boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

amd64.py (17779B)


      1 from common import (
      2     AddI,
      3     ArchDef,
      4     BranchReg,
      5     CondB,
      6     CondBZ,
      7     Enter,
      8     La,
      9     LaBr,
     10     LdArg,
     11     Li,
     12     LogI,
     13     Mem,
     14     Mov,
     15     Nullary,
     16     Rrr,
     17     ShiftI,
     18     byte,
     19     le32,
     20     round_up,
     21 )
     22 
     23 
     24 # ---- Native register numbers --------------------------------------------
     25 #
     26 # Backend-private mapping from P1 register names to native amd64 regnums.
     27 # `br` is the hidden branch-target reg (r15). `scratch` is the per-expansion
     28 # scratch reg (r9). rax/rbp are also used internally (retaddr spill, rcx /
     29 # rdx save slots) and are not P1-visible.
     30 
     31 NAT = {
     32     'a0': 7,    # rdi
     33     'a1': 6,    # rsi
     34     'a2': 2,    # rdx
     35     'a3': 1,    # rcx
     36     't0': 10,   # r10
     37     't1': 11,   # r11
     38     't2': 8,    # r8
     39     's0': 3,    # rbx
     40     's1': 12,   # r12
     41     's2': 13,   # r13
     42     's3': 14,   # r14
     43     'sp': 4,    # rsp
     44     'br': 15,   # r15
     45     'scratch': 9,  # r9
     46     'rax': 0,
     47     'rcx': 1,
     48     'rdx': 2,
     49     'rbx': 3,
     50     'rsp': 4,
     51     'rbp': 5,
     52     'rsi': 6,
     53     'rdi': 7,
     54     'r8': 8,
     55     'r9': 9,
     56     'r10': 10,
     57     'r11': 11,
     58     'r12': 12,
     59     'r13': 13,
     60     'r14': 14,
     61     'r15': 15,
     62 }
     63 
     64 
     65 SYSCALL_NUMBERS = {
     66     'SYS_READ': 0,
     67     'SYS_WRITE': 1,
     68     'SYS_CLOSE': 3,
     69     'SYS_OPENAT': 257,
     70     'SYS_EXIT': 60,
     71     'SYS_CLONE': 56,
     72     'SYS_EXECVE': 59,
     73     'SYS_WAITID': 247,
     74 }
     75 
     76 
     77 # ---- REX / ModRM helpers ------------------------------------------------
     78 
     79 def amd_rex_b_short(r):
     80     # Optional one-byte REX.B (no W) prefix used by push/pop/jmp r/call r/
     81     # mov r,imm32 when the target reg is r8-r15. Returns '' for low regs.
     82     if NAT[r] >= 8:
     83         return byte(0x41)
     84     return ''
     85 
     86 
     87 def amd_rex_wb(r):
     88     # REX.W=1, B=(r>>3) to extend ModRM.rm / SIB.base.
     89     return byte(0x48 | ((NAT[r] >> 3) & 1))
     90 
     91 
     92 def amd_rex_wrb(rg, rm):
     93     # REX.W=1, R=(rg>>3), B=(rm>>3). Used whenever a ModRM.reg field is in
     94     # use together with a ModRM.rm field.
     95     return byte(0x48 | (((NAT[rg] >> 3) & 1) << 2) | ((NAT[rm] >> 3) & 1))
     96 
     97 
     98 def amd_modrm_rr(rg, rm):
     99     return byte(0xC0 | ((NAT[rg] & 7) << 3) | (NAT[rm] & 7))
    100 
    101 
    102 def amd_modrm_ext_r(ext, rm):
    103     return byte(0xC0 | ((ext & 7) << 3) | (NAT[rm] & 7))
    104 
    105 
    106 # ---- Memory-addressing ModRM (+ SIB + disp) ----------------------------
    107 #
    108 # [base + disp] with `reg` in ModRM.reg. Bases whose low 3 bits are 100 -
    109 # rsp and r12 - must go through a SIB byte; all others use the plain
    110 # encoding. disp selects mod=1 (disp8) when it fits in [-128,127], else
    111 # mod=2 (disp32). We never emit mod=0 / no-disp; the extra byte is fine.
    112 
    113 def amd_modrm_disp(reg, base, disp):
    114     use_sib = (NAT[base] & 7) == 4
    115     use_disp8 = -128 <= disp <= 127
    116     reg_lo = NAT[reg] & 7
    117     if use_sib:
    118         if use_disp8:
    119             return byte(0x44 | (reg_lo << 3)) + byte(0x24) + byte(disp)
    120         return byte(0x84 | (reg_lo << 3)) + byte(0x24) + le32(disp)
    121     base_lo = NAT[base] & 7
    122     if use_disp8:
    123         return byte(0x40 | (reg_lo << 3) | base_lo) + byte(disp)
    124     return byte(0x80 | (reg_lo << 3) | base_lo) + le32(disp)
    125 
    126 
    127 # ---- Register / arithmetic primitives ----------------------------------
    128 
    129 def amd_mov_rr(dst, src):
    130     # mov dst, src  --  REX.WRB 89 /r  (source in ModRM.reg, dest in rm)
    131     return amd_rex_wrb(src, dst) + byte(0x89) + amd_modrm_rr(src, dst)
    132 
    133 
    134 def amd_alu_rr(opcode, dst, src):
    135     # ADD/SUB/AND/OR/XOR dst, src  --  REX.WRB <op> /r (src in reg, dst in rm)
    136     return amd_rex_wrb(src, dst) + byte(opcode) + amd_modrm_rr(src, dst)
    137 
    138 
    139 def amd_alu_ri8(ext, dst, imm):
    140     # op dst, imm8 -- REX.WB 83 /ext ib
    141     return amd_rex_wb(dst) + byte(0x83) + amd_modrm_ext_r(ext, dst) + byte(imm)
    142 
    143 
    144 def amd_alu_ri32(ext, dst, imm):
    145     # op dst, imm32 -- REX.WB 81 /ext id
    146     return amd_rex_wb(dst) + byte(0x81) + amd_modrm_ext_r(ext, dst) + le32(imm)
    147 
    148 
    149 def amd_shift_ri8(ext, dst, imm):
    150     # shift dst, imm8 -- REX.WB C1 /ext ib  (SHL=4, SHR=5, SAR=7)
    151     return (amd_rex_wb(dst) + byte(0xC1) + amd_modrm_ext_r(ext, dst)
    152             + byte(imm & 0x3F))
    153 
    154 
    155 def amd_shift_cl(ext, dst):
    156     # shift dst, cl -- REX.WB D3 /ext
    157     return amd_rex_wb(dst) + byte(0xD3) + amd_modrm_ext_r(ext, dst)
    158 
    159 
    160 def amd_imul_rr(dst, src):
    161     # imul dst, src  --  REX.WRB 0F AF /r  (dst in reg, src in rm)
    162     return (amd_rex_wrb(dst, src) + byte(0x0F) + byte(0xAF)
    163             + amd_modrm_rr(dst, src))
    164 
    165 
    166 def amd_idiv_r(src):
    167     # idiv src  --  REX.WB F7 /7
    168     return amd_rex_wb(src) + byte(0xF7) + amd_modrm_ext_r(7, src)
    169 
    170 
    171 def amd_cqo():
    172     # cqo -- 48 99 (sign-extend rax into rdx:rax)
    173     return byte(0x48) + byte(0x99)
    174 
    175 
    176 def amd_push(r):
    177     return amd_rex_b_short(r) + byte(0x50 | (NAT[r] & 7))
    178 
    179 
    180 def amd_pop(r):
    181     return amd_rex_b_short(r) + byte(0x58 | (NAT[r] & 7))
    182 
    183 
    184 def amd_mov_imm32_prefix(rd):
    185     # mov r32, imm32  --  [REX.B] B8+r  (caller appends 4-byte literal).
    186     # Result is zero-extended into the full 64-bit register.
    187     return amd_rex_b_short(rd) + byte(0xB8 | (NAT[rd] & 7))
    188 
    189 
    190 def amd_mov_imm64_prefix(rd):
    191     # mov r64, imm64  --  REX.W[.B] B8+r  (caller appends 8-byte literal).
    192     return amd_rex_wb(rd) + byte(0xB8 | (NAT[rd] & 7))
    193 
    194 
    195 # ---- Memory ops --------------------------------------------------------
    196 
    197 def amd_mem_LD(rt, rn, off):
    198     # mov rT, [rN + off]  --  REX.WRB 8B /r  modrm-with-disp
    199     return amd_rex_wrb(rt, rn) + byte(0x8B) + amd_modrm_disp(rt, rn, off)
    200 
    201 
    202 def amd_mem_ST(rt, rn, off):
    203     # mov [rN + off], rT  --  REX.WRB 89 /r
    204     return amd_rex_wrb(rt, rn) + byte(0x89) + amd_modrm_disp(rt, rn, off)
    205 
    206 
    207 def amd_mem_SB(rt, rn, off):
    208     # mov [rN + off], rT8 -- REX.WRB 88 /r (REX.W forces dil/sil/bpl/spl
    209     # byte-view encoding when the low byte of those regs is needed).
    210     return amd_rex_wrb(rt, rn) + byte(0x88) + amd_modrm_disp(rt, rn, off)
    211 
    212 
    213 def amd_mem_LB(rt, rn, off):
    214     # movzx rT, byte ptr [rN + off]  --  REX.WRB 0F B6 /r
    215     return (amd_rex_wrb(rt, rn) + byte(0x0F) + byte(0xB6)
    216             + amd_modrm_disp(rt, rn, off))
    217 
    218 
    219 # ---- Control-flow primitives -------------------------------------------
    220 
    221 def amd_jmp_r(r):
    222     # jmp r/m64 -- [REX.B] FF /4. 2 bytes for low regs, 3 bytes for r8-r15.
    223     return amd_rex_b_short(r) + byte(0xFF) + byte(0xE0 | (NAT[r] & 7))
    224 
    225 
    226 def amd_call_r(r):
    227     # call r/m64 -- [REX.B] FF /2.
    228     return amd_rex_b_short(r) + byte(0xFF) + byte(0xD0 | (NAT[r] & 7))
    229 
    230 
    231 def amd_ret():
    232     return byte(0xC3)
    233 
    234 
    235 def amd_syscall():
    236     return byte(0x0F) + byte(0x05)
    237 
    238 
    239 def amd_cmp_rr(ra, rb):
    240     # cmp rA, rB -- REX.WRB 39 /r (rB in reg, rA in rm).
    241     return amd_rex_wrb(rb, ra) + byte(0x39) + amd_modrm_rr(rb, ra)
    242 
    243 
    244 def amd_test_rr(ra, rb):
    245     return amd_rex_wrb(rb, ra) + byte(0x85) + amd_modrm_rr(rb, ra)
    246 
    247 
    248 # ---- P1 register-register op lowering ----------------------------------
    249 #
    250 # For ADD/SUB/AND/OR/XOR we honor rD==rB aliasing -- the naive
    251 # `mov rD,rA ; op rD,rB` would clobber rB before the op reads it. Route rB
    252 # through the scratch reg when that aliasing shows up.
    253 
    254 ALU_OPCODE = {
    255     'ADD': 0x01,
    256     'SUB': 0x29,
    257     'AND': 0x21,
    258     'OR': 0x09,
    259     'XOR': 0x31,
    260 }
    261 
    262 
    263 def amd_rrr_simple(opcode, rd, ra, rb):
    264     if NAT[rd] == NAT[rb]:
    265         return (amd_mov_rr('scratch', rb)
    266                 + amd_mov_rr(rd, ra)
    267                 + amd_alu_rr(opcode, rd, 'scratch'))
    268     return amd_mov_rr(rd, ra) + amd_alu_rr(opcode, rd, rb)
    269 
    270 
    271 def amd_rrr_MUL(rd, ra, rb):
    272     if NAT[rd] == NAT[rb]:
    273         return (amd_mov_rr('scratch', rb)
    274                 + amd_mov_rr(rd, ra)
    275                 + amd_imul_rr(rd, 'scratch'))
    276     return amd_mov_rr(rd, ra) + amd_imul_rr(rd, rb)
    277 
    278 
    279 # DIV / REM clobber rax and rdx natively. rax is not a P1 register, so we
    280 # clobber it freely; rdx IS P1 a2, so we stash it to rbp (also outside the
    281 # P1 mapping) for the lifetime of the op. Aliasing-safety plan mirrors the
    282 # M1pp comments verbatim.
    283 
    284 def amd_rrr_DIV(rd, ra, rb):
    285     return ''.join([
    286         amd_mov_rr('rbp', 'rdx'),
    287         amd_mov_rr('scratch', rb),
    288         amd_mov_rr('rax', ra),
    289         amd_cqo(),
    290         amd_idiv_r('scratch'),
    291         amd_mov_rr('rdx', 'rbp'),
    292         amd_mov_rr(rd, 'rax'),
    293     ])
    294 
    295 
    296 def amd_rrr_REM(rd, ra, rb):
    297     return ''.join([
    298         amd_mov_rr('rbp', 'rdx'),
    299         amd_mov_rr('scratch', rb),
    300         amd_mov_rr('rax', ra),
    301         amd_cqo(),
    302         amd_idiv_r('scratch'),
    303         amd_mov_rr('rax', 'rdx'),
    304         amd_mov_rr('rdx', 'rbp'),
    305         amd_mov_rr(rd, 'rax'),
    306     ])
    307 
    308 
    309 # SHL / SHR / SAR with reg count. x86 reads the count from CL only, so
    310 # staging goes through rcx -- which IS P1 a3. Save rcx to rbp for the
    311 # duration. Ordering matches the M1pp comments.
    312 
    313 def amd_rrr_shift(ext, rd, ra, rb):
    314     return ''.join([
    315         amd_mov_rr('rbp', 'rcx'),
    316         amd_mov_rr('scratch', ra),
    317         amd_mov_rr('rcx', rb),
    318         amd_shift_cl(ext, 'scratch'),
    319         amd_mov_rr('rcx', 'rbp'),
    320         amd_mov_rr(rd, 'scratch'),
    321     ])
    322 
    323 
    324 # ---- Encoders ----------------------------------------------------------
    325 
    326 def encode_li(_arch, row):
    327     return amd_mov_imm64_prefix(row.rd)
    328 
    329 
    330 def encode_la(_arch, row):
    331     return amd_mov_imm32_prefix(row.rd)
    332 
    333 
    334 def encode_labr(_arch, _row):
    335     return amd_mov_imm32_prefix('br')
    336 
    337 
    338 def encode_mov(_arch, row):
    339     # Portable sp is the frame-local base, which is 16 bytes above native
    340     # rsp. Reading sp into a register yields native_rsp + 16, so emit
    341     # `mov rd, rsp ; add rd, 16` for the sp-source case.
    342     if row.rs == 'sp':
    343         return amd_mov_rr(row.rd, 'sp') + amd_alu_ri8(0, row.rd, 16)
    344     return amd_mov_rr(row.rd, row.rs)
    345 
    346 
    347 def encode_rrr(_arch, row):
    348     if row.op == 'MUL':
    349         return amd_rrr_MUL(row.rd, row.ra, row.rb)
    350     if row.op == 'DIV':
    351         return amd_rrr_DIV(row.rd, row.ra, row.rb)
    352     if row.op == 'REM':
    353         return amd_rrr_REM(row.rd, row.ra, row.rb)
    354     if row.op == 'SHL':
    355         return amd_rrr_shift(4, row.rd, row.ra, row.rb)
    356     if row.op == 'SHR':
    357         return amd_rrr_shift(5, row.rd, row.ra, row.rb)
    358     if row.op == 'SAR':
    359         return amd_rrr_shift(7, row.rd, row.ra, row.rb)
    360     return amd_rrr_simple(ALU_OPCODE[row.op], row.rd, row.ra, row.rb)
    361 
    362 
    363 def encode_addi(_arch, row):
    364     head = amd_mov_rr(row.rd, row.ra)
    365     if -128 <= row.imm <= 127:
    366         return head + amd_alu_ri8(0, row.rd, row.imm)
    367     return head + amd_alu_ri32(0, row.rd, row.imm)
    368 
    369 
    370 # AND/OR with imm: 83 /ext ib sign-extends imm8 to 64 bits. That works for
    371 # imm in [-128, 127] (and for -1 as a convenient all-ones mask), but breaks
    372 # for positive imms >= 128 -- ANDI with 255 would become AND with
    373 # 0xFFFFFFFFFFFFFFFF. Widen to the imm32 form when imm8 would misencode.
    374 LOGI_EXT = {
    375     'ANDI': 4,
    376     'ORI': 1,
    377 }
    378 
    379 
    380 def encode_logi(_arch, row):
    381     head = amd_mov_rr(row.rd, row.ra)
    382     ext = LOGI_EXT[row.op]
    383     if -128 <= row.imm <= 127:
    384         return head + amd_alu_ri8(ext, row.rd, row.imm)
    385     return head + amd_alu_ri32(ext, row.rd, row.imm)
    386 
    387 
    388 SHIFTI_EXT = {
    389     'SHLI': 4,
    390     'SHRI': 5,
    391     'SARI': 7,
    392 }
    393 
    394 
    395 def encode_shifti(_arch, row):
    396     return (amd_mov_rr(row.rd, row.ra)
    397             + amd_shift_ri8(SHIFTI_EXT[row.op], row.rd, row.imm))
    398 
    399 
    400 def encode_mem(_arch, row):
    401     # Portable sp points to the frame-local base; the 16-byte hidden frame
    402     # header sits at native_rsp+0..15 and is not portable-addressable.
    403     # Shift sp-relative offsets past the header.
    404     off = row.off + 16 if row.rn == 'sp' else row.off
    405     if row.op == 'LD':
    406         return amd_mem_LD(row.rt, row.rn, off)
    407     if row.op == 'ST':
    408         return amd_mem_ST(row.rt, row.rn, off)
    409     if row.op == 'LB':
    410         return amd_mem_LB(row.rt, row.rn, off)
    411     if row.op == 'SB':
    412         return amd_mem_SB(row.rt, row.rn, off)
    413     raise ValueError(f'unknown mem op: {row.op}')
    414 
    415 
    416 def encode_ldarg(_arch, row):
    417     # Internal callers bypass the +16 sp-base translation: native rsp+8
    418     # holds the saved caller-sp pointer set up by p1_enter, and the first
    419     # incoming stack-arg word lives 16 bytes past that.
    420     return (amd_mem_LD('scratch', 'sp', 8)
    421             + amd_mem_LD(row.rd, 'scratch', 16 + 8 * row.slot))
    422 
    423 
    424 def amd_epilogue_prefix():
    425     # Frame-teardown prefix shared by ERET, TAIL, TAILR. Loads retaddr into
    426     # scratch (r9), saved caller sp into rax, unwinds rsp, then re-pushes
    427     # retaddr so a trailing `ret` or `jmp` finds the right top-of-stack
    428     # layout. (For TAIL/TAILR the trailing op is a jmp, but the retaddr
    429     # still needs to be back on the stack so the eventual callee `ret`
    430     # returns to the original caller.)
    431     return ''.join([
    432         amd_mem_LD('scratch', 'sp', 0),
    433         amd_mem_LD('rax', 'sp', 8),
    434         amd_mov_rr('sp', 'rax'),
    435         amd_push('scratch'),
    436     ])
    437 
    438 
    439 def encode_branch_reg(_arch, row):
    440     if row.kind == 'BR':
    441         return amd_jmp_r(row.rs)
    442     if row.kind == 'CALLR':
    443         return amd_call_r(row.rs)
    444     if row.kind == 'TAILR':
    445         return amd_epilogue_prefix() + amd_jmp_r(row.rs)
    446     raise ValueError(f'unknown branch-reg kind: {row.kind}')
    447 
    448 
    449 # Conditional-branch lowering:
    450 #   cmp / test
    451 #   Jcc_inverse +3       -- skip the 3-byte `jmp r15`
    452 #   jmp r15              -- P1 branch-taken path
    453 #
    454 # Invert codes: BEQ->JNE(75), BNE->JE(74), BLT->JGE(7D), BLTU->JAE(73),
    455 # BLTZ->JGE(7D), BEQZ->JNE(75), BNEZ->JE(74). The 0x03 rel8 skips
    456 # `amd_jmp_r(br)` which is 3 bytes (REX.B 41 + FF + E7).
    457 CONDB_INVERT = {
    458     'BEQ': 0x75,   # JNE
    459     'BNE': 0x74,   # JE
    460     'BLT': 0x7D,   # JGE
    461     'BLTU': 0x73,  # JAE
    462 }
    463 
    464 CONDBZ_INVERT = {
    465     'BEQZ': 0x75,  # JNE
    466     'BNEZ': 0x74,  # JE
    467     'BLTZ': 0x7D,  # JGE
    468 }
    469 
    470 
    471 def encode_condb(_arch, row):
    472     return (amd_cmp_rr(row.ra, row.rb)
    473             + byte(CONDB_INVERT[row.op]) + byte(0x03)
    474             + amd_jmp_r('br'))
    475 
    476 
    477 def encode_condbz(_arch, row):
    478     return (amd_test_rr(row.ra, row.ra)
    479             + byte(CONDBZ_INVERT[row.op]) + byte(0x03)
    480             + amd_jmp_r('br'))
    481 
    482 
    483 def encode_enter(arch, row):
    484     # CALL on amd64 pushed the retaddr, so on entry:
    485     #   rsp = caller_sp - 8
    486     #   [rsp] = retaddr
    487     #
    488     # Standard frame after ENTER:
    489     #   [sp + 0]                  = retaddr
    490     #   [sp + 8]                  = saved caller_sp
    491     #   [sp + 16 .. 16 + size - 1] = portable locals
    492     #   total frame = round_up(stack_align, 16 + size)
    493     frame_bytes = round_up(arch.stack_align, 2 * arch.word_bytes + row.size)
    494     return ''.join([
    495         amd_pop('scratch'),
    496         amd_mov_rr('rax', 'sp'),
    497         amd_alu_ri32(5, 'sp', frame_bytes),
    498         amd_mem_ST('scratch', 'sp', 0),
    499         amd_mem_ST('rax', 'sp', 8),
    500     ])
    501 
    502 
    503 def encode_nullary(_arch, row):
    504     if row.kind == 'B':
    505         return amd_jmp_r('br')
    506     if row.kind == 'CALL':
    507         return amd_call_r('br')
    508     if row.kind == 'RET':
    509         return amd_ret()
    510     if row.kind == 'ERET':
    511         return amd_epilogue_prefix() + amd_ret()
    512     if row.kind == 'TAIL':
    513         return amd_epilogue_prefix() + amd_jmp_r('br')
    514     if row.kind == 'SYSCALL':
    515         # P1: a0=num, a1..a3,t0,s0,s1 = args 0..5. Linux amd64: rax=num,
    516         # rdi/rsi/rdx/r10/r8/r9 = args 0..5, return in rax; syscall also
    517         # clobbers rcx and r11.
    518         #
    519         # Push the P1 registers whose native slots get overwritten or
    520         # syscall-clobbered -- rsi (a1), rdx (a2), rcx (a3), r11 (t1),
    521         # r8 (t2) -- then shuffle into the native arg slots, issue
    522         # syscall, restore, and move the return value (rax) into a0
    523         # (rdi). Stack offsets after the 5 pushes: [rsp+0]=r8,
    524         # [rsp+8]=r11, [rsp+16]=rcx (a3), [rsp+24]=rdx (a2),
    525         # [rsp+32]=rsi (a1).
    526         return ''.join([
    527             amd_push('rsi'),
    528             amd_push('rdx'),
    529             amd_push('rcx'),
    530             amd_push('r11'),
    531             amd_push('r8'),
    532             amd_mov_rr('rax', 'rdi'),
    533             amd_mem_LD('rdi', 'sp', 32),
    534             amd_mem_LD('rsi', 'sp', 24),
    535             amd_mem_LD('rdx', 'sp', 16),
    536             amd_mov_rr('r8', 'rbx'),
    537             amd_mov_rr('r9', 'r12'),
    538             amd_syscall(),
    539             amd_pop('r8'),
    540             amd_pop('r11'),
    541             amd_pop('rcx'),
    542             amd_pop('rdx'),
    543             amd_pop('rsi'),
    544             amd_mov_rr('rdi', 'rax'),
    545         ])
    546     raise ValueError(f'unknown nullary kind: {row.kind}')
    547 
    548 
    549 def amd_start_stub():
    550     # Backend-owned :_start stub per docs/P1.md §Program Entry. Linux amd64
    551     # puts argc at [rsp] and argv starting at [rsp+8]. Load argc into a0
    552     # (rdi), compute &argv[0] into a1 (rsi), call p1_main under the
    553     # one-word direct-result convention, then issue sys_exit with
    554     # p1_main's return value in a0 (== rdi). Mirrors the `%p1_entry`
    555     # macro in p1/P1-amd64.M1pp.
    556     #
    557     # Raw hex outside DEFINE bodies must be single-quoted so bootstrap M0
    558     # treats it as a literal byte run. The bootstrap amd64 M0 has a 256B
    559     # token buffer, so each quoted run must stay <= 128 hex chars; we
    560     # split into multiple short lines defensively.
    561     def q(hex_bytes):
    562         return f"'{hex_bytes}'"
    563 
    564     load_argc = amd_mem_LD('a0', 'sp', 0)
    565     compute_argv = amd_mov_rr('a1', 'sp') + amd_alu_ri8(0, 'a1', 8)
    566     labr_prefix = amd_mov_imm32_prefix('br')
    567     call_main = amd_call_r('br')
    568     # mov eax, 60 ; syscall. P1 a0 (rdi) already holds p1_main's return.
    569     sys_exit = byte(0xB8) + le32(60) + amd_syscall()
    570 
    571     return [
    572         ':_start',
    573         q(load_argc),
    574         q(compute_argv),
    575         q(labr_prefix),
    576         '&p1_main',
    577         q(call_main),
    578         q(sys_exit),
    579     ]
    580 
    581 
    582 ENCODERS = {
    583     Li: encode_li,
    584     La: encode_la,
    585     LaBr: encode_labr,
    586     Mov: encode_mov,
    587     Rrr: encode_rrr,
    588     AddI: encode_addi,
    589     LogI: encode_logi,
    590     ShiftI: encode_shifti,
    591     Mem: encode_mem,
    592     LdArg: encode_ldarg,
    593     Nullary: encode_nullary,
    594     BranchReg: encode_branch_reg,
    595     CondB: encode_condb,
    596     CondBZ: encode_condbz,
    597     Enter: encode_enter,
    598 }
    599 
    600 
    601 ARCH = ArchDef(
    602     name='amd64',
    603     word_bytes=8,
    604     stack_align=16,
    605     syscall_numbers=SYSCALL_NUMBERS,
    606     encoders=ENCODERS,
    607     start_stub=amd_start_stub,
    608 )