Add amd64 / riscv64 P1v2 generators; explicit ARCHES wiring - boot2

commit ffe72f13e339e8411d89985ad324f01a6483a4e6
parent d7398ef3945e0d7f61cc44a36a2cb86a7ab47093
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 24 Apr 2026 15:54:38 -0700

Add amd64 / riscv64 P1v2 generators; explicit ARCHES wiring

Mirrors p1/gen/aarch64.py; each backend encodes the same byte
sequences as its p1/P1-<arch>.M1pp counterpart.

Drop the ARCH_REGISTRY indirection in common.py. Each arch module now
exposes ARCH = ArchDef(...); p1_gen.py builds an explicit ARCHES dict.

Diffstat:
A p1/gen/aarch64.py  | 410 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A p1/gen/amd64.py  | 608 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A p1/gen/common.py  | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
A p1/gen/p1_gen.py  | 259 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A p1/gen/riscv64.py  | 396 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

5 files changed, 1722 insertions(+), 0 deletions(-)
diff --git a/p1/gen/aarch64.py b/p1/gen/aarch64.py
@@ -0,0 +1,410 @@
+from common import (
+    AddI,
+    ArchDef,
+    BranchReg,
+    CondB,
+    CondBZ,
+    Enter,
+    La,
+    LaBr,
+    LdArg,
+    Li,
+    LogI,
+    Mem,
+    Mov,
+    Nullary,
+    Rrr,
+    ShiftI,
+    le32,
+    round_up,
+)
+
+
+NAT = {
+    'a0': 0,
+    'a1': 1,
+    'a2': 2,
+    'a3': 3,
+    'x4': 4,
+    'x5': 5,
+    't0': 9,
+    't1': 10,
+    't2': 11,
+    's0': 19,
+    's1': 20,
+    's2': 21,
+    's3': 22,
+    'sp': 31,
+    'xzr': 31,
+    'lr': 30,
+    'br': 17,
+    'scratch': 16,
+    'x8': 8,
+    'save0': 23,
+    'save1': 24,
+    'save2': 25,
+}
+
+
+RRR_BASE = {
+    'ADD': 0x8B000000,
+    'SUB': 0xCB000000,
+    'AND': 0x8A000000,
+    'OR': 0xAA000000,
+    'XOR': 0xCA000000,
+    'SHL': 0x9AC02000,
+    'SHR': 0x9AC02400,
+    'SAR': 0x9AC02800,
+    'DIV': 0x9AC00C00,
+}
+
+
+SYSCALL_NUMBERS = {
+    'SYS_READ': 63,
+    'SYS_WRITE': 64,
+    'SYS_CLOSE': 57,
+    'SYS_OPENAT': 56,
+    'SYS_EXIT': 93,
+    'SYS_CLONE': 220,
+    'SYS_EXECVE': 221,
+    'SYS_WAITID': 95,
+}
+
+
+def aa_rrr(base, rd, ra, rb):
+    d = NAT[rd]
+    a = NAT[ra]
+    b = NAT[rb]
+    return le32(base | (b << 16) | (a << 5) | d)
+
+
+def aa_add_imm(rd, ra, imm12, sub=False):
+    d = NAT[rd]
+    a = NAT[ra]
+    base = 0xD1000000 if sub else 0x91000000
+    return le32(base | ((imm12 & 0xFFF) << 10) | (a << 5) | d)
+
+
+def aa_mov_rr(dst, src):
+    if dst == 'sp':
+        return aa_add_imm('sp', src, 0, sub=False)
+    if src == 'sp':
+        return aa_add_imm(dst, 'sp', 0, sub=False)
+    d = NAT[dst]
+    s = NAT[src]
+    return le32(0xAA000000 | (s << 16) | (31 << 5) | d)
+
+
+def aa_ubfm(rd, ra, immr, imms):
+    d = NAT[rd]
+    a = NAT[ra]
+    return le32(0xD3400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
+
+
+def aa_sbfm(rd, ra, immr, imms):
+    d = NAT[rd]
+    a = NAT[ra]
+    return le32(0x93400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
+
+
+def aa_movz(rd, imm16):
+    d = NAT[rd]
+    return le32(0xD2800000 | ((imm16 & 0xFFFF) << 5) | d)
+
+
+def aa_movn(rd, imm16):
+    d = NAT[rd]
+    return le32(0x92800000 | ((imm16 & 0xFFFF) << 5) | d)
+
+
+def aa_materialize_small_imm(rd, imm):
+    if imm >= 0:
+        return aa_movz(rd, imm)
+    return aa_movn(rd, (~imm) & 0xFFFF)
+
+
+def aa_ldst_uimm12(base, rt, rn, off_bytes, size_log2):
+    imm12 = off_bytes >> size_log2
+    t = NAT[rt]
+    n = NAT[rn]
+    return le32(base | (imm12 << 10) | (n << 5) | t)
+
+
+def aa_ldst_unscaled(base, rt, rn, off):
+    imm9 = off & 0x1FF
+    t = NAT[rt]
+    n = NAT[rn]
+    return le32(base | (imm9 << 12) | (n << 5) | t)
+
+
+def aa_mem(op, rt, rn, off):
+    bases = {
+        'LD': (0xF9400000, 3, 0xF8400000),
+        'ST': (0xF9000000, 3, 0xF8000000),
+        'LB': (0x39400000, 0, 0x38400000),
+        'SB': (0x39000000, 0, 0x38000000),
+    }
+    uimm_base, size_log2, unscaled_base = bases[op]
+    scale = 1 << size_log2
+    if off >= 0 and off % scale == 0 and off < (4096 << size_log2):
+        return aa_ldst_uimm12(uimm_base, rt, rn, off, size_log2)
+    if -256 <= off <= 255:
+        return aa_ldst_unscaled(unscaled_base, rt, rn, off)
+    if -2048 <= off <= 2047:
+        if off >= 0:
+            addr = aa_add_imm('scratch', rn, off, sub=False)
+        else:
+            addr = aa_add_imm('scratch', rn, -off, sub=True)
+        return addr + aa_ldst_uimm12(uimm_base, rt, 'scratch', 0, size_log2)
+    raise ValueError(f'aarch64 offset out of range for {op}: {off}')
+
+
+def aa_cmp_skip(op, ra, rb):
+    a = NAT[ra]
+    b = NAT[rb]
+    cmp_hex = le32(0xEB000000 | (b << 16) | (a << 5) | 31)
+    skip_cond = {
+        'BEQ': 1,
+        'BNE': 0,
+        'BLT': 10,
+        'BLTU': 2,
+    }[op]
+    return cmp_hex + le32(0x54000040 | skip_cond)
+
+
+def aa_br(reg):
+    return le32(0xD61F0000 | (NAT[reg] << 5))
+
+
+def aa_blr(reg):
+    return le32(0xD63F0000 | (NAT[reg] << 5))
+
+
+def aa_ret():
+    return le32(0xD65F03C0)
+
+
+def aa_epilogue():
+    # Frame teardown, shared by ERET, TAIL, TAILR. Loads lr and the
+    # saved caller sp from the hidden header at native_sp+0/+8, then
+    # unwinds sp. Does NOT transfer control; the caller appends an
+    # aa_ret / aa_br as appropriate.
+    return (
+        aa_mem('LD', 'lr', 'sp', 0)
+        + aa_mem('LD', 'x8', 'sp', 8)
+        + aa_mov_rr('sp', 'x8')
+    )
+
+
+def aa_lit64_prefix(rd):
+    ## 64-bit literal-pool prefix for LI: ldr xN, [pc,#8]; b PC+12.
+    ## The 8 bytes that follow in source become the literal; b skips them.
+    d = NAT[rd]
+    ldr_lit = 0x58000040 | d
+    b_plus12 = 0x14000003
+    return le32(ldr_lit) + le32(b_plus12)
+
+
+def aa_lit32_prefix(rd):
+    ## 32-bit literal-pool prefix for LA / LA_BR: ldr wN, [pc,#8]; b PC+8.
+    ## ldr w zero-extends into the full 64-bit register, so a 4-byte literal
+    ## is enough for any address in the stage0 layout (base 0x00600000,
+    ## programs well under 4 GB). This lets source use `&label` directly
+    ## without padding to 8 bytes.
+    d = NAT[rd]
+    ldr_lit = 0x18000040 | d
+    b_plus8 = 0x14000002
+    return le32(ldr_lit) + le32(b_plus8)
+
+
+def encode_li(_arch, row):
+    return aa_lit64_prefix(row.rd)
+
+
+def encode_la(_arch, row):
+    return aa_lit32_prefix(row.rd)
+
+
+def encode_labr(_arch, _row):
+    return aa_lit32_prefix('br')
+
+
+def encode_mov(_arch, row):
+    # Portable `sp` is the frame-local base, which is 16 bytes above
+    # native sp (the backend's 2-word hidden header sits at the low end
+    # of each frame allocation). So reading sp into a register yields
+    # native_sp + 16, not native_sp itself.
+    if row.rs == 'sp':
+        return aa_add_imm(row.rd, 'sp', 16, sub=False)
+    return aa_mov_rr(row.rd, row.rs)
+
+
+def encode_rrr(_arch, row):
+    if row.op == 'MUL':
+        d = NAT[row.rd]
+        a = NAT[row.ra]
+        b = NAT[row.rb]
+        return le32(0x9B000000 | (b << 16) | (31 << 10) | (a << 5) | d)
+    if row.op == 'REM':
+        d = NAT[row.rd]
+        a = NAT[row.ra]
+        b = NAT[row.rb]
+        sc = NAT['scratch']
+        sdiv = 0x9AC00C00 | (b << 16) | (a << 5) | sc
+        msub = 0x9B008000 | (b << 16) | (a << 10) | (sc << 5) | d
+        return le32(sdiv) + le32(msub)
+    return aa_rrr(RRR_BASE[row.op], row.rd, row.ra, row.rb)
+
+
+def encode_addi(_arch, row):
+    if row.imm >= 0:
+        return aa_add_imm(row.rd, row.ra, row.imm, sub=False)
+    return aa_add_imm(row.rd, row.ra, -row.imm, sub=True)
+
+
+def encode_logi(_arch, row):
+    seq = aa_materialize_small_imm('scratch', row.imm)
+    base = {
+        'ANDI': 0x8A000000,
+        'ORI': 0xAA000000,
+    }[row.op]
+    return seq + aa_rrr(base, row.rd, row.ra, 'scratch')
+
+
+def encode_shifti(_arch, row):
+    if row.op == 'SHLI':
+        return aa_ubfm(row.rd, row.ra, (-row.imm) & 63, 63 - row.imm)
+    if row.op == 'SHRI':
+        return aa_ubfm(row.rd, row.ra, row.imm, 63)
+    return aa_sbfm(row.rd, row.ra, row.imm, 63)
+
+
+def encode_mem(_arch, row):
+    # Portable sp points to the frame-local base; the 2-word hidden
+    # header sits at native_sp+0/+8 and is not portable-addressable.
+    # Shift sp-relative offsets past the header.
+    off = row.off + 16 if row.rn == 'sp' else row.off
+    return aa_mem(row.op, row.rt, row.rn, off)
+
+
+def encode_ldarg(_arch, row):
+    return aa_mem('LD', 'scratch', 'sp', 8) + aa_mem('LD', row.rd, 'scratch', 16 + 8 * row.slot)
+
+
+def encode_branch_reg(_arch, row):
+    if row.kind == 'BR':
+        return aa_br(row.rs)
+    if row.kind == 'CALLR':
+        return aa_blr(row.rs)
+    if row.kind == 'TAILR':
+        return aa_epilogue() + aa_br(row.rs)
+    raise ValueError(f'unknown branch-reg kind: {row.kind}')
+
+
+def encode_condb(_arch, row):
+    return aa_cmp_skip(row.op, row.ra, row.rb) + aa_br('br')
+
+
+def encode_condbz(_arch, row):
+    a = NAT[row.ra]
+    br_hex = aa_br('br')
+    if row.op == 'BEQZ':
+        return le32(0xB5000000 | (2 << 5) | a) + br_hex
+    if row.op == 'BNEZ':
+        return le32(0xB4000000 | (2 << 5) | a) + br_hex
+    cmp_zero = le32(0xEB1F001F | (a << 5))
+    bge = le32(0x54000040 | 10)
+    return cmp_zero + bge + br_hex
+
+
+def encode_enter(arch, row):
+    frame_bytes = round_up(arch.stack_align, 2 * arch.word_bytes + row.size)
+    return (
+        aa_add_imm('sp', 'sp', frame_bytes, sub=True)
+        + aa_mem('ST', 'lr', 'sp', 0)
+        + aa_add_imm('x8', 'sp', frame_bytes, sub=False)
+        + aa_mem('ST', 'x8', 'sp', 8)
+    )
+
+
+def encode_nullary(_arch, row):
+    if row.kind == 'B':
+        return aa_br('br')
+    if row.kind == 'CALL':
+        return aa_blr('br')
+    if row.kind == 'RET':
+        return aa_ret()
+    if row.kind == 'ERET':
+        return aa_epilogue() + aa_ret()
+    if row.kind == 'TAIL':
+        return aa_epilogue() + aa_br('br')
+    if row.kind == 'SYSCALL':
+        return ''.join([
+            aa_mov_rr('x8', 'a0'),
+            aa_mov_rr('save0', 'a1'),
+            aa_mov_rr('save1', 'a2'),
+            aa_mov_rr('save2', 'a3'),
+            aa_mov_rr('a0', 'save0'),
+            aa_mov_rr('a1', 'save1'),
+            aa_mov_rr('a2', 'save2'),
+            aa_mov_rr('a3', 't0'),
+            aa_mov_rr('x4', 's0'),
+            aa_mov_rr('x5', 's1'),
+            le32(0xD4000001),
+            aa_mov_rr('a1', 'save0'),
+            aa_mov_rr('a2', 'save1'),
+            aa_mov_rr('a3', 'save2'),
+        ])
+    raise ValueError(f'unknown nullary kind: {row.kind}')
+
+
+def aa_start_stub():
+    # Backend-owned :_start stub per docs/P1.md §Program Entry. Captures
+    # argc from [sp] and argv pointer from sp+8, calls p1_main under the
+    # one-word direct-result convention (a0=argc, a1=argv), then issues a
+    # native Linux sys_exit with p1_main's return value. Mirrors the
+    # m1pp-path stub in p1/P1-aarch64.M1pp (`%p1_entry`).
+    #
+    # Raw hex outside `DEFINE` bodies must be single-quoted so bootstrap
+    # M0 treats it as a literal byte run rather than a token.
+    def q(hex_bytes):
+        return f"'{hex_bytes}'"
+    return [
+        ':_start',
+        q(aa_mem('LD', 'a0', 'sp', 0)),
+        q(aa_add_imm('a1', 'sp', 8, sub=False)),
+        q(aa_lit32_prefix('br')),
+        '&p1_main',
+        q(aa_blr('br')),
+        q(aa_movz('x8', 93)),
+        q(le32(0xD4000001)),
+    ]
+
+
+ENCODERS = {
+    Li: encode_li,
+    La: encode_la,
+    LaBr: encode_labr,
+    Mov: encode_mov,
+    Rrr: encode_rrr,
+    AddI: encode_addi,
+    LogI: encode_logi,
+    ShiftI: encode_shifti,
+    Mem: encode_mem,
+    LdArg: encode_ldarg,
+    Nullary: encode_nullary,
+    BranchReg: encode_branch_reg,
+    CondB: encode_condb,
+    CondBZ: encode_condbz,
+    Enter: encode_enter,
+}
+
+
+ARCH = ArchDef(
+    name='aarch64',
+    word_bytes=8,
+    stack_align=16,
+    syscall_numbers=SYSCALL_NUMBERS,
+    encoders=ENCODERS,
+    start_stub=aa_start_stub,
+)
diff --git a/p1/gen/amd64.py b/p1/gen/amd64.py
@@ -0,0 +1,608 @@
+from common import (
+    AddI,
+    ArchDef,
+    BranchReg,
+    CondB,
+    CondBZ,
+    Enter,
+    La,
+    LaBr,
+    LdArg,
+    Li,
+    LogI,
+    Mem,
+    Mov,
+    Nullary,
+    Rrr,
+    ShiftI,
+    byte,
+    le32,
+    round_up,
+)
+
+
+# ---- Native register numbers --------------------------------------------
+#
+# Backend-private mapping from P1 register names to native amd64 regnums.
+# `br` is the hidden branch-target reg (r15). `scratch` is the per-expansion
+# scratch reg (r9). rax/rbp are also used internally (retaddr spill, rcx /
+# rdx save slots) and are not P1-visible.
+
+NAT = {
+    'a0': 7,    # rdi
+    'a1': 6,    # rsi
+    'a2': 2,    # rdx
+    'a3': 1,    # rcx
+    't0': 10,   # r10
+    't1': 11,   # r11
+    't2': 8,    # r8
+    's0': 3,    # rbx
+    's1': 12,   # r12
+    's2': 13,   # r13
+    's3': 14,   # r14
+    'sp': 4,    # rsp
+    'br': 15,   # r15
+    'scratch': 9,  # r9
+    'rax': 0,
+    'rcx': 1,
+    'rdx': 2,
+    'rbx': 3,
+    'rsp': 4,
+    'rbp': 5,
+    'rsi': 6,
+    'rdi': 7,
+    'r8': 8,
+    'r9': 9,
+    'r10': 10,
+    'r11': 11,
+    'r12': 12,
+    'r13': 13,
+    'r14': 14,
+    'r15': 15,
+}
+
+
+SYSCALL_NUMBERS = {
+    'SYS_READ': 0,
+    'SYS_WRITE': 1,
+    'SYS_CLOSE': 3,
+    'SYS_OPENAT': 257,
+    'SYS_EXIT': 60,
+    'SYS_CLONE': 56,
+    'SYS_EXECVE': 59,
+    'SYS_WAITID': 247,
+}
+
+
+# ---- REX / ModRM helpers ------------------------------------------------
+
+def amd_rex_b_short(r):
+    # Optional one-byte REX.B (no W) prefix used by push/pop/jmp r/call r/
+    # mov r,imm32 when the target reg is r8-r15. Returns '' for low regs.
+    if NAT[r] >= 8:
+        return byte(0x41)
+    return ''
+
+
+def amd_rex_wb(r):
+    # REX.W=1, B=(r>>3) to extend ModRM.rm / SIB.base.
+    return byte(0x48 | ((NAT[r] >> 3) & 1))
+
+
+def amd_rex_wrb(rg, rm):
+    # REX.W=1, R=(rg>>3), B=(rm>>3). Used whenever a ModRM.reg field is in
+    # use together with a ModRM.rm field.
+    return byte(0x48 | (((NAT[rg] >> 3) & 1) << 2) | ((NAT[rm] >> 3) & 1))
+
+
+def amd_modrm_rr(rg, rm):
+    return byte(0xC0 | ((NAT[rg] & 7) << 3) | (NAT[rm] & 7))
+
+
+def amd_modrm_ext_r(ext, rm):
+    return byte(0xC0 | ((ext & 7) << 3) | (NAT[rm] & 7))
+
+
+# ---- Memory-addressing ModRM (+ SIB + disp) ----------------------------
+#
+# [base + disp] with `reg` in ModRM.reg. Bases whose low 3 bits are 100 -
+# rsp and r12 - must go through a SIB byte; all others use the plain
+# encoding. disp selects mod=1 (disp8) when it fits in [-128,127], else
+# mod=2 (disp32). We never emit mod=0 / no-disp; the extra byte is fine.
+
+def amd_modrm_disp(reg, base, disp):
+    use_sib = (NAT[base] & 7) == 4
+    use_disp8 = -128 <= disp <= 127
+    reg_lo = NAT[reg] & 7
+    if use_sib:
+        if use_disp8:
+            return byte(0x44 | (reg_lo << 3)) + byte(0x24) + byte(disp)
+        return byte(0x84 | (reg_lo << 3)) + byte(0x24) + le32(disp)
+    base_lo = NAT[base] & 7
+    if use_disp8:
+        return byte(0x40 | (reg_lo << 3) | base_lo) + byte(disp)
+    return byte(0x80 | (reg_lo << 3) | base_lo) + le32(disp)
+
+
+# ---- Register / arithmetic primitives ----------------------------------
+
+def amd_mov_rr(dst, src):
+    # mov dst, src  --  REX.WRB 89 /r  (source in ModRM.reg, dest in rm)
+    return amd_rex_wrb(src, dst) + byte(0x89) + amd_modrm_rr(src, dst)
+
+
+def amd_alu_rr(opcode, dst, src):
+    # ADD/SUB/AND/OR/XOR dst, src  --  REX.WRB <op> /r (src in reg, dst in rm)
+    return amd_rex_wrb(src, dst) + byte(opcode) + amd_modrm_rr(src, dst)
+
+
+def amd_alu_ri8(ext, dst, imm):
+    # op dst, imm8 -- REX.WB 83 /ext ib
+    return amd_rex_wb(dst) + byte(0x83) + amd_modrm_ext_r(ext, dst) + byte(imm)
+
+
+def amd_alu_ri32(ext, dst, imm):
+    # op dst, imm32 -- REX.WB 81 /ext id
+    return amd_rex_wb(dst) + byte(0x81) + amd_modrm_ext_r(ext, dst) + le32(imm)
+
+
+def amd_shift_ri8(ext, dst, imm):
+    # shift dst, imm8 -- REX.WB C1 /ext ib  (SHL=4, SHR=5, SAR=7)
+    return (amd_rex_wb(dst) + byte(0xC1) + amd_modrm_ext_r(ext, dst)
+            + byte(imm & 0x3F))
+
+
+def amd_shift_cl(ext, dst):
+    # shift dst, cl -- REX.WB D3 /ext
+    return amd_rex_wb(dst) + byte(0xD3) + amd_modrm_ext_r(ext, dst)
+
+
+def amd_imul_rr(dst, src):
+    # imul dst, src  --  REX.WRB 0F AF /r  (dst in reg, src in rm)
+    return (amd_rex_wrb(dst, src) + byte(0x0F) + byte(0xAF)
+            + amd_modrm_rr(dst, src))
+
+
+def amd_idiv_r(src):
+    # idiv src  --  REX.WB F7 /7
+    return amd_rex_wb(src) + byte(0xF7) + amd_modrm_ext_r(7, src)
+
+
+def amd_cqo():
+    # cqo -- 48 99 (sign-extend rax into rdx:rax)
+    return byte(0x48) + byte(0x99)
+
+
+def amd_push(r):
+    return amd_rex_b_short(r) + byte(0x50 | (NAT[r] & 7))
+
+
+def amd_pop(r):
+    return amd_rex_b_short(r) + byte(0x58 | (NAT[r] & 7))
+
+
+def amd_mov_imm32_prefix(rd):
+    # mov r32, imm32  --  [REX.B] B8+r  (caller appends 4-byte literal).
+    # Result is zero-extended into the full 64-bit register.
+    return amd_rex_b_short(rd) + byte(0xB8 | (NAT[rd] & 7))
+
+
+def amd_mov_imm64_prefix(rd):
+    # mov r64, imm64  --  REX.W[.B] B8+r  (caller appends 8-byte literal).
+    return amd_rex_wb(rd) + byte(0xB8 | (NAT[rd] & 7))
+
+
+# ---- Memory ops --------------------------------------------------------
+
+def amd_mem_LD(rt, rn, off):
+    # mov rT, [rN + off]  --  REX.WRB 8B /r  modrm-with-disp
+    return amd_rex_wrb(rt, rn) + byte(0x8B) + amd_modrm_disp(rt, rn, off)
+
+
+def amd_mem_ST(rt, rn, off):
+    # mov [rN + off], rT  --  REX.WRB 89 /r
+    return amd_rex_wrb(rt, rn) + byte(0x89) + amd_modrm_disp(rt, rn, off)
+
+
+def amd_mem_SB(rt, rn, off):
+    # mov [rN + off], rT8 -- REX.WRB 88 /r (REX.W forces dil/sil/bpl/spl
+    # byte-view encoding when the low byte of those regs is needed).
+    return amd_rex_wrb(rt, rn) + byte(0x88) + amd_modrm_disp(rt, rn, off)
+
+
+def amd_mem_LB(rt, rn, off):
+    # movzx rT, byte ptr [rN + off]  --  REX.WRB 0F B6 /r
+    return (amd_rex_wrb(rt, rn) + byte(0x0F) + byte(0xB6)
+            + amd_modrm_disp(rt, rn, off))
+
+
+# ---- Control-flow primitives -------------------------------------------
+
+def amd_jmp_r(r):
+    # jmp r/m64 -- [REX.B] FF /4. 2 bytes for low regs, 3 bytes for r8-r15.
+    return amd_rex_b_short(r) + byte(0xFF) + byte(0xE0 | (NAT[r] & 7))
+
+
+def amd_call_r(r):
+    # call r/m64 -- [REX.B] FF /2.
+    return amd_rex_b_short(r) + byte(0xFF) + byte(0xD0 | (NAT[r] & 7))
+
+
+def amd_ret():
+    return byte(0xC3)
+
+
+def amd_syscall():
+    return byte(0x0F) + byte(0x05)
+
+
+def amd_cmp_rr(ra, rb):
+    # cmp rA, rB -- REX.WRB 39 /r (rB in reg, rA in rm).
+    return amd_rex_wrb(rb, ra) + byte(0x39) + amd_modrm_rr(rb, ra)
+
+
+def amd_test_rr(ra, rb):
+    return amd_rex_wrb(rb, ra) + byte(0x85) + amd_modrm_rr(rb, ra)
+
+
+# ---- P1 register-register op lowering ----------------------------------
+#
+# For ADD/SUB/AND/OR/XOR we honor rD==rB aliasing -- the naive
+# `mov rD,rA ; op rD,rB` would clobber rB before the op reads it. Route rB
+# through the scratch reg when that aliasing shows up.
+
+ALU_OPCODE = {
+    'ADD': 0x01,
+    'SUB': 0x29,
+    'AND': 0x21,
+    'OR': 0x09,
+    'XOR': 0x31,
+}
+
+
+def amd_rrr_simple(opcode, rd, ra, rb):
+    if NAT[rd] == NAT[rb]:
+        return (amd_mov_rr('scratch', rb)
+                + amd_mov_rr(rd, ra)
+                + amd_alu_rr(opcode, rd, 'scratch'))
+    return amd_mov_rr(rd, ra) + amd_alu_rr(opcode, rd, rb)
+
+
+def amd_rrr_MUL(rd, ra, rb):
+    if NAT[rd] == NAT[rb]:
+        return (amd_mov_rr('scratch', rb)
+                + amd_mov_rr(rd, ra)
+                + amd_imul_rr(rd, 'scratch'))
+    return amd_mov_rr(rd, ra) + amd_imul_rr(rd, rb)
+
+
+# DIV / REM clobber rax and rdx natively. rax is not a P1 register, so we
+# clobber it freely; rdx IS P1 a2, so we stash it to rbp (also outside the
+# P1 mapping) for the lifetime of the op. Aliasing-safety plan mirrors the
+# M1pp comments verbatim.
+
+def amd_rrr_DIV(rd, ra, rb):
+    return ''.join([
+        amd_mov_rr('rbp', 'rdx'),
+        amd_mov_rr('scratch', rb),
+        amd_mov_rr('rax', ra),
+        amd_cqo(),
+        amd_idiv_r('scratch'),
+        amd_mov_rr('rdx', 'rbp'),
+        amd_mov_rr(rd, 'rax'),
+    ])
+
+
+def amd_rrr_REM(rd, ra, rb):
+    return ''.join([
+        amd_mov_rr('rbp', 'rdx'),
+        amd_mov_rr('scratch', rb),
+        amd_mov_rr('rax', ra),
+        amd_cqo(),
+        amd_idiv_r('scratch'),
+        amd_mov_rr('rax', 'rdx'),
+        amd_mov_rr('rdx', 'rbp'),
+        amd_mov_rr(rd, 'rax'),
+    ])
+
+
+# SHL / SHR / SAR with reg count. x86 reads the count from CL only, so
+# staging goes through rcx -- which IS P1 a3. Save rcx to rbp for the
+# duration. Ordering matches the M1pp comments.
+
+def amd_rrr_shift(ext, rd, ra, rb):
+    return ''.join([
+        amd_mov_rr('rbp', 'rcx'),
+        amd_mov_rr('scratch', ra),
+        amd_mov_rr('rcx', rb),
+        amd_shift_cl(ext, 'scratch'),
+        amd_mov_rr('rcx', 'rbp'),
+        amd_mov_rr(rd, 'scratch'),
+    ])
+
+
+# ---- Encoders ----------------------------------------------------------
+
+def encode_li(_arch, row):
+    return amd_mov_imm64_prefix(row.rd)
+
+
+def encode_la(_arch, row):
+    return amd_mov_imm32_prefix(row.rd)
+
+
+def encode_labr(_arch, _row):
+    return amd_mov_imm32_prefix('br')
+
+
+def encode_mov(_arch, row):
+    # Portable sp is the frame-local base, which is 16 bytes above native
+    # rsp. Reading sp into a register yields native_rsp + 16, so emit
+    # `mov rd, rsp ; add rd, 16` for the sp-source case.
+    if row.rs == 'sp':
+        return amd_mov_rr(row.rd, 'sp') + amd_alu_ri8(0, row.rd, 16)
+    return amd_mov_rr(row.rd, row.rs)
+
+
+def encode_rrr(_arch, row):
+    if row.op == 'MUL':
+        return amd_rrr_MUL(row.rd, row.ra, row.rb)
+    if row.op == 'DIV':
+        return amd_rrr_DIV(row.rd, row.ra, row.rb)
+    if row.op == 'REM':
+        return amd_rrr_REM(row.rd, row.ra, row.rb)
+    if row.op == 'SHL':
+        return amd_rrr_shift(4, row.rd, row.ra, row.rb)
+    if row.op == 'SHR':
+        return amd_rrr_shift(5, row.rd, row.ra, row.rb)
+    if row.op == 'SAR':
+        return amd_rrr_shift(7, row.rd, row.ra, row.rb)
+    return amd_rrr_simple(ALU_OPCODE[row.op], row.rd, row.ra, row.rb)
+
+
+def encode_addi(_arch, row):
+    head = amd_mov_rr(row.rd, row.ra)
+    if -128 <= row.imm <= 127:
+        return head + amd_alu_ri8(0, row.rd, row.imm)
+    return head + amd_alu_ri32(0, row.rd, row.imm)
+
+
+# AND/OR with imm: 83 /ext ib sign-extends imm8 to 64 bits. That works for
+# imm in [-128, 127] (and for -1 as a convenient all-ones mask), but breaks
+# for positive imms >= 128 -- ANDI with 255 would become AND with
+# 0xFFFFFFFFFFFFFFFF. Widen to the imm32 form when imm8 would misencode.
+LOGI_EXT = {
+    'ANDI': 4,
+    'ORI': 1,
+}
+
+
+def encode_logi(_arch, row):
+    head = amd_mov_rr(row.rd, row.ra)
+    ext = LOGI_EXT[row.op]
+    if -128 <= row.imm <= 127:
+        return head + amd_alu_ri8(ext, row.rd, row.imm)
+    return head + amd_alu_ri32(ext, row.rd, row.imm)
+
+
+SHIFTI_EXT = {
+    'SHLI': 4,
+    'SHRI': 5,
+    'SARI': 7,
+}
+
+
+def encode_shifti(_arch, row):
+    return (amd_mov_rr(row.rd, row.ra)
+            + amd_shift_ri8(SHIFTI_EXT[row.op], row.rd, row.imm))
+
+
+def encode_mem(_arch, row):
+    # Portable sp points to the frame-local base; the 16-byte hidden frame
+    # header sits at native_rsp+0..15 and is not portable-addressable.
+    # Shift sp-relative offsets past the header.
+    off = row.off + 16 if row.rn == 'sp' else row.off
+    if row.op == 'LD':
+        return amd_mem_LD(row.rt, row.rn, off)
+    if row.op == 'ST':
+        return amd_mem_ST(row.rt, row.rn, off)
+    if row.op == 'LB':
+        return amd_mem_LB(row.rt, row.rn, off)
+    if row.op == 'SB':
+        return amd_mem_SB(row.rt, row.rn, off)
+    raise ValueError(f'unknown mem op: {row.op}')
+
+
+def encode_ldarg(_arch, row):
+    # Internal callers bypass the +16 sp-base translation: native rsp+8
+    # holds the saved caller-sp pointer set up by p1_enter, and the first
+    # incoming stack-arg word lives 16 bytes past that.
+    return (amd_mem_LD('scratch', 'sp', 8)
+            + amd_mem_LD(row.rd, 'scratch', 16 + 8 * row.slot))
+
+
+def amd_epilogue_prefix():
+    # Frame-teardown prefix shared by ERET, TAIL, TAILR. Loads retaddr into
+    # scratch (r9), saved caller sp into rax, unwinds rsp, then re-pushes
+    # retaddr so a trailing `ret` or `jmp` finds the right top-of-stack
+    # layout. (For TAIL/TAILR the trailing op is a jmp, but the retaddr
+    # still needs to be back on the stack so the eventual callee `ret`
+    # returns to the original caller.)
+    return ''.join([
+        amd_mem_LD('scratch', 'sp', 0),
+        amd_mem_LD('rax', 'sp', 8),
+        amd_mov_rr('sp', 'rax'),
+        amd_push('scratch'),
+    ])
+
+
+def encode_branch_reg(_arch, row):
+    if row.kind == 'BR':
+        return amd_jmp_r(row.rs)
+    if row.kind == 'CALLR':
+        return amd_call_r(row.rs)
+    if row.kind == 'TAILR':
+        return amd_epilogue_prefix() + amd_jmp_r(row.rs)
+    raise ValueError(f'unknown branch-reg kind: {row.kind}')
+
+
+# Conditional-branch lowering:
+#   cmp / test
+#   Jcc_inverse +3       -- skip the 3-byte `jmp r15`
+#   jmp r15              -- P1 branch-taken path
+#
+# Invert codes: BEQ->JNE(75), BNE->JE(74), BLT->JGE(7D), BLTU->JAE(73),
+# BLTZ->JGE(7D), BEQZ->JNE(75), BNEZ->JE(74). The 0x03 rel8 skips
+# `amd_jmp_r(br)` which is 3 bytes (REX.B 41 + FF + E7).
+CONDB_INVERT = {
+    'BEQ': 0x75,   # JNE
+    'BNE': 0x74,   # JE
+    'BLT': 0x7D,   # JGE
+    'BLTU': 0x73,  # JAE
+}
+
+CONDBZ_INVERT = {
+    'BEQZ': 0x75,  # JNE
+    'BNEZ': 0x74,  # JE
+    'BLTZ': 0x7D,  # JGE
+}
+
+
+def encode_condb(_arch, row):
+    return (amd_cmp_rr(row.ra, row.rb)
+            + byte(CONDB_INVERT[row.op]) + byte(0x03)
+            + amd_jmp_r('br'))
+
+
+def encode_condbz(_arch, row):
+    return (amd_test_rr(row.ra, row.ra)
+            + byte(CONDBZ_INVERT[row.op]) + byte(0x03)
+            + amd_jmp_r('br'))
+
+
+def encode_enter(arch, row):
+    # CALL on amd64 pushed the retaddr, so on entry:
+    #   rsp = caller_sp - 8
+    #   [rsp] = retaddr
+    #
+    # Standard frame after ENTER:
+    #   [sp + 0]                  = retaddr
+    #   [sp + 8]                  = saved caller_sp
+    #   [sp + 16 .. 16 + size - 1] = portable locals
+    #   total frame = round_up(stack_align, 16 + size)
+    frame_bytes = round_up(arch.stack_align, 2 * arch.word_bytes + row.size)
+    return ''.join([
+        amd_pop('scratch'),
+        amd_mov_rr('rax', 'sp'),
+        amd_alu_ri32(5, 'sp', frame_bytes),
+        amd_mem_ST('scratch', 'sp', 0),
+        amd_mem_ST('rax', 'sp', 8),
+    ])
+
+
+def encode_nullary(_arch, row):
+    if row.kind == 'B':
+        return amd_jmp_r('br')
+    if row.kind == 'CALL':
+        return amd_call_r('br')
+    if row.kind == 'RET':
+        return amd_ret()
+    if row.kind == 'ERET':
+        return amd_epilogue_prefix() + amd_ret()
+    if row.kind == 'TAIL':
+        return amd_epilogue_prefix() + amd_jmp_r('br')
+    if row.kind == 'SYSCALL':
+        # P1: a0=num, a1..a3,t0,s0,s1 = args 0..5. Linux amd64: rax=num,
+        # rdi/rsi/rdx/r10/r8/r9 = args 0..5, return in rax; syscall also
+        # clobbers rcx and r11.
+        #
+        # Push the P1 registers whose native slots get overwritten or
+        # syscall-clobbered -- rsi (a1), rdx (a2), rcx (a3), r11 (t1),
+        # r8 (t2) -- then shuffle into the native arg slots, issue
+        # syscall, restore, and move the return value (rax) into a0
+        # (rdi). Stack offsets after the 5 pushes: [rsp+0]=r8,
+        # [rsp+8]=r11, [rsp+16]=rcx (a3), [rsp+24]=rdx (a2),
+        # [rsp+32]=rsi (a1).
+        return ''.join([
+            amd_push('rsi'),
+            amd_push('rdx'),
+            amd_push('rcx'),
+            amd_push('r11'),
+            amd_push('r8'),
+            amd_mov_rr('rax', 'rdi'),
+            amd_mem_LD('rdi', 'sp', 32),
+            amd_mem_LD('rsi', 'sp', 24),
+            amd_mem_LD('rdx', 'sp', 16),
+            amd_mov_rr('r8', 'rbx'),
+            amd_mov_rr('r9', 'r12'),
+            amd_syscall(),
+            amd_pop('r8'),
+            amd_pop('r11'),
+            amd_pop('rcx'),
+            amd_pop('rdx'),
+            amd_pop('rsi'),
+            amd_mov_rr('rdi', 'rax'),
+        ])
+    raise ValueError(f'unknown nullary kind: {row.kind}')
+
+
+def amd_start_stub():
+    # Backend-owned :_start stub per docs/P1.md §Program Entry. Linux amd64
+    # puts argc at [rsp] and argv starting at [rsp+8]. Load argc into a0
+    # (rdi), compute &argv[0] into a1 (rsi), call p1_main under the
+    # one-word direct-result convention, then issue sys_exit with
+    # p1_main's return value in a0 (== rdi). Mirrors the `%p1_entry`
+    # macro in p1/P1-amd64.M1pp.
+    #
+    # Raw hex outside DEFINE bodies must be single-quoted so bootstrap M0
+    # treats it as a literal byte run. The bootstrap amd64 M0 has a 256B
+    # token buffer, so each quoted run must stay <= 128 hex chars; we
+    # split into multiple short lines defensively.
+    def q(hex_bytes):
+        return f"'{hex_bytes}'"
+
+    load_argc = amd_mem_LD('a0', 'sp', 0)
+    compute_argv = amd_mov_rr('a1', 'sp') + amd_alu_ri8(0, 'a1', 8)
+    labr_prefix = amd_mov_imm32_prefix('br')
+    call_main = amd_call_r('br')
+    # mov eax, 60 ; syscall. P1 a0 (rdi) already holds p1_main's return.
+    sys_exit = byte(0xB8) + le32(60) + amd_syscall()
+
+    return [
+        ':_start',
+        q(load_argc),
+        q(compute_argv),
+        q(labr_prefix),
+        '&p1_main',
+        q(call_main),
+        q(sys_exit),
+    ]
+
+
+ENCODERS = {
+    Li: encode_li,
+    La: encode_la,
+    LaBr: encode_labr,
+    Mov: encode_mov,
+    Rrr: encode_rrr,
+    AddI: encode_addi,
+    LogI: encode_logi,
+    ShiftI: encode_shifti,
+    Mem: encode_mem,
+    LdArg: encode_ldarg,
+    Nullary: encode_nullary,
+    BranchReg: encode_branch_reg,
+    CondB: encode_condb,
+    CondBZ: encode_condbz,
+    Enter: encode_enter,
+}
+
+
+ARCH = ArchDef(
+    name='amd64',
+    word_bytes=8,
+    stack_align=16,
+    syscall_numbers=SYSCALL_NUMBERS,
+    encoders=ENCODERS,
+    start_stub=amd_start_stub,
+)
diff --git a/p1/gen/common.py b/p1/gen/common.py
@@ -0,0 +1,49 @@
+from collections import namedtuple
+
+
+ArchDef = namedtuple(
+    'ArchDef',
+    'name word_bytes stack_align syscall_numbers encoders start_stub',
+)
+
+Banner = namedtuple('Banner', 'text')
+Literal = namedtuple('Literal', 'name hex_by_arch')
+Nullary = namedtuple('Nullary', 'name kind')
+Li = namedtuple('Li', 'name rd')
+La = namedtuple('La', 'name rd')
+LaBr = namedtuple('LaBr', 'name')
+Mov = namedtuple('Mov', 'name rd rs')
+Rrr = namedtuple('Rrr', 'name op rd ra rb')
+AddI = namedtuple('AddI', 'name rd ra imm')
+LogI = namedtuple('LogI', 'name op rd ra imm')
+ShiftI = namedtuple('ShiftI', 'name op rd ra imm')
+Mem = namedtuple('Mem', 'name op rt rn off')
+LdArg = namedtuple('LdArg', 'name rd slot')
+BranchReg = namedtuple('BranchReg', 'name kind rs')
+CondB = namedtuple('CondB', 'name op ra rb')
+CondBZ = namedtuple('CondBZ', 'name op ra')
+Enter = namedtuple('Enter', 'name size')
+
+
+def byte(n):
+    return f'{n & 0xFF:02X}'
+
+
+def le32(n):
+    return (n & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
+
+
+def le64(n):
+    return (n & 0xFFFFFFFFFFFFFFFF).to_bytes(8, 'little').hex().upper()
+
+
+def word_hex(word_bytes, n):
+    if word_bytes == 4:
+        return le32(n)
+    if word_bytes == 8:
+        return le64(n)
+    raise ValueError(f'unsupported word size: {word_bytes}')
+
+
+def round_up(align, n):
+    return ((n + align - 1) // align) * align
diff --git a/p1/gen/p1_gen.py b/p1/gen/p1_gen.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+"""Generate P1 v2 DEFINE tables.
+
+This is a fresh generator for docs/P1v2.md. The ISA surface is described by
+plain namedtuple rows, and each backend registers a simple row-type -> encoder
+mapping. The emitted immediate/offset domains are still curated tables rather
+than the full theoretical spec space, so extending coverage is a one-line data
+edit instead of an architecture rewrite.
+
+Usage:
+    python3 p1/gen/p1_gen.py [--arch ARCH] [build-root]
+    python3 p1/gen/p1_gen.py --check [--arch ARCH] [build-root]
+    python3 p1/gen/p1_gen.py --list-archs
+"""
+
+import os
+import sys
+from itertools import product
+
+from common import (
+    AddI,
+    Banner,
+    BranchReg,
+    CondB,
+    CondBZ,
+    Enter,
+    La,
+    LaBr,
+    LdArg,
+    Li,
+    Literal,
+    LogI,
+    Mem,
+    Mov,
+    Nullary,
+    Rrr,
+    ShiftI,
+    word_hex,
+)
+
+import aarch64
+import amd64
+import riscv64
+
+ARCHES = {a.name: a for a in (aarch64.ARCH, amd64.ARCH, riscv64.ARCH)}
+
+
+P1_GPRS = ('a0', 'a1', 'a2', 'a3', 't0', 't1', 't2', 's0', 's1', 's2', 's3')
+P1_BASES = P1_GPRS + ('sp',)
+
+RRR_OPS = ('ADD', 'SUB', 'AND', 'OR', 'XOR', 'SHL', 'SHR', 'SAR', 'MUL', 'DIV', 'REM')
+LOGI_OPS = ('ANDI', 'ORI')
+SHIFT_OPS = ('SHLI', 'SHRI', 'SARI')
+MEM_OPS = ('LD', 'ST', 'LB', 'SB')
+CONDB_OPS = ('BEQ', 'BNE', 'BLT', 'BLTU')
+CONDBZ_OPS = ('BEQZ', 'BNEZ', 'BLTZ')
+
+ADDI_IMMS = (
+    -2048, -1024, -256, -128, -64, -48, -32, -24, -16, -12, -8, -7, -6,
+    -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16, 24, 32, 40,
+    48, 63, 64, 127, 128, 255, 256, 512, 1024, 2047,
+)
+
+LOGI_IMMS = (
+    -1, 0, 1, 2, 3, 4, 6, 7, 8, 15, 16, 31, 32, 63, 64, 127, 255, 511, 1023,
+    2047,
+)
+
+SHIFT_IMMS = tuple(range(64))
+
+MEM_OFFS = (
+    -256, -128, -64, -48, -32, -24, -16, -8, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8,
+    15, 16, 24, 32, 40, 48, 56, 64, 128, 255,
+)
+
+LDARG_SLOTS = tuple(range(32))
+ENTER_SIZES = tuple(range(0, 129))
+
+
+HEADER = """## p1_{arch}.M1 — GENERATED by p1/gen/p1_gen.py. Do not edit by hand.
+##
+## This table targets the P1 v2 ISA described in docs/P1v2.md.
+## Row shapes are shared; per-arch lowering lives in p1/gen/<arch>.py.
+"""
+
+
+def imm_suffix(imm):
+    return f'NEG{-imm}' if imm < 0 else str(imm)
+
+
+def rows(arch):
+    out = []
+
+    out.append(Banner('Materialization'))
+    for rd in P1_GPRS:
+        out.append(Li(name=f'LI_{rd.upper()}', rd=rd))
+    for rd in P1_GPRS:
+        out.append(La(name=f'LA_{rd.upper()}', rd=rd))
+    out.append(LaBr(name='LA_BR'))
+
+    out.append(Banner('Moves'))
+    for rd, rs in product(P1_GPRS, P1_GPRS):
+        out.append(Mov(name=f'MOV_{rd.upper()}_{rs.upper()}', rd=rd, rs=rs))
+    for rd in P1_GPRS:
+        out.append(Mov(name=f'MOV_{rd.upper()}_SP', rd=rd, rs='sp'))
+
+    out.append(Banner('Register Arithmetic'))
+    for op, rd, ra, rb in product(RRR_OPS, P1_GPRS, P1_GPRS, P1_GPRS):
+        out.append(Rrr(name=f'{op}_{rd.upper()}_{ra.upper()}_{rb.upper()}',
+                       op=op, rd=rd, ra=ra, rb=rb))
+
+    out.append(Banner('Immediate Arithmetic'))
+    for rd, ra, imm in product(P1_GPRS, P1_GPRS, ADDI_IMMS):
+        out.append(AddI(name=f'ADDI_{rd.upper()}_{ra.upper()}_{imm_suffix(imm)}',
+                        rd=rd, ra=ra, imm=imm))
+    for op, rd, ra, imm in product(LOGI_OPS, P1_GPRS, P1_GPRS, LOGI_IMMS):
+        out.append(LogI(name=f'{op}_{rd.upper()}_{ra.upper()}_{imm_suffix(imm)}',
+                        op=op, rd=rd, ra=ra, imm=imm))
+    for op, rd, ra, imm in product(SHIFT_OPS, P1_GPRS, P1_GPRS, SHIFT_IMMS):
+        out.append(ShiftI(name=f'{op}_{rd.upper()}_{ra.upper()}_{imm}',
+                          op=op, rd=rd, ra=ra, imm=imm))
+
+    out.append(Banner('Memory'))
+    for op, rt, rn, off in product(MEM_OPS, P1_GPRS, P1_BASES, MEM_OFFS):
+        out.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{imm_suffix(off)}',
+                       op=op, rt=rt, rn=rn, off=off))
+
+    out.append(Banner('ABI Access'))
+    for rd, slot in product(P1_GPRS, LDARG_SLOTS):
+        out.append(LdArg(name=f'LDARG_{rd.upper()}_{slot}', rd=rd, slot=slot))
+
+    out.append(Banner('Branches'))
+    out.append(Nullary(name='B', kind='B'))
+    for rs in P1_GPRS:
+        out.append(BranchReg(name=f'BR_{rs.upper()}', kind='BR', rs=rs))
+    for op, ra, rb in product(CONDB_OPS, P1_GPRS, P1_GPRS):
+        out.append(CondB(name=f'{op}_{ra.upper()}_{rb.upper()}', op=op, ra=ra, rb=rb))
+    for op, ra in product(CONDBZ_OPS, P1_GPRS):
+        out.append(CondBZ(name=f'{op}_{ra.upper()}', op=op, ra=ra))
+
+    out.append(Banner('Calls And Returns'))
+    out.append(Nullary(name='CALL', kind='CALL'))
+    out.append(Nullary(name='RET', kind='RET'))
+    out.append(Nullary(name='ERET', kind='ERET'))
+    out.append(Nullary(name='TAIL', kind='TAIL'))
+    for rs in P1_GPRS:
+        out.append(BranchReg(name=f'CALLR_{rs.upper()}', kind='CALLR', rs=rs))
+    for rs in P1_GPRS:
+        out.append(BranchReg(name=f'TAILR_{rs.upper()}', kind='TAILR', rs=rs))
+
+    out.append(Banner('Frame Management'))
+    for size in ENTER_SIZES:
+        out.append(Enter(name=f'ENTER_{size}', size=size))
+
+    out.append(Banner('System'))
+    out.append(Nullary(name='SYSCALL', kind='SYSCALL'))
+    for name, number in sorted(arch.syscall_numbers.items()):
+        out.append(Literal(name=name, hex_by_arch={arch.name: word_hex(arch.word_bytes, number)}))
+
+    return out
+
+
+def lower_name(name):
+    low = name.lower()
+    head, sep, rest = low.partition('_')
+    if not sep:
+        return low
+    if '_' not in rest:
+        return low
+    return f'{head}_{rest.replace("_", ",")}'
+
+
+def encode_row(arch, row):
+    if isinstance(row, Literal):
+        return row.hex_by_arch[arch.name]
+    encoder = arch.encoders[type(row)]
+    return encoder(arch, row)
+
+
+def emit(arch_name):
+    arch = ARCHES[arch_name]
+    out = [HEADER.format(arch=arch.name).rstrip(), '']
+    seen = set()
+    for row in rows(arch):
+        if isinstance(row, Banner):
+            out.append('')
+            out.append(f'## ---- {row.text}')
+            continue
+        name = lower_name(row.name)
+        if name in seen:
+            raise RuntimeError(f'duplicate DEFINE: {name}')
+        seen.add(name)
+        out.append(f'DEFINE {name} {encode_row(arch, row)}')
+    out.append('')
+    out.append('## ---- Program Entry')
+    out.append('## Backend-owned :_start stub per docs/P1.md §Program Entry.')
+    out.append('## Calls p1_main under the one-word direct-result convention')
+    out.append("## (a0=argc, a1=argv) and sys_exits its return value.")
+    out.extend(arch.start_stub())
+    out.append('')
+    return '\n'.join(out)
+
+
+def parse_args(argv):
+    check = False
+    archs = []
+    positional = []
+    i = 0
+    while i < len(argv):
+        arg = argv[i]
+        if arg == '--check':
+            check = True
+        elif arg == '--list-archs':
+            print('\n'.join(sorted(ARCHES)))
+            sys.exit(0)
+        elif arg == '--arch':
+            i += 1
+            if i >= len(argv):
+                raise SystemExit('--arch requires a value')
+            archs.append(argv[i])
+        else:
+            positional.append(arg)
+        i += 1
+    build_root = positional[0] if positional else os.path.join('build', 'p1v2')
+    if not archs:
+        archs = list(sorted(ARCHES))
+    return check, archs, build_root
+
+
+def main(argv=None):
+    check, archs, build_root = parse_args(argv or sys.argv[1:])
+    had_diff = False
+
+    for arch_name in archs:
+        arch = ARCHES[arch_name]
+        dest_dir = os.path.join(build_root, arch.name)
+        path = os.path.join(dest_dir, f'p1_{arch.name}.M1')
+        content = emit(arch.name)
+        if check:
+            try:
+                with open(path) as f:
+                    existing = f.read()
+            except FileNotFoundError:
+                existing = ''
+            if existing != content:
+                sys.stderr.write(f'DIFF: {path}\n')
+                had_diff = True
+            continue
+        os.makedirs(dest_dir, exist_ok=True)
+        with open(path, 'w') as f:
+            f.write(content)
+        print(f'wrote {path} ({len(content)} bytes)')
+
+    if check and had_diff:
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/p1/gen/riscv64.py b/p1/gen/riscv64.py
@@ -0,0 +1,396 @@
+from common import (
+    AddI,
+    ArchDef,
+    BranchReg,
+    CondB,
+    CondBZ,
+    Enter,
+    La,
+    LaBr,
+    LdArg,
+    Li,
+    LogI,
+    Mem,
+    Mov,
+    Nullary,
+    Rrr,
+    ShiftI,
+    le32,
+    round_up,
+)
+
+
+NAT = {
+    'a0': 10,
+    'a1': 11,
+    'a2': 12,
+    'a3': 13,
+    'a4': 14,
+    'a5': 15,
+    'a6': 16,
+    'a7': 17,
+    't0': 5,
+    't1': 6,
+    't2': 7,
+    's0': 9,
+    's1': 18,
+    's2': 19,
+    's3': 20,
+    'sp': 2,
+    'zero': 0,
+    'ra': 1,
+    'fp': 8,
+    'br': 31,
+    'scratch': 30,
+    'save0': 29,
+    'save1': 28,
+    'save2': 16,
+}
+
+
+RRR_BASE = {
+    'ADD': 0x00000033,
+    'SUB': 0x40000033,
+    'AND': 0x00007033,
+    'OR':  0x00006033,
+    'XOR': 0x00004033,
+    'SHL': 0x00001033,
+    'SHR': 0x00005033,
+    'SAR': 0x40005033,
+    'MUL': 0x02000033,
+    'DIV': 0x02004033,
+    'REM': 0x02006033,
+}
+
+
+# Inverted-condition B-type opcodes for the skip-taken-over-jalr pattern:
+# the skip fires when the P1 condition is FALSE, so the jalr below is the
+# taken target.
+CONDB_INV_BASE = {
+    'BEQ':  0x00001063,  # native BNE -- skip when not equal
+    'BNE':  0x00000063,  # native BEQ -- skip when equal
+    'BLT':  0x00005063,  # native BGE -- skip when ra >= rb (signed)
+    'BLTU': 0x00007063,  # native BGEU -- skip when ra >= rb (unsigned)
+}
+
+
+CONDBZ_INV_BASE = {
+    'BEQZ': 0x00001063,
+    'BNEZ': 0x00000063,
+    'BLTZ': 0x00005063,
+}
+
+
+SYSCALL_NUMBERS = {
+    'SYS_READ': 63,
+    'SYS_WRITE': 64,
+    'SYS_CLOSE': 57,
+    'SYS_OPENAT': 56,
+    'SYS_EXIT': 93,
+    'SYS_CLONE': 220,
+    'SYS_EXECVE': 221,
+    'SYS_WAITID': 95,
+}
+
+
+def rv_r_type(base, rd, ra, rb):
+    d = NAT[rd]
+    a = NAT[ra]
+    b = NAT[rb]
+    return le32(base | (b << 20) | (a << 15) | (d << 7))
+
+
+def rv_i_type(base, rd, ra, imm12):
+    d = NAT[rd]
+    a = NAT[ra]
+    return le32(base | ((imm12 & 0xFFF) << 20) | (a << 15) | (d << 7))
+
+
+def rv_s_type(base, rs, ra, imm12):
+    s = NAT[rs]
+    a = NAT[ra]
+    imm = imm12 & 0xFFF
+    # arithmetic-shift the 12-bit signed value: bits 11:5 -> [31:25],
+    # bits 4:0 -> [11:7]. We only need the unsigned 12-bit pattern here
+    # because the m1pp encoder uses (>> imm 5) on the masked value.
+    hi = (imm >> 5) & 0x7F
+    lo = imm & 0x1F
+    return le32(base | (hi << 25) | (s << 20) | (a << 15) | (lo << 7))
+
+
+def rv_b_type_skip8(base, ra, rb):
+    # Hardcoded +8 branch: imm = 8, encoded with imm[4:1]=4, imm[11]=0,
+    # imm[10:5]=0, imm[12]=0. The combined [11:7] field becomes
+    # (imm[4:1] << 1) | imm[11] = 8.
+    a = NAT[ra]
+    b = NAT[rb]
+    return le32(base | (b << 20) | (a << 15) | (8 << 7))
+
+
+def rv_addi(rd, ra, imm12):
+    return rv_i_type(0x00000013, rd, ra, imm12)
+
+
+def rv_ld(rd, ra, imm12):
+    return rv_i_type(0x00003003, rd, ra, imm12)
+
+
+def rv_sd(rs, ra, imm12):
+    return rv_s_type(0x00003023, rs, ra, imm12)
+
+
+def rv_lbu(rd, ra, imm12):
+    return rv_i_type(0x00004003, rd, ra, imm12)
+
+
+def rv_sb(rs, ra, imm12):
+    return rv_s_type(0x00000023, rs, ra, imm12)
+
+
+def rv_lwu(rd, ra, imm12):
+    return rv_i_type(0x00006003, rd, ra, imm12)
+
+
+def rv_mov_rr(dst, src):
+    return rv_addi(dst, src, 0)
+
+
+def rv_slli(rd, ra, shamt):
+    d = NAT[rd]
+    a = NAT[ra]
+    return le32(0x00001013 | ((shamt & 0x3F) << 20) | (a << 15) | (d << 7))
+
+
+def rv_srli(rd, ra, shamt):
+    d = NAT[rd]
+    a = NAT[ra]
+    return le32(0x00005013 | ((shamt & 0x3F) << 20) | (a << 15) | (d << 7))
+
+
+def rv_srai(rd, ra, shamt):
+    d = NAT[rd]
+    a = NAT[ra]
+    return le32(0x40005013 | ((shamt & 0x3F) << 20) | (a << 15) | (d << 7))
+
+
+def rv_jalr(rd, rs, imm12):
+    d = NAT[rd]
+    s = NAT[rs]
+    return le32(0x00000067 | ((imm12 & 0xFFF) << 20) | (s << 15) | (d << 7))
+
+
+def rv_ecall():
+    return le32(0x00000073)
+
+
+def rv_lit64_prefix(rd):
+    # auipc rd, 0 ; ld rd, 12(rd) ; jal x0, +12.
+    # The 8 bytes that follow in source become the literal.
+    d = NAT[rd]
+    auipc = 0x00000017 | (d << 7)
+    ld = 0x00C03003 | (d << 15) | (d << 7)
+    jal = 0x00C0006F
+    return le32(auipc) + le32(ld) + le32(jal)
+
+
+def rv_lit32_prefix(rd):
+    # auipc rd, 0 ; lwu rd, 12(rd) ; jal x0, +8.
+    # lwu zero-extends a 4-byte literal; enough for stage0 addresses.
+    d = NAT[rd]
+    auipc = 0x00000017 | (d << 7)
+    lwu = 0x00C06003 | (d << 15) | (d << 7)
+    jal = 0x0080006F
+    return le32(auipc) + le32(lwu) + le32(jal)
+
+
+def rv_epilogue():
+    # Frame teardown shared by ERET, TAIL, TAILR. Mirrors p1_eret/p1_tail
+    # in P1-riscv64.M1pp: load saved ra, load saved caller sp into fp,
+    # then move fp into sp. The caller appends the actual jalr.
+    return rv_ld('ra', 'sp', 0) + rv_ld('fp', 'sp', 8) + rv_mov_rr('sp', 'fp')
+
+
+def encode_li(_arch, row):
+    return rv_lit64_prefix(row.rd)
+
+
+def encode_la(_arch, row):
+    return rv_lit32_prefix(row.rd)
+
+
+def encode_labr(_arch, _row):
+    return rv_lit32_prefix('br')
+
+
+def encode_mov(_arch, row):
+    # Portable sp is the frame-local base, which sits 16 bytes above
+    # native sp (the backend's 2-word hidden header occupies the low
+    # end of each frame). MOV rd, sp must therefore yield native_sp+16.
+    if row.rs == 'sp':
+        return rv_addi(row.rd, 'sp', 16)
+    return rv_mov_rr(row.rd, row.rs)
+
+
+def encode_rrr(_arch, row):
+    return rv_r_type(RRR_BASE[row.op], row.rd, row.ra, row.rb)
+
+
+def encode_addi(_arch, row):
+    return rv_addi(row.rd, row.ra, row.imm)
+
+
+def encode_logi(_arch, row):
+    base = {
+        'ANDI': 0x00007013,
+        'ORI':  0x00006013,
+    }[row.op]
+    return rv_i_type(base, row.rd, row.ra, row.imm)
+
+
+def encode_shifti(_arch, row):
+    if row.op == 'SHLI':
+        return rv_slli(row.rd, row.ra, row.imm)
+    if row.op == 'SHRI':
+        return rv_srli(row.rd, row.ra, row.imm)
+    if row.op == 'SARI':
+        return rv_srai(row.rd, row.ra, row.imm)
+    raise ValueError(f'unknown shift op: {row.op}')
+
+
+def encode_mem(_arch, row):
+    # Portable sp points to the frame-local base; the 2-word hidden header
+    # at native_sp+0/+8 is not portable-addressable. Shift sp-relative
+    # offsets past the header.
+    off = row.off + 16 if row.rn == 'sp' else row.off
+    if row.op == 'LD':
+        return rv_ld(row.rt, row.rn, off)
+    if row.op == 'ST':
+        return rv_sd(row.rt, row.rn, off)
+    if row.op == 'LB':
+        return rv_lbu(row.rt, row.rn, off)
+    if row.op == 'SB':
+        return rv_sb(row.rt, row.rn, off)
+    raise ValueError(f'unknown mem op: {row.op}')
+
+
+def encode_ldarg(_arch, row):
+    # LDARG loads the saved caller sp from [sp+8] (the hidden header
+    # slot), then indexes the incoming stack-arg area off it. Slot 0 is
+    # at caller_sp+16 because the native call instruction does not push
+    # a return address on riscv64 -- the +16 matches the aarch64 layout
+    # by convention for stage0 frame uniformity.
+    return rv_ld('scratch', 'sp', 8) + rv_ld(row.rd, 'scratch', 16 + 8 * row.slot)
+
+
+def encode_branch_reg(_arch, row):
+    if row.kind == 'BR':
+        return rv_jalr('zero', row.rs, 0)
+    if row.kind == 'CALLR':
+        return rv_jalr('ra', row.rs, 0)
+    if row.kind == 'TAILR':
+        return rv_epilogue() + rv_jalr('zero', row.rs, 0)
+    raise ValueError(f'unknown branch-reg kind: {row.kind}')
+
+
+def encode_condb(_arch, row):
+    return rv_b_type_skip8(CONDB_INV_BASE[row.op], row.ra, row.rb) + rv_jalr('zero', 'br', 0)
+
+
+def encode_condbz(_arch, row):
+    return rv_b_type_skip8(CONDBZ_INV_BASE[row.op], row.ra, 'zero') + rv_jalr('zero', 'br', 0)
+
+
+def encode_enter(arch, row):
+    frame_bytes = round_up(arch.stack_align, 2 * arch.word_bytes + row.size)
+    return (
+        rv_addi('sp', 'sp', -frame_bytes)
+        + rv_sd('ra', 'sp', 0)
+        + rv_addi('fp', 'sp', frame_bytes)
+        + rv_sd('fp', 'sp', 8)
+    )
+
+
+def encode_nullary(_arch, row):
+    if row.kind == 'B':
+        return rv_jalr('zero', 'br', 0)
+    if row.kind == 'CALL':
+        return rv_jalr('ra', 'br', 0)
+    if row.kind == 'RET':
+        return rv_jalr('zero', 'ra', 0)
+    if row.kind == 'ERET':
+        return rv_epilogue() + rv_jalr('zero', 'ra', 0)
+    if row.kind == 'TAIL':
+        return rv_epilogue() + rv_jalr('zero', 'br', 0)
+    if row.kind == 'SYSCALL':
+        # P1: a0=number, a1..a3,t0,s0,s1 = args 0..5.
+        # Linux riscv64: a7=number, a0..a5 = args 0..5, return in a0.
+        # SYSCALL clobbers only P1 a0; restore a1/a2/a3 after ecall.
+        return ''.join([
+            rv_mov_rr('save0', 'a1'),
+            rv_mov_rr('save1', 'a2'),
+            rv_mov_rr('save2', 'a3'),
+            rv_mov_rr('a7', 'a0'),
+            rv_mov_rr('a0', 'save0'),
+            rv_mov_rr('a1', 'save1'),
+            rv_mov_rr('a2', 'save2'),
+            rv_mov_rr('a3', 't0'),
+            rv_mov_rr('a4', 's0'),
+            rv_mov_rr('a5', 's1'),
+            rv_ecall(),
+            rv_mov_rr('a1', 'save0'),
+            rv_mov_rr('a2', 'save1'),
+            rv_mov_rr('a3', 'save2'),
+        ])
+    raise ValueError(f'unknown nullary kind: {row.kind}')
+
+
+def rv_start_stub():
+    # Backend-owned :_start stub per docs/P1.md §Program Entry. Linux
+    # riscv64 puts argc at [sp] and argv starting at [sp+8]; load argc
+    # into a0, compute &argv[0] into a1, call p1_main under the one-word
+    # direct-result convention, then issue sys_exit. Mirrors %p1_entry
+    # in p1/P1-riscv64.M1pp.
+    #
+    # Raw hex outside DEFINE bodies must be single-quoted so bootstrap
+    # M0 treats it as a literal byte run rather than a token.
+    def q(hex_bytes):
+        return f"'{hex_bytes}'"
+    return [
+        ':_start',
+        q(rv_ld('a0', 'sp', 0)),
+        q(rv_addi('a1', 'sp', 8)),
+        q(rv_lit32_prefix('br')),
+        '&p1_main',
+        q(rv_jalr('ra', 'br', 0)),
+        q(rv_addi('a7', 'zero', 93)),
+        q(rv_ecall()),
+    ]
+
+
+ENCODERS = {
+    Li: encode_li,
+    La: encode_la,
+    LaBr: encode_labr,
+    Mov: encode_mov,
+    Rrr: encode_rrr,
+    AddI: encode_addi,
+    LogI: encode_logi,
+    ShiftI: encode_shifti,
+    Mem: encode_mem,
+    LdArg: encode_ldarg,
+    Nullary: encode_nullary,
+    BranchReg: encode_branch_reg,
+    CondB: encode_condb,
+    CondBZ: encode_condbz,
+    Enter: encode_enter,
+}
+
+
+ARCH = ArchDef(
+    name='riscv64',
+    word_bytes=8,
+    stack_align=16,
+    syscall_numbers=SYSCALL_NUMBERS,
+    encoders=ENCODERS,
+    start_stub=rv_start_stub,
+)

	boot2 Playing with the boostrap
	git clone https://git.ryansepassi.com/git/boot2.git
	Log \| Files \| Refs \| README

A	p1/gen/aarch64.py	\|	410	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	p1/gen/amd64.py	\|	608	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	p1/gen/common.py	\|	49	+++++++++++++++++++++++++++++++++++++++++++++++++
A	p1/gen/p1_gen.py	\|	259	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	p1/gen/riscv64.py	\|	396	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++