commit ffe72f13e339e8411d89985ad324f01a6483a4e6
parent d7398ef3945e0d7f61cc44a36a2cb86a7ab47093
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 24 Apr 2026 15:54:38 -0700
Add amd64 / riscv64 P1v2 generators; explicit ARCHES wiring
Mirrors p1/gen/aarch64.py; each backend encodes the same byte
sequences as its p1/P1-<arch>.M1pp counterpart.
Drop the ARCH_REGISTRY indirection in common.py. Each arch module now
exposes ARCH = ArchDef(...); p1_gen.py builds an explicit ARCHES dict.
Diffstat:
| A | p1/gen/aarch64.py | | | 410 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | p1/gen/amd64.py | | | 608 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | p1/gen/common.py | | | 49 | +++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | p1/gen/p1_gen.py | | | 259 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | p1/gen/riscv64.py | | | 396 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
5 files changed, 1722 insertions(+), 0 deletions(-)
diff --git a/p1/gen/aarch64.py b/p1/gen/aarch64.py
@@ -0,0 +1,410 @@
+from common import (
+ AddI,
+ ArchDef,
+ BranchReg,
+ CondB,
+ CondBZ,
+ Enter,
+ La,
+ LaBr,
+ LdArg,
+ Li,
+ LogI,
+ Mem,
+ Mov,
+ Nullary,
+ Rrr,
+ ShiftI,
+ le32,
+ round_up,
+)
+
+
+NAT = {
+ 'a0': 0,
+ 'a1': 1,
+ 'a2': 2,
+ 'a3': 3,
+ 'x4': 4,
+ 'x5': 5,
+ 't0': 9,
+ 't1': 10,
+ 't2': 11,
+ 's0': 19,
+ 's1': 20,
+ 's2': 21,
+ 's3': 22,
+ 'sp': 31,
+ 'xzr': 31,
+ 'lr': 30,
+ 'br': 17,
+ 'scratch': 16,
+ 'x8': 8,
+ 'save0': 23,
+ 'save1': 24,
+ 'save2': 25,
+}
+
+
+RRR_BASE = {
+ 'ADD': 0x8B000000,
+ 'SUB': 0xCB000000,
+ 'AND': 0x8A000000,
+ 'OR': 0xAA000000,
+ 'XOR': 0xCA000000,
+ 'SHL': 0x9AC02000,
+ 'SHR': 0x9AC02400,
+ 'SAR': 0x9AC02800,
+ 'DIV': 0x9AC00C00,
+}
+
+
+SYSCALL_NUMBERS = {
+ 'SYS_READ': 63,
+ 'SYS_WRITE': 64,
+ 'SYS_CLOSE': 57,
+ 'SYS_OPENAT': 56,
+ 'SYS_EXIT': 93,
+ 'SYS_CLONE': 220,
+ 'SYS_EXECVE': 221,
+ 'SYS_WAITID': 95,
+}
+
+
+def aa_rrr(base, rd, ra, rb):
+ d = NAT[rd]
+ a = NAT[ra]
+ b = NAT[rb]
+ return le32(base | (b << 16) | (a << 5) | d)
+
+
+def aa_add_imm(rd, ra, imm12, sub=False):
+ d = NAT[rd]
+ a = NAT[ra]
+ base = 0xD1000000 if sub else 0x91000000
+ return le32(base | ((imm12 & 0xFFF) << 10) | (a << 5) | d)
+
+
+def aa_mov_rr(dst, src):
+ if dst == 'sp':
+ return aa_add_imm('sp', src, 0, sub=False)
+ if src == 'sp':
+ return aa_add_imm(dst, 'sp', 0, sub=False)
+ d = NAT[dst]
+ s = NAT[src]
+ return le32(0xAA000000 | (s << 16) | (31 << 5) | d)
+
+
+def aa_ubfm(rd, ra, immr, imms):
+ d = NAT[rd]
+ a = NAT[ra]
+ return le32(0xD3400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
+
+
+def aa_sbfm(rd, ra, immr, imms):
+ d = NAT[rd]
+ a = NAT[ra]
+ return le32(0x93400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
+
+
+def aa_movz(rd, imm16):
+ d = NAT[rd]
+ return le32(0xD2800000 | ((imm16 & 0xFFFF) << 5) | d)
+
+
+def aa_movn(rd, imm16):
+ d = NAT[rd]
+ return le32(0x92800000 | ((imm16 & 0xFFFF) << 5) | d)
+
+
+def aa_materialize_small_imm(rd, imm):
+ if imm >= 0:
+ return aa_movz(rd, imm)
+ return aa_movn(rd, (~imm) & 0xFFFF)
+
+
+def aa_ldst_uimm12(base, rt, rn, off_bytes, size_log2):
+ imm12 = off_bytes >> size_log2
+ t = NAT[rt]
+ n = NAT[rn]
+ return le32(base | (imm12 << 10) | (n << 5) | t)
+
+
+def aa_ldst_unscaled(base, rt, rn, off):
+ imm9 = off & 0x1FF
+ t = NAT[rt]
+ n = NAT[rn]
+ return le32(base | (imm9 << 12) | (n << 5) | t)
+
+
+def aa_mem(op, rt, rn, off):
+ bases = {
+ 'LD': (0xF9400000, 3, 0xF8400000),
+ 'ST': (0xF9000000, 3, 0xF8000000),
+ 'LB': (0x39400000, 0, 0x38400000),
+ 'SB': (0x39000000, 0, 0x38000000),
+ }
+ uimm_base, size_log2, unscaled_base = bases[op]
+ scale = 1 << size_log2
+ if off >= 0 and off % scale == 0 and off < (4096 << size_log2):
+ return aa_ldst_uimm12(uimm_base, rt, rn, off, size_log2)
+ if -256 <= off <= 255:
+ return aa_ldst_unscaled(unscaled_base, rt, rn, off)
+ if -2048 <= off <= 2047:
+ if off >= 0:
+ addr = aa_add_imm('scratch', rn, off, sub=False)
+ else:
+ addr = aa_add_imm('scratch', rn, -off, sub=True)
+ return addr + aa_ldst_uimm12(uimm_base, rt, 'scratch', 0, size_log2)
+ raise ValueError(f'aarch64 offset out of range for {op}: {off}')
+
+
+def aa_cmp_skip(op, ra, rb):
+ a = NAT[ra]
+ b = NAT[rb]
+ cmp_hex = le32(0xEB000000 | (b << 16) | (a << 5) | 31)
+ skip_cond = {
+ 'BEQ': 1,
+ 'BNE': 0,
+ 'BLT': 10,
+ 'BLTU': 2,
+ }[op]
+ return cmp_hex + le32(0x54000040 | skip_cond)
+
+
+def aa_br(reg):
+ return le32(0xD61F0000 | (NAT[reg] << 5))
+
+
+def aa_blr(reg):
+ return le32(0xD63F0000 | (NAT[reg] << 5))
+
+
+def aa_ret():
+ return le32(0xD65F03C0)
+
+
+def aa_epilogue():
+ # Frame teardown, shared by ERET, TAIL, TAILR. Loads lr and the
+ # saved caller sp from the hidden header at native_sp+0/+8, then
+ # unwinds sp. Does NOT transfer control; the caller appends an
+ # aa_ret / aa_br as appropriate.
+ return (
+ aa_mem('LD', 'lr', 'sp', 0)
+ + aa_mem('LD', 'x8', 'sp', 8)
+ + aa_mov_rr('sp', 'x8')
+ )
+
+
+def aa_lit64_prefix(rd):
+ ## 64-bit literal-pool prefix for LI: ldr xN, [pc,#8]; b PC+12.
+ ## The 8 bytes that follow in source become the literal; b skips them.
+ d = NAT[rd]
+ ldr_lit = 0x58000040 | d
+ b_plus12 = 0x14000003
+ return le32(ldr_lit) + le32(b_plus12)
+
+
+def aa_lit32_prefix(rd):
+ ## 32-bit literal-pool prefix for LA / LA_BR: ldr wN, [pc,#8]; b PC+8.
+ ## ldr w zero-extends into the full 64-bit register, so a 4-byte literal
+ ## is enough for any address in the stage0 layout (base 0x00600000,
+ ## programs well under 4 GB). This lets source use `&label` directly
+ ## without padding to 8 bytes.
+ d = NAT[rd]
+ ldr_lit = 0x18000040 | d
+ b_plus8 = 0x14000002
+ return le32(ldr_lit) + le32(b_plus8)
+
+
+def encode_li(_arch, row):
+ return aa_lit64_prefix(row.rd)
+
+
+def encode_la(_arch, row):
+ return aa_lit32_prefix(row.rd)
+
+
+def encode_labr(_arch, _row):
+ return aa_lit32_prefix('br')
+
+
+def encode_mov(_arch, row):
+ # Portable `sp` is the frame-local base, which is 16 bytes above
+ # native sp (the backend's 2-word hidden header sits at the low end
+ # of each frame allocation). So reading sp into a register yields
+ # native_sp + 16, not native_sp itself.
+ if row.rs == 'sp':
+ return aa_add_imm(row.rd, 'sp', 16, sub=False)
+ return aa_mov_rr(row.rd, row.rs)
+
+
+def encode_rrr(_arch, row):
+ if row.op == 'MUL':
+ d = NAT[row.rd]
+ a = NAT[row.ra]
+ b = NAT[row.rb]
+ return le32(0x9B000000 | (b << 16) | (31 << 10) | (a << 5) | d)
+ if row.op == 'REM':
+ d = NAT[row.rd]
+ a = NAT[row.ra]
+ b = NAT[row.rb]
+ sc = NAT['scratch']
+ sdiv = 0x9AC00C00 | (b << 16) | (a << 5) | sc
+ msub = 0x9B008000 | (b << 16) | (a << 10) | (sc << 5) | d
+ return le32(sdiv) + le32(msub)
+ return aa_rrr(RRR_BASE[row.op], row.rd, row.ra, row.rb)
+
+
+def encode_addi(_arch, row):
+ if row.imm >= 0:
+ return aa_add_imm(row.rd, row.ra, row.imm, sub=False)
+ return aa_add_imm(row.rd, row.ra, -row.imm, sub=True)
+
+
+def encode_logi(_arch, row):
+ seq = aa_materialize_small_imm('scratch', row.imm)
+ base = {
+ 'ANDI': 0x8A000000,
+ 'ORI': 0xAA000000,
+ }[row.op]
+ return seq + aa_rrr(base, row.rd, row.ra, 'scratch')
+
+
+def encode_shifti(_arch, row):
+ if row.op == 'SHLI':
+ return aa_ubfm(row.rd, row.ra, (-row.imm) & 63, 63 - row.imm)
+ if row.op == 'SHRI':
+ return aa_ubfm(row.rd, row.ra, row.imm, 63)
+ return aa_sbfm(row.rd, row.ra, row.imm, 63)
+
+
+def encode_mem(_arch, row):
+ # Portable sp points to the frame-local base; the 2-word hidden
+ # header sits at native_sp+0/+8 and is not portable-addressable.
+ # Shift sp-relative offsets past the header.
+ off = row.off + 16 if row.rn == 'sp' else row.off
+ return aa_mem(row.op, row.rt, row.rn, off)
+
+
+def encode_ldarg(_arch, row):
+ return aa_mem('LD', 'scratch', 'sp', 8) + aa_mem('LD', row.rd, 'scratch', 16 + 8 * row.slot)
+
+
+def encode_branch_reg(_arch, row):
+ if row.kind == 'BR':
+ return aa_br(row.rs)
+ if row.kind == 'CALLR':
+ return aa_blr(row.rs)
+ if row.kind == 'TAILR':
+ return aa_epilogue() + aa_br(row.rs)
+ raise ValueError(f'unknown branch-reg kind: {row.kind}')
+
+
+def encode_condb(_arch, row):
+ return aa_cmp_skip(row.op, row.ra, row.rb) + aa_br('br')
+
+
+def encode_condbz(_arch, row):
+ a = NAT[row.ra]
+ br_hex = aa_br('br')
+ if row.op == 'BEQZ':
+ return le32(0xB5000000 | (2 << 5) | a) + br_hex
+ if row.op == 'BNEZ':
+ return le32(0xB4000000 | (2 << 5) | a) + br_hex
+ cmp_zero = le32(0xEB1F001F | (a << 5))
+ bge = le32(0x54000040 | 10)
+ return cmp_zero + bge + br_hex
+
+
+def encode_enter(arch, row):
+ frame_bytes = round_up(arch.stack_align, 2 * arch.word_bytes + row.size)
+ return (
+ aa_add_imm('sp', 'sp', frame_bytes, sub=True)
+ + aa_mem('ST', 'lr', 'sp', 0)
+ + aa_add_imm('x8', 'sp', frame_bytes, sub=False)
+ + aa_mem('ST', 'x8', 'sp', 8)
+ )
+
+
+def encode_nullary(_arch, row):
+ if row.kind == 'B':
+ return aa_br('br')
+ if row.kind == 'CALL':
+ return aa_blr('br')
+ if row.kind == 'RET':
+ return aa_ret()
+ if row.kind == 'ERET':
+ return aa_epilogue() + aa_ret()
+ if row.kind == 'TAIL':
+ return aa_epilogue() + aa_br('br')
+ if row.kind == 'SYSCALL':
+ return ''.join([
+ aa_mov_rr('x8', 'a0'),
+ aa_mov_rr('save0', 'a1'),
+ aa_mov_rr('save1', 'a2'),
+ aa_mov_rr('save2', 'a3'),
+ aa_mov_rr('a0', 'save0'),
+ aa_mov_rr('a1', 'save1'),
+ aa_mov_rr('a2', 'save2'),
+ aa_mov_rr('a3', 't0'),
+ aa_mov_rr('x4', 's0'),
+ aa_mov_rr('x5', 's1'),
+ le32(0xD4000001),
+ aa_mov_rr('a1', 'save0'),
+ aa_mov_rr('a2', 'save1'),
+ aa_mov_rr('a3', 'save2'),
+ ])
+ raise ValueError(f'unknown nullary kind: {row.kind}')
+
+
+def aa_start_stub():
+ # Backend-owned :_start stub per docs/P1.md §Program Entry. Captures
+ # argc from [sp] and argv pointer from sp+8, calls p1_main under the
+ # one-word direct-result convention (a0=argc, a1=argv), then issues a
+ # native Linux sys_exit with p1_main's return value. Mirrors the
+ # m1pp-path stub in p1/P1-aarch64.M1pp (`%p1_entry`).
+ #
+ # Raw hex outside `DEFINE` bodies must be single-quoted so bootstrap
+ # M0 treats it as a literal byte run rather than a token.
+ def q(hex_bytes):
+ return f"'{hex_bytes}'"
+ return [
+ ':_start',
+ q(aa_mem('LD', 'a0', 'sp', 0)),
+ q(aa_add_imm('a1', 'sp', 8, sub=False)),
+ q(aa_lit32_prefix('br')),
+ '&p1_main',
+ q(aa_blr('br')),
+ q(aa_movz('x8', 93)),
+ q(le32(0xD4000001)),
+ ]
+
+
+ENCODERS = {
+ Li: encode_li,
+ La: encode_la,
+ LaBr: encode_labr,
+ Mov: encode_mov,
+ Rrr: encode_rrr,
+ AddI: encode_addi,
+ LogI: encode_logi,
+ ShiftI: encode_shifti,
+ Mem: encode_mem,
+ LdArg: encode_ldarg,
+ Nullary: encode_nullary,
+ BranchReg: encode_branch_reg,
+ CondB: encode_condb,
+ CondBZ: encode_condbz,
+ Enter: encode_enter,
+}
+
+
+ARCH = ArchDef(
+ name='aarch64',
+ word_bytes=8,
+ stack_align=16,
+ syscall_numbers=SYSCALL_NUMBERS,
+ encoders=ENCODERS,
+ start_stub=aa_start_stub,
+)
diff --git a/p1/gen/amd64.py b/p1/gen/amd64.py
@@ -0,0 +1,608 @@
+from common import (
+ AddI,
+ ArchDef,
+ BranchReg,
+ CondB,
+ CondBZ,
+ Enter,
+ La,
+ LaBr,
+ LdArg,
+ Li,
+ LogI,
+ Mem,
+ Mov,
+ Nullary,
+ Rrr,
+ ShiftI,
+ byte,
+ le32,
+ round_up,
+)
+
+
+# ---- Native register numbers --------------------------------------------
+#
+# Backend-private mapping from P1 register names to native amd64 regnums.
+# `br` is the hidden branch-target reg (r15). `scratch` is the per-expansion
+# scratch reg (r9). rax/rbp are also used internally (retaddr spill, rcx /
+# rdx save slots) and are not P1-visible.
+
+NAT = {
+ 'a0': 7, # rdi
+ 'a1': 6, # rsi
+ 'a2': 2, # rdx
+ 'a3': 1, # rcx
+ 't0': 10, # r10
+ 't1': 11, # r11
+ 't2': 8, # r8
+ 's0': 3, # rbx
+ 's1': 12, # r12
+ 's2': 13, # r13
+ 's3': 14, # r14
+ 'sp': 4, # rsp
+ 'br': 15, # r15
+ 'scratch': 9, # r9
+ 'rax': 0,
+ 'rcx': 1,
+ 'rdx': 2,
+ 'rbx': 3,
+ 'rsp': 4,
+ 'rbp': 5,
+ 'rsi': 6,
+ 'rdi': 7,
+ 'r8': 8,
+ 'r9': 9,
+ 'r10': 10,
+ 'r11': 11,
+ 'r12': 12,
+ 'r13': 13,
+ 'r14': 14,
+ 'r15': 15,
+}
+
+
+SYSCALL_NUMBERS = {
+ 'SYS_READ': 0,
+ 'SYS_WRITE': 1,
+ 'SYS_CLOSE': 3,
+ 'SYS_OPENAT': 257,
+ 'SYS_EXIT': 60,
+ 'SYS_CLONE': 56,
+ 'SYS_EXECVE': 59,
+ 'SYS_WAITID': 247,
+}
+
+
+# ---- REX / ModRM helpers ------------------------------------------------
+
+def amd_rex_b_short(r):
+ # Optional one-byte REX.B (no W) prefix used by push/pop/jmp r/call r/
+ # mov r,imm32 when the target reg is r8-r15. Returns '' for low regs.
+ if NAT[r] >= 8:
+ return byte(0x41)
+ return ''
+
+
+def amd_rex_wb(r):
+ # REX.W=1, B=(r>>3) to extend ModRM.rm / SIB.base.
+ return byte(0x48 | ((NAT[r] >> 3) & 1))
+
+
+def amd_rex_wrb(rg, rm):
+ # REX.W=1, R=(rg>>3), B=(rm>>3). Used whenever a ModRM.reg field is in
+ # use together with a ModRM.rm field.
+ return byte(0x48 | (((NAT[rg] >> 3) & 1) << 2) | ((NAT[rm] >> 3) & 1))
+
+
+def amd_modrm_rr(rg, rm):
+ return byte(0xC0 | ((NAT[rg] & 7) << 3) | (NAT[rm] & 7))
+
+
+def amd_modrm_ext_r(ext, rm):
+ return byte(0xC0 | ((ext & 7) << 3) | (NAT[rm] & 7))
+
+
+# ---- Memory-addressing ModRM (+ SIB + disp) ----------------------------
+#
+# [base + disp] with `reg` in ModRM.reg. Bases whose low 3 bits are 100 -
+# rsp and r12 - must go through a SIB byte; all others use the plain
+# encoding. disp selects mod=1 (disp8) when it fits in [-128,127], else
+# mod=2 (disp32). We never emit mod=0 / no-disp; the extra byte is fine.
+
+def amd_modrm_disp(reg, base, disp):
+ use_sib = (NAT[base] & 7) == 4
+ use_disp8 = -128 <= disp <= 127
+ reg_lo = NAT[reg] & 7
+ if use_sib:
+ if use_disp8:
+ return byte(0x44 | (reg_lo << 3)) + byte(0x24) + byte(disp)
+ return byte(0x84 | (reg_lo << 3)) + byte(0x24) + le32(disp)
+ base_lo = NAT[base] & 7
+ if use_disp8:
+ return byte(0x40 | (reg_lo << 3) | base_lo) + byte(disp)
+ return byte(0x80 | (reg_lo << 3) | base_lo) + le32(disp)
+
+
+# ---- Register / arithmetic primitives ----------------------------------
+
+def amd_mov_rr(dst, src):
+ # mov dst, src -- REX.WRB 89 /r (source in ModRM.reg, dest in rm)
+ return amd_rex_wrb(src, dst) + byte(0x89) + amd_modrm_rr(src, dst)
+
+
+def amd_alu_rr(opcode, dst, src):
+ # ADD/SUB/AND/OR/XOR dst, src -- REX.WRB <op> /r (src in reg, dst in rm)
+ return amd_rex_wrb(src, dst) + byte(opcode) + amd_modrm_rr(src, dst)
+
+
+def amd_alu_ri8(ext, dst, imm):
+ # op dst, imm8 -- REX.WB 83 /ext ib
+ return amd_rex_wb(dst) + byte(0x83) + amd_modrm_ext_r(ext, dst) + byte(imm)
+
+
+def amd_alu_ri32(ext, dst, imm):
+ # op dst, imm32 -- REX.WB 81 /ext id
+ return amd_rex_wb(dst) + byte(0x81) + amd_modrm_ext_r(ext, dst) + le32(imm)
+
+
+def amd_shift_ri8(ext, dst, imm):
+ # shift dst, imm8 -- REX.WB C1 /ext ib (SHL=4, SHR=5, SAR=7)
+ return (amd_rex_wb(dst) + byte(0xC1) + amd_modrm_ext_r(ext, dst)
+ + byte(imm & 0x3F))
+
+
+def amd_shift_cl(ext, dst):
+ # shift dst, cl -- REX.WB D3 /ext
+ return amd_rex_wb(dst) + byte(0xD3) + amd_modrm_ext_r(ext, dst)
+
+
+def amd_imul_rr(dst, src):
+ # imul dst, src -- REX.WRB 0F AF /r (dst in reg, src in rm)
+ return (amd_rex_wrb(dst, src) + byte(0x0F) + byte(0xAF)
+ + amd_modrm_rr(dst, src))
+
+
+def amd_idiv_r(src):
+ # idiv src -- REX.WB F7 /7
+ return amd_rex_wb(src) + byte(0xF7) + amd_modrm_ext_r(7, src)
+
+
+def amd_cqo():
+ # cqo -- 48 99 (sign-extend rax into rdx:rax)
+ return byte(0x48) + byte(0x99)
+
+
+def amd_push(r):
+ return amd_rex_b_short(r) + byte(0x50 | (NAT[r] & 7))
+
+
+def amd_pop(r):
+ return amd_rex_b_short(r) + byte(0x58 | (NAT[r] & 7))
+
+
+def amd_mov_imm32_prefix(rd):
+ # mov r32, imm32 -- [REX.B] B8+r (caller appends 4-byte literal).
+ # Result is zero-extended into the full 64-bit register.
+ return amd_rex_b_short(rd) + byte(0xB8 | (NAT[rd] & 7))
+
+
+def amd_mov_imm64_prefix(rd):
+ # mov r64, imm64 -- REX.W[.B] B8+r (caller appends 8-byte literal).
+ return amd_rex_wb(rd) + byte(0xB8 | (NAT[rd] & 7))
+
+
+# ---- Memory ops --------------------------------------------------------
+
+def amd_mem_LD(rt, rn, off):
+ # mov rT, [rN + off] -- REX.WRB 8B /r modrm-with-disp
+ return amd_rex_wrb(rt, rn) + byte(0x8B) + amd_modrm_disp(rt, rn, off)
+
+
+def amd_mem_ST(rt, rn, off):
+ # mov [rN + off], rT -- REX.WRB 89 /r
+ return amd_rex_wrb(rt, rn) + byte(0x89) + amd_modrm_disp(rt, rn, off)
+
+
+def amd_mem_SB(rt, rn, off):
+ # mov [rN + off], rT8 -- REX.WRB 88 /r (REX.W forces dil/sil/bpl/spl
+ # byte-view encoding when the low byte of those regs is needed).
+ return amd_rex_wrb(rt, rn) + byte(0x88) + amd_modrm_disp(rt, rn, off)
+
+
+def amd_mem_LB(rt, rn, off):
+ # movzx rT, byte ptr [rN + off] -- REX.WRB 0F B6 /r
+ return (amd_rex_wrb(rt, rn) + byte(0x0F) + byte(0xB6)
+ + amd_modrm_disp(rt, rn, off))
+
+
+# ---- Control-flow primitives -------------------------------------------
+
+def amd_jmp_r(r):
+ # jmp r/m64 -- [REX.B] FF /4. 2 bytes for low regs, 3 bytes for r8-r15.
+ return amd_rex_b_short(r) + byte(0xFF) + byte(0xE0 | (NAT[r] & 7))
+
+
+def amd_call_r(r):
+ # call r/m64 -- [REX.B] FF /2.
+ return amd_rex_b_short(r) + byte(0xFF) + byte(0xD0 | (NAT[r] & 7))
+
+
+def amd_ret():
+ return byte(0xC3)
+
+
+def amd_syscall():
+ return byte(0x0F) + byte(0x05)
+
+
+def amd_cmp_rr(ra, rb):
+ # cmp rA, rB -- REX.WRB 39 /r (rB in reg, rA in rm).
+ return amd_rex_wrb(rb, ra) + byte(0x39) + amd_modrm_rr(rb, ra)
+
+
+def amd_test_rr(ra, rb):
+ return amd_rex_wrb(rb, ra) + byte(0x85) + amd_modrm_rr(rb, ra)
+
+
+# ---- P1 register-register op lowering ----------------------------------
+#
+# For ADD/SUB/AND/OR/XOR we honor rD==rB aliasing -- the naive
+# `mov rD,rA ; op rD,rB` would clobber rB before the op reads it. Route rB
+# through the scratch reg when that aliasing shows up.
+
+ALU_OPCODE = {
+ 'ADD': 0x01,
+ 'SUB': 0x29,
+ 'AND': 0x21,
+ 'OR': 0x09,
+ 'XOR': 0x31,
+}
+
+
+def amd_rrr_simple(opcode, rd, ra, rb):
+ if NAT[rd] == NAT[rb]:
+ return (amd_mov_rr('scratch', rb)
+ + amd_mov_rr(rd, ra)
+ + amd_alu_rr(opcode, rd, 'scratch'))
+ return amd_mov_rr(rd, ra) + amd_alu_rr(opcode, rd, rb)
+
+
+def amd_rrr_MUL(rd, ra, rb):
+ if NAT[rd] == NAT[rb]:
+ return (amd_mov_rr('scratch', rb)
+ + amd_mov_rr(rd, ra)
+ + amd_imul_rr(rd, 'scratch'))
+ return amd_mov_rr(rd, ra) + amd_imul_rr(rd, rb)
+
+
+# DIV / REM clobber rax and rdx natively. rax is not a P1 register, so we
+# clobber it freely; rdx IS P1 a2, so we stash it to rbp (also outside the
+# P1 mapping) for the lifetime of the op. Aliasing-safety plan mirrors the
+# M1pp comments verbatim.
+
+def amd_rrr_DIV(rd, ra, rb):
+ return ''.join([
+ amd_mov_rr('rbp', 'rdx'),
+ amd_mov_rr('scratch', rb),
+ amd_mov_rr('rax', ra),
+ amd_cqo(),
+ amd_idiv_r('scratch'),
+ amd_mov_rr('rdx', 'rbp'),
+ amd_mov_rr(rd, 'rax'),
+ ])
+
+
+def amd_rrr_REM(rd, ra, rb):
+ return ''.join([
+ amd_mov_rr('rbp', 'rdx'),
+ amd_mov_rr('scratch', rb),
+ amd_mov_rr('rax', ra),
+ amd_cqo(),
+ amd_idiv_r('scratch'),
+ amd_mov_rr('rax', 'rdx'),
+ amd_mov_rr('rdx', 'rbp'),
+ amd_mov_rr(rd, 'rax'),
+ ])
+
+
+# SHL / SHR / SAR with reg count. x86 reads the count from CL only, so
+# staging goes through rcx -- which IS P1 a3. Save rcx to rbp for the
+# duration. Ordering matches the M1pp comments.
+
+def amd_rrr_shift(ext, rd, ra, rb):
+ return ''.join([
+ amd_mov_rr('rbp', 'rcx'),
+ amd_mov_rr('scratch', ra),
+ amd_mov_rr('rcx', rb),
+ amd_shift_cl(ext, 'scratch'),
+ amd_mov_rr('rcx', 'rbp'),
+ amd_mov_rr(rd, 'scratch'),
+ ])
+
+
+# ---- Encoders ----------------------------------------------------------
+
+def encode_li(_arch, row):
+ return amd_mov_imm64_prefix(row.rd)
+
+
+def encode_la(_arch, row):
+ return amd_mov_imm32_prefix(row.rd)
+
+
+def encode_labr(_arch, _row):
+ return amd_mov_imm32_prefix('br')
+
+
+def encode_mov(_arch, row):
+ # Portable sp is the frame-local base, which is 16 bytes above native
+ # rsp. Reading sp into a register yields native_rsp + 16, so emit
+ # `mov rd, rsp ; add rd, 16` for the sp-source case.
+ if row.rs == 'sp':
+ return amd_mov_rr(row.rd, 'sp') + amd_alu_ri8(0, row.rd, 16)
+ return amd_mov_rr(row.rd, row.rs)
+
+
+def encode_rrr(_arch, row):
+ if row.op == 'MUL':
+ return amd_rrr_MUL(row.rd, row.ra, row.rb)
+ if row.op == 'DIV':
+ return amd_rrr_DIV(row.rd, row.ra, row.rb)
+ if row.op == 'REM':
+ return amd_rrr_REM(row.rd, row.ra, row.rb)
+ if row.op == 'SHL':
+ return amd_rrr_shift(4, row.rd, row.ra, row.rb)
+ if row.op == 'SHR':
+ return amd_rrr_shift(5, row.rd, row.ra, row.rb)
+ if row.op == 'SAR':
+ return amd_rrr_shift(7, row.rd, row.ra, row.rb)
+ return amd_rrr_simple(ALU_OPCODE[row.op], row.rd, row.ra, row.rb)
+
+
+def encode_addi(_arch, row):
+ head = amd_mov_rr(row.rd, row.ra)
+ if -128 <= row.imm <= 127:
+ return head + amd_alu_ri8(0, row.rd, row.imm)
+ return head + amd_alu_ri32(0, row.rd, row.imm)
+
+
+# AND/OR with imm: 83 /ext ib sign-extends imm8 to 64 bits. That works for
+# imm in [-128, 127] (and for -1 as a convenient all-ones mask), but breaks
+# for positive imms >= 128 -- ANDI with 255 would become AND with
+# 0xFFFFFFFFFFFFFFFF. Widen to the imm32 form when imm8 would misencode.
+LOGI_EXT = {
+ 'ANDI': 4,
+ 'ORI': 1,
+}
+
+
+def encode_logi(_arch, row):
+ head = amd_mov_rr(row.rd, row.ra)
+ ext = LOGI_EXT[row.op]
+ if -128 <= row.imm <= 127:
+ return head + amd_alu_ri8(ext, row.rd, row.imm)
+ return head + amd_alu_ri32(ext, row.rd, row.imm)
+
+
+SHIFTI_EXT = {
+ 'SHLI': 4,
+ 'SHRI': 5,
+ 'SARI': 7,
+}
+
+
+def encode_shifti(_arch, row):
+ return (amd_mov_rr(row.rd, row.ra)
+ + amd_shift_ri8(SHIFTI_EXT[row.op], row.rd, row.imm))
+
+
+def encode_mem(_arch, row):
+ # Portable sp points to the frame-local base; the 16-byte hidden frame
+ # header sits at native_rsp+0..15 and is not portable-addressable.
+ # Shift sp-relative offsets past the header.
+ off = row.off + 16 if row.rn == 'sp' else row.off
+ if row.op == 'LD':
+ return amd_mem_LD(row.rt, row.rn, off)
+ if row.op == 'ST':
+ return amd_mem_ST(row.rt, row.rn, off)
+ if row.op == 'LB':
+ return amd_mem_LB(row.rt, row.rn, off)
+ if row.op == 'SB':
+ return amd_mem_SB(row.rt, row.rn, off)
+ raise ValueError(f'unknown mem op: {row.op}')
+
+
+def encode_ldarg(_arch, row):
+ # Internal callers bypass the +16 sp-base translation: native rsp+8
+ # holds the saved caller-sp pointer set up by p1_enter, and the first
+ # incoming stack-arg word lives 16 bytes past that.
+ return (amd_mem_LD('scratch', 'sp', 8)
+ + amd_mem_LD(row.rd, 'scratch', 16 + 8 * row.slot))
+
+
+def amd_epilogue_prefix():
+ # Frame-teardown prefix shared by ERET, TAIL, TAILR. Loads retaddr into
+ # scratch (r9), saved caller sp into rax, unwinds rsp, then re-pushes
+ # retaddr so a trailing `ret` or `jmp` finds the right top-of-stack
+ # layout. (For TAIL/TAILR the trailing op is a jmp, but the retaddr
+ # still needs to be back on the stack so the eventual callee `ret`
+ # returns to the original caller.)
+ return ''.join([
+ amd_mem_LD('scratch', 'sp', 0),
+ amd_mem_LD('rax', 'sp', 8),
+ amd_mov_rr('sp', 'rax'),
+ amd_push('scratch'),
+ ])
+
+
+def encode_branch_reg(_arch, row):
+ if row.kind == 'BR':
+ return amd_jmp_r(row.rs)
+ if row.kind == 'CALLR':
+ return amd_call_r(row.rs)
+ if row.kind == 'TAILR':
+ return amd_epilogue_prefix() + amd_jmp_r(row.rs)
+ raise ValueError(f'unknown branch-reg kind: {row.kind}')
+
+
+# Conditional-branch lowering:
+# cmp / test
+# Jcc_inverse +3 -- skip the 3-byte `jmp r15`
+# jmp r15 -- P1 branch-taken path
+#
+# Invert codes: BEQ->JNE(75), BNE->JE(74), BLT->JGE(7D), BLTU->JAE(73),
+# BLTZ->JGE(7D), BEQZ->JNE(75), BNEZ->JE(74). The 0x03 rel8 skips
+# `amd_jmp_r(br)` which is 3 bytes (REX.B 41 + FF + E7).
+CONDB_INVERT = {
+ 'BEQ': 0x75, # JNE
+ 'BNE': 0x74, # JE
+ 'BLT': 0x7D, # JGE
+ 'BLTU': 0x73, # JAE
+}
+
+CONDBZ_INVERT = {
+ 'BEQZ': 0x75, # JNE
+ 'BNEZ': 0x74, # JE
+ 'BLTZ': 0x7D, # JGE
+}
+
+
+def encode_condb(_arch, row):
+ return (amd_cmp_rr(row.ra, row.rb)
+ + byte(CONDB_INVERT[row.op]) + byte(0x03)
+ + amd_jmp_r('br'))
+
+
+def encode_condbz(_arch, row):
+ return (amd_test_rr(row.ra, row.ra)
+ + byte(CONDBZ_INVERT[row.op]) + byte(0x03)
+ + amd_jmp_r('br'))
+
+
+def encode_enter(arch, row):
+ # CALL on amd64 pushed the retaddr, so on entry:
+ # rsp = caller_sp - 8
+ # [rsp] = retaddr
+ #
+ # Standard frame after ENTER:
+ # [sp + 0] = retaddr
+ # [sp + 8] = saved caller_sp
+ # [sp + 16 .. 16 + size - 1] = portable locals
+ # total frame = round_up(stack_align, 16 + size)
+ frame_bytes = round_up(arch.stack_align, 2 * arch.word_bytes + row.size)
+ return ''.join([
+ amd_pop('scratch'),
+ amd_mov_rr('rax', 'sp'),
+ amd_alu_ri32(5, 'sp', frame_bytes),
+ amd_mem_ST('scratch', 'sp', 0),
+ amd_mem_ST('rax', 'sp', 8),
+ ])
+
+
+def encode_nullary(_arch, row):
+ if row.kind == 'B':
+ return amd_jmp_r('br')
+ if row.kind == 'CALL':
+ return amd_call_r('br')
+ if row.kind == 'RET':
+ return amd_ret()
+ if row.kind == 'ERET':
+ return amd_epilogue_prefix() + amd_ret()
+ if row.kind == 'TAIL':
+ return amd_epilogue_prefix() + amd_jmp_r('br')
+ if row.kind == 'SYSCALL':
+ # P1: a0=num, a1..a3,t0,s0,s1 = args 0..5. Linux amd64: rax=num,
+ # rdi/rsi/rdx/r10/r8/r9 = args 0..5, return in rax; syscall also
+ # clobbers rcx and r11.
+ #
+ # Push the P1 registers whose native slots get overwritten or
+ # syscall-clobbered -- rsi (a1), rdx (a2), rcx (a3), r11 (t1),
+ # r8 (t2) -- then shuffle into the native arg slots, issue
+ # syscall, restore, and move the return value (rax) into a0
+ # (rdi). Stack offsets after the 5 pushes: [rsp+0]=r8,
+ # [rsp+8]=r11, [rsp+16]=rcx (a3), [rsp+24]=rdx (a2),
+ # [rsp+32]=rsi (a1).
+ return ''.join([
+ amd_push('rsi'),
+ amd_push('rdx'),
+ amd_push('rcx'),
+ amd_push('r11'),
+ amd_push('r8'),
+ amd_mov_rr('rax', 'rdi'),
+ amd_mem_LD('rdi', 'sp', 32),
+ amd_mem_LD('rsi', 'sp', 24),
+ amd_mem_LD('rdx', 'sp', 16),
+ amd_mov_rr('r8', 'rbx'),
+ amd_mov_rr('r9', 'r12'),
+ amd_syscall(),
+ amd_pop('r8'),
+ amd_pop('r11'),
+ amd_pop('rcx'),
+ amd_pop('rdx'),
+ amd_pop('rsi'),
+ amd_mov_rr('rdi', 'rax'),
+ ])
+ raise ValueError(f'unknown nullary kind: {row.kind}')
+
+
+def amd_start_stub():
+ # Backend-owned :_start stub per docs/P1.md §Program Entry. Linux amd64
+ # puts argc at [rsp] and argv starting at [rsp+8]. Load argc into a0
+ # (rdi), compute &argv[0] into a1 (rsi), call p1_main under the
+ # one-word direct-result convention, then issue sys_exit with
+ # p1_main's return value in a0 (== rdi). Mirrors the `%p1_entry`
+ # macro in p1/P1-amd64.M1pp.
+ #
+ # Raw hex outside DEFINE bodies must be single-quoted so bootstrap M0
+ # treats it as a literal byte run. The bootstrap amd64 M0 has a 256B
+ # token buffer, so each quoted run must stay <= 128 hex chars; we
+ # split into multiple short lines defensively.
+ def q(hex_bytes):
+ return f"'{hex_bytes}'"
+
+ load_argc = amd_mem_LD('a0', 'sp', 0)
+ compute_argv = amd_mov_rr('a1', 'sp') + amd_alu_ri8(0, 'a1', 8)
+ labr_prefix = amd_mov_imm32_prefix('br')
+ call_main = amd_call_r('br')
+ # mov eax, 60 ; syscall. P1 a0 (rdi) already holds p1_main's return.
+ sys_exit = byte(0xB8) + le32(60) + amd_syscall()
+
+ return [
+ ':_start',
+ q(load_argc),
+ q(compute_argv),
+ q(labr_prefix),
+ '&p1_main',
+ q(call_main),
+ q(sys_exit),
+ ]
+
+
+ENCODERS = {
+ Li: encode_li,
+ La: encode_la,
+ LaBr: encode_labr,
+ Mov: encode_mov,
+ Rrr: encode_rrr,
+ AddI: encode_addi,
+ LogI: encode_logi,
+ ShiftI: encode_shifti,
+ Mem: encode_mem,
+ LdArg: encode_ldarg,
+ Nullary: encode_nullary,
+ BranchReg: encode_branch_reg,
+ CondB: encode_condb,
+ CondBZ: encode_condbz,
+ Enter: encode_enter,
+}
+
+
+ARCH = ArchDef(
+ name='amd64',
+ word_bytes=8,
+ stack_align=16,
+ syscall_numbers=SYSCALL_NUMBERS,
+ encoders=ENCODERS,
+ start_stub=amd_start_stub,
+)
diff --git a/p1/gen/common.py b/p1/gen/common.py
@@ -0,0 +1,49 @@
+from collections import namedtuple
+
+
+ArchDef = namedtuple(
+ 'ArchDef',
+ 'name word_bytes stack_align syscall_numbers encoders start_stub',
+)
+
+Banner = namedtuple('Banner', 'text')
+Literal = namedtuple('Literal', 'name hex_by_arch')
+Nullary = namedtuple('Nullary', 'name kind')
+Li = namedtuple('Li', 'name rd')
+La = namedtuple('La', 'name rd')
+LaBr = namedtuple('LaBr', 'name')
+Mov = namedtuple('Mov', 'name rd rs')
+Rrr = namedtuple('Rrr', 'name op rd ra rb')
+AddI = namedtuple('AddI', 'name rd ra imm')
+LogI = namedtuple('LogI', 'name op rd ra imm')
+ShiftI = namedtuple('ShiftI', 'name op rd ra imm')
+Mem = namedtuple('Mem', 'name op rt rn off')
+LdArg = namedtuple('LdArg', 'name rd slot')
+BranchReg = namedtuple('BranchReg', 'name kind rs')
+CondB = namedtuple('CondB', 'name op ra rb')
+CondBZ = namedtuple('CondBZ', 'name op ra')
+Enter = namedtuple('Enter', 'name size')
+
+
+def byte(n):
+ return f'{n & 0xFF:02X}'
+
+
+def le32(n):
+ return (n & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
+
+
+def le64(n):
+ return (n & 0xFFFFFFFFFFFFFFFF).to_bytes(8, 'little').hex().upper()
+
+
+def word_hex(word_bytes, n):
+ if word_bytes == 4:
+ return le32(n)
+ if word_bytes == 8:
+ return le64(n)
+ raise ValueError(f'unsupported word size: {word_bytes}')
+
+
+def round_up(align, n):
+ return ((n + align - 1) // align) * align
diff --git a/p1/gen/p1_gen.py b/p1/gen/p1_gen.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+"""Generate P1 v2 DEFINE tables.
+
+This is a fresh generator for docs/P1v2.md. The ISA surface is described by
+plain namedtuple rows, and each backend registers a simple row-type -> encoder
+mapping. The emitted immediate/offset domains are still curated tables rather
+than the full theoretical spec space, so extending coverage is a one-line data
+edit instead of an architecture rewrite.
+
+Usage:
+ python3 p1/gen/p1_gen.py [--arch ARCH] [build-root]
+ python3 p1/gen/p1_gen.py --check [--arch ARCH] [build-root]
+ python3 p1/gen/p1_gen.py --list-archs
+"""
+
+import os
+import sys
+from itertools import product
+
+from common import (
+ AddI,
+ Banner,
+ BranchReg,
+ CondB,
+ CondBZ,
+ Enter,
+ La,
+ LaBr,
+ LdArg,
+ Li,
+ Literal,
+ LogI,
+ Mem,
+ Mov,
+ Nullary,
+ Rrr,
+ ShiftI,
+ word_hex,
+)
+
+import aarch64
+import amd64
+import riscv64
+
+ARCHES = {a.name: a for a in (aarch64.ARCH, amd64.ARCH, riscv64.ARCH)}
+
+
+P1_GPRS = ('a0', 'a1', 'a2', 'a3', 't0', 't1', 't2', 's0', 's1', 's2', 's3')
+P1_BASES = P1_GPRS + ('sp',)
+
+RRR_OPS = ('ADD', 'SUB', 'AND', 'OR', 'XOR', 'SHL', 'SHR', 'SAR', 'MUL', 'DIV', 'REM')
+LOGI_OPS = ('ANDI', 'ORI')
+SHIFT_OPS = ('SHLI', 'SHRI', 'SARI')
+MEM_OPS = ('LD', 'ST', 'LB', 'SB')
+CONDB_OPS = ('BEQ', 'BNE', 'BLT', 'BLTU')
+CONDBZ_OPS = ('BEQZ', 'BNEZ', 'BLTZ')
+
+ADDI_IMMS = (
+ -2048, -1024, -256, -128, -64, -48, -32, -24, -16, -12, -8, -7, -6,
+ -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16, 24, 32, 40,
+ 48, 63, 64, 127, 128, 255, 256, 512, 1024, 2047,
+)
+
+LOGI_IMMS = (
+ -1, 0, 1, 2, 3, 4, 6, 7, 8, 15, 16, 31, 32, 63, 64, 127, 255, 511, 1023,
+ 2047,
+)
+
+SHIFT_IMMS = tuple(range(64))
+
+MEM_OFFS = (
+ -256, -128, -64, -48, -32, -24, -16, -8, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8,
+ 15, 16, 24, 32, 40, 48, 56, 64, 128, 255,
+)
+
+LDARG_SLOTS = tuple(range(32))
+ENTER_SIZES = tuple(range(0, 129))
+
+
+HEADER = """## p1_{arch}.M1 — GENERATED by p1/gen/p1_gen.py. Do not edit by hand.
+##
+## This table targets the P1 v2 ISA described in docs/P1v2.md.
+## Row shapes are shared; per-arch lowering lives in p1/gen/<arch>.py.
+"""
+
+
+def imm_suffix(imm):
+ return f'NEG{-imm}' if imm < 0 else str(imm)
+
+
+def rows(arch):
+ out = []
+
+ out.append(Banner('Materialization'))
+ for rd in P1_GPRS:
+ out.append(Li(name=f'LI_{rd.upper()}', rd=rd))
+ for rd in P1_GPRS:
+ out.append(La(name=f'LA_{rd.upper()}', rd=rd))
+ out.append(LaBr(name='LA_BR'))
+
+ out.append(Banner('Moves'))
+ for rd, rs in product(P1_GPRS, P1_GPRS):
+ out.append(Mov(name=f'MOV_{rd.upper()}_{rs.upper()}', rd=rd, rs=rs))
+ for rd in P1_GPRS:
+ out.append(Mov(name=f'MOV_{rd.upper()}_SP', rd=rd, rs='sp'))
+
+ out.append(Banner('Register Arithmetic'))
+ for op, rd, ra, rb in product(RRR_OPS, P1_GPRS, P1_GPRS, P1_GPRS):
+ out.append(Rrr(name=f'{op}_{rd.upper()}_{ra.upper()}_{rb.upper()}',
+ op=op, rd=rd, ra=ra, rb=rb))
+
+ out.append(Banner('Immediate Arithmetic'))
+ for rd, ra, imm in product(P1_GPRS, P1_GPRS, ADDI_IMMS):
+ out.append(AddI(name=f'ADDI_{rd.upper()}_{ra.upper()}_{imm_suffix(imm)}',
+ rd=rd, ra=ra, imm=imm))
+ for op, rd, ra, imm in product(LOGI_OPS, P1_GPRS, P1_GPRS, LOGI_IMMS):
+ out.append(LogI(name=f'{op}_{rd.upper()}_{ra.upper()}_{imm_suffix(imm)}',
+ op=op, rd=rd, ra=ra, imm=imm))
+ for op, rd, ra, imm in product(SHIFT_OPS, P1_GPRS, P1_GPRS, SHIFT_IMMS):
+ out.append(ShiftI(name=f'{op}_{rd.upper()}_{ra.upper()}_{imm}',
+ op=op, rd=rd, ra=ra, imm=imm))
+
+ out.append(Banner('Memory'))
+ for op, rt, rn, off in product(MEM_OPS, P1_GPRS, P1_BASES, MEM_OFFS):
+ out.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{imm_suffix(off)}',
+ op=op, rt=rt, rn=rn, off=off))
+
+ out.append(Banner('ABI Access'))
+ for rd, slot in product(P1_GPRS, LDARG_SLOTS):
+ out.append(LdArg(name=f'LDARG_{rd.upper()}_{slot}', rd=rd, slot=slot))
+
+ out.append(Banner('Branches'))
+ out.append(Nullary(name='B', kind='B'))
+ for rs in P1_GPRS:
+ out.append(BranchReg(name=f'BR_{rs.upper()}', kind='BR', rs=rs))
+ for op, ra, rb in product(CONDB_OPS, P1_GPRS, P1_GPRS):
+ out.append(CondB(name=f'{op}_{ra.upper()}_{rb.upper()}', op=op, ra=ra, rb=rb))
+ for op, ra in product(CONDBZ_OPS, P1_GPRS):
+ out.append(CondBZ(name=f'{op}_{ra.upper()}', op=op, ra=ra))
+
+ out.append(Banner('Calls And Returns'))
+ out.append(Nullary(name='CALL', kind='CALL'))
+ out.append(Nullary(name='RET', kind='RET'))
+ out.append(Nullary(name='ERET', kind='ERET'))
+ out.append(Nullary(name='TAIL', kind='TAIL'))
+ for rs in P1_GPRS:
+ out.append(BranchReg(name=f'CALLR_{rs.upper()}', kind='CALLR', rs=rs))
+ for rs in P1_GPRS:
+ out.append(BranchReg(name=f'TAILR_{rs.upper()}', kind='TAILR', rs=rs))
+
+ out.append(Banner('Frame Management'))
+ for size in ENTER_SIZES:
+ out.append(Enter(name=f'ENTER_{size}', size=size))
+
+ out.append(Banner('System'))
+ out.append(Nullary(name='SYSCALL', kind='SYSCALL'))
+ for name, number in sorted(arch.syscall_numbers.items()):
+ out.append(Literal(name=name, hex_by_arch={arch.name: word_hex(arch.word_bytes, number)}))
+
+ return out
+
+
+def lower_name(name):
+ low = name.lower()
+ head, sep, rest = low.partition('_')
+ if not sep:
+ return low
+ if '_' not in rest:
+ return low
+ return f'{head}_{rest.replace("_", ",")}'
+
+
+def encode_row(arch, row):
+ if isinstance(row, Literal):
+ return row.hex_by_arch[arch.name]
+ encoder = arch.encoders[type(row)]
+ return encoder(arch, row)
+
+
+def emit(arch_name):
+ arch = ARCHES[arch_name]
+ out = [HEADER.format(arch=arch.name).rstrip(), '']
+ seen = set()
+ for row in rows(arch):
+ if isinstance(row, Banner):
+ out.append('')
+ out.append(f'## ---- {row.text}')
+ continue
+ name = lower_name(row.name)
+ if name in seen:
+ raise RuntimeError(f'duplicate DEFINE: {name}')
+ seen.add(name)
+ out.append(f'DEFINE {name} {encode_row(arch, row)}')
+ out.append('')
+ out.append('## ---- Program Entry')
+ out.append('## Backend-owned :_start stub per docs/P1.md §Program Entry.')
+ out.append('## Calls p1_main under the one-word direct-result convention')
+ out.append("## (a0=argc, a1=argv) and sys_exits its return value.")
+ out.extend(arch.start_stub())
+ out.append('')
+ return '\n'.join(out)
+
+
+def parse_args(argv):
+ check = False
+ archs = []
+ positional = []
+ i = 0
+ while i < len(argv):
+ arg = argv[i]
+ if arg == '--check':
+ check = True
+ elif arg == '--list-archs':
+ print('\n'.join(sorted(ARCHES)))
+ sys.exit(0)
+ elif arg == '--arch':
+ i += 1
+ if i >= len(argv):
+ raise SystemExit('--arch requires a value')
+ archs.append(argv[i])
+ else:
+ positional.append(arg)
+ i += 1
+ build_root = positional[0] if positional else os.path.join('build', 'p1v2')
+ if not archs:
+ archs = list(sorted(ARCHES))
+ return check, archs, build_root
+
+
+def main(argv=None):
+ check, archs, build_root = parse_args(argv or sys.argv[1:])
+ had_diff = False
+
+ for arch_name in archs:
+ arch = ARCHES[arch_name]
+ dest_dir = os.path.join(build_root, arch.name)
+ path = os.path.join(dest_dir, f'p1_{arch.name}.M1')
+ content = emit(arch.name)
+ if check:
+ try:
+ with open(path) as f:
+ existing = f.read()
+ except FileNotFoundError:
+ existing = ''
+ if existing != content:
+ sys.stderr.write(f'DIFF: {path}\n')
+ had_diff = True
+ continue
+ os.makedirs(dest_dir, exist_ok=True)
+ with open(path, 'w') as f:
+ f.write(content)
+ print(f'wrote {path} ({len(content)} bytes)')
+
+ if check and had_diff:
+ sys.exit(1)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/p1/gen/riscv64.py b/p1/gen/riscv64.py
@@ -0,0 +1,396 @@
+from common import (
+ AddI,
+ ArchDef,
+ BranchReg,
+ CondB,
+ CondBZ,
+ Enter,
+ La,
+ LaBr,
+ LdArg,
+ Li,
+ LogI,
+ Mem,
+ Mov,
+ Nullary,
+ Rrr,
+ ShiftI,
+ le32,
+ round_up,
+)
+
+
+NAT = {
+ 'a0': 10,
+ 'a1': 11,
+ 'a2': 12,
+ 'a3': 13,
+ 'a4': 14,
+ 'a5': 15,
+ 'a6': 16,
+ 'a7': 17,
+ 't0': 5,
+ 't1': 6,
+ 't2': 7,
+ 's0': 9,
+ 's1': 18,
+ 's2': 19,
+ 's3': 20,
+ 'sp': 2,
+ 'zero': 0,
+ 'ra': 1,
+ 'fp': 8,
+ 'br': 31,
+ 'scratch': 30,
+ 'save0': 29,
+ 'save1': 28,
+ 'save2': 16,
+}
+
+
+RRR_BASE = {
+ 'ADD': 0x00000033,
+ 'SUB': 0x40000033,
+ 'AND': 0x00007033,
+ 'OR': 0x00006033,
+ 'XOR': 0x00004033,
+ 'SHL': 0x00001033,
+ 'SHR': 0x00005033,
+ 'SAR': 0x40005033,
+ 'MUL': 0x02000033,
+ 'DIV': 0x02004033,
+ 'REM': 0x02006033,
+}
+
+
+# Inverted-condition B-type opcodes for the skip-taken-over-jalr pattern:
+# the skip fires when the P1 condition is FALSE, so the jalr below is the
+# taken target.
+CONDB_INV_BASE = {
+ 'BEQ': 0x00001063, # native BNE -- skip when not equal
+ 'BNE': 0x00000063, # native BEQ -- skip when equal
+ 'BLT': 0x00005063, # native BGE -- skip when ra >= rb (signed)
+ 'BLTU': 0x00007063, # native BGEU -- skip when ra >= rb (unsigned)
+}
+
+
+CONDBZ_INV_BASE = {
+ 'BEQZ': 0x00001063,
+ 'BNEZ': 0x00000063,
+ 'BLTZ': 0x00005063,
+}
+
+
+SYSCALL_NUMBERS = {
+ 'SYS_READ': 63,
+ 'SYS_WRITE': 64,
+ 'SYS_CLOSE': 57,
+ 'SYS_OPENAT': 56,
+ 'SYS_EXIT': 93,
+ 'SYS_CLONE': 220,
+ 'SYS_EXECVE': 221,
+ 'SYS_WAITID': 95,
+}
+
+
+def rv_r_type(base, rd, ra, rb):
+ d = NAT[rd]
+ a = NAT[ra]
+ b = NAT[rb]
+ return le32(base | (b << 20) | (a << 15) | (d << 7))
+
+
+def rv_i_type(base, rd, ra, imm12):
+ d = NAT[rd]
+ a = NAT[ra]
+ return le32(base | ((imm12 & 0xFFF) << 20) | (a << 15) | (d << 7))
+
+
+def rv_s_type(base, rs, ra, imm12):
+ s = NAT[rs]
+ a = NAT[ra]
+ imm = imm12 & 0xFFF
+ # arithmetic-shift the 12-bit signed value: bits 11:5 -> [31:25],
+ # bits 4:0 -> [11:7]. We only need the unsigned 12-bit pattern here
+ # because the m1pp encoder uses (>> imm 5) on the masked value.
+ hi = (imm >> 5) & 0x7F
+ lo = imm & 0x1F
+ return le32(base | (hi << 25) | (s << 20) | (a << 15) | (lo << 7))
+
+
+def rv_b_type_skip8(base, ra, rb):
+ # Hardcoded +8 branch: imm = 8, encoded with imm[4:1]=4, imm[11]=0,
+ # imm[10:5]=0, imm[12]=0. The combined [11:7] field becomes
+ # (imm[4:1] << 1) | imm[11] = 8.
+ a = NAT[ra]
+ b = NAT[rb]
+ return le32(base | (b << 20) | (a << 15) | (8 << 7))
+
+
+def rv_addi(rd, ra, imm12):
+ return rv_i_type(0x00000013, rd, ra, imm12)
+
+
+def rv_ld(rd, ra, imm12):
+ return rv_i_type(0x00003003, rd, ra, imm12)
+
+
+def rv_sd(rs, ra, imm12):
+ return rv_s_type(0x00003023, rs, ra, imm12)
+
+
+def rv_lbu(rd, ra, imm12):
+ return rv_i_type(0x00004003, rd, ra, imm12)
+
+
+def rv_sb(rs, ra, imm12):
+ return rv_s_type(0x00000023, rs, ra, imm12)
+
+
+def rv_lwu(rd, ra, imm12):
+ return rv_i_type(0x00006003, rd, ra, imm12)
+
+
+def rv_mov_rr(dst, src):
+ return rv_addi(dst, src, 0)
+
+
+def rv_slli(rd, ra, shamt):
+ d = NAT[rd]
+ a = NAT[ra]
+ return le32(0x00001013 | ((shamt & 0x3F) << 20) | (a << 15) | (d << 7))
+
+
+def rv_srli(rd, ra, shamt):
+ d = NAT[rd]
+ a = NAT[ra]
+ return le32(0x00005013 | ((shamt & 0x3F) << 20) | (a << 15) | (d << 7))
+
+
+def rv_srai(rd, ra, shamt):
+ d = NAT[rd]
+ a = NAT[ra]
+ return le32(0x40005013 | ((shamt & 0x3F) << 20) | (a << 15) | (d << 7))
+
+
+def rv_jalr(rd, rs, imm12):
+ d = NAT[rd]
+ s = NAT[rs]
+ return le32(0x00000067 | ((imm12 & 0xFFF) << 20) | (s << 15) | (d << 7))
+
+
+def rv_ecall():
+ return le32(0x00000073)
+
+
+def rv_lit64_prefix(rd):
+ # auipc rd, 0 ; ld rd, 12(rd) ; jal x0, +12.
+ # The 8 bytes that follow in source become the literal.
+ d = NAT[rd]
+ auipc = 0x00000017 | (d << 7)
+ ld = 0x00C03003 | (d << 15) | (d << 7)
+ jal = 0x00C0006F
+ return le32(auipc) + le32(ld) + le32(jal)
+
+
+def rv_lit32_prefix(rd):
+ # auipc rd, 0 ; lwu rd, 12(rd) ; jal x0, +8.
+ # lwu zero-extends a 4-byte literal; enough for stage0 addresses.
+ d = NAT[rd]
+ auipc = 0x00000017 | (d << 7)
+ lwu = 0x00C06003 | (d << 15) | (d << 7)
+ jal = 0x0080006F
+ return le32(auipc) + le32(lwu) + le32(jal)
+
+
+def rv_epilogue():
+ # Frame teardown shared by ERET, TAIL, TAILR. Mirrors p1_eret/p1_tail
+ # in P1-riscv64.M1pp: load saved ra, load saved caller sp into fp,
+ # then move fp into sp. The caller appends the actual jalr.
+ return rv_ld('ra', 'sp', 0) + rv_ld('fp', 'sp', 8) + rv_mov_rr('sp', 'fp')
+
+
+def encode_li(_arch, row):
+ return rv_lit64_prefix(row.rd)
+
+
+def encode_la(_arch, row):
+ return rv_lit32_prefix(row.rd)
+
+
+def encode_labr(_arch, _row):
+ return rv_lit32_prefix('br')
+
+
+def encode_mov(_arch, row):
+ # Portable sp is the frame-local base, which sits 16 bytes above
+ # native sp (the backend's 2-word hidden header occupies the low
+ # end of each frame). MOV rd, sp must therefore yield native_sp+16.
+ if row.rs == 'sp':
+ return rv_addi(row.rd, 'sp', 16)
+ return rv_mov_rr(row.rd, row.rs)
+
+
+def encode_rrr(_arch, row):
+ return rv_r_type(RRR_BASE[row.op], row.rd, row.ra, row.rb)
+
+
+def encode_addi(_arch, row):
+ return rv_addi(row.rd, row.ra, row.imm)
+
+
+def encode_logi(_arch, row):
+ base = {
+ 'ANDI': 0x00007013,
+ 'ORI': 0x00006013,
+ }[row.op]
+ return rv_i_type(base, row.rd, row.ra, row.imm)
+
+
+def encode_shifti(_arch, row):
+ if row.op == 'SHLI':
+ return rv_slli(row.rd, row.ra, row.imm)
+ if row.op == 'SHRI':
+ return rv_srli(row.rd, row.ra, row.imm)
+ if row.op == 'SARI':
+ return rv_srai(row.rd, row.ra, row.imm)
+ raise ValueError(f'unknown shift op: {row.op}')
+
+
+def encode_mem(_arch, row):
+ # Portable sp points to the frame-local base; the 2-word hidden header
+ # at native_sp+0/+8 is not portable-addressable. Shift sp-relative
+ # offsets past the header.
+ off = row.off + 16 if row.rn == 'sp' else row.off
+ if row.op == 'LD':
+ return rv_ld(row.rt, row.rn, off)
+ if row.op == 'ST':
+ return rv_sd(row.rt, row.rn, off)
+ if row.op == 'LB':
+ return rv_lbu(row.rt, row.rn, off)
+ if row.op == 'SB':
+ return rv_sb(row.rt, row.rn, off)
+ raise ValueError(f'unknown mem op: {row.op}')
+
+
+def encode_ldarg(_arch, row):
+ # LDARG loads the saved caller sp from [sp+8] (the hidden header
+ # slot), then indexes the incoming stack-arg area off it. Slot 0 is
+ # at caller_sp+16 because the native call instruction does not push
+ # a return address on riscv64 -- the +16 matches the aarch64 layout
+ # by convention for stage0 frame uniformity.
+ return rv_ld('scratch', 'sp', 8) + rv_ld(row.rd, 'scratch', 16 + 8 * row.slot)
+
+
+def encode_branch_reg(_arch, row):
+ if row.kind == 'BR':
+ return rv_jalr('zero', row.rs, 0)
+ if row.kind == 'CALLR':
+ return rv_jalr('ra', row.rs, 0)
+ if row.kind == 'TAILR':
+ return rv_epilogue() + rv_jalr('zero', row.rs, 0)
+ raise ValueError(f'unknown branch-reg kind: {row.kind}')
+
+
+def encode_condb(_arch, row):
+ return rv_b_type_skip8(CONDB_INV_BASE[row.op], row.ra, row.rb) + rv_jalr('zero', 'br', 0)
+
+
+def encode_condbz(_arch, row):
+ return rv_b_type_skip8(CONDBZ_INV_BASE[row.op], row.ra, 'zero') + rv_jalr('zero', 'br', 0)
+
+
+def encode_enter(arch, row):
+ frame_bytes = round_up(arch.stack_align, 2 * arch.word_bytes + row.size)
+ return (
+ rv_addi('sp', 'sp', -frame_bytes)
+ + rv_sd('ra', 'sp', 0)
+ + rv_addi('fp', 'sp', frame_bytes)
+ + rv_sd('fp', 'sp', 8)
+ )
+
+
+def encode_nullary(_arch, row):
+ if row.kind == 'B':
+ return rv_jalr('zero', 'br', 0)
+ if row.kind == 'CALL':
+ return rv_jalr('ra', 'br', 0)
+ if row.kind == 'RET':
+ return rv_jalr('zero', 'ra', 0)
+ if row.kind == 'ERET':
+ return rv_epilogue() + rv_jalr('zero', 'ra', 0)
+ if row.kind == 'TAIL':
+ return rv_epilogue() + rv_jalr('zero', 'br', 0)
+ if row.kind == 'SYSCALL':
+ # P1: a0=number, a1..a3,t0,s0,s1 = args 0..5.
+ # Linux riscv64: a7=number, a0..a5 = args 0..5, return in a0.
+ # SYSCALL clobbers only P1 a0; restore a1/a2/a3 after ecall.
+ return ''.join([
+ rv_mov_rr('save0', 'a1'),
+ rv_mov_rr('save1', 'a2'),
+ rv_mov_rr('save2', 'a3'),
+ rv_mov_rr('a7', 'a0'),
+ rv_mov_rr('a0', 'save0'),
+ rv_mov_rr('a1', 'save1'),
+ rv_mov_rr('a2', 'save2'),
+ rv_mov_rr('a3', 't0'),
+ rv_mov_rr('a4', 's0'),
+ rv_mov_rr('a5', 's1'),
+ rv_ecall(),
+ rv_mov_rr('a1', 'save0'),
+ rv_mov_rr('a2', 'save1'),
+ rv_mov_rr('a3', 'save2'),
+ ])
+ raise ValueError(f'unknown nullary kind: {row.kind}')
+
+
+def rv_start_stub():
+ # Backend-owned :_start stub per docs/P1.md §Program Entry. Linux
+ # riscv64 puts argc at [sp] and argv starting at [sp+8]; load argc
+ # into a0, compute &argv[0] into a1, call p1_main under the one-word
+ # direct-result convention, then issue sys_exit. Mirrors %p1_entry
+ # in p1/P1-riscv64.M1pp.
+ #
+ # Raw hex outside DEFINE bodies must be single-quoted so bootstrap
+ # M0 treats it as a literal byte run rather than a token.
+ def q(hex_bytes):
+ return f"'{hex_bytes}'"
+ return [
+ ':_start',
+ q(rv_ld('a0', 'sp', 0)),
+ q(rv_addi('a1', 'sp', 8)),
+ q(rv_lit32_prefix('br')),
+ '&p1_main',
+ q(rv_jalr('ra', 'br', 0)),
+ q(rv_addi('a7', 'zero', 93)),
+ q(rv_ecall()),
+ ]
+
+
+ENCODERS = {
+ Li: encode_li,
+ La: encode_la,
+ LaBr: encode_labr,
+ Mov: encode_mov,
+ Rrr: encode_rrr,
+ AddI: encode_addi,
+ LogI: encode_logi,
+ ShiftI: encode_shifti,
+ Mem: encode_mem,
+ LdArg: encode_ldarg,
+ Nullary: encode_nullary,
+ BranchReg: encode_branch_reg,
+ CondB: encode_condb,
+ CondBZ: encode_condbz,
+ Enter: encode_enter,
+}
+
+
+ARCH = ArchDef(
+ name='riscv64',
+ word_bytes=8,
+ stack_align=16,
+ syscall_numbers=SYSCALL_NUMBERS,
+ encoders=ENCODERS,
+ start_stub=rv_start_stub,
+)