boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

commit ffe72f13e339e8411d89985ad324f01a6483a4e6
parent d7398ef3945e0d7f61cc44a36a2cb86a7ab47093
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 24 Apr 2026 15:54:38 -0700

Add amd64 / riscv64 P1v2 generators; explicit ARCHES wiring

Mirrors p1/gen/aarch64.py; each backend encodes the same byte
sequences as its p1/P1-<arch>.M1pp counterpart.

Drop the ARCH_REGISTRY indirection in common.py. Each arch module now
exposes ARCH = ArchDef(...); p1_gen.py builds an explicit ARCHES dict.

Diffstat:
Ap1/gen/aarch64.py | 410+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ap1/gen/amd64.py | 608+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ap1/gen/common.py | 49+++++++++++++++++++++++++++++++++++++++++++++++++
Ap1/gen/p1_gen.py | 259+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ap1/gen/riscv64.py | 396+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 1722 insertions(+), 0 deletions(-)

diff --git a/p1/gen/aarch64.py b/p1/gen/aarch64.py @@ -0,0 +1,410 @@ +from common import ( + AddI, + ArchDef, + BranchReg, + CondB, + CondBZ, + Enter, + La, + LaBr, + LdArg, + Li, + LogI, + Mem, + Mov, + Nullary, + Rrr, + ShiftI, + le32, + round_up, +) + + +NAT = { + 'a0': 0, + 'a1': 1, + 'a2': 2, + 'a3': 3, + 'x4': 4, + 'x5': 5, + 't0': 9, + 't1': 10, + 't2': 11, + 's0': 19, + 's1': 20, + 's2': 21, + 's3': 22, + 'sp': 31, + 'xzr': 31, + 'lr': 30, + 'br': 17, + 'scratch': 16, + 'x8': 8, + 'save0': 23, + 'save1': 24, + 'save2': 25, +} + + +RRR_BASE = { + 'ADD': 0x8B000000, + 'SUB': 0xCB000000, + 'AND': 0x8A000000, + 'OR': 0xAA000000, + 'XOR': 0xCA000000, + 'SHL': 0x9AC02000, + 'SHR': 0x9AC02400, + 'SAR': 0x9AC02800, + 'DIV': 0x9AC00C00, +} + + +SYSCALL_NUMBERS = { + 'SYS_READ': 63, + 'SYS_WRITE': 64, + 'SYS_CLOSE': 57, + 'SYS_OPENAT': 56, + 'SYS_EXIT': 93, + 'SYS_CLONE': 220, + 'SYS_EXECVE': 221, + 'SYS_WAITID': 95, +} + + +def aa_rrr(base, rd, ra, rb): + d = NAT[rd] + a = NAT[ra] + b = NAT[rb] + return le32(base | (b << 16) | (a << 5) | d) + + +def aa_add_imm(rd, ra, imm12, sub=False): + d = NAT[rd] + a = NAT[ra] + base = 0xD1000000 if sub else 0x91000000 + return le32(base | ((imm12 & 0xFFF) << 10) | (a << 5) | d) + + +def aa_mov_rr(dst, src): + if dst == 'sp': + return aa_add_imm('sp', src, 0, sub=False) + if src == 'sp': + return aa_add_imm(dst, 'sp', 0, sub=False) + d = NAT[dst] + s = NAT[src] + return le32(0xAA000000 | (s << 16) | (31 << 5) | d) + + +def aa_ubfm(rd, ra, immr, imms): + d = NAT[rd] + a = NAT[ra] + return le32(0xD3400000 | (immr << 16) | (imms << 10) | (a << 5) | d) + + +def aa_sbfm(rd, ra, immr, imms): + d = NAT[rd] + a = NAT[ra] + return le32(0x93400000 | (immr << 16) | (imms << 10) | (a << 5) | d) + + +def aa_movz(rd, imm16): + d = NAT[rd] + return le32(0xD2800000 | ((imm16 & 0xFFFF) << 5) | d) + + +def aa_movn(rd, imm16): + d = NAT[rd] + return le32(0x92800000 | ((imm16 & 0xFFFF) << 5) | d) + + +def aa_materialize_small_imm(rd, imm): + if imm >= 0: + return aa_movz(rd, imm) + return aa_movn(rd, (~imm) & 0xFFFF) + + +def aa_ldst_uimm12(base, rt, rn, off_bytes, size_log2): + imm12 = off_bytes >> size_log2 + t = NAT[rt] + n = NAT[rn] + return le32(base | (imm12 << 10) | (n << 5) | t) + + +def aa_ldst_unscaled(base, rt, rn, off): + imm9 = off & 0x1FF + t = NAT[rt] + n = NAT[rn] + return le32(base | (imm9 << 12) | (n << 5) | t) + + +def aa_mem(op, rt, rn, off): + bases = { + 'LD': (0xF9400000, 3, 0xF8400000), + 'ST': (0xF9000000, 3, 0xF8000000), + 'LB': (0x39400000, 0, 0x38400000), + 'SB': (0x39000000, 0, 0x38000000), + } + uimm_base, size_log2, unscaled_base = bases[op] + scale = 1 << size_log2 + if off >= 0 and off % scale == 0 and off < (4096 << size_log2): + return aa_ldst_uimm12(uimm_base, rt, rn, off, size_log2) + if -256 <= off <= 255: + return aa_ldst_unscaled(unscaled_base, rt, rn, off) + if -2048 <= off <= 2047: + if off >= 0: + addr = aa_add_imm('scratch', rn, off, sub=False) + else: + addr = aa_add_imm('scratch', rn, -off, sub=True) + return addr + aa_ldst_uimm12(uimm_base, rt, 'scratch', 0, size_log2) + raise ValueError(f'aarch64 offset out of range for {op}: {off}') + + +def aa_cmp_skip(op, ra, rb): + a = NAT[ra] + b = NAT[rb] + cmp_hex = le32(0xEB000000 | (b << 16) | (a << 5) | 31) + skip_cond = { + 'BEQ': 1, + 'BNE': 0, + 'BLT': 10, + 'BLTU': 2, + }[op] + return cmp_hex + le32(0x54000040 | skip_cond) + + +def aa_br(reg): + return le32(0xD61F0000 | (NAT[reg] << 5)) + + +def aa_blr(reg): + return le32(0xD63F0000 | (NAT[reg] << 5)) + + +def aa_ret(): + return le32(0xD65F03C0) + + +def aa_epilogue(): + # Frame teardown, shared by ERET, TAIL, TAILR. Loads lr and the + # saved caller sp from the hidden header at native_sp+0/+8, then + # unwinds sp. Does NOT transfer control; the caller appends an + # aa_ret / aa_br as appropriate. + return ( + aa_mem('LD', 'lr', 'sp', 0) + + aa_mem('LD', 'x8', 'sp', 8) + + aa_mov_rr('sp', 'x8') + ) + + +def aa_lit64_prefix(rd): + ## 64-bit literal-pool prefix for LI: ldr xN, [pc,#8]; b PC+12. + ## The 8 bytes that follow in source become the literal; b skips them. + d = NAT[rd] + ldr_lit = 0x58000040 | d + b_plus12 = 0x14000003 + return le32(ldr_lit) + le32(b_plus12) + + +def aa_lit32_prefix(rd): + ## 32-bit literal-pool prefix for LA / LA_BR: ldr wN, [pc,#8]; b PC+8. + ## ldr w zero-extends into the full 64-bit register, so a 4-byte literal + ## is enough for any address in the stage0 layout (base 0x00600000, + ## programs well under 4 GB). This lets source use `&label` directly + ## without padding to 8 bytes. + d = NAT[rd] + ldr_lit = 0x18000040 | d + b_plus8 = 0x14000002 + return le32(ldr_lit) + le32(b_plus8) + + +def encode_li(_arch, row): + return aa_lit64_prefix(row.rd) + + +def encode_la(_arch, row): + return aa_lit32_prefix(row.rd) + + +def encode_labr(_arch, _row): + return aa_lit32_prefix('br') + + +def encode_mov(_arch, row): + # Portable `sp` is the frame-local base, which is 16 bytes above + # native sp (the backend's 2-word hidden header sits at the low end + # of each frame allocation). So reading sp into a register yields + # native_sp + 16, not native_sp itself. + if row.rs == 'sp': + return aa_add_imm(row.rd, 'sp', 16, sub=False) + return aa_mov_rr(row.rd, row.rs) + + +def encode_rrr(_arch, row): + if row.op == 'MUL': + d = NAT[row.rd] + a = NAT[row.ra] + b = NAT[row.rb] + return le32(0x9B000000 | (b << 16) | (31 << 10) | (a << 5) | d) + if row.op == 'REM': + d = NAT[row.rd] + a = NAT[row.ra] + b = NAT[row.rb] + sc = NAT['scratch'] + sdiv = 0x9AC00C00 | (b << 16) | (a << 5) | sc + msub = 0x9B008000 | (b << 16) | (a << 10) | (sc << 5) | d + return le32(sdiv) + le32(msub) + return aa_rrr(RRR_BASE[row.op], row.rd, row.ra, row.rb) + + +def encode_addi(_arch, row): + if row.imm >= 0: + return aa_add_imm(row.rd, row.ra, row.imm, sub=False) + return aa_add_imm(row.rd, row.ra, -row.imm, sub=True) + + +def encode_logi(_arch, row): + seq = aa_materialize_small_imm('scratch', row.imm) + base = { + 'ANDI': 0x8A000000, + 'ORI': 0xAA000000, + }[row.op] + return seq + aa_rrr(base, row.rd, row.ra, 'scratch') + + +def encode_shifti(_arch, row): + if row.op == 'SHLI': + return aa_ubfm(row.rd, row.ra, (-row.imm) & 63, 63 - row.imm) + if row.op == 'SHRI': + return aa_ubfm(row.rd, row.ra, row.imm, 63) + return aa_sbfm(row.rd, row.ra, row.imm, 63) + + +def encode_mem(_arch, row): + # Portable sp points to the frame-local base; the 2-word hidden + # header sits at native_sp+0/+8 and is not portable-addressable. + # Shift sp-relative offsets past the header. + off = row.off + 16 if row.rn == 'sp' else row.off + return aa_mem(row.op, row.rt, row.rn, off) + + +def encode_ldarg(_arch, row): + return aa_mem('LD', 'scratch', 'sp', 8) + aa_mem('LD', row.rd, 'scratch', 16 + 8 * row.slot) + + +def encode_branch_reg(_arch, row): + if row.kind == 'BR': + return aa_br(row.rs) + if row.kind == 'CALLR': + return aa_blr(row.rs) + if row.kind == 'TAILR': + return aa_epilogue() + aa_br(row.rs) + raise ValueError(f'unknown branch-reg kind: {row.kind}') + + +def encode_condb(_arch, row): + return aa_cmp_skip(row.op, row.ra, row.rb) + aa_br('br') + + +def encode_condbz(_arch, row): + a = NAT[row.ra] + br_hex = aa_br('br') + if row.op == 'BEQZ': + return le32(0xB5000000 | (2 << 5) | a) + br_hex + if row.op == 'BNEZ': + return le32(0xB4000000 | (2 << 5) | a) + br_hex + cmp_zero = le32(0xEB1F001F | (a << 5)) + bge = le32(0x54000040 | 10) + return cmp_zero + bge + br_hex + + +def encode_enter(arch, row): + frame_bytes = round_up(arch.stack_align, 2 * arch.word_bytes + row.size) + return ( + aa_add_imm('sp', 'sp', frame_bytes, sub=True) + + aa_mem('ST', 'lr', 'sp', 0) + + aa_add_imm('x8', 'sp', frame_bytes, sub=False) + + aa_mem('ST', 'x8', 'sp', 8) + ) + + +def encode_nullary(_arch, row): + if row.kind == 'B': + return aa_br('br') + if row.kind == 'CALL': + return aa_blr('br') + if row.kind == 'RET': + return aa_ret() + if row.kind == 'ERET': + return aa_epilogue() + aa_ret() + if row.kind == 'TAIL': + return aa_epilogue() + aa_br('br') + if row.kind == 'SYSCALL': + return ''.join([ + aa_mov_rr('x8', 'a0'), + aa_mov_rr('save0', 'a1'), + aa_mov_rr('save1', 'a2'), + aa_mov_rr('save2', 'a3'), + aa_mov_rr('a0', 'save0'), + aa_mov_rr('a1', 'save1'), + aa_mov_rr('a2', 'save2'), + aa_mov_rr('a3', 't0'), + aa_mov_rr('x4', 's0'), + aa_mov_rr('x5', 's1'), + le32(0xD4000001), + aa_mov_rr('a1', 'save0'), + aa_mov_rr('a2', 'save1'), + aa_mov_rr('a3', 'save2'), + ]) + raise ValueError(f'unknown nullary kind: {row.kind}') + + +def aa_start_stub(): + # Backend-owned :_start stub per docs/P1.md §Program Entry. Captures + # argc from [sp] and argv pointer from sp+8, calls p1_main under the + # one-word direct-result convention (a0=argc, a1=argv), then issues a + # native Linux sys_exit with p1_main's return value. Mirrors the + # m1pp-path stub in p1/P1-aarch64.M1pp (`%p1_entry`). + # + # Raw hex outside `DEFINE` bodies must be single-quoted so bootstrap + # M0 treats it as a literal byte run rather than a token. + def q(hex_bytes): + return f"'{hex_bytes}'" + return [ + ':_start', + q(aa_mem('LD', 'a0', 'sp', 0)), + q(aa_add_imm('a1', 'sp', 8, sub=False)), + q(aa_lit32_prefix('br')), + '&p1_main', + q(aa_blr('br')), + q(aa_movz('x8', 93)), + q(le32(0xD4000001)), + ] + + +ENCODERS = { + Li: encode_li, + La: encode_la, + LaBr: encode_labr, + Mov: encode_mov, + Rrr: encode_rrr, + AddI: encode_addi, + LogI: encode_logi, + ShiftI: encode_shifti, + Mem: encode_mem, + LdArg: encode_ldarg, + Nullary: encode_nullary, + BranchReg: encode_branch_reg, + CondB: encode_condb, + CondBZ: encode_condbz, + Enter: encode_enter, +} + + +ARCH = ArchDef( + name='aarch64', + word_bytes=8, + stack_align=16, + syscall_numbers=SYSCALL_NUMBERS, + encoders=ENCODERS, + start_stub=aa_start_stub, +) diff --git a/p1/gen/amd64.py b/p1/gen/amd64.py @@ -0,0 +1,608 @@ +from common import ( + AddI, + ArchDef, + BranchReg, + CondB, + CondBZ, + Enter, + La, + LaBr, + LdArg, + Li, + LogI, + Mem, + Mov, + Nullary, + Rrr, + ShiftI, + byte, + le32, + round_up, +) + + +# ---- Native register numbers -------------------------------------------- +# +# Backend-private mapping from P1 register names to native amd64 regnums. +# `br` is the hidden branch-target reg (r15). `scratch` is the per-expansion +# scratch reg (r9). rax/rbp are also used internally (retaddr spill, rcx / +# rdx save slots) and are not P1-visible. + +NAT = { + 'a0': 7, # rdi + 'a1': 6, # rsi + 'a2': 2, # rdx + 'a3': 1, # rcx + 't0': 10, # r10 + 't1': 11, # r11 + 't2': 8, # r8 + 's0': 3, # rbx + 's1': 12, # r12 + 's2': 13, # r13 + 's3': 14, # r14 + 'sp': 4, # rsp + 'br': 15, # r15 + 'scratch': 9, # r9 + 'rax': 0, + 'rcx': 1, + 'rdx': 2, + 'rbx': 3, + 'rsp': 4, + 'rbp': 5, + 'rsi': 6, + 'rdi': 7, + 'r8': 8, + 'r9': 9, + 'r10': 10, + 'r11': 11, + 'r12': 12, + 'r13': 13, + 'r14': 14, + 'r15': 15, +} + + +SYSCALL_NUMBERS = { + 'SYS_READ': 0, + 'SYS_WRITE': 1, + 'SYS_CLOSE': 3, + 'SYS_OPENAT': 257, + 'SYS_EXIT': 60, + 'SYS_CLONE': 56, + 'SYS_EXECVE': 59, + 'SYS_WAITID': 247, +} + + +# ---- REX / ModRM helpers ------------------------------------------------ + +def amd_rex_b_short(r): + # Optional one-byte REX.B (no W) prefix used by push/pop/jmp r/call r/ + # mov r,imm32 when the target reg is r8-r15. Returns '' for low regs. + if NAT[r] >= 8: + return byte(0x41) + return '' + + +def amd_rex_wb(r): + # REX.W=1, B=(r>>3) to extend ModRM.rm / SIB.base. + return byte(0x48 | ((NAT[r] >> 3) & 1)) + + +def amd_rex_wrb(rg, rm): + # REX.W=1, R=(rg>>3), B=(rm>>3). Used whenever a ModRM.reg field is in + # use together with a ModRM.rm field. + return byte(0x48 | (((NAT[rg] >> 3) & 1) << 2) | ((NAT[rm] >> 3) & 1)) + + +def amd_modrm_rr(rg, rm): + return byte(0xC0 | ((NAT[rg] & 7) << 3) | (NAT[rm] & 7)) + + +def amd_modrm_ext_r(ext, rm): + return byte(0xC0 | ((ext & 7) << 3) | (NAT[rm] & 7)) + + +# ---- Memory-addressing ModRM (+ SIB + disp) ---------------------------- +# +# [base + disp] with `reg` in ModRM.reg. Bases whose low 3 bits are 100 - +# rsp and r12 - must go through a SIB byte; all others use the plain +# encoding. disp selects mod=1 (disp8) when it fits in [-128,127], else +# mod=2 (disp32). We never emit mod=0 / no-disp; the extra byte is fine. + +def amd_modrm_disp(reg, base, disp): + use_sib = (NAT[base] & 7) == 4 + use_disp8 = -128 <= disp <= 127 + reg_lo = NAT[reg] & 7 + if use_sib: + if use_disp8: + return byte(0x44 | (reg_lo << 3)) + byte(0x24) + byte(disp) + return byte(0x84 | (reg_lo << 3)) + byte(0x24) + le32(disp) + base_lo = NAT[base] & 7 + if use_disp8: + return byte(0x40 | (reg_lo << 3) | base_lo) + byte(disp) + return byte(0x80 | (reg_lo << 3) | base_lo) + le32(disp) + + +# ---- Register / arithmetic primitives ---------------------------------- + +def amd_mov_rr(dst, src): + # mov dst, src -- REX.WRB 89 /r (source in ModRM.reg, dest in rm) + return amd_rex_wrb(src, dst) + byte(0x89) + amd_modrm_rr(src, dst) + + +def amd_alu_rr(opcode, dst, src): + # ADD/SUB/AND/OR/XOR dst, src -- REX.WRB <op> /r (src in reg, dst in rm) + return amd_rex_wrb(src, dst) + byte(opcode) + amd_modrm_rr(src, dst) + + +def amd_alu_ri8(ext, dst, imm): + # op dst, imm8 -- REX.WB 83 /ext ib + return amd_rex_wb(dst) + byte(0x83) + amd_modrm_ext_r(ext, dst) + byte(imm) + + +def amd_alu_ri32(ext, dst, imm): + # op dst, imm32 -- REX.WB 81 /ext id + return amd_rex_wb(dst) + byte(0x81) + amd_modrm_ext_r(ext, dst) + le32(imm) + + +def amd_shift_ri8(ext, dst, imm): + # shift dst, imm8 -- REX.WB C1 /ext ib (SHL=4, SHR=5, SAR=7) + return (amd_rex_wb(dst) + byte(0xC1) + amd_modrm_ext_r(ext, dst) + + byte(imm & 0x3F)) + + +def amd_shift_cl(ext, dst): + # shift dst, cl -- REX.WB D3 /ext + return amd_rex_wb(dst) + byte(0xD3) + amd_modrm_ext_r(ext, dst) + + +def amd_imul_rr(dst, src): + # imul dst, src -- REX.WRB 0F AF /r (dst in reg, src in rm) + return (amd_rex_wrb(dst, src) + byte(0x0F) + byte(0xAF) + + amd_modrm_rr(dst, src)) + + +def amd_idiv_r(src): + # idiv src -- REX.WB F7 /7 + return amd_rex_wb(src) + byte(0xF7) + amd_modrm_ext_r(7, src) + + +def amd_cqo(): + # cqo -- 48 99 (sign-extend rax into rdx:rax) + return byte(0x48) + byte(0x99) + + +def amd_push(r): + return amd_rex_b_short(r) + byte(0x50 | (NAT[r] & 7)) + + +def amd_pop(r): + return amd_rex_b_short(r) + byte(0x58 | (NAT[r] & 7)) + + +def amd_mov_imm32_prefix(rd): + # mov r32, imm32 -- [REX.B] B8+r (caller appends 4-byte literal). + # Result is zero-extended into the full 64-bit register. + return amd_rex_b_short(rd) + byte(0xB8 | (NAT[rd] & 7)) + + +def amd_mov_imm64_prefix(rd): + # mov r64, imm64 -- REX.W[.B] B8+r (caller appends 8-byte literal). + return amd_rex_wb(rd) + byte(0xB8 | (NAT[rd] & 7)) + + +# ---- Memory ops -------------------------------------------------------- + +def amd_mem_LD(rt, rn, off): + # mov rT, [rN + off] -- REX.WRB 8B /r modrm-with-disp + return amd_rex_wrb(rt, rn) + byte(0x8B) + amd_modrm_disp(rt, rn, off) + + +def amd_mem_ST(rt, rn, off): + # mov [rN + off], rT -- REX.WRB 89 /r + return amd_rex_wrb(rt, rn) + byte(0x89) + amd_modrm_disp(rt, rn, off) + + +def amd_mem_SB(rt, rn, off): + # mov [rN + off], rT8 -- REX.WRB 88 /r (REX.W forces dil/sil/bpl/spl + # byte-view encoding when the low byte of those regs is needed). + return amd_rex_wrb(rt, rn) + byte(0x88) + amd_modrm_disp(rt, rn, off) + + +def amd_mem_LB(rt, rn, off): + # movzx rT, byte ptr [rN + off] -- REX.WRB 0F B6 /r + return (amd_rex_wrb(rt, rn) + byte(0x0F) + byte(0xB6) + + amd_modrm_disp(rt, rn, off)) + + +# ---- Control-flow primitives ------------------------------------------- + +def amd_jmp_r(r): + # jmp r/m64 -- [REX.B] FF /4. 2 bytes for low regs, 3 bytes for r8-r15. + return amd_rex_b_short(r) + byte(0xFF) + byte(0xE0 | (NAT[r] & 7)) + + +def amd_call_r(r): + # call r/m64 -- [REX.B] FF /2. + return amd_rex_b_short(r) + byte(0xFF) + byte(0xD0 | (NAT[r] & 7)) + + +def amd_ret(): + return byte(0xC3) + + +def amd_syscall(): + return byte(0x0F) + byte(0x05) + + +def amd_cmp_rr(ra, rb): + # cmp rA, rB -- REX.WRB 39 /r (rB in reg, rA in rm). + return amd_rex_wrb(rb, ra) + byte(0x39) + amd_modrm_rr(rb, ra) + + +def amd_test_rr(ra, rb): + return amd_rex_wrb(rb, ra) + byte(0x85) + amd_modrm_rr(rb, ra) + + +# ---- P1 register-register op lowering ---------------------------------- +# +# For ADD/SUB/AND/OR/XOR we honor rD==rB aliasing -- the naive +# `mov rD,rA ; op rD,rB` would clobber rB before the op reads it. Route rB +# through the scratch reg when that aliasing shows up. + +ALU_OPCODE = { + 'ADD': 0x01, + 'SUB': 0x29, + 'AND': 0x21, + 'OR': 0x09, + 'XOR': 0x31, +} + + +def amd_rrr_simple(opcode, rd, ra, rb): + if NAT[rd] == NAT[rb]: + return (amd_mov_rr('scratch', rb) + + amd_mov_rr(rd, ra) + + amd_alu_rr(opcode, rd, 'scratch')) + return amd_mov_rr(rd, ra) + amd_alu_rr(opcode, rd, rb) + + +def amd_rrr_MUL(rd, ra, rb): + if NAT[rd] == NAT[rb]: + return (amd_mov_rr('scratch', rb) + + amd_mov_rr(rd, ra) + + amd_imul_rr(rd, 'scratch')) + return amd_mov_rr(rd, ra) + amd_imul_rr(rd, rb) + + +# DIV / REM clobber rax and rdx natively. rax is not a P1 register, so we +# clobber it freely; rdx IS P1 a2, so we stash it to rbp (also outside the +# P1 mapping) for the lifetime of the op. Aliasing-safety plan mirrors the +# M1pp comments verbatim. + +def amd_rrr_DIV(rd, ra, rb): + return ''.join([ + amd_mov_rr('rbp', 'rdx'), + amd_mov_rr('scratch', rb), + amd_mov_rr('rax', ra), + amd_cqo(), + amd_idiv_r('scratch'), + amd_mov_rr('rdx', 'rbp'), + amd_mov_rr(rd, 'rax'), + ]) + + +def amd_rrr_REM(rd, ra, rb): + return ''.join([ + amd_mov_rr('rbp', 'rdx'), + amd_mov_rr('scratch', rb), + amd_mov_rr('rax', ra), + amd_cqo(), + amd_idiv_r('scratch'), + amd_mov_rr('rax', 'rdx'), + amd_mov_rr('rdx', 'rbp'), + amd_mov_rr(rd, 'rax'), + ]) + + +# SHL / SHR / SAR with reg count. x86 reads the count from CL only, so +# staging goes through rcx -- which IS P1 a3. Save rcx to rbp for the +# duration. Ordering matches the M1pp comments. + +def amd_rrr_shift(ext, rd, ra, rb): + return ''.join([ + amd_mov_rr('rbp', 'rcx'), + amd_mov_rr('scratch', ra), + amd_mov_rr('rcx', rb), + amd_shift_cl(ext, 'scratch'), + amd_mov_rr('rcx', 'rbp'), + amd_mov_rr(rd, 'scratch'), + ]) + + +# ---- Encoders ---------------------------------------------------------- + +def encode_li(_arch, row): + return amd_mov_imm64_prefix(row.rd) + + +def encode_la(_arch, row): + return amd_mov_imm32_prefix(row.rd) + + +def encode_labr(_arch, _row): + return amd_mov_imm32_prefix('br') + + +def encode_mov(_arch, row): + # Portable sp is the frame-local base, which is 16 bytes above native + # rsp. Reading sp into a register yields native_rsp + 16, so emit + # `mov rd, rsp ; add rd, 16` for the sp-source case. + if row.rs == 'sp': + return amd_mov_rr(row.rd, 'sp') + amd_alu_ri8(0, row.rd, 16) + return amd_mov_rr(row.rd, row.rs) + + +def encode_rrr(_arch, row): + if row.op == 'MUL': + return amd_rrr_MUL(row.rd, row.ra, row.rb) + if row.op == 'DIV': + return amd_rrr_DIV(row.rd, row.ra, row.rb) + if row.op == 'REM': + return amd_rrr_REM(row.rd, row.ra, row.rb) + if row.op == 'SHL': + return amd_rrr_shift(4, row.rd, row.ra, row.rb) + if row.op == 'SHR': + return amd_rrr_shift(5, row.rd, row.ra, row.rb) + if row.op == 'SAR': + return amd_rrr_shift(7, row.rd, row.ra, row.rb) + return amd_rrr_simple(ALU_OPCODE[row.op], row.rd, row.ra, row.rb) + + +def encode_addi(_arch, row): + head = amd_mov_rr(row.rd, row.ra) + if -128 <= row.imm <= 127: + return head + amd_alu_ri8(0, row.rd, row.imm) + return head + amd_alu_ri32(0, row.rd, row.imm) + + +# AND/OR with imm: 83 /ext ib sign-extends imm8 to 64 bits. That works for +# imm in [-128, 127] (and for -1 as a convenient all-ones mask), but breaks +# for positive imms >= 128 -- ANDI with 255 would become AND with +# 0xFFFFFFFFFFFFFFFF. Widen to the imm32 form when imm8 would misencode. +LOGI_EXT = { + 'ANDI': 4, + 'ORI': 1, +} + + +def encode_logi(_arch, row): + head = amd_mov_rr(row.rd, row.ra) + ext = LOGI_EXT[row.op] + if -128 <= row.imm <= 127: + return head + amd_alu_ri8(ext, row.rd, row.imm) + return head + amd_alu_ri32(ext, row.rd, row.imm) + + +SHIFTI_EXT = { + 'SHLI': 4, + 'SHRI': 5, + 'SARI': 7, +} + + +def encode_shifti(_arch, row): + return (amd_mov_rr(row.rd, row.ra) + + amd_shift_ri8(SHIFTI_EXT[row.op], row.rd, row.imm)) + + +def encode_mem(_arch, row): + # Portable sp points to the frame-local base; the 16-byte hidden frame + # header sits at native_rsp+0..15 and is not portable-addressable. + # Shift sp-relative offsets past the header. + off = row.off + 16 if row.rn == 'sp' else row.off + if row.op == 'LD': + return amd_mem_LD(row.rt, row.rn, off) + if row.op == 'ST': + return amd_mem_ST(row.rt, row.rn, off) + if row.op == 'LB': + return amd_mem_LB(row.rt, row.rn, off) + if row.op == 'SB': + return amd_mem_SB(row.rt, row.rn, off) + raise ValueError(f'unknown mem op: {row.op}') + + +def encode_ldarg(_arch, row): + # Internal callers bypass the +16 sp-base translation: native rsp+8 + # holds the saved caller-sp pointer set up by p1_enter, and the first + # incoming stack-arg word lives 16 bytes past that. + return (amd_mem_LD('scratch', 'sp', 8) + + amd_mem_LD(row.rd, 'scratch', 16 + 8 * row.slot)) + + +def amd_epilogue_prefix(): + # Frame-teardown prefix shared by ERET, TAIL, TAILR. Loads retaddr into + # scratch (r9), saved caller sp into rax, unwinds rsp, then re-pushes + # retaddr so a trailing `ret` or `jmp` finds the right top-of-stack + # layout. (For TAIL/TAILR the trailing op is a jmp, but the retaddr + # still needs to be back on the stack so the eventual callee `ret` + # returns to the original caller.) + return ''.join([ + amd_mem_LD('scratch', 'sp', 0), + amd_mem_LD('rax', 'sp', 8), + amd_mov_rr('sp', 'rax'), + amd_push('scratch'), + ]) + + +def encode_branch_reg(_arch, row): + if row.kind == 'BR': + return amd_jmp_r(row.rs) + if row.kind == 'CALLR': + return amd_call_r(row.rs) + if row.kind == 'TAILR': + return amd_epilogue_prefix() + amd_jmp_r(row.rs) + raise ValueError(f'unknown branch-reg kind: {row.kind}') + + +# Conditional-branch lowering: +# cmp / test +# Jcc_inverse +3 -- skip the 3-byte `jmp r15` +# jmp r15 -- P1 branch-taken path +# +# Invert codes: BEQ->JNE(75), BNE->JE(74), BLT->JGE(7D), BLTU->JAE(73), +# BLTZ->JGE(7D), BEQZ->JNE(75), BNEZ->JE(74). The 0x03 rel8 skips +# `amd_jmp_r(br)` which is 3 bytes (REX.B 41 + FF + E7). +CONDB_INVERT = { + 'BEQ': 0x75, # JNE + 'BNE': 0x74, # JE + 'BLT': 0x7D, # JGE + 'BLTU': 0x73, # JAE +} + +CONDBZ_INVERT = { + 'BEQZ': 0x75, # JNE + 'BNEZ': 0x74, # JE + 'BLTZ': 0x7D, # JGE +} + + +def encode_condb(_arch, row): + return (amd_cmp_rr(row.ra, row.rb) + + byte(CONDB_INVERT[row.op]) + byte(0x03) + + amd_jmp_r('br')) + + +def encode_condbz(_arch, row): + return (amd_test_rr(row.ra, row.ra) + + byte(CONDBZ_INVERT[row.op]) + byte(0x03) + + amd_jmp_r('br')) + + +def encode_enter(arch, row): + # CALL on amd64 pushed the retaddr, so on entry: + # rsp = caller_sp - 8 + # [rsp] = retaddr + # + # Standard frame after ENTER: + # [sp + 0] = retaddr + # [sp + 8] = saved caller_sp + # [sp + 16 .. 16 + size - 1] = portable locals + # total frame = round_up(stack_align, 16 + size) + frame_bytes = round_up(arch.stack_align, 2 * arch.word_bytes + row.size) + return ''.join([ + amd_pop('scratch'), + amd_mov_rr('rax', 'sp'), + amd_alu_ri32(5, 'sp', frame_bytes), + amd_mem_ST('scratch', 'sp', 0), + amd_mem_ST('rax', 'sp', 8), + ]) + + +def encode_nullary(_arch, row): + if row.kind == 'B': + return amd_jmp_r('br') + if row.kind == 'CALL': + return amd_call_r('br') + if row.kind == 'RET': + return amd_ret() + if row.kind == 'ERET': + return amd_epilogue_prefix() + amd_ret() + if row.kind == 'TAIL': + return amd_epilogue_prefix() + amd_jmp_r('br') + if row.kind == 'SYSCALL': + # P1: a0=num, a1..a3,t0,s0,s1 = args 0..5. Linux amd64: rax=num, + # rdi/rsi/rdx/r10/r8/r9 = args 0..5, return in rax; syscall also + # clobbers rcx and r11. + # + # Push the P1 registers whose native slots get overwritten or + # syscall-clobbered -- rsi (a1), rdx (a2), rcx (a3), r11 (t1), + # r8 (t2) -- then shuffle into the native arg slots, issue + # syscall, restore, and move the return value (rax) into a0 + # (rdi). Stack offsets after the 5 pushes: [rsp+0]=r8, + # [rsp+8]=r11, [rsp+16]=rcx (a3), [rsp+24]=rdx (a2), + # [rsp+32]=rsi (a1). + return ''.join([ + amd_push('rsi'), + amd_push('rdx'), + amd_push('rcx'), + amd_push('r11'), + amd_push('r8'), + amd_mov_rr('rax', 'rdi'), + amd_mem_LD('rdi', 'sp', 32), + amd_mem_LD('rsi', 'sp', 24), + amd_mem_LD('rdx', 'sp', 16), + amd_mov_rr('r8', 'rbx'), + amd_mov_rr('r9', 'r12'), + amd_syscall(), + amd_pop('r8'), + amd_pop('r11'), + amd_pop('rcx'), + amd_pop('rdx'), + amd_pop('rsi'), + amd_mov_rr('rdi', 'rax'), + ]) + raise ValueError(f'unknown nullary kind: {row.kind}') + + +def amd_start_stub(): + # Backend-owned :_start stub per docs/P1.md §Program Entry. Linux amd64 + # puts argc at [rsp] and argv starting at [rsp+8]. Load argc into a0 + # (rdi), compute &argv[0] into a1 (rsi), call p1_main under the + # one-word direct-result convention, then issue sys_exit with + # p1_main's return value in a0 (== rdi). Mirrors the `%p1_entry` + # macro in p1/P1-amd64.M1pp. + # + # Raw hex outside DEFINE bodies must be single-quoted so bootstrap M0 + # treats it as a literal byte run. The bootstrap amd64 M0 has a 256B + # token buffer, so each quoted run must stay <= 128 hex chars; we + # split into multiple short lines defensively. + def q(hex_bytes): + return f"'{hex_bytes}'" + + load_argc = amd_mem_LD('a0', 'sp', 0) + compute_argv = amd_mov_rr('a1', 'sp') + amd_alu_ri8(0, 'a1', 8) + labr_prefix = amd_mov_imm32_prefix('br') + call_main = amd_call_r('br') + # mov eax, 60 ; syscall. P1 a0 (rdi) already holds p1_main's return. + sys_exit = byte(0xB8) + le32(60) + amd_syscall() + + return [ + ':_start', + q(load_argc), + q(compute_argv), + q(labr_prefix), + '&p1_main', + q(call_main), + q(sys_exit), + ] + + +ENCODERS = { + Li: encode_li, + La: encode_la, + LaBr: encode_labr, + Mov: encode_mov, + Rrr: encode_rrr, + AddI: encode_addi, + LogI: encode_logi, + ShiftI: encode_shifti, + Mem: encode_mem, + LdArg: encode_ldarg, + Nullary: encode_nullary, + BranchReg: encode_branch_reg, + CondB: encode_condb, + CondBZ: encode_condbz, + Enter: encode_enter, +} + + +ARCH = ArchDef( + name='amd64', + word_bytes=8, + stack_align=16, + syscall_numbers=SYSCALL_NUMBERS, + encoders=ENCODERS, + start_stub=amd_start_stub, +) diff --git a/p1/gen/common.py b/p1/gen/common.py @@ -0,0 +1,49 @@ +from collections import namedtuple + + +ArchDef = namedtuple( + 'ArchDef', + 'name word_bytes stack_align syscall_numbers encoders start_stub', +) + +Banner = namedtuple('Banner', 'text') +Literal = namedtuple('Literal', 'name hex_by_arch') +Nullary = namedtuple('Nullary', 'name kind') +Li = namedtuple('Li', 'name rd') +La = namedtuple('La', 'name rd') +LaBr = namedtuple('LaBr', 'name') +Mov = namedtuple('Mov', 'name rd rs') +Rrr = namedtuple('Rrr', 'name op rd ra rb') +AddI = namedtuple('AddI', 'name rd ra imm') +LogI = namedtuple('LogI', 'name op rd ra imm') +ShiftI = namedtuple('ShiftI', 'name op rd ra imm') +Mem = namedtuple('Mem', 'name op rt rn off') +LdArg = namedtuple('LdArg', 'name rd slot') +BranchReg = namedtuple('BranchReg', 'name kind rs') +CondB = namedtuple('CondB', 'name op ra rb') +CondBZ = namedtuple('CondBZ', 'name op ra') +Enter = namedtuple('Enter', 'name size') + + +def byte(n): + return f'{n & 0xFF:02X}' + + +def le32(n): + return (n & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper() + + +def le64(n): + return (n & 0xFFFFFFFFFFFFFFFF).to_bytes(8, 'little').hex().upper() + + +def word_hex(word_bytes, n): + if word_bytes == 4: + return le32(n) + if word_bytes == 8: + return le64(n) + raise ValueError(f'unsupported word size: {word_bytes}') + + +def round_up(align, n): + return ((n + align - 1) // align) * align diff --git a/p1/gen/p1_gen.py b/p1/gen/p1_gen.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 +"""Generate P1 v2 DEFINE tables. + +This is a fresh generator for docs/P1v2.md. The ISA surface is described by +plain namedtuple rows, and each backend registers a simple row-type -> encoder +mapping. The emitted immediate/offset domains are still curated tables rather +than the full theoretical spec space, so extending coverage is a one-line data +edit instead of an architecture rewrite. + +Usage: + python3 p1/gen/p1_gen.py [--arch ARCH] [build-root] + python3 p1/gen/p1_gen.py --check [--arch ARCH] [build-root] + python3 p1/gen/p1_gen.py --list-archs +""" + +import os +import sys +from itertools import product + +from common import ( + AddI, + Banner, + BranchReg, + CondB, + CondBZ, + Enter, + La, + LaBr, + LdArg, + Li, + Literal, + LogI, + Mem, + Mov, + Nullary, + Rrr, + ShiftI, + word_hex, +) + +import aarch64 +import amd64 +import riscv64 + +ARCHES = {a.name: a for a in (aarch64.ARCH, amd64.ARCH, riscv64.ARCH)} + + +P1_GPRS = ('a0', 'a1', 'a2', 'a3', 't0', 't1', 't2', 's0', 's1', 's2', 's3') +P1_BASES = P1_GPRS + ('sp',) + +RRR_OPS = ('ADD', 'SUB', 'AND', 'OR', 'XOR', 'SHL', 'SHR', 'SAR', 'MUL', 'DIV', 'REM') +LOGI_OPS = ('ANDI', 'ORI') +SHIFT_OPS = ('SHLI', 'SHRI', 'SARI') +MEM_OPS = ('LD', 'ST', 'LB', 'SB') +CONDB_OPS = ('BEQ', 'BNE', 'BLT', 'BLTU') +CONDBZ_OPS = ('BEQZ', 'BNEZ', 'BLTZ') + +ADDI_IMMS = ( + -2048, -1024, -256, -128, -64, -48, -32, -24, -16, -12, -8, -7, -6, + -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16, 24, 32, 40, + 48, 63, 64, 127, 128, 255, 256, 512, 1024, 2047, +) + +LOGI_IMMS = ( + -1, 0, 1, 2, 3, 4, 6, 7, 8, 15, 16, 31, 32, 63, 64, 127, 255, 511, 1023, + 2047, +) + +SHIFT_IMMS = tuple(range(64)) + +MEM_OFFS = ( + -256, -128, -64, -48, -32, -24, -16, -8, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, + 15, 16, 24, 32, 40, 48, 56, 64, 128, 255, +) + +LDARG_SLOTS = tuple(range(32)) +ENTER_SIZES = tuple(range(0, 129)) + + +HEADER = """## p1_{arch}.M1 — GENERATED by p1/gen/p1_gen.py. Do not edit by hand. +## +## This table targets the P1 v2 ISA described in docs/P1v2.md. +## Row shapes are shared; per-arch lowering lives in p1/gen/<arch>.py. +""" + + +def imm_suffix(imm): + return f'NEG{-imm}' if imm < 0 else str(imm) + + +def rows(arch): + out = [] + + out.append(Banner('Materialization')) + for rd in P1_GPRS: + out.append(Li(name=f'LI_{rd.upper()}', rd=rd)) + for rd in P1_GPRS: + out.append(La(name=f'LA_{rd.upper()}', rd=rd)) + out.append(LaBr(name='LA_BR')) + + out.append(Banner('Moves')) + for rd, rs in product(P1_GPRS, P1_GPRS): + out.append(Mov(name=f'MOV_{rd.upper()}_{rs.upper()}', rd=rd, rs=rs)) + for rd in P1_GPRS: + out.append(Mov(name=f'MOV_{rd.upper()}_SP', rd=rd, rs='sp')) + + out.append(Banner('Register Arithmetic')) + for op, rd, ra, rb in product(RRR_OPS, P1_GPRS, P1_GPRS, P1_GPRS): + out.append(Rrr(name=f'{op}_{rd.upper()}_{ra.upper()}_{rb.upper()}', + op=op, rd=rd, ra=ra, rb=rb)) + + out.append(Banner('Immediate Arithmetic')) + for rd, ra, imm in product(P1_GPRS, P1_GPRS, ADDI_IMMS): + out.append(AddI(name=f'ADDI_{rd.upper()}_{ra.upper()}_{imm_suffix(imm)}', + rd=rd, ra=ra, imm=imm)) + for op, rd, ra, imm in product(LOGI_OPS, P1_GPRS, P1_GPRS, LOGI_IMMS): + out.append(LogI(name=f'{op}_{rd.upper()}_{ra.upper()}_{imm_suffix(imm)}', + op=op, rd=rd, ra=ra, imm=imm)) + for op, rd, ra, imm in product(SHIFT_OPS, P1_GPRS, P1_GPRS, SHIFT_IMMS): + out.append(ShiftI(name=f'{op}_{rd.upper()}_{ra.upper()}_{imm}', + op=op, rd=rd, ra=ra, imm=imm)) + + out.append(Banner('Memory')) + for op, rt, rn, off in product(MEM_OPS, P1_GPRS, P1_BASES, MEM_OFFS): + out.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{imm_suffix(off)}', + op=op, rt=rt, rn=rn, off=off)) + + out.append(Banner('ABI Access')) + for rd, slot in product(P1_GPRS, LDARG_SLOTS): + out.append(LdArg(name=f'LDARG_{rd.upper()}_{slot}', rd=rd, slot=slot)) + + out.append(Banner('Branches')) + out.append(Nullary(name='B', kind='B')) + for rs in P1_GPRS: + out.append(BranchReg(name=f'BR_{rs.upper()}', kind='BR', rs=rs)) + for op, ra, rb in product(CONDB_OPS, P1_GPRS, P1_GPRS): + out.append(CondB(name=f'{op}_{ra.upper()}_{rb.upper()}', op=op, ra=ra, rb=rb)) + for op, ra in product(CONDBZ_OPS, P1_GPRS): + out.append(CondBZ(name=f'{op}_{ra.upper()}', op=op, ra=ra)) + + out.append(Banner('Calls And Returns')) + out.append(Nullary(name='CALL', kind='CALL')) + out.append(Nullary(name='RET', kind='RET')) + out.append(Nullary(name='ERET', kind='ERET')) + out.append(Nullary(name='TAIL', kind='TAIL')) + for rs in P1_GPRS: + out.append(BranchReg(name=f'CALLR_{rs.upper()}', kind='CALLR', rs=rs)) + for rs in P1_GPRS: + out.append(BranchReg(name=f'TAILR_{rs.upper()}', kind='TAILR', rs=rs)) + + out.append(Banner('Frame Management')) + for size in ENTER_SIZES: + out.append(Enter(name=f'ENTER_{size}', size=size)) + + out.append(Banner('System')) + out.append(Nullary(name='SYSCALL', kind='SYSCALL')) + for name, number in sorted(arch.syscall_numbers.items()): + out.append(Literal(name=name, hex_by_arch={arch.name: word_hex(arch.word_bytes, number)})) + + return out + + +def lower_name(name): + low = name.lower() + head, sep, rest = low.partition('_') + if not sep: + return low + if '_' not in rest: + return low + return f'{head}_{rest.replace("_", ",")}' + + +def encode_row(arch, row): + if isinstance(row, Literal): + return row.hex_by_arch[arch.name] + encoder = arch.encoders[type(row)] + return encoder(arch, row) + + +def emit(arch_name): + arch = ARCHES[arch_name] + out = [HEADER.format(arch=arch.name).rstrip(), ''] + seen = set() + for row in rows(arch): + if isinstance(row, Banner): + out.append('') + out.append(f'## ---- {row.text}') + continue + name = lower_name(row.name) + if name in seen: + raise RuntimeError(f'duplicate DEFINE: {name}') + seen.add(name) + out.append(f'DEFINE {name} {encode_row(arch, row)}') + out.append('') + out.append('## ---- Program Entry') + out.append('## Backend-owned :_start stub per docs/P1.md §Program Entry.') + out.append('## Calls p1_main under the one-word direct-result convention') + out.append("## (a0=argc, a1=argv) and sys_exits its return value.") + out.extend(arch.start_stub()) + out.append('') + return '\n'.join(out) + + +def parse_args(argv): + check = False + archs = [] + positional = [] + i = 0 + while i < len(argv): + arg = argv[i] + if arg == '--check': + check = True + elif arg == '--list-archs': + print('\n'.join(sorted(ARCHES))) + sys.exit(0) + elif arg == '--arch': + i += 1 + if i >= len(argv): + raise SystemExit('--arch requires a value') + archs.append(argv[i]) + else: + positional.append(arg) + i += 1 + build_root = positional[0] if positional else os.path.join('build', 'p1v2') + if not archs: + archs = list(sorted(ARCHES)) + return check, archs, build_root + + +def main(argv=None): + check, archs, build_root = parse_args(argv or sys.argv[1:]) + had_diff = False + + for arch_name in archs: + arch = ARCHES[arch_name] + dest_dir = os.path.join(build_root, arch.name) + path = os.path.join(dest_dir, f'p1_{arch.name}.M1') + content = emit(arch.name) + if check: + try: + with open(path) as f: + existing = f.read() + except FileNotFoundError: + existing = '' + if existing != content: + sys.stderr.write(f'DIFF: {path}\n') + had_diff = True + continue + os.makedirs(dest_dir, exist_ok=True) + with open(path, 'w') as f: + f.write(content) + print(f'wrote {path} ({len(content)} bytes)') + + if check and had_diff: + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/p1/gen/riscv64.py b/p1/gen/riscv64.py @@ -0,0 +1,396 @@ +from common import ( + AddI, + ArchDef, + BranchReg, + CondB, + CondBZ, + Enter, + La, + LaBr, + LdArg, + Li, + LogI, + Mem, + Mov, + Nullary, + Rrr, + ShiftI, + le32, + round_up, +) + + +NAT = { + 'a0': 10, + 'a1': 11, + 'a2': 12, + 'a3': 13, + 'a4': 14, + 'a5': 15, + 'a6': 16, + 'a7': 17, + 't0': 5, + 't1': 6, + 't2': 7, + 's0': 9, + 's1': 18, + 's2': 19, + 's3': 20, + 'sp': 2, + 'zero': 0, + 'ra': 1, + 'fp': 8, + 'br': 31, + 'scratch': 30, + 'save0': 29, + 'save1': 28, + 'save2': 16, +} + + +RRR_BASE = { + 'ADD': 0x00000033, + 'SUB': 0x40000033, + 'AND': 0x00007033, + 'OR': 0x00006033, + 'XOR': 0x00004033, + 'SHL': 0x00001033, + 'SHR': 0x00005033, + 'SAR': 0x40005033, + 'MUL': 0x02000033, + 'DIV': 0x02004033, + 'REM': 0x02006033, +} + + +# Inverted-condition B-type opcodes for the skip-taken-over-jalr pattern: +# the skip fires when the P1 condition is FALSE, so the jalr below is the +# taken target. +CONDB_INV_BASE = { + 'BEQ': 0x00001063, # native BNE -- skip when not equal + 'BNE': 0x00000063, # native BEQ -- skip when equal + 'BLT': 0x00005063, # native BGE -- skip when ra >= rb (signed) + 'BLTU': 0x00007063, # native BGEU -- skip when ra >= rb (unsigned) +} + + +CONDBZ_INV_BASE = { + 'BEQZ': 0x00001063, + 'BNEZ': 0x00000063, + 'BLTZ': 0x00005063, +} + + +SYSCALL_NUMBERS = { + 'SYS_READ': 63, + 'SYS_WRITE': 64, + 'SYS_CLOSE': 57, + 'SYS_OPENAT': 56, + 'SYS_EXIT': 93, + 'SYS_CLONE': 220, + 'SYS_EXECVE': 221, + 'SYS_WAITID': 95, +} + + +def rv_r_type(base, rd, ra, rb): + d = NAT[rd] + a = NAT[ra] + b = NAT[rb] + return le32(base | (b << 20) | (a << 15) | (d << 7)) + + +def rv_i_type(base, rd, ra, imm12): + d = NAT[rd] + a = NAT[ra] + return le32(base | ((imm12 & 0xFFF) << 20) | (a << 15) | (d << 7)) + + +def rv_s_type(base, rs, ra, imm12): + s = NAT[rs] + a = NAT[ra] + imm = imm12 & 0xFFF + # arithmetic-shift the 12-bit signed value: bits 11:5 -> [31:25], + # bits 4:0 -> [11:7]. We only need the unsigned 12-bit pattern here + # because the m1pp encoder uses (>> imm 5) on the masked value. + hi = (imm >> 5) & 0x7F + lo = imm & 0x1F + return le32(base | (hi << 25) | (s << 20) | (a << 15) | (lo << 7)) + + +def rv_b_type_skip8(base, ra, rb): + # Hardcoded +8 branch: imm = 8, encoded with imm[4:1]=4, imm[11]=0, + # imm[10:5]=0, imm[12]=0. The combined [11:7] field becomes + # (imm[4:1] << 1) | imm[11] = 8. + a = NAT[ra] + b = NAT[rb] + return le32(base | (b << 20) | (a << 15) | (8 << 7)) + + +def rv_addi(rd, ra, imm12): + return rv_i_type(0x00000013, rd, ra, imm12) + + +def rv_ld(rd, ra, imm12): + return rv_i_type(0x00003003, rd, ra, imm12) + + +def rv_sd(rs, ra, imm12): + return rv_s_type(0x00003023, rs, ra, imm12) + + +def rv_lbu(rd, ra, imm12): + return rv_i_type(0x00004003, rd, ra, imm12) + + +def rv_sb(rs, ra, imm12): + return rv_s_type(0x00000023, rs, ra, imm12) + + +def rv_lwu(rd, ra, imm12): + return rv_i_type(0x00006003, rd, ra, imm12) + + +def rv_mov_rr(dst, src): + return rv_addi(dst, src, 0) + + +def rv_slli(rd, ra, shamt): + d = NAT[rd] + a = NAT[ra] + return le32(0x00001013 | ((shamt & 0x3F) << 20) | (a << 15) | (d << 7)) + + +def rv_srli(rd, ra, shamt): + d = NAT[rd] + a = NAT[ra] + return le32(0x00005013 | ((shamt & 0x3F) << 20) | (a << 15) | (d << 7)) + + +def rv_srai(rd, ra, shamt): + d = NAT[rd] + a = NAT[ra] + return le32(0x40005013 | ((shamt & 0x3F) << 20) | (a << 15) | (d << 7)) + + +def rv_jalr(rd, rs, imm12): + d = NAT[rd] + s = NAT[rs] + return le32(0x00000067 | ((imm12 & 0xFFF) << 20) | (s << 15) | (d << 7)) + + +def rv_ecall(): + return le32(0x00000073) + + +def rv_lit64_prefix(rd): + # auipc rd, 0 ; ld rd, 12(rd) ; jal x0, +12. + # The 8 bytes that follow in source become the literal. + d = NAT[rd] + auipc = 0x00000017 | (d << 7) + ld = 0x00C03003 | (d << 15) | (d << 7) + jal = 0x00C0006F + return le32(auipc) + le32(ld) + le32(jal) + + +def rv_lit32_prefix(rd): + # auipc rd, 0 ; lwu rd, 12(rd) ; jal x0, +8. + # lwu zero-extends a 4-byte literal; enough for stage0 addresses. + d = NAT[rd] + auipc = 0x00000017 | (d << 7) + lwu = 0x00C06003 | (d << 15) | (d << 7) + jal = 0x0080006F + return le32(auipc) + le32(lwu) + le32(jal) + + +def rv_epilogue(): + # Frame teardown shared by ERET, TAIL, TAILR. Mirrors p1_eret/p1_tail + # in P1-riscv64.M1pp: load saved ra, load saved caller sp into fp, + # then move fp into sp. The caller appends the actual jalr. + return rv_ld('ra', 'sp', 0) + rv_ld('fp', 'sp', 8) + rv_mov_rr('sp', 'fp') + + +def encode_li(_arch, row): + return rv_lit64_prefix(row.rd) + + +def encode_la(_arch, row): + return rv_lit32_prefix(row.rd) + + +def encode_labr(_arch, _row): + return rv_lit32_prefix('br') + + +def encode_mov(_arch, row): + # Portable sp is the frame-local base, which sits 16 bytes above + # native sp (the backend's 2-word hidden header occupies the low + # end of each frame). MOV rd, sp must therefore yield native_sp+16. + if row.rs == 'sp': + return rv_addi(row.rd, 'sp', 16) + return rv_mov_rr(row.rd, row.rs) + + +def encode_rrr(_arch, row): + return rv_r_type(RRR_BASE[row.op], row.rd, row.ra, row.rb) + + +def encode_addi(_arch, row): + return rv_addi(row.rd, row.ra, row.imm) + + +def encode_logi(_arch, row): + base = { + 'ANDI': 0x00007013, + 'ORI': 0x00006013, + }[row.op] + return rv_i_type(base, row.rd, row.ra, row.imm) + + +def encode_shifti(_arch, row): + if row.op == 'SHLI': + return rv_slli(row.rd, row.ra, row.imm) + if row.op == 'SHRI': + return rv_srli(row.rd, row.ra, row.imm) + if row.op == 'SARI': + return rv_srai(row.rd, row.ra, row.imm) + raise ValueError(f'unknown shift op: {row.op}') + + +def encode_mem(_arch, row): + # Portable sp points to the frame-local base; the 2-word hidden header + # at native_sp+0/+8 is not portable-addressable. Shift sp-relative + # offsets past the header. + off = row.off + 16 if row.rn == 'sp' else row.off + if row.op == 'LD': + return rv_ld(row.rt, row.rn, off) + if row.op == 'ST': + return rv_sd(row.rt, row.rn, off) + if row.op == 'LB': + return rv_lbu(row.rt, row.rn, off) + if row.op == 'SB': + return rv_sb(row.rt, row.rn, off) + raise ValueError(f'unknown mem op: {row.op}') + + +def encode_ldarg(_arch, row): + # LDARG loads the saved caller sp from [sp+8] (the hidden header + # slot), then indexes the incoming stack-arg area off it. Slot 0 is + # at caller_sp+16 because the native call instruction does not push + # a return address on riscv64 -- the +16 matches the aarch64 layout + # by convention for stage0 frame uniformity. + return rv_ld('scratch', 'sp', 8) + rv_ld(row.rd, 'scratch', 16 + 8 * row.slot) + + +def encode_branch_reg(_arch, row): + if row.kind == 'BR': + return rv_jalr('zero', row.rs, 0) + if row.kind == 'CALLR': + return rv_jalr('ra', row.rs, 0) + if row.kind == 'TAILR': + return rv_epilogue() + rv_jalr('zero', row.rs, 0) + raise ValueError(f'unknown branch-reg kind: {row.kind}') + + +def encode_condb(_arch, row): + return rv_b_type_skip8(CONDB_INV_BASE[row.op], row.ra, row.rb) + rv_jalr('zero', 'br', 0) + + +def encode_condbz(_arch, row): + return rv_b_type_skip8(CONDBZ_INV_BASE[row.op], row.ra, 'zero') + rv_jalr('zero', 'br', 0) + + +def encode_enter(arch, row): + frame_bytes = round_up(arch.stack_align, 2 * arch.word_bytes + row.size) + return ( + rv_addi('sp', 'sp', -frame_bytes) + + rv_sd('ra', 'sp', 0) + + rv_addi('fp', 'sp', frame_bytes) + + rv_sd('fp', 'sp', 8) + ) + + +def encode_nullary(_arch, row): + if row.kind == 'B': + return rv_jalr('zero', 'br', 0) + if row.kind == 'CALL': + return rv_jalr('ra', 'br', 0) + if row.kind == 'RET': + return rv_jalr('zero', 'ra', 0) + if row.kind == 'ERET': + return rv_epilogue() + rv_jalr('zero', 'ra', 0) + if row.kind == 'TAIL': + return rv_epilogue() + rv_jalr('zero', 'br', 0) + if row.kind == 'SYSCALL': + # P1: a0=number, a1..a3,t0,s0,s1 = args 0..5. + # Linux riscv64: a7=number, a0..a5 = args 0..5, return in a0. + # SYSCALL clobbers only P1 a0; restore a1/a2/a3 after ecall. + return ''.join([ + rv_mov_rr('save0', 'a1'), + rv_mov_rr('save1', 'a2'), + rv_mov_rr('save2', 'a3'), + rv_mov_rr('a7', 'a0'), + rv_mov_rr('a0', 'save0'), + rv_mov_rr('a1', 'save1'), + rv_mov_rr('a2', 'save2'), + rv_mov_rr('a3', 't0'), + rv_mov_rr('a4', 's0'), + rv_mov_rr('a5', 's1'), + rv_ecall(), + rv_mov_rr('a1', 'save0'), + rv_mov_rr('a2', 'save1'), + rv_mov_rr('a3', 'save2'), + ]) + raise ValueError(f'unknown nullary kind: {row.kind}') + + +def rv_start_stub(): + # Backend-owned :_start stub per docs/P1.md §Program Entry. Linux + # riscv64 puts argc at [sp] and argv starting at [sp+8]; load argc + # into a0, compute &argv[0] into a1, call p1_main under the one-word + # direct-result convention, then issue sys_exit. Mirrors %p1_entry + # in p1/P1-riscv64.M1pp. + # + # Raw hex outside DEFINE bodies must be single-quoted so bootstrap + # M0 treats it as a literal byte run rather than a token. + def q(hex_bytes): + return f"'{hex_bytes}'" + return [ + ':_start', + q(rv_ld('a0', 'sp', 0)), + q(rv_addi('a1', 'sp', 8)), + q(rv_lit32_prefix('br')), + '&p1_main', + q(rv_jalr('ra', 'br', 0)), + q(rv_addi('a7', 'zero', 93)), + q(rv_ecall()), + ] + + +ENCODERS = { + Li: encode_li, + La: encode_la, + LaBr: encode_labr, + Mov: encode_mov, + Rrr: encode_rrr, + AddI: encode_addi, + LogI: encode_logi, + ShiftI: encode_shifti, + Mem: encode_mem, + LdArg: encode_ldarg, + Nullary: encode_nullary, + BranchReg: encode_branch_reg, + CondB: encode_condb, + CondBZ: encode_condbz, + Enter: encode_enter, +} + + +ARCH = ArchDef( + name='riscv64', + word_bytes=8, + stack_align=16, + syscall_numbers=SYSCALL_NUMBERS, + encoders=ENCODERS, + start_stub=rv_start_stub, +)