commit f5dbc20f05f0180d5dc05a8d6f31d2746ee65d33
parent 30406eb9364d5d247ef5f49b1993874b0d35b1d4
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Tue, 21 Apr 2026 06:52:29 -0700
P1 gen: per-arch encoder classes + Cartesian DEFINE expansion
Refactor p1_gen.py from one class with per-op if/elif arch switches to
three Encoder subclasses (AA64, AMD64, RV64) bundling each arch's
encoding helpers. Thin Op dataclasses double-dispatch through
enc.<method>(fields), so adding an arch is a new class instead of
edits scattered across every Op.
Expand every non-RRR op to the full register Cartesian product and
enumerate RRR from a small curated table. Canonical imm/offset/shamt
sets cover all values used by lisp.M1, hello.M1, demo.M1, and
kaem-minimal.M1; no more per-op tuple lists to edit when new code
uses a legal combo.
Fixes along the way:
* aa64 mem(): positive-but-unaligned offsets fall through to the
unscaled ldur/stur form (needed for lisp LD at +7).
* amd64 adds amd_alu_ri32 for ADDI imms outside imm8 range.
* drops the tranche-12 MOV_R1_R2 skip hack — Cartesian emits each
(rD,rA) exactly once.
Output grows ~18x (9.8KB -> 180KB per arch) but is generated and
gitignored. hello, demo, and lisp all build and run on aarch64,
amd64, and riscv64 unchanged.
Diffstat:
| M | p1_gen.py | | | 1410 | +++++++++++++++++++++++++++++++++---------------------------------------------- |
1 file changed, 595 insertions(+), 815 deletions(-)
diff --git a/p1_gen.py b/p1_gen.py
@@ -1,39 +1,45 @@
#!/usr/bin/env python3
-"""p1_gen.py — generate p1_<arch>.M1 from a shared op table.
+"""p1_gen.py — generate p1_<arch>.M1 from a per-arch encoder table.
Single source of truth for the P1 DEFINE tables across all three target
-arches. Replaces the hand-written p1_<arch>.M1 files; running this script
-rewrites all three in place.
-
-Why a generator: the hand-written defs diverge across arches in
-hard-to-review ways, typos silently produce SIGILL'ing binaries (M1
-passes undefined tokens through as literal text — see `lint.sh` for
-the pre-assemble check), and the combinatorial surface (~1200 DEFINEs
-per arch if fully enumerated) is past the point of hand-maintainability.
-
-Design:
- * OPS is a list of emission rows. Each row is a small class whose
- `encode(arch) -> hex_string` method knows how to lower itself to
- that arch's native bytes.
- * Per-arch encoders live next to the Op classes. Adding a new op
- means adding one Op subclass with three encode methods.
- * Row ordering controls the output order in the .M1 file; tranches
- are grouped by banner comments.
+arches. Running this script rewrites p1_aarch64.M1, p1_amd64.M1, and
+p1_riscv64.M1 in place.
+
+Structure:
+ * Low-level native encoders (amd_*, aa_*, rv_*) — one bank of
+ helpers per arch.
+ * Encoder classes AA64/AMD64/RV64 (subclasses of Encoder): one
+ method per P1 op category, lowering (op, reg-tuple, imm) into
+ native hex. Each arch's encoder is a coherent bundle — adding a
+ new op means one new method on each of the three.
+ * Op dataclasses — thin rows holding the DEFINE's name + data.
+ Op.encode(enc) dispatches into enc.<op-method>() with the Op's
+ fields unpacked. No per-arch branching lives in Op classes.
+ * rows() — builds the output list. Non-RRR ops are emitted as the
+ full register product × a curated imm/offset/shamt set. RRR
+ keeps an explicit table (the full 8³ cube is 5.6k entries per
+ arch, >99% dead weight). Adding a new RRR triple or a new imm
+ value is a one-line edit to rows(); a new register combination
+ for any other op needs no edit at all.
+ * emit(arch) / main — iterate rows, ask the arch's encoder to
+ lower each, write out the defs file.
Running:
$ python3 p1_gen.py # rewrite all three files
$ python3 p1_gen.py --check # diff against current files
-
-Output files: p1_aarch64.M1, p1_amd64.M1, p1_riscv64.M1.
"""
import os
import sys
from dataclasses import dataclass
+from itertools import product
from typing import Optional
ARCHES = ('aarch64', 'amd64', 'riscv64')
+## P1 GPRs (the 8 caller/callee-split registers exposed to P1 source).
+P1_REGS = ('r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7')
+
## ---------- Register mappings --------------------------------------------
## P1 register name → native encoding number. The native numbers are what
## the per-arch encoders insert into instruction fields; the human-facing
@@ -112,6 +118,14 @@ def amd_alu_ri8(ext, dst, imm):
d = NAT_AMD64[dst]
return rex(1, 0, 0, d >> 3) + '83' + modrm(3, ext, d) + byte(imm)
+def amd_alu_ri32(ext, dst, imm):
+ """op dst, imm32 (sign-extended). Opcode 81 /ext id. Used when
+ an immediate doesn't fit in the imm8 form (e.g., ADDI with
+ values outside [-128, 127])."""
+ d = NAT_AMD64[dst]
+ imm_le = (imm & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
+ return rex(1, 0, 0, d >> 3) + '81' + modrm(3, ext, d) + imm_le
+
def amd_shift_ri8(ext, dst, imm):
"""shl/shr/sar dst, imm8. Opcode C1 /ext ib."""
d = NAT_AMD64[dst]
@@ -215,7 +229,9 @@ def aa_ldst_uimm12(base, rT, rN, off_bytes, size_log2):
return le32(base | (imm12 << 10) | (n << 5) | t)
def aa_ldst_unscaled(base, rT, rN, off):
- """LDUR/STUR (unscaled, signed imm9)."""
+ """LDUR/STUR (unscaled, signed imm9). Handles arbitrary small
+ offsets — negative, or positive-but-not-a-multiple-of-the-access-
+ size (e.g. LD at offset 7). imm9 range is [-256, 255]."""
assert -256 <= off <= 255
imm9 = off & 0x1FF
t, n = NAT_AA64[rT], NAT_AA64[rN]
@@ -247,20 +263,7 @@ def rv_shift_imm(base, rD, rA, shamt):
return le32(base | ((shamt & 0x3F) << 20) | (a << 15) | (d << 7))
-## ---------- Ops ---------------------------------------------------------
-## Each class represents a row in the output. `name` is the DEFINE
-## name (without the P1_ prefix that gets added automatically).
-
-@dataclass
-class Op:
- name: str
- comment: str = ''
-
- def encode(self, arch: str) -> str:
- raise NotImplementedError
-
-## --- Reg-reg-reg arith ---
-## Per-arch base opcodes. Dict-of-dicts: BASES[op][arch] = base_value.
+## ---------- Per-arch op base tables -------------------------------------
AA64_RRR_BASE = {
'ADD': 0x8B000000,
@@ -277,413 +280,496 @@ AMD64_RRR_OPC = {
'ADD': '01', 'SUB': '29', 'AND': '21', 'OR': '09', 'XOR': '31',
}
RV_RRR = {
- 'ADD': (0x00000033,), # funct7=0 funct3=0 opcode=0x33
- 'SUB': (0x40000033,),
- 'XOR': (0x00004033,),
- 'OR': (0x00006033,),
- 'AND': (0x00007033,),
- 'SHL': (0x00001033,),
- 'SHR': (0x00005033,),
- 'SAR': (0x40005033,),
- 'MUL': (0x02000033,),
- 'DIV': (0x02004033,),
- 'REM': (0x02006033,),
+ 'ADD': 0x00000033, # funct7=0 funct3=0 opcode=0x33
+ 'SUB': 0x40000033,
+ 'XOR': 0x00004033,
+ 'OR': 0x00006033,
+ 'AND': 0x00007033,
+ 'SHL': 0x00001033,
+ 'SHR': 0x00005033,
+ 'SAR': 0x40005033,
+ 'MUL': 0x02000033,
+ 'DIV': 0x02004033,
+ 'REM': 0x02006033,
+}
+
+
+## aarch64 bitmask-immediate encoding for ANDI/ORI. Entries are the
+## (N, immr, imms) triples that encode each small imm as an aarch64
+## "logical immediate." Computed by hand because the full encoding
+## algorithm (contiguous-run + rotation for element sizes
+## 2/4/8/16/32/64) is substantial and we only need a handful of
+## values. Extend this table if a new imm shows up in P1 source.
+AA64_LOGI_ENC = {
+ 1: (1, 0, 0), # 0b0001 — single bit at position 0
+ 2: (1, 63, 0), # 0b0010 — single bit at position 1
+ 3: (1, 0, 1), # 0b0011 — 2 contiguous ones
+ 4: (1, 62, 0), # 0b0100 — single bit at position 2
+ 6: (1, 63, 1), # 0b0110 — 2 ones rotated by 1
+ 7: (1, 0, 2), # 0b0111 — 3 contiguous ones
+ 8: (1, 61, 0), # 0b1000 — single bit at position 3
}
+
+## Frame layout after PROLOGUE_Nk (k >= 1, rounded up so total frame
+## bytes stay 16-byte aligned on aarch64):
+## [sp + 0] = retaddr (aarch64 lr / riscv64 ra / amd64 retaddr)
+## [sp + 8] = slot 1 (callee-private scratch)
+## [sp + 16] = slot 2
+## ...
+## [sp + 8*k] = slot k
+##
+## Frame size = round_up_to_16(8 + 8*k). So k=1 → 16, k=2 → 24 → 32,
+## k=3 → 32, k=4 → 40 → 48.
+
+def prologue_frame_bytes(k: int) -> int:
+ raw = 8 + 8 * k
+ return (raw + 15) & ~15
+
+
+## ---------- Encoders ----------------------------------------------------
+## One class per arch. Each provides one method per P1 op category,
+## mapping (op, reg-tuple, imm) to native bytes. Op classes dispatch
+## here via `Op.encode(enc)` → `enc.<method>(fields)`.
+
+class Encoder:
+ """Per-arch encoder base. Subclasses implement one method per
+ op category. `arch` is used by literal() to pick the right
+ pre-encoded bytes from an arch-keyed dict."""
+ arch = ''
+
+ def literal(self, hex_by_arch):
+ return hex_by_arch[self.arch]
+
+
+class AA64(Encoder):
+ arch = 'aarch64'
+
+ def rrr(self, op, rD, rA, rB):
+ if op == 'MUL':
+ # MUL = MADD with Ra=xzr. 100 11011 000 mmmmm 0 aaaaa nnnnn ddddd
+ d = NAT_AA64[rD]; a = NAT_AA64[rA]; b = NAT_AA64[rB]
+ return le32(0x9B000000 | (b << 16) | (31 << 10) | (a << 5) | d)
+ if op == 'REM':
+ # SDIV x16, xA, xB ; MSUB xD, x16, xB, xA.
+ # x16 (ARM IP0, caller-saved, not a P1 reg) is scratch so
+ # REM does not hidden-clobber P1 r4 — the op modifies rD only.
+ # MSUB needs bit 15 set (o0=1); without it it decodes as
+ # MADD and REM returns A + (A/B)*B.
+ d = NAT_AA64[rD]; a = NAT_AA64[rA]; b = NAT_AA64[rB]
+ SC = 16
+ sdiv = 0x9AC00C00 | (b << 16) | (a << 5) | SC
+ msub = 0x9B008000 | (b << 16) | (a << 10) | (SC << 5) | d
+ return le32(sdiv) + le32(msub)
+ return aa_rrr(AA64_RRR_BASE[op], rD, rA, rB)
+
+ def addi(self, rD, rA, imm):
+ if imm >= 0:
+ return aa_add_imm(rD, rA, imm, sub=False)
+ return aa_add_imm(rD, rA, -imm, sub=True)
+
+ def logi(self, op, rD, rA, imm):
+ N, immr, imms = AA64_LOGI_ENC[imm]
+ base = 0x92000000 if op == 'ANDI' else 0xB2000000 # ORI = orr
+ return aa_logical_imm(base, rD, rA, N, immr, imms)
+
+ def shifti(self, op, rD, rA, imm):
+ if op == 'SHLI':
+ return aa_ubfm(rD, rA, (-imm) & 63, 63 - imm)
+ if op == 'SHRI':
+ return aa_ubfm(rD, rA, imm, 63)
+ if op == 'SARI':
+ return aa_sbfm(rD, rA, imm, 63)
+
+ def mov(self, rD, rA):
+ if rA == 'sp':
+ return aa_add_imm(rD, 'sp', 0, sub=False)
+ # MOV xD, xA = ORR xD, xzr, xA
+ d = NAT_AA64[rD]; a = NAT_AA64[rA]
+ return le32(0xAA000000 | (a << 16) | (31 << 5) | d)
+
+ def li(self, rD):
+ # ldr wD, [pc+8] ; b +8 (caller emits 4 bytes of data next)
+ d = NAT_AA64[rD]
+ ldr_w_lit = 0x18000040 | d # LDR (literal) 32-bit, offset 8
+ b_plus8 = 0x14000002 # B offset 8 (imm26 = 2 words = 8 bytes)
+ return le32(ldr_w_lit) + le32(b_plus8)
+
+ def la(self, rD):
+ return self.li(rD)
+
+ def mem(self, op, rT, rN, off):
+ # Pick uimm12 (scaled, large range) when the offset is a
+ # non-negative multiple of the access width; otherwise fall
+ # back to the unscaled signed-imm9 form (covers negative
+ # offsets and positive-but-misaligned ones like 7).
+ BASES = {
+ 'LD': (0xF9400000, 3, 0xF8400000),
+ 'ST': (0xF9000000, 3, 0xF8000000),
+ 'LB': (0x39400000, 0, 0x38400000),
+ 'SB': (0x39000000, 0, 0x38000000),
+ }
+ uimm_base, size_log2, unscaled_base = BASES[op]
+ scale = 1 << size_log2
+ if off >= 0 and (off % scale) == 0:
+ return aa_ldst_uimm12(uimm_base, rT, rN, off, size_log2)
+ return aa_ldst_unscaled(unscaled_base, rT, rN, off)
+
+ def b(self):
+ return le32(0xD61F0000 | (NAT_AA64['br'] << 5)) # BR x17
+
+ def condb(self, op, rA, rB):
+ # cmp xA, xB = SUBS xzr, xA, xB (0xEB000000 base, rD=31).
+ # Skip when NOT cond holds. BEQ→NE(1), BNE→EQ(0), BLT→GE(A).
+ a = NAT_AA64[rA]; b_ = NAT_AA64[rB]
+ cmp_ = le32(0xEB000000 | (b_ << 16) | (a << 5) | 31)
+ cond = {'BEQ': 1, 'BNE': 0, 'BLT': 10}[op]
+ bcond = le32(0x54000040 | cond)
+ br = le32(0xD61F0000 | (NAT_AA64['br'] << 5))
+ return cmp_ + bcond + br
+
+ def call(self):
+ return le32(0xD63F0000 | (NAT_AA64['br'] << 5)) # BLR x17
+
+ def ret(self):
+ return le32(0xD65F03C0) # RET (= br x30)
+
+ def prologue(self, k):
+ fb = prologue_frame_bytes(k)
+ sub = aa_add_imm('sp', 'sp', fb, sub=True)
+ str_lr = aa_ldst_uimm12(0xF9000000, 'lr', 'sp', 0, 3)
+ return sub + str_lr
+
+ def epilogue(self, k):
+ fb = prologue_frame_bytes(k)
+ ldr_lr = aa_ldst_uimm12(0xF9400000, 'lr', 'sp', 0, 3)
+ add = aa_add_imm('sp', 'sp', fb, sub=False)
+ return ldr_lr + add
+
+ def tail(self, k):
+ return self.epilogue(k) + self.b()
+
+
+class AMD64(Encoder):
+ arch = 'amd64'
+
+ def rrr(self, op, rD, rA, rB):
+ if op == 'MUL':
+ return amd_mov_rr(rD, rA) + amd_imul_rr(rD, rB)
+ if op in ('DIV', 'REM'):
+ # x86 idiv implicitly reads/writes rax (P1 r0) and rdx
+ # (P1 r3). To keep DIV/REM clobber-free (only rD changes),
+ # stash r0 into r11 and r3 into rcx — neither is a P1 reg —
+ # then restore. If rA or rB alias r0/r3, read from the
+ # saved copy since we've overwritten the originals.
+ # Skip the final restore for whichever of r0/r3 *is* rD,
+ # so rD keeps its newly computed value.
+ seq = amd_mov_rr('r11', 'r0') # save r0 (rax)
+ seq += amd_mov_rr('rcx', 'r3') # save r3 (rdx)
+ src_a = 'r11' if rA == 'r0' else ('rcx' if rA == 'r3' else rA)
+ seq += amd_mov_rr('r0', src_a) # rax = rA
+ seq += amd_cqo() # rdx:rax = sign-ext rax
+ src_b = 'r11' if rB == 'r0' else ('rcx' if rB == 'r3' else rB)
+ seq += amd_idiv(src_b)
+ seq += amd_mov_rr(rD, 'r0' if op == 'DIV' else 'r3')
+ if rD != 'r3':
+ seq += amd_mov_rr('r3', 'rcx')
+ if rD != 'r0':
+ seq += amd_mov_rr('r0', 'r11')
+ return seq
+ if op in ('SHL', 'SHR', 'SAR'):
+ ext = {'SHL': 4, 'SHR': 5, 'SAR': 7}[op]
+ seq = amd_mov_rr(rD, rA)
+ seq += amd_mov_rr('rcx', rB)
+ seq += amd_shift_cl(ext, rD)
+ return seq
+ # ADD/SUB/AND/OR/XOR: mov rD,rA ; op rD,rB
+ seq = amd_mov_rr(rD, rA)
+ seq += amd_alu_rr(AMD64_RRR_OPC[op], rD, rB)
+ return seq
+
+ def addi(self, rD, rA, imm):
+ # mov rD,rA ; add rD,imm. Use imm8 form when it fits
+ # ([-128, 127]); otherwise emit the imm32 form.
+ seq = amd_mov_rr(rD, rA)
+ if -128 <= imm <= 127:
+ seq += amd_alu_ri8(0, rD, imm) # /0 = ADD
+ else:
+ seq += amd_alu_ri32(0, rD, imm)
+ return seq
+
+ def logi(self, op, rD, rA, imm):
+ ext = {'ANDI': 4, 'ORI': 1}[op]
+ seq = amd_mov_rr(rD, rA)
+ seq += amd_alu_ri8(ext, rD, imm)
+ return seq
+
+ def shifti(self, op, rD, rA, imm):
+ ext = {'SHLI': 4, 'SHRI': 5, 'SARI': 7}[op]
+ seq = amd_mov_rr(rD, rA)
+ seq += amd_shift_ri8(ext, rD, imm)
+ return seq
+
+ def mov(self, rD, rA):
+ return amd_mov_rr(rD, rA)
+
+ def li(self, rD):
+ # mov <rD as r32>, imm32 — opcode B8+r (with REX.B if r8..r15)
+ d = NAT_AMD64[rD]
+ if d >= 8:
+ return '41' + byte(0xB8 + (d & 7))
+ return byte(0xB8 + d)
+
+ def la(self, rD):
+ return self.li(rD)
+
+ def mem(self, op, rT, rN, off):
+ if op == 'LD': return amd_mem_rm('8B', rT, rN, off)
+ if op == 'ST': return amd_mem_rm('89', rT, rN, off)
+ if op == 'LB': return amd_mov_rm_b(rT, rN, off, store=False)
+ if op == 'SB': return amd_mov_rm_b(rT, rN, off, store=True)
+
+ def b(self):
+ return '41FFE3' # jmp r11
+
+ def condb(self, op, rA, rB):
+ a, b_ = NAT_AMD64[rA], NAT_AMD64[rB]
+ # cmp rA, rB — opcode 39 /r with rA as r/m
+ cmp_ = rex(1, b_ >> 3, 0, a >> 3) + '39' + modrm(3, b_, a)
+ # jcc rel8 opcode, skip=3 (past jmp r11):
+ # BEQ→JNE 75 03 ; BNE→JE 74 03 ; BLT→JGE 7D 03
+ jop = {'BEQ': '75', 'BNE': '74', 'BLT': '7D'}[op]
+ return cmp_ + jop + '03' + '41FFE3' # jmp r11
+
+ def call(self):
+ return '41FFD3' # call r11
+
+ def ret(self):
+ return 'C3'
+
+ def prologue(self, k):
+ # pop rcx ; sub rsp,fb ; push rcx. rcx is the retaddr-carry
+ # scratch — caller-save, never a P1 reg. r11 (= 'br') is
+ # off-limits because TAIL = EPILOGUE + `jmp r11`, and using
+ # r11 here would clobber the LI_BR-loaded tail target.
+ fb = prologue_frame_bytes(k)
+ assert fb <= 127
+ return '59' + '4883EC' + byte(fb) + '51'
+
+ def epilogue(self, k):
+ # Mirror of prologue: pop rcx ; add rsp,fb ; push rcx.
+ fb = prologue_frame_bytes(k)
+ assert fb <= 127
+ return '59' + '4883C4' + byte(fb) + '51'
+
+ def tail(self, k):
+ return self.epilogue(k) + self.b()
+
+
+class RV64(Encoder):
+ arch = 'riscv64'
+
+ def rrr(self, op, rD, rA, rB):
+ return rv_r(RV_RRR[op], rD, rA, rB)
+
+ def addi(self, rD, rA, imm):
+ return rv_i(0x00000013, rD, rA, imm)
+
+ def logi(self, op, rD, rA, imm):
+ base = {'ANDI': 0x00007013, 'ORI': 0x00006013}[op]
+ return rv_i(base, rD, rA, imm)
+
+ def shifti(self, op, rD, rA, imm):
+ base = {'SHLI': 0x00001013, 'SHRI': 0x00005013, 'SARI': 0x40005013}[op]
+ return rv_shift_imm(base, rD, rA, imm)
+
+ def mov(self, rD, rA):
+ return rv_i(0x00000013, rD, rA, 0) # addi rD, rA, 0
+
+ def li(self, rD):
+ # auipc rD,0 ; lwu rD,12(rD) ; jal x0,+8
+ d = NAT_RV64[rD]
+ auipc = 0x00000017 | (d << 7)
+ lwu = 0x00006003 | (d << 7) | (d << 15) | (12 << 20)
+ jal_p8 = 0x0080006F
+ return le32(auipc) + le32(lwu) + le32(jal_p8)
+
+ def la(self, rD):
+ return self.li(rD)
+
+ def mem(self, op, rT, rN, off):
+ # funct3: LD=3, ST=3, LBU=4, SB=0. Opcodes: load=03, store=23.
+ if op == 'LD': return rv_i(0x00003003, rT, rN, off)
+ if op == 'ST': return rv_s(0x00003023, rT, rN, off)
+ if op == 'LB': return rv_i(0x00004003, rT, rN, off) # LBU
+ if op == 'SB': return rv_s(0x00000023, rT, rN, off)
+
+ def b(self):
+ return le32(0x00000067 | (NAT_RV64['br'] << 15)) # jalr x0, 0(t5)
+
+ def condb(self, op, rA, rB):
+ # B<inv> rA, rB, +8 ; jalr x0, 0(t5). funct3 picks the op:
+ # BEQ→BNE(1), BNE→BEQ(0), BLT→BGE(5).
+ a, b_ = NAT_RV64[rA], NAT_RV64[rB]
+ funct3 = {'BEQ': 1, 'BNE': 0, 'BLT': 5}[op]
+ insn = 0x00000063 | (funct3 << 12) | (a << 15) | (b_ << 20) | (8 << 7)
+ jalr = 0x00000067 | (NAT_RV64['br'] << 15)
+ return le32(insn) + le32(jalr)
+
+ def call(self):
+ return le32(0x000000E7 | (NAT_RV64['br'] << 15)) # jalr ra, 0(t5)
+
+ def ret(self):
+ return le32(0x00008067) # jalr x0, 0(ra)
+
+ def prologue(self, k):
+ fb = prologue_frame_bytes(k)
+ sub = rv_i(0x00000013, 'sp', 'sp', -fb)
+ sd = rv_s(0x00003023, 'ra', 'sp', 0)
+ return sub + sd
+
+ def epilogue(self, k):
+ fb = prologue_frame_bytes(k)
+ ld = rv_i(0x00003003, 'ra', 'sp', 0)
+ add = rv_i(0x00000013, 'sp', 'sp', fb)
+ return ld + add
+
+ def tail(self, k):
+ return self.epilogue(k) + self.b()
+
+
+ENCODERS = {'aarch64': AA64(), 'amd64': AMD64(), 'riscv64': RV64()}
+
+
+## ---------- Op dataclasses ----------------------------------------------
+## Thin wrappers: each row holds its DEFINE name + the data needed to
+## reconstruct the encoding. `encode(enc)` calls the matching method
+## on the arch's encoder.
+
+@dataclass
+class Op:
+ name: str
+ comment: str = ''
+
+ def encode(self, enc: Encoder) -> str:
+ raise NotImplementedError
+
@dataclass
class RRR(Op):
op: str = ''
rD: str = ''
rA: str = ''
rB: str = ''
+ def encode(self, enc):
+ return enc.rrr(self.op, self.rD, self.rA, self.rB)
- def encode(self, arch):
- if arch == 'aarch64':
- if self.op == 'MUL':
- # MUL = MADD with Ra=xzr. 100 11011 000 mmmmm 0 aaaaa nnnnn ddddd
- d = NAT_AA64[self.rD]
- a = NAT_AA64[self.rA]
- b = NAT_AA64[self.rB]
- return le32(0x9B000000 | (b << 16) | (31 << 10) | (a << 5) | d)
- if self.op == 'REM':
- # SDIV x16, xA, xB ; MSUB xD, x16, xB, xA.
- # x16 (ARM IP0, caller-saved, not a P1 reg) is scratch so
- # REM does not hidden-clobber P1 r4 — the op modifies rD only.
- # MSUB encoding needs bit 15 set (o0=1); without it the
- # instruction decodes as MADD and REM returns A + (A/B)*B.
- d = NAT_AA64[self.rD]; a = NAT_AA64[self.rA]; b = NAT_AA64[self.rB]
- SC = 16
- sdiv = 0x9AC00C00 | (b << 16) | (a << 5) | SC
- msub = 0x9B008000 | (b << 16) | (a << 10) | (SC << 5) | d
- return le32(sdiv) + le32(msub)
- base = AA64_RRR_BASE[self.op]
- return aa_rrr(base, self.rD, self.rA, self.rB)
-
- if arch == 'amd64':
- if self.op == 'MUL':
- return amd_mov_rr(self.rD, self.rA) + amd_imul_rr(self.rD, self.rB)
- if self.op in ('DIV', 'REM'):
- # x86 idiv implicitly reads/writes rax (P1 r0) and rdx
- # (P1 r3). To keep DIV/REM clobber-free (only rD changes),
- # stash r0 into r11 and r3 into rcx — neither is a P1 reg —
- # then restore. If rA or rB alias r0/r3, read from the
- # saved copy since we've overwritten the originals.
- # Skip the final restore for whichever of r0/r3 *is* rD,
- # so rD keeps its newly computed value.
- seq = amd_mov_rr('r11', 'r0') # save r0 (rax)
- seq += amd_mov_rr('rcx', 'r3') # save r3 (rdx)
- src_a = 'r11' if self.rA == 'r0' else ('rcx' if self.rA == 'r3' else self.rA)
- seq += amd_mov_rr('r0', src_a) # rax = rA
- seq += amd_cqo() # rdx:rax = sign-ext rax
- src_b = 'r11' if self.rB == 'r0' else ('rcx' if self.rB == 'r3' else self.rB)
- seq += amd_idiv(src_b)
- if self.op == 'DIV':
- seq += amd_mov_rr(self.rD, 'r0') # rD = quotient
- else:
- seq += amd_mov_rr(self.rD, 'r3') # rD = remainder
- if self.rD != 'r3':
- seq += amd_mov_rr('r3', 'rcx') # restore r3
- if self.rD != 'r0':
- seq += amd_mov_rr('r0', 'r11') # restore r0
- return seq
- if self.op in ('SHL', 'SHR', 'SAR'):
- ext = {'SHL': 4, 'SHR': 5, 'SAR': 7}[self.op]
- seq = amd_mov_rr(self.rD, self.rA)
- seq += amd_mov_rr('rcx', self.rB)
- seq += amd_shift_cl(ext, self.rD)
- return seq
- # ADD/SUB/AND/OR/XOR: mov rD,rA ; op rD,rB
- seq = amd_mov_rr(self.rD, self.rA)
- seq += amd_alu_rr(AMD64_RRR_OPC[self.op], self.rD, self.rB)
- return seq
-
- if arch == 'riscv64':
- base, = RV_RRR[self.op]
- return rv_r(base, self.rD, self.rA, self.rB)
-
- raise ValueError(arch)
-
-## --- Immediate arith ---
@dataclass
class AddI(Op):
rD: str = ''
rA: str = ''
imm: int = 0
-
- def encode(self, arch):
- if arch == 'aarch64':
- if self.imm >= 0:
- return aa_add_imm(self.rD, self.rA, self.imm, sub=False)
- else:
- return aa_add_imm(self.rD, self.rA, -self.imm, sub=True)
- if arch == 'amd64':
- # mov rD,rA ; add rD,imm8
- seq = amd_mov_rr(self.rD, self.rA)
- seq += amd_alu_ri8(0, self.rD, self.imm) # /0 = ADD
- return seq
- if arch == 'riscv64':
- return rv_i(0x00000013, self.rD, self.rA, self.imm)
+ def encode(self, enc):
+ return enc.addi(self.rD, self.rA, self.imm)
@dataclass
class LogI(Op):
- """Bitwise-imm ops: ANDI, ORI. aarch64 logical-imm encoding is
- pattern-based — caller supplies N/immr/imms. imm is small and
- explicit for clarity."""
- op: str = ''
+ op: str = '' # ANDI / ORI
rD: str = ''
rA: str = ''
imm: int = 0
- aa_N: int = 0
- aa_immr: int = 0
- aa_imms: int = 0
-
- def encode(self, arch):
- if arch == 'aarch64':
- base = 0x92000000 if self.op == 'ANDI' else 0xB2000000 # ORI = orr
- return aa_logical_imm(base, self.rD, self.rA, self.aa_N, self.aa_immr, self.aa_imms)
- if arch == 'amd64':
- ext = {'ANDI': 4, 'ORI': 1}[self.op]
- seq = amd_mov_rr(self.rD, self.rA)
- seq += amd_alu_ri8(ext, self.rD, self.imm)
- return seq
- if arch == 'riscv64':
- base = {'ANDI': 0x00007013, 'ORI': 0x00006013}[self.op]
- return rv_i(base, self.rD, self.rA, self.imm)
+ def encode(self, enc):
+ return enc.logi(self.op, self.rD, self.rA, self.imm)
@dataclass
class ShiftI(Op):
- op: str = '' # SHLI/SHRI/SARI
+ op: str = '' # SHLI / SHRI / SARI
rD: str = ''
rA: str = ''
imm: int = 0
+ def encode(self, enc):
+ return enc.shifti(self.op, self.rD, self.rA, self.imm)
- def encode(self, arch):
- if arch == 'aarch64':
- if self.op == 'SHLI':
- return aa_ubfm(self.rD, self.rA, (-self.imm) & 63, 63 - self.imm)
- if self.op == 'SHRI':
- return aa_ubfm(self.rD, self.rA, self.imm, 63)
- if self.op == 'SARI':
- return aa_sbfm(self.rD, self.rA, self.imm, 63)
- if arch == 'amd64':
- ext = {'SHLI': 4, 'SHRI': 5, 'SARI': 7}[self.op]
- seq = amd_mov_rr(self.rD, self.rA)
- seq += amd_shift_ri8(ext, self.rD, self.imm)
- return seq
- if arch == 'riscv64':
- base = {'SHLI': 0x00001013, 'SHRI': 0x00005013, 'SARI': 0x40005013}[self.op]
- return rv_shift_imm(base, self.rD, self.rA, self.imm)
-
-## --- Moves ---
@dataclass
class Mov(Op):
rD: str = ''
rA: str = ''
+ def encode(self, enc):
+ return enc.mov(self.rD, self.rA)
- def encode(self, arch):
- if arch == 'aarch64':
- if self.rA == 'sp':
- return aa_add_imm(self.rD, 'sp', 0, sub=False)
- # MOV xD, xA = ORR xD, xzr, xA
- d = NAT_AA64[self.rD]; a = NAT_AA64[self.rA]
- return le32(0xAA000000 | (a << 16) | (31 << 5) | d)
- if arch == 'amd64':
- return amd_mov_rr(self.rD, self.rA)
- if arch == 'riscv64':
- return rv_i(0x00000013, self.rD, self.rA, 0) # addi rD, rA, 0
-
-## --- LI (wide literal) ---
@dataclass
class Li(Op):
rD: str = ''
+ def encode(self, enc):
+ return enc.li(self.rD)
- def encode(self, arch):
- if arch == 'aarch64':
- # ldr wD, [pc+8] ; b +8 (caller emits 4 bytes of data next)
- d = NAT_AA64[self.rD]
- ldr_w_lit = 0x18000040 | d # LDR (literal) 32-bit, offset 8: imm19=2 → 0x40 in [23:5]
- b_plus8 = 0x14000002 # B offset 8 (imm26 = 2 words = 8 bytes)
- return le32(ldr_w_lit) + le32(b_plus8)
- if arch == 'amd64':
- # mov <rD as r32>, imm32 — opcode B8+r (with REX.B if r8..r15)
- d = NAT_AMD64[self.rD]
- if d >= 8:
- return '41' + byte(0xB8 + (d & 7))
- return byte(0xB8 + d)
- if arch == 'riscv64':
- # auipc rD,0 ; lwu rD,12(rD) ; jal x0,+8
- d = NAT_RV64[self.rD]
- auipc = 0x00000017 | (d << 7)
- lwu = 0x00006003 | (d << 7) | (d << 15) | (12 << 20)
- jal_p8 = 0x0080006F # jal x0, +8
- return le32(auipc) + le32(lwu) + le32(jal_p8)
-
-## --- LA (address-load) — in the spike, same as LI ---
@dataclass
class La(Op):
rD: str = ''
+ def encode(self, enc):
+ return enc.la(self.rD)
- def encode(self, arch):
- return Li(name=self.name, rD=self.rD).encode(arch)
-
-## --- Memory: LD (64b), ST (64b), LB (8b zero-ext), SB (8b) ---
@dataclass
class Mem(Op):
- op: str = '' # LD/ST/LB/SB
- rT: str = '' # load dest or store src
- rN: str = '' # base
+ op: str = '' # LD / ST / LB / SB
+ rT: str = ''
+ rN: str = ''
off: int = 0
+ def encode(self, enc):
+ return enc.mem(self.op, self.rT, self.rN, self.off)
- def encode(self, arch):
- if arch == 'aarch64':
- if self.op == 'LD':
- if self.off >= 0:
- return aa_ldst_uimm12(0xF9400000, self.rT, self.rN, self.off, 3)
- return aa_ldst_unscaled(0xF8400000, self.rT, self.rN, self.off)
- if self.op == 'ST':
- if self.off >= 0:
- return aa_ldst_uimm12(0xF9000000, self.rT, self.rN, self.off, 3)
- return aa_ldst_unscaled(0xF8000000, self.rT, self.rN, self.off)
- if self.op == 'LB':
- if self.off >= 0:
- return aa_ldst_uimm12(0x39400000, self.rT, self.rN, self.off, 0)
- return aa_ldst_unscaled(0x38400000, self.rT, self.rN, self.off)
- if self.op == 'SB':
- if self.off >= 0:
- return aa_ldst_uimm12(0x39000000, self.rT, self.rN, self.off, 0)
- return aa_ldst_unscaled(0x38000000, self.rT, self.rN, self.off)
- if arch == 'amd64':
- if self.op == 'LD':
- return amd_mem_rm('8B', self.rT, self.rN, self.off)
- if self.op == 'ST':
- return amd_mem_rm('89', self.rT, self.rN, self.off)
- if self.op == 'LB':
- return amd_mov_rm_b(self.rT, self.rN, self.off, store=False)
- if self.op == 'SB':
- return amd_mov_rm_b(self.rT, self.rN, self.off, store=True)
- if arch == 'riscv64':
- # funct3: LD=3, ST=3, LBU=4, SB=0. Opcodes: load=03, store=23.
- if self.op == 'LD':
- return rv_i(0x00003003, self.rT, self.rN, self.off)
- if self.op == 'ST':
- return rv_s(0x00003023, self.rT, self.rN, self.off)
- if self.op == 'LB':
- return rv_i(0x00004003, self.rT, self.rN, self.off) # LBU
- if self.op == 'SB':
- return rv_s(0x00000023, self.rT, self.rN, self.off)
-
-## --- Branches: LI_BR-indirect pattern ---
@dataclass
class B(Op):
- def encode(self, arch):
- if arch == 'aarch64':
- return le32(0xD61F0000 | (NAT_AA64['br'] << 5)) # BR x17
- if arch == 'amd64':
- return '41FFE3' # jmp r11
- if arch == 'riscv64':
- return le32(0x00000067 | (NAT_RV64['br'] << 15)) # jalr x0, 0(t5)
+ def encode(self, enc):
+ return enc.b()
@dataclass
class CondB(Op):
- op: str = '' # BEQ/BNE/BLT
+ op: str = '' # BEQ / BNE / BLT
rA: str = ''
rB: str = ''
+ def encode(self, enc):
+ return enc.condb(self.op, self.rA, self.rB)
- def encode(self, arch):
- if arch == 'aarch64':
- # cmp xA, xB = SUBS xzr, xA, xB (0xEB000000 base, rD=31)
- a = NAT_AA64[self.rA]; b = NAT_AA64[self.rB]
- cmp = le32(0xEB000000 | (b << 16) | (a << 5) | 31)
- # b.cond +8 : opcode 54 00 00 4X where X is the cond
- # Skip when NOT cond holds. BEQ→NE(1), BNE→EQ(0), BLT→GE(A).
- cond = {'BEQ': 1, 'BNE': 0, 'BLT': 10}[self.op]
- bcond = le32(0x54000040 | cond)
- br = le32(0xD61F0000 | (NAT_AA64['br'] << 5))
- return cmp + bcond + br
- if arch == 'amd64':
- a, b = NAT_AMD64[self.rA], NAT_AMD64[self.rB]
- # cmp rA, rB — opcode 39 /r with rA as r/m
- cmp_ = rex(1, b >> 3, 0, a >> 3) + '39' + modrm(3, b, a)
- # jcc rel8 opcode, skip=3 (past jmp r11):
- # BEQ→JNE 75 03 ; BNE→JE 74 03 ; BLT→JGE 7D 03
- jop = {'BEQ': '75', 'BNE': '74', 'BLT': '7D'}[self.op]
- return cmp_ + jop + '03' + '41FFE3' # jmp r11
- if arch == 'riscv64':
- # B<inv> rA, rB, +8 ; jalr x0, 0(t5)
- a, b = NAT_RV64[self.rA], NAT_RV64[self.rB]
- # funct3 picks the op: BEQ→BNE(1), BNE→BEQ(0), BLT→BGE(5).
- funct3 = {'BEQ': 1, 'BNE': 0, 'BLT': 5}[self.op]
- # B-type with imm=+8: imm[11:8]=0b0100 → byte[1] nibble high=4.
- insn = 0x00000063 | (funct3 << 12) | (a << 15) | (b << 20) | (8 << 7)
- jalr = 0x00000067 | (NAT_RV64['br'] << 15) # jalr x0, 0(t5)
- return le32(insn) + le32(jalr)
-
-
-## --- Simple singletons ---
@dataclass
class Literal(Op):
- hex_by_arch: dict = None
-
- def encode(self, arch):
- return self.hex_by_arch[arch]
-
-
-## --- PROLOGUE / EPILOGUE / TAIL — N-slot variants ------------------------
-## Frame layout after PROLOGUE_Nk (k >= 1, rounded up so total frame bytes
-## stay 16-byte aligned on aarch64):
-##
-## [sp + 0] = retaddr (aarch64 lr / riscv64 ra / amd64 retaddr)
-## [sp + 8] = slot 1 (callee-private scratch)
-## [sp + 16] = slot 2
-## [sp + 24] = slot 3
-## ...
-## [sp + 8*k] = slot k
-##
-## Frame size = round_up_to_16(8 + 8*k). So k=1 → 16, k=2 → 24 → 32,
-## k=3 → 32, k=4 → 40 → 48. Keeping the EPILOGUE a strict inverse.
-
-def prologue_frame_bytes(k: int) -> int:
- raw = 8 + 8 * k
- return (raw + 15) & ~15
+ hex_by_arch: Optional[dict] = None
+ def encode(self, enc):
+ return enc.literal(self.hex_by_arch)
@dataclass
class Prologue(Op):
k: int = 1
-
- def encode(self, arch):
- fb = prologue_frame_bytes(self.k)
- if arch == 'aarch64':
- # sub sp, sp, #fb ; str x30, [sp]
- sub = aa_add_imm('sp', 'sp', fb, sub=True)
- str_lr = aa_ldst_uimm12(0xF9000000, 'lr', 'sp', 0, 3)
- return sub + str_lr
- if arch == 'amd64':
- # pop rcx ; sub rsp,fb ; push rcx
- # pop rcx = 59 ; sub rsp,imm8 = 48 83 EC ib (if fb<=127)
- # push rcx = 51
- # rcx is the retaddr-carry scratch. r11 (= P1 'br') is off-limits
- # because TAIL = EPILOGUE + `jmp r11` and using r11 here would
- # clobber the LI_BR-loaded tail target mid-EPILOGUE. rcx is
- # already the shift-count + DIV/REM save scratch — caller-save,
- # never a P1 reg.
- assert fb <= 127
- return '59' + '4883EC' + byte(fb) + '51'
- if arch == 'riscv64':
- # addi sp, sp, -fb ; sd ra, 0(sp)
- sub = rv_i(0x00000013, 'sp', 'sp', -fb)
- sd = rv_s(0x00003023, 'ra', 'sp', 0)
- return sub + sd
+ def encode(self, enc):
+ return enc.prologue(self.k)
@dataclass
class Epilogue(Op):
k: int = 1
-
- def encode(self, arch):
- fb = prologue_frame_bytes(self.k)
- if arch == 'aarch64':
- ldr_lr = aa_ldst_uimm12(0xF9400000, 'lr', 'sp', 0, 3)
- add = aa_add_imm('sp', 'sp', fb, sub=False)
- return ldr_lr + add
- if arch == 'amd64':
- # Mirror of Prologue — pop rcx ; add rsp,fb ; push rcx.
- assert fb <= 127
- return '59' + '4883C4' + byte(fb) + '51'
- if arch == 'riscv64':
- ld = rv_i(0x00003003, 'ra', 'sp', 0)
- add = rv_i(0x00000013, 'sp', 'sp', fb)
- return ld + add
+ def encode(self, enc):
+ return enc.epilogue(self.k)
@dataclass
class Tail(Op):
k: int = 1
-
- def encode(self, arch):
- # Epilogue + unconditional B (= jump through the branch-scratch reg)
- epi = Epilogue(name='', k=self.k).encode(arch)
- br = B(name='').encode(arch)
- return epi + br
+ def encode(self, enc):
+ return enc.tail(self.k)
@dataclass
class Call(Op):
- def encode(self, arch):
- if arch == 'aarch64':
- return le32(0xD63F0000 | (NAT_AA64['br'] << 5)) # BLR x17
- if arch == 'amd64':
- return '41FFD3' # call r11
- if arch == 'riscv64':
- return le32(0x000000E7 | (NAT_RV64['br'] << 15)) # jalr ra, 0(t5)
+ def encode(self, enc):
+ return enc.call()
@dataclass
class Ret(Op):
- def encode(self, arch):
- if arch == 'aarch64':
- return le32(0xD65F03C0) # ret (= br x30)
- if arch == 'amd64':
- return 'C3'
- if arch == 'riscv64':
- return le32(0x00008067) # jalr x0, 0(ra)
-
-## --- SYSCALL / SYSOPEN ---
+ def encode(self, enc):
+ return enc.ret()
+
+
+## ---------- SYSCALL / SYSOPEN pre-encoded sequences ---------------------
+## These are the one-shot syscall wrappers. They shuffle P1's r0=num,
+## r1–r6=args convention into each arch's native syscall ABI and clobber
+## only r0 on return. Encoded by hand (per P1.md §"Syscall conventions").
+
SYSCALL_HEX = {
'aarch64': (
# r4/r5 now live in callee-saved natives (x26/x27), so the
@@ -742,7 +828,7 @@ SYSOPEN_HEX = {
'riscv64': le32(0xF9C00513) + le32(0x03800893) + le32(0x00000073),
}
-## --- Syscall numbers (little-endian 32-bit for LI operand) ---
+## Syscall numbers (little-endian 32-bit for LI operand).
## aarch64 and riscv64 share the asm-generic table (write=64, exit=93,
## clone=220, execve=221, waitid=95). amd64 has its own arch-specific
## numbers. `wait4` is deliberately absent — it's only defined for
@@ -757,11 +843,83 @@ SYS_NUM = {
}
-## ---------- Op table (rows emitted in order) -----------------------------
+## ---------- Canonical imm/offset/shamt sets -----------------------------
+## Enumerated instead of sigil-passed: M1's DEFINE substitutes hex
+## bytes verbatim, so every distinct imm value needs its own DEFINE.
+## These cover every value used across hello/demo/lisp/kaem-minimal
+## plus a little headroom. Extend when a new value appears in P1 src.
+
+## ADDI imms. NEG48/48 handle the ASCII '0' bias; the rest cover tag
+## stripping and loop counters. Full reg product × this set = 8²×N.
+ADDI_IMMS = (-48, -8, -7, -6, -5, -4, -3, -2, -1,
+ 1, 2, 3, 4, 5, 6, 7, 8, 48)
+
+## Shift amounts (for SHLI/SHRI/SARI). 32/52 implement low-N-bit masks
+## (length field extraction; 4096-slot symbol-table index); the small
+## values scale-by-N for byte offsets and fixnum encode/decode.
+SHIFT_IMMS = (1, 2, 3, 5, 16, 32, 52)
+
+## ANDI/ORI imms. Every entry must appear in AA64_LOGI_ENC.
+LOGI_IMMS = (1, 2, 3, 4, 6, 7, 8)
+
+## Memory offsets for LD/ST/LB/SB. 0/8/16/24/32 cover slot offsets in
+## N-slot frames and common struct fields; 7 is the NUL terminator
+## position inside an 8-byte zero-padded slot; -8 reaches one slot
+## below the current base.
+MEM_OFFS = (-8, 0, 7, 8, 16, 24, 32)
+
+CONDB_OPS = ('BEQ', 'BNE', 'BLT')
+SHIFT_OPS = ('SHLI', 'SHRI', 'SARI')
+LOGI_OPS = ('ANDI', 'ORI')
+MEM_OPS = ('LD', 'ST', 'LB', 'SB')
+
+
+## Curated RRR triples. The full cube is 11 ops × 8³ regs = 5632
+## entries per arch — >99% would be dead weight. Each tuple below
+## is one actually used by hello/demo/lisp/kaem-minimal. Lint
+## catches missing triples on assembly; add a line here and
+## regenerate.
+RRR_TABLE = (
+ # demo/lisp step-1 arith cube
+ ('ADD','r1','r1','r2'), ('ADD','r1','r1','r4'),
+ ('ADD','r2','r2','r6'), ('ADD','r2','r3','r1'),
+ ('SUB','r1','r1','r2'), ('SUB','r2','r2','r6'),
+ ('AND','r1','r1','r5'),
+ ('OR', 'r1','r1','r2'),
+ ('XOR','r1','r1','r2'),
+ ('MUL','r1','r1','r2'),
+ ('DIV','r1','r1','r2'),
+ ('REM','r1','r1','r5'),
+ ('SHL','r1','r1','r2'),
+ ('SHR','r1','r1','r2'),
+ ('SAR','r4','r4','r2'),
+ # alloc / pointer arithmetic
+ ('ADD','r2','r0','r1'),
+ ('ADD','r0','r0','r3'),
+ ('ADD','r2','r2','r0'),
+ ('ADD','r2','r2','r1'),
+ ('SUB','r3','r3','r0'),
+ # reader / display index+offset fold
+ ('ADD','r6','r1','r2'),
+ ('ADD','r6','r6','r0'),
+ ('ADD','r7','r1','r2'),
+ ('SUB','r2','r1','r6'),
+ ('SUB','r3','r1','r6'),
+ ('REM','r1','r1','r2'),
+ # kaem-minimal bump-pointer + accumulator updates
+ ('ADD','r1','r1','r0'),
+ ('ADD','r5','r5','r0'),
+ ('ADD','r7','r7','r0'),
+ ('SUB','r3','r3','r2'),
+ ('SUB','r6','r6','r0'),
+)
+
+
+## ---------- Row assembly ------------------------------------------------
HEADER = """## p1_{arch}.M1 — GENERATED by p1_gen.py. Do not edit by hand.
##
-## Shared op-table lives in p1_gen.py; each arch's encoders expand
+## Shared op-table lives in p1_gen.py; each arch's encoder lowers
## (op, register-tuple, imm) rows into native bytes. See P1.md for the
## ISA spec and register mapping.
"""
@@ -770,97 +928,70 @@ HEADER = """## p1_{arch}.M1 — GENERATED by p1_gen.py. Do not edit by hand.
class Banner:
text: str
+
+def _imm_suf(imm):
+ return f'NEG{-imm}' if imm < 0 else f'{imm}'
+
+
def rows():
R = []
- # --- LI (wide literal load) ---
- R.append(Banner('LI — load 4-byte zero-extended literal from inline data slot'))
- for rd in ['r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7']:
+ # --- LI / LA — wide literal and address loads ---
+ R.append(Banner('LI / LA — load 4-byte zero-extended literal or label addr'))
+ for rd in P1_REGS:
R.append(Li(name=f'LI_{rd.upper()}', rD=rd))
# LI_BR loads into the hidden branch-target scratch (x17/r11/t5).
- # Every branch and call site is `LI_BR &target ; P1_B` (or the
- # matching conditional / CALL). The scratch is *not* a P1 reg.
+ # Every branch/call site is `LI_BR &target ; P1_<BR>`. The scratch
+ # is *not* a P1 reg.
R.append(Li(name='LI_BR', rD='br'))
+ for rd in P1_REGS:
+ R.append(La(name=f'LA_{rd.upper()}', rD=rd))
+
+ # --- MOV — register-to-register + MOV rD, sp ---
+ R.append(Banner('MOV — full register product (src may be sp)'))
+ for rd in P1_REGS:
+ for ra in P1_REGS:
+ R.append(Mov(name=f'MOV_{rd.upper()}_{ra.upper()}', rD=rd, rA=ra))
+ R.append(Mov(name=f'MOV_{rd.upper()}_SP', rD=rd, rA='sp'))
+
+ # --- RRR — curated triples (full cube would be 5.6k/arch) ---
+ R.append(Banner('RRR — curated triples (explicit table in p1_gen.py)'))
+ for op, d, a, b in RRR_TABLE:
+ R.append(RRR(name=f'{op}_{d.upper()}_{a.upper()}_{b.upper()}',
+ op=op, rD=d, rA=a, rB=b))
- # --- SYSCALL / SYSOPEN ---
- R.append(Banner('SYSCALL / SYSOPEN — uniform (clobbers r0 only) across arches'))
- R.append(Literal(name='SYSCALL', hex_by_arch=SYSCALL_HEX))
- R.append(Literal(name='SYSOPEN', hex_by_arch=SYSOPEN_HEX))
+ # --- Immediate arith: ADDI × full reg product × imm set ---
+ R.append(Banner('ADDI — full register product × ADDI_IMMS'))
+ for d, a, imm in product(P1_REGS, P1_REGS, ADDI_IMMS):
+ R.append(AddI(name=f'ADDI_{d.upper()}_{a.upper()}_{_imm_suf(imm)}',
+ rD=d, rA=a, imm=imm))
- # --- Syscall numbers ---
- R.append(Banner('Linux syscall numbers (per-arch table). LE-32 immediate operands for LI.'))
- for name in ('SYS_WRITE', 'SYS_EXIT', 'SYS_READ', 'SYS_CLOSE',
- 'SYS_CLONE', 'SYS_EXECVE', 'SYS_WAITID'):
- R.append(Literal(name=name, hex_by_arch={a: le32(SYS_NUM[a][name]) for a in ARCHES}))
-
- # --- Reg-reg-reg arith tuples used by demo/lisp ---
- R.append(Banner('Reg-reg-reg arithmetic (tranche 1)'))
- for op, d, a, b in [
- ('ADD','r1','r1','r2'), ('ADD','r1','r1','r4'), ('ADD','r2','r2','r6'),
- ('ADD','r2','r3','r1'),
- ('SUB','r1','r1','r2'), ('SUB','r2','r2','r6'),
- ('AND','r1','r1','r5'),
- ('OR', 'r1','r1','r2'),
- ('XOR','r1','r1','r2'),
- ('MUL','r1','r1','r2'),
- ('DIV','r1','r1','r2'),
- ('REM','r1','r1','r5'),
- ('SHL','r1','r1','r2'),
- ('SHR','r1','r1','r2'),
- ('SAR','r4','r4','r2'),
- ]:
- R.append(RRR(name=f'{op}_{d.upper()}_{a.upper()}_{b.upper()}', op=op, rD=d, rA=a, rB=b))
-
- # --- Immediate arith ---
- R.append(Banner('Immediate arithmetic (tranche 2)'))
- for d, a, imm in [('r1','r1',3), ('r1','r1',1), ('r1','r1',-3), ('r4','r4',-1),
- ('r1','r1',-2), ('r0','r0',1)]:
- suf = f'NEG{-imm}' if imm < 0 else f'{imm}'
- R.append(AddI(name=f'ADDI_{d.upper()}_{a.upper()}_{suf}', rD=d, rA=a, imm=imm))
-
- # SHLI/SHRI with imm=1
- R.append(ShiftI(name='SHLI_R1_R1_1', op='SHLI', rD='r1', rA='r1', imm=1))
- R.append(ShiftI(name='SHRI_R1_R1_1', op='SHRI', rD='r1', rA='r1', imm=1))
- R.append(ShiftI(name='SARI_R4_R4_1', op='SARI', rD='r4', rA='r4', imm=1))
-
- # ANDI 6: aarch64 logical-imm N=1, immr=0, imms=1 (2 ones at bit 1..2)
- R.append(LogI(name='ANDI_R1_R1_6', op='ANDI', rD='r1', rA='r1', imm=6,
- aa_N=1, aa_immr=63, aa_imms=1))
- # ANDI 7: N=1, immr=0, imms=2 (3 contiguous ones)
- R.append(LogI(name='ANDI_R1_R1_7', op='ANDI', rD='r1', rA='r1', imm=7,
- aa_N=1, aa_immr=0, aa_imms=2))
- # ORI 1: N=1, immr=0, imms=0 (1 one at bit 0)
- R.append(LogI(name='ORI_R1_R1_1', op='ORI', rD='r1', rA='r1', imm=1,
- aa_N=1, aa_immr=0, aa_imms=0))
- # ORI 2 on r0: 1 one rotated to bit 1 → immr=63, imms=0
- R.append(LogI(name='ORI_R0_R0_2', op='ORI', rD='r0', rA='r0', imm=2,
- aa_N=1, aa_immr=63, aa_imms=0))
- # ORI 7 on r0: 3 ones at bit 0..2 → N=1, immr=0, imms=2
- R.append(LogI(name='ORI_R0_R0_7', op='ORI', rD='r0', rA='r0', imm=7,
- aa_N=1, aa_immr=0, aa_imms=2))
-
- # --- LA + Memory ops ---
- R.append(Banner('LA + memory ops (tranche 3)'))
- R.append(La(name='LA_R4', rD='r4'))
- for op, rt, rn, off in [
- ('ST','r1','r4',0), ('LD','r1','r4',0),
- ('ST','r1','r4',8), ('LD','r1','r4',8),
- ('SB','r1','r4',16), ('LB','r1','r4',16),
- ('ST','r1','r4',-8), ('LD','r1','r4',-8),
- ]:
- suf = f'NEG{-off}' if off < 0 else f'{off}'
- R.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{suf}',
+ # --- ANDI / ORI × full reg product × LOGI_IMMS ---
+ R.append(Banner('ANDI / ORI — full register product × LOGI_IMMS'))
+ for op, d, a, imm in product(LOGI_OPS, P1_REGS, P1_REGS, LOGI_IMMS):
+ R.append(LogI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}',
+ op=op, rD=d, rA=a, imm=imm))
+
+ # --- SHLI / SHRI / SARI × full reg product × SHIFT_IMMS ---
+ R.append(Banner('SHLI / SHRI / SARI — full register product × SHIFT_IMMS'))
+ for op, d, a, imm in product(SHIFT_OPS, P1_REGS, P1_REGS, SHIFT_IMMS):
+ R.append(ShiftI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}',
+ op=op, rD=d, rA=a, imm=imm))
+
+ # --- Memory: LD/ST/LB/SB × full reg product × MEM_OFFS ---
+ R.append(Banner('LD / ST / LB / SB — full register product × MEM_OFFS'))
+ for op, rt, rn, off in product(MEM_OPS, P1_REGS, P1_REGS, MEM_OFFS):
+ R.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{_imm_suf(off)}',
op=op, rT=rt, rN=rn, off=off))
- # --- Branches ---
- R.append(Banner('Branches (tranche 4, LI_BR-indirect)'))
+ # --- Branches: BEQ/BNE/BLT × full reg product + unconditional B ---
+ R.append(Banner('Branches — LI_BR-indirect pattern'))
R.append(B(name='B'))
- for op, a, b in [
- ('BEQ','r2','r3'), ('BNE','r2','r3'), ('BLT','r2','r3'), ('BLT','r4','r2'),
- ]:
- R.append(CondB(name=f'{op}_{a.upper()}_{b.upper()}', op=op, rA=a, rB=b))
+ for op, a, b in product(CONDB_OPS, P1_REGS, P1_REGS):
+ R.append(CondB(name=f'{op}_{a.upper()}_{b.upper()}',
+ op=op, rA=a, rB=b))
- # --- PROLOGUE / EPILOGUE / CALL / RET / TAIL — single-slot + Nk variants ---
+ # --- Control: CALL / RET / PROLOGUE / EPILOGUE / TAIL (Nk = 1..4) ---
R.append(Banner('Control: CALL/RET + single-slot and N-slot PROLOGUE/EPILOGUE/TAIL'))
R.append(Prologue(name='PROLOGUE', k=1))
R.append(Epilogue(name='EPILOGUE', k=1))
@@ -872,368 +1003,17 @@ def rows():
R.append(Epilogue(name=f'EPILOGUE_N{k}', k=k))
R.append(Tail(name=f'TAIL_N{k}', k=k))
- # --- Tranche 6: seed-Lisp step-1 extensions ---
- R.append(Banner('Seed-Lisp step 1 extensions (tranche 6)'))
- R.append(Mov(name='MOV_R1_R6', rD='r1', rA='r6'))
- R.append(Mov(name='MOV_R6_R1', rD='r6', rA='r1'))
- R.append(Mov(name='MOV_R6_R0', rD='r6', rA='r0'))
- R.append(Mov(name='MOV_R0_R3', rD='r0', rA='r3'))
- R.append(Mov(name='MOV_R7_R0', rD='r7', rA='r0'))
- R.append(Mov(name='MOV_R7_R2', rD='r7', rA='r2'))
- R.append(Mov(name='MOV_R2_R6', rD='r2', rA='r6'))
- R.append(Mov(name='MOV_R3_R7', rD='r3', rA='r7'))
- R.append(Mov(name='MOV_R2_R7', rD='r2', rA='r7'))
- R.append(Mov(name='MOV_R4_R7', rD='r4', rA='r7'))
- # MOV rD, sp variants
- R.append(Mov(name='MOV_R2_SP', rD='r2', rA='sp'))
- R.append(Mov(name='MOV_R3_SP', rD='r3', rA='sp'))
- R.append(Mov(name='MOV_R4_SP', rD='r4', rA='sp'))
- R.append(Mov(name='MOV_R6_SP', rD='r6', rA='sp'))
- # Extra MOVs needed around calls
- R.append(Mov(name='MOV_R2_R0', rD='r2', rA='r0'))
-
- # LD/ST extras
- for op, rt, rn, off in [
- ('LD','r0','r6',0), ('LD','r1','r6',16), ('LD','r3','r4',0),
- ('LD','r0','r5',0), ('LB','r1','r4',0), ('ST','r2','r4',0),
- ('ST','r0','r4',8), ('LD','r0','r4',8), ('LB','r1','r0',0),
- ('LD','r0','r1',0), ('LD','r0','r1',8), ('ST','r1','r0',0),
- ('LD','r2','r4',0), ('ST','r2','r0',8), ('LD','r0','r4',0),
- ('ST','r2','r4',16), ('LD','r2','r4',16),
- # r3-based addressing: after the 4:4 split, r4 is callee-saved
- # and cons/alloc use r3 (caller-saved) as sp-temp and heap-cell
- # scratch instead. LD_R1_R1_0 is alloc's `r1 = *heap_end`.
- ('LD','r0','r3',0), ('ST','r2','r3',0),
- ('ST','r1','r3',8), ('LD','r1','r3',8),
- ('ST','r2','r3',16), ('LD','r2','r3',16),
- ('LD','r1','r1',0),
- ]:
- suf = f'NEG{-off}' if off < 0 else f'{off}'
- R.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{suf}',
- op=op, rT=rt, rN=rn, off=off))
-
- # ADD_R2_R0_R1: alloc's `new_next = old + size`. The other
- # tranche-1 ADDs don't happen to have (d=r2, a=r0, b=r1).
- R.append(RRR(name='ADD_R2_R0_R1', op='ADD', rD='r2', rA='r0', rB='r1'))
-
- # Branches used by lisp step 2 / alloc OOM.
- R.append(CondB(name='BLT_R0_R2', op='BLT', rA='r0', rB='r2'))
- R.append(CondB(name='BLT_R1_R2', op='BLT', rA='r1', rB='r2'))
- R.append(CondB(name='BNE_R1_R2', op='BNE', rA='r1', rB='r2'))
- R.append(CondB(name='BNE_R0_R2', op='BNE', rA='r0', rB='r2'))
-
- # --- Tranche 9: seed-Lisp step-3 extensions (strings + interning) ---
- R.append(Banner('Seed-Lisp step 3 extensions (tranche 9): strings + interning'))
-
- # Extra MOVs
- R.append(Mov(name='MOV_R0_SP', rD='r0', rA='sp'))
- R.append(Mov(name='MOV_R1_R3', rD='r1', rA='r3'))
-
- # ADDI with new (d, a, imm) tuples. NEG5 is for stripping the SYMBOL
- # tag; NEG1 flavors are loop counters; 7/8 form round_up_8 in
- # make_symbol/make_string.
- for d, a, imm in [
- ('r1','r7',7),
- ('r1','r1',8),
- ('r1','r1',-5),
- ('r2','r2',1),
- ('r2','r2',-1),
- ('r3','r0',8),
- ('r3','r3',1),
- ('r3','r3',-1),
- ('r6','r6',1),
- ('r7','r7',-1),
- ]:
- suf = f'NEG{-imm}' if imm < 0 else f'{imm}'
- R.append(AddI(name=f'ADDI_{d.upper()}_{a.upper()}_{suf}',
- rD=d, rA=a, imm=imm))
-
- # Shifts at new immediates. SHLI/SHRI by 52 implement the low-12-bit
- # mask (= index into the 4096-slot symbol table). SHLI/SHRI by 32
- # mask low 32 bits (length extraction from the 48-bit header
- # field). SHLI_R0_R0_3 / SHLI_R1_R1_3 scale an index to byte offset
- # (h * 8). SHLI_R3_R0_5 / SHRI by 3 implement 31*h via (h<<5)-h
- # and round_up_8 via ((x+7)>>3)<<3.
- for op, d, a, imm in [
- ('SHLI','r0','r0',3),
- ('SHLI','r0','r0',52),
- ('SHLI','r1','r1',3),
- ('SHLI','r3','r0',5),
- ('SHLI','r6','r6',32),
- ('SHRI','r0','r0',52),
- ('SHRI','r1','r1',3),
- ('SHRI','r6','r6',32),
- ]:
- R.append(ShiftI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}',
- op=op, rD=d, rA=a, imm=imm))
-
- # ORI_R0_R0_4: sets bit 2 (SYMBOL tag = 0b101 needs bits 0 and 2;
- # 0b101 is not a valid aarch64 bitmask-immediate, so callers do
- # ORI_R0_R0_4 then ORI_R0_R0_1). Single-bit pattern at bit 2:
- # N=1, immr=62 (rotate 1 right by 62 = left by 2), imms=0.
- R.append(LogI(name='ORI_R0_R0_4', op='ORI', rD='r0', rA='r0', imm=4,
- aa_N=1, aa_immr=62, aa_imms=0))
- # ORI_R0_R0_1: sets bit 0 (paired with ORI_R0_R0_4 for SYMBOL tag 0b101).
- R.append(LogI(name='ORI_R0_R0_1', op='ORI', rD='r0', rA='r0', imm=1,
- aa_N=1, aa_immr=0, aa_imms=0))
-
- # Reg-reg-reg arith for hash + intern address computation.
- R.append(RRR(name='ADD_R0_R0_R3', op='ADD', rD='r0', rA='r0', rB='r3'))
- R.append(RRR(name='ADD_R2_R2_R0', op='ADD', rD='r2', rA='r2', rB='r0'))
- R.append(RRR(name='ADD_R2_R2_R1', op='ADD', rD='r2', rA='r2', rB='r1'))
- R.append(RRR(name='SUB_R3_R3_R0', op='SUB', rD='r3', rA='r3', rB='r0'))
-
- # Branches. Zero-comparison variants (rX vs a reg holding 0);
- # length-comparison variants (rlen vs zero/symbol-len); pointer-
- # identity variants (sym_a vs sym_b).
- for op, a, b in [
- ('BEQ','r0','r6'),
- ('BEQ','r2','r6'),
- ('BEQ','r3','r1'),
- ('BEQ','r3','r6'),
- ('BEQ','r7','r1'),
- ('BNE','r0','r1'),
- ('BNE','r0','r6'),
- ('BNE','r0','r7'),
- ('BNE','r6','r3'),
- ]:
- R.append(CondB(name=f'{op}_{a.upper()}_{b.upper()}',
- op=op, rA=a, rB=b))
-
- # Memory extras. Offsets 0, 7, 8, 16, 24 for symbol/string/intern
- # machinery.
- for op, rt, rn, off in [
- ('LB','r0','r1',0),
- ('LB','r2','r6',0),
- ('LB','r7','r2',0),
- ('LD','r0','r2',0),
- ('LD','r0','r3',24),
- ('LD','r1','r3',24),
- ('LD','r3','r2',0),
- ('LD','r6','r1',0),
- ('LD','r6','r3',8),
- ('LD','r7','r3',16),
- ('SB','r1','r0',7),
- ('SB','r2','r3',0),
- ('ST','r0','r2',0),
- ('ST','r0','r3',24),
- ('ST','r0','r4',0),
- ('ST','r6','r0',8),
- ('ST','r6','r3',8),
- ('ST','r7','r0',0),
- ('ST','r7','r0',16),
- ('ST','r7','r3',16),
- ]:
- suf = f'NEG{-off}' if off < 0 else f'{off}'
- R.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{suf}',
- op=op, rT=rt, rN=rn, off=off))
-
- # --- Tranche 10: seed-Lisp step-4 extensions (reader + display) ---
- R.append(Banner('Seed-Lisp step 4 extensions (tranche 10): reader + display'))
-
- # Moves pairing up registers unused in earlier tranches.
- for d, a in [('r1','r0'), ('r1','r2'), ('r1','r7'),
- ('r2','r1'), ('r6','r2'), ('r7','r1')]:
- R.append(Mov(name=f'MOV_{d.upper()}_{a.upper()}', rD=d, rA=a))
-
- # ADDI: character-class arithmetic and cursor/len book-keeping.
- # NEG48 peels the '0' bias off an ASCII digit. 48 replays the bias
- # for display_uint. 8 advances past a symbol/string header. NEG5
- # strips the SYMBOL tag (0b101) from a tagged-symbol pointer.
- for d, a, imm in [
- ('r0','r0',-1),
- ('r0','r0',-48),
- ('r1','r1',48),
- ('r1','r1',-1),
- ('r2','r2',8),
- ('r2','r2',-5),
- ('r6','r6',-1),
- ]:
- suf = f'NEG{-imm}' if imm < 0 else f'{imm}'
- R.append(AddI(name=f'ADDI_{d.upper()}_{a.upper()}_{suf}',
- rD=d, rA=a, imm=imm))
-
- # Shifts. Pair-tag strip (SHLI/SHRI 16 masks low 48). Fixnum
- # encode/decode at shift-3. SHLI_R0_R6_3 / R1_R6_3 / R2_R6_1
- # realize value*8 + value*2 in read_number's *10 expansion.
- for op, d, a, imm in [
- ('SARI','r1','r1',3),
- ('SHLI','r0','r6',3),
- ('SHLI','r1','r6',3),
- ('SHLI','r2','r6',1),
- ('SHLI','r3','r3',16),
- ('SHRI','r3','r3',16),
- ]:
- R.append(ShiftI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}',
- op=op, rD=d, rA=a, imm=imm))
-
- # RRR: add/sub combinations for pointer arithmetic and list cons.
- R.append(RRR(name='ADD_R6_R1_R2', op='ADD', rD='r6', rA='r1', rB='r2'))
- R.append(RRR(name='ADD_R6_R6_R0', op='ADD', rD='r6', rA='r6', rB='r0'))
- R.append(RRR(name='ADD_R7_R1_R2', op='ADD', rD='r7', rA='r1', rB='r2'))
- R.append(RRR(name='SUB_R2_R1_R6', op='SUB', rD='r2', rA='r1', rB='r6'))
- R.append(RRR(name='SUB_R3_R1_R6', op='SUB', rD='r3', rA='r1', rB='r6'))
- R.append(RRR(name='REM_R1_R1_R2', op='REM', rD='r1', rA='r1', rB='r2'))
-
- # Branches. BEQ/BNE variants for EOF sentinels (rX == -1),
- # tagged-value comparisons, and list-terminator checks.
- for op, a, b in [
- ('BEQ','r0','r1'),
- ('BEQ','r1','r2'),
- ('BEQ','r1','r3'),
- ('BEQ','r2','r1'),
- ('BEQ','r6','r1'),
- ('BNE','r7','r1'),
- ]:
- R.append(CondB(name=f'{op}_{a.upper()}_{b.upper()}',
- op=op, rA=a, rB=b))
-
- # Memory extras. LB/SB at new (reg, off) combinations; LD_R2_R1_0
- # and LD_R2_R2_0 for reading reader state words. ST_R0_R3_8 / _R2_R1_0
- # push heap-list head into a spill slot. ST_R2_R3_24 spills fd
- # inside display_uint's PROLOGUE_N3 so it survives the digit loop.
- for op, rt, rn, off in [
- ('LB','r0','r2',0),
- ('LD','r2','r1',0),
- ('LD','r2','r2',0),
- ('SB','r1','r2',0),
- ('SB','r1','r6',0),
- ('ST','r0','r3',8),
- ('ST','r2','r1',0),
- ('ST','r2','r3',24),
- ]:
- suf = f'NEG{-off}' if off < 0 else f'{off}'
- R.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{suf}',
- op=op, rT=rt, rN=rn, off=off))
-
- # --- Tranche 11: seed-Lisp step-5/6 (printer + eval) ---
- R.append(Banner('Seed-Lisp step 5/6 extensions (tranche 11): printer + eval'))
-
- # MOV: write_string copies the string length (r3) into r7 so r7
- # can survive putc calls inside the escape loop. _start stashes
- # the last eval result in r4 (callee-saved) across iterations, so
- # MOV_R4_R0 captures it after eval and MOV_R1_R4 reloads it for
- # write / SYS_EXIT.
- R.append(Mov(name='MOV_R7_R3', rD='r7', rA='r3'))
- R.append(Mov(name='MOV_R4_R0', rD='r4', rA='r0'))
- R.append(Mov(name='MOV_R1_R4', rD='r1', rA='r4'))
-
- # BNE_R1_R0: apply's callee-tag check branches to err_not_callable
- # when the low-3 tag doesn't equal 0b110.
- R.append(CondB(name='BNE_R1_R0', op='BNE', rA='r1', rB='r0'))
-
- # LB_R1_R6_0: write_string reads a single byte at the current
- # cursor (r6) into r1 before the escape-test/putc sequence.
- R.append(Mem(name='LB_R1_R6_0', op='LB', rT='r1', rN='r6', off=0))
-
- # ADDI: NEG4 strips the STRING tag (0b100) from a tagged string,
- # NEG6 strips the CLOSURE tag (0b110) inside apply.
- for d, a, imm in [
- ('r1','r1',-6),
- ('r2','r2',-4),
- ]:
- suf = f'NEG{-imm}' if imm < 0 else f'{imm}'
- R.append(AddI(name=f'ADDI_{d.upper()}_{a.upper()}_{suf}',
- rD=d, rA=a, imm=imm))
-
- # Memory ops:
- # LD_R0_R1_{16,24} — apply reads body/env from raw closure ptr
- # LD_R0_R3_8 — eval_self_slot1 returns the saved expr
- # LD_R1_R3_16 — loads slot 2 into r1 when r1 is the cons/car
- # arg and we want a clean register
- # LD_R1_R3_32 — env_extend fetches saved name from slot 4
- # LD_R2_R3_8/24 — eval_begin cursor reload; env_extend env reload
- # LD_R3_R3_16/32 — eval_lambda / apply load env into r3 as the
- # 3rd argument to make_closure / env_extend
- # ST_R0_R3_16/32 — env_extend spills updated vals / saves name
- # ST_R1_R0_{8,16,24} — make_closure writes fields into fresh heap
- for op, rt, rn, off in [
- ('LD','r0','r1',16),
- ('LD','r0','r1',24),
- ('LD','r0','r3',8),
- ('LD','r1','r3',16),
- ('LD','r1','r3',32),
- ('LD','r2','r3',8),
- ('LD','r2','r3',24),
- ('LD','r3','r3',16),
- ('LD','r3','r3',32),
- ('ST','r0','r3',16),
- ('ST','r0','r3',32),
- ('ST','r1','r0',8),
- ('ST','r1','r0',16),
- ('ST','r1','r0',24),
- ]:
- suf = f'NEG{-off}' if off < 0 else f'{off}'
- R.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{suf}',
- op=op, rT=rt, rN=rn, off=off))
-
- # --- Tranche 12: kaem-minimal (line shell for build scripts) ---
- R.append(Banner('kaem-minimal extensions (tranche 12): line shell for build'))
-
- # MOVs: read_file_all parks (fd, buf, cap, total) in r4–r7 across
- # SYSCALLs (callee-saved natives) so the read loop doesn't need a
- # prologue. write_cstr uses the same trick across its strlen call.
- for d, a in [('r0','r7'), ('r2','r5'), ('r3','r0'), ('r3','r6'),
- ('r4','r1'), ('r4','r2'), ('r5','r1'), ('r5','r2'),
- ('r6','r3'), ('r1','r2')]:
- # MOV_R1_R2 already exists in tranche 10 — skip via duplicate check.
- if d == 'r1' and a == 'r2':
- continue
- R.append(Mov(name=f'MOV_{d.upper()}_{a.upper()}', rD=d, rA=a))
-
- # RRR: bump-pointer and accumulator updates in read_file_all's inner
- # loop (buf += n, cap -= n, total += n). ADD_R1_R1_R0 is end_token's
- # argv_out[count] address calc.
- for op, d, a, b in [
- ('ADD','r1','r1','r0'),
- ('ADD','r5','r5','r0'),
- ('ADD','r7','r7','r0'),
- ('SUB','r3','r3','r2'),
- ('SUB','r6','r6','r0'),
- ]:
- R.append(RRR(name=f'{op}_{d.upper()}_{a.upper()}_{b.upper()}',
- op=op, rD=d, rA=a, rB=b))
-
- # ADDI_R1_R0_2: _start computes r1 = argc + 2 before scaling by 8
- # to land on envp.
- R.append(AddI(name='ADDI_R1_R0_2', rD='r1', rA='r0', imm=2))
-
- # SHLI_R0_R3_3: end_token scales token_count by 8 (slot size) into
- # r0 for the argv_out address addition.
- R.append(ShiftI(name='SHLI_R0_R3_3', op='SHLI', rD='r0', rA='r3', imm=3))
-
- # Branches. BLT_R0_R1 covers the two fd/argc-sign checks in _start
- # and the read EOF check; BEQ_R3_R2 is the strlen-zero-byte check;
- # BNE_R3_R0 is arena_putc's "token already open?" guard.
- for op, a, b in [
- ('BEQ','r3','r2'),
- ('BLT','r0','r1'),
- ('BNE','r3','r0'),
- ]:
- R.append(CondB(name=f'{op}_{a.upper()}_{b.upper()}',
- op=op, rA=a, rB=b))
+ # --- SYSCALL / SYSOPEN — pre-encoded per-arch wrappers ---
+ R.append(Banner('SYSCALL / SYSOPEN — uniform "clobbers r0 only" across arches'))
+ R.append(Literal(name='SYSCALL', hex_by_arch=SYSCALL_HEX))
+ R.append(Literal(name='SYSOPEN', hex_by_arch=SYSOPEN_HEX))
- # Memory ops. LB at offset 24 reads si_status out of the waitid
- # siginfo_buf (SIGCHLD case: byte 24 is the exit code / signal).
- # LD_R3_R3_0 is end_token dereferencing token_arena_next;
- # LD_R3_R1_0 loads mutable globals into r3 for use as a cursor.
- # ST_R3_R1_0 / ST_R3_R2_0 / ST_R0_R1_0 persist cursor/count.
- # SB_{R0,R1}_R3_0 write the NUL terminator / token byte into the
- # arena at token_arena_next.
- for op, rt, rn, off in [
- ('LB','r1','r1',24),
- ('LB','r3','r2',0),
- ('LD','r3','r1',0),
- ('LD','r3','r3',0),
- ('SB','r0','r3',0),
- ('SB','r1','r3',0),
- ('ST','r0','r1',0),
- ('ST','r3','r1',0),
- ('ST','r3','r2',0),
- ]:
- suf = f'NEG{-off}' if off < 0 else f'{off}'
- R.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{suf}',
- op=op, rT=rt, rN=rn, off=off))
+ # --- Syscall numbers (LE-32 immediates) ---
+ R.append(Banner('Linux syscall numbers (per-arch table). LE-32 operands for LI.'))
+ for name in ('SYS_WRITE', 'SYS_EXIT', 'SYS_READ', 'SYS_CLOSE',
+ 'SYS_CLONE', 'SYS_EXECVE', 'SYS_WAITID'):
+ R.append(Literal(name=name,
+ hex_by_arch={a: le32(SYS_NUM[a][name]) for a in ARCHES}))
return R
@@ -1241,6 +1021,7 @@ def rows():
## ---------- File emission -----------------------------------------------
def emit(arch: str) -> str:
+ enc = ENCODERS[arch]
out = [HEADER.format(arch=arch).rstrip(), '']
seen = set()
for row in rows():
@@ -1252,8 +1033,7 @@ def emit(arch: str) -> str:
if name in seen:
raise RuntimeError(f'duplicate DEFINE: {name}')
seen.add(name)
- hex_bytes = row.encode(arch)
- out.append(f'DEFINE {name} {hex_bytes}')
+ out.append(f'DEFINE {name} {row.encode(enc)}')
out.append('')
return '\n'.join(out)