commit 78312f7592353f51c3b5957eca18ea098cbd4a66
parent 224a62c7d3f1c05dcdeb1fe9c621a87067c8ce91
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Mon, 20 Apr 2026 21:11:20 -0700
P1: generator-driven defs, N-slot prologue, lint, no hidden clobbers
Replace the three hand-written p1_<arch>.M1 files with p1_gen.py, a
single source of truth that expands shared op rows to per-arch native
bytes. This closes P1_TODO.md issue 3 and removes the combinatorial
hand-maintenance problem ahead of the ~1200-DEFINE full surface.
Makefile gains lint.sh as a prerequisite to assembly — M1 otherwise
silently passes undefined P1_*/SYS_* tokens through as literal text,
producing SIGILL-on-run binaries with no diagnostic (issue 1).
PROLOGUE/EPILOGUE extend to N-slot variants (k=1..4), giving callees
multiple walkable spill slots across CALL. lisp.M1 cons() rewrites
off the :cons_save_cdr BSS hack onto PROLOGUE_N2 (issue 4).
Every P1 op now modifies only what its name says it modifies. aarch64
REM moves its msub scratch from x4 (= P1 r4) to x16 (IP0); amd64
DIV/REM preserves rax (P1 r0) via r11 when rD != r0, and handles rA/rB
aliasing r0/r3 by sourcing from the saved copy.
P1.md documents the r7-indirect branch pattern, label-alignment
rules, N-slot frame layout, and the reserved-scratch register set.
Diffstat:
| M | Makefile | | | 21 | ++++++++++++++++++--- |
| M | P1.md | | | 195 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------- |
| A | lint.sh | | | 42 | ++++++++++++++++++++++++++++++++++++++++++ |
| M | lisp.M1 | | | 36 | +++++++++++------------------------- |
| M | p1_aarch64.M1 | | | 405 | ++++++++++++++++++++++--------------------------------------------------------- |
| M | p1_amd64.M1 | | | 378 | +++++++++++++++++++++++++------------------------------------------------------ |
| A | p1_gen.py | | | 949 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| M | p1_riscv64.M1 | | | 365 | ++++++++++++++++++++++++------------------------------------------------------- |
8 files changed, 1518 insertions(+), 873 deletions(-)
diff --git a/Makefile b/Makefile
@@ -64,7 +64,7 @@ PODMAN := podman run --rm --platform $(PLATFORM) \
# --- Targets ---------------------------------------------------------------
-.PHONY: all toolchain run run-all clean
+.PHONY: all toolchain run run-all clean gen check-gen
all: $(OUT_DIR)/$(PROG)
@@ -80,12 +80,18 @@ $(OUT_DIR) $(TOOLS_DIR):
$(TOOLS_DIR)/M0 $(TOOLS_DIR)/hex2-0 $(TOOLS_DIR)/catm $(TOOLS_DIR)/hex0 $(TOOLS_DIR)/hex1 &: bootstrap.sh | $(TOOLS_DIR)
$(PODMAN_BOOTSTRAP) sh bootstrap.sh $(ARCH) /work/lispcc/$(TOOLS_DIR)
-# Assemble: combine per-arch defs + program, feed to M0.
+# Assemble: lint first, then combine per-arch defs + program and feed to M0.
+#
+# Lint catches P1_*/SYS_* tokens with no matching DEFINE — M1 otherwise
+# silently emits the literal token text and produces a SIGILL-on-run
+# binary (see P1_TODO.md issue 1). Runs on the host (plain POSIX sh);
+# no podman dependency.
#
# M0 takes a single positional input (no -f flag), so we catm the two
# sources together first. The intermediate .combined.M1 is kept in OUT_DIR
# so it gets cleaned along with everything else.
-$(OUT_DIR)/$(PROG).hex2: $(PROG).M1 p1_$(ARCH).M1 $(TOOLS_DIR)/M0 $(TOOLS_DIR)/catm | $(OUT_DIR)
+$(OUT_DIR)/$(PROG).hex2: $(PROG).M1 p1_$(ARCH).M1 lint.sh $(TOOLS_DIR)/M0 $(TOOLS_DIR)/catm | $(OUT_DIR)
+ ./lint.sh p1_$(ARCH).M1 $(PROG).M1
$(PODMAN) sh -ec ' \
$(TOOLS_DIR)/catm $(OUT_DIR)/$(PROG).combined.M1 p1_$(ARCH).M1 $(PROG).M1 ; \
$(TOOLS_DIR)/M0 $(OUT_DIR)/$(PROG).combined.M1 $(OUT_DIR)/$(PROG).hex2'
@@ -113,3 +119,12 @@ run-all:
clean:
rm -rf build/
+
+# Regenerate the per-arch p1_<arch>.M1 defs from p1_gen.py.
+# Running `make gen` overwrites all three files in place; `make check-gen`
+# exits non-zero if any are stale (useful for CI). See p1_gen.py.
+gen:
+ python3 p1_gen.py
+
+check-gen:
+ python3 p1_gen.py --check
diff --git a/P1.md b/P1.md
@@ -27,22 +27,26 @@ hand-writable in M1 source, that assembles to three host ISAs via per-arch
- **Not self-hosting.** P1 is a target for humans, not a compiler IR. If you
want a compiler, write it in subset-C and use M2-Planet.
-## Current status (v0.1 spike)
-
-Built in `lispcc/`: `hello.M1` and a compute/print/exit `demo.M1`, both
-written in P1 mnemonics, assemble unchanged across aarch64, amd64, and
-riscv64 from per-arch `p1_<arch>.M1` defs. Ops exercised so far: `LI`,
-`SYSCALL`, `MOV`, `ADD`, `SUB` (the last three in specific
-register-tuple forms). Runs on stock `M1` + `hex2_linker` (amd64,
-aarch64) / `hex2_word` (riscv64). Run with `make PROG=demo run-all`
-from `lispcc/`.
-
-The rest of the ISA (branches, `CALL`/`RET`, `TAIL`, `LD`/`ST`,
-logical/shift/mul-div) is reachable with the same tooling via the
-inline-data `LI` pattern generalized: load the branch target with
-`P1_LI_R7 &label` and jump through the register. Local conditional
-skips over a known number of instructions hand-encode as plain hex
-distances inside the DEFINE.
+## Current status
+
+Three programs assemble unchanged across aarch64, amd64, and riscv64
+from the generator-produced `p1_<arch>.M1` defs:
+ * `hello.M1` — write/exit, prints "Hello, world!".
+ * `demo.M1` — exercises the full tranche 1–5 op set (arith/imm/LD/ST/
+ branches/CALL/RET/PROLOGUE/EPILOGUE/TAIL); exits with code 5.
+ * `lisp.M1` — seed Lisp through step 2 of `LISP.md`: bump heap,
+ `cons`/`car`/`cdr`, tagged-value encoding. Exits with code 42
+ (decoded fixnum from `car(cons(42, nil))`).
+
+All runs on stock stage0 `M0` + `hex2-0`, bootstrapped per-arch from
+`hex0-seed` — no C compiler, no M2-Planet, no Mes. Run with
+`make PROG=<hello|demo|lisp> run-all` from `lispcc/`.
+
+The DEFINE table is generator-driven (`p1_gen.py`); tranches 1–8 are
+enumerated there, plus the full PROLOGUE_Nk family (k=1..4). Branch
+offsets are realized by the r7-indirect pattern
+(`LI_R7 &target ; BXX_rA_rB_R7`), sidestepping the missing
+branch-offset support in hex2.
### Spike deviations from the design
@@ -53,9 +57,11 @@ distances inside the DEFINE.
- `LI` is 4-byte zero-extended today; 8-byte absolute is deferred until
a program needs it. All current references are to addresses under
4 GiB, so `&label` + a 4-byte zero pad suffices.
-- The per-tuple DEFINE table is hand-written for the handful of
- MOV/ADD/SUB register tuples `demo.M1` uses. The ~1500-entry generator
- is still future work.
+- The per-tuple DEFINE table is generator-produced (see `p1_gen.py`)
+ from a shared op table across all three arches. The emitted set
+ covers tranches 1–8 plus the N-slot PROLOGUE/EPILOGUE/TAIL
+ variants. Adding a new tuple is a one-line append to `rows()` in
+ the generator; no hand-encoding.
## Design decisions
@@ -103,12 +109,53 @@ machinery: its value is meaningful only between the `LI_R7` that loads a
target and the branch that consumes it. Never carry a live value through a
branch or call in `r7`.
-**Syscall-reserved registers (not available to P1):** on aarch64, `x21`–
-`x25` are used by the `SYSCALL` expansion as save slots for `r1`–`r5`
-around the arg shuffle; on riscv64, `s3`–`s7` serve the same role. The
-kernel preserves them, and `SYSCALL` restores them, so to P1 programs they
-simply don't exist. amd64 has no such reservation (native arg regs already
-align with P1 `r1`–`r5`).
+**Reserved scratch registers (not available to P1):** certain native
+regs are used internally by op expansions and are never exposed as P1
+registers. Every P1 op writes only what its name says it writes —
+reserved scratch is save/restored within the expansion so no hidden
+clobbers leak across op boundaries.
+
+- **aarch64** — `x21`–`x25` hold `r1`–`r5` across the `SYSCALL` arg
+ shuffle. `x16` (ARM IP0) is scratch for `REM` (carries the `SDIV`
+ quotient into the following `MSUB`). `x8` holds the syscall number.
+- **amd64** — `rcx` and `r11` are kernel-clobbered by the `syscall`
+ instruction itself; `PROLOGUE` additionally uses `r11` to temporarily
+ hold the return address across the `sub rsp, N`. `DIV`/`REM` use both
+ `rcx` (to save `rdx` = P1 `r3`) and `r11` (to save `rax` = P1 `r0`)
+ so that idiv's implicit writes to rax/rdx stay invisible.
+- **riscv64** — `s3`–`s7` hold `r1`–`r5` across the `SYSCALL` arg
+ shuffle; `a7` holds the syscall number.
+
+All of these are off-limits to hand-written P1 programs and are never
+mentioned in P1 source. If you see a register name not in the r0–r7 /
+sp / lr set, it belongs to an op's internal expansion.
+
+## Reading P1 source
+
+P1 has no PC-relative branch immediates (hex2 offers no label-arithmetic
+sigil — branch ranges can't be expressed in hex2 source). Every branch,
+conditional or not, compiles through the **r7-indirect** pattern: the
+caller loads the target into `r7` with a wide `LI`, then the branch op
+jumps through `r7`. A conditional like "jump to `fail` if `r1 != r2`" is
+three source lines:
+
+```
+P1_LI_R7
+&fail
+P1_BNE_R1_R2_R7
+```
+
+The `_R7` suffix on every branch op is a reminder that `r7` is the
+branch-target register: the value loaded by the preceding `LI_R7` is
+consumed by the branch, and both lines always appear together. `CALL`
+follows the same shape (`LI_R7 &callee ; P1_CALL`).
+
+Because every branch site overwrites `r7`, **`r7` is branch-scratch, not
+a general-purpose callee-saved register** — see the "`r7` is not a
+general-purpose callee-saved register" paragraph above. If you need to
+keep a value live across a branch or call, use `r0`–`r5` (caller-saved,
+lives across un-branched straight-line code) or `r6` (callee-saved,
+lives across calls).
## Instruction set (~30 ops)
@@ -171,19 +218,38 @@ SYSCALL # num in r0, args r1-r6, ret in r0
- `CALL %label` pushes a return address (via the arch's native mechanism
or the caller-emitted `PROLOGUE`, see below) and jumps. `RET` pops and
jumps.
-- `PROLOGUE` / `EPILOGUE` set up and tear down a 16-byte frame. After
- `PROLOGUE`, `[sp + 0]` holds the caller's return address and
- `[sp + 8]` is a **callee-private scratch slot** — one 8-byte cell that
- each function may spill into across `CALL`s (or across any sequence
- where the live register set is inconvenient). The slot is private to
- the current frame: a nested `PROLOGUE` allocates its own pair of
- cells, so the parent's spill at `[sp + 8]` survives unchanged.
- Per-arch mechanics differ — aarch64/riscv64 `PROLOGUE` subtracts 16
- from `sp` and stores `lr`/`ra` at `[sp + 0]`; amd64 pops the retaddr
- native `call` already pushed into a non-P1 scratch (`r11`), subtracts
- 16, then re-pushes it so the final layout matches. Access the scratch
- slot via `MOV rX, sp` followed by `LD rY, rX, 8` / `ST rY, rX, 8`; `sp`
- itself isn't a valid base for `LD`/`ST`.
+- `PROLOGUE` / `EPILOGUE` set up and tear down a frame with **k
+ callee-private scratch slots**. `PROLOGUE` is shorthand for
+ `PROLOGUE_N1` (one slot); `PROLOGUE_Nk` for k = 2, 3, 4 reserves that
+ many slots. After `PROLOGUE_Nk`:
+
+ ```
+ [sp + 0] = caller's return address
+ [sp + 8] = slot 1 (callee-private scratch)
+ [sp + 16] = slot 2 (k >= 2)
+ [sp + 24] = slot 3 (k >= 3)
+ [sp + 32] = slot 4 (k >= 4)
+ ```
+
+ Each slot is private to the current frame: a nested `PROLOGUE`
+ allocates its own slots, so the parent's spills survive unchanged.
+ Frame size is `round_up_16(8 + 8*k)`, so k=1→16, k=2→32 (with 8
+ bytes of padding past slot 2), k=3→32, k=4→48. `EPILOGUE_Nk` /
+ `TAIL_Nk` must match the `PROLOGUE_Nk` of the enclosing function.
+
+ Why multiple slots: constructors like `cons(car, cdr)` keep several
+ live values across an inner `alloc()` call. One scratch cell isn't
+ enough, and parking overflow in BSS would break the step-9 mark-sweep
+ GC (which walks the stack for roots). Per-frame slots keep every live
+ value on the walkable stack.
+
+ Per-arch mechanics differ — aarch64/riscv64 `PROLOGUE` subtracts the
+ frame size from `sp` and stores `lr`/`ra` at `[sp + 0]`; amd64 pops
+ the retaddr native `call` already pushed into a non-P1 scratch
+ (`r11`), subtracts the frame size, then re-pushes it so the final
+ layout matches. Access slots via `MOV rX, sp` followed by
+ `LD rY, rX, <off>` / `ST rY, rX, <off>`; `sp` itself isn't a valid
+ base for `LD`/`ST`.
- `TAIL %label` is a tail call — it performs the current function's
standard epilogue (restore `lr` from `[sp+0]`, pop the frame) and then
branches unconditionally to `%label`, reusing the caller's return
@@ -309,6 +375,50 @@ riscv `AUIPC`+`LD`, unlimited for x86 `mov rD, [rip + rel32]` within 2 GiB).
For programs under a few MiB, a single pool per file is fine. For larger
programs, emit a pool per function.
+## Data alignment
+
+**Labels have no inherent alignment.** A label's runtime address is
+`ELF_base + (cumulative bytes emitted before the label)`. Neither M1 nor
+hex2 offers an `.align` directive or any other alignment control — the
+existing hex2 sigils (`: ! @ $ ~ % &` and the `>` base override) cover
+labels and references, not padding. And because the cumulative byte count
+between the ELF header and any label varies per arch (different SYSCALL
+expansions, different branch encodings, different PROLOGUE sizes), the
+same label lands at a different low-3-bits offset on each target.
+
+Concretely: `heap_start` in a program that builds identically for all
+three arches can land at `0x...560` (aligned) on aarch64, `0x...2CB`
+(misaligned) on amd64, and `0x...604` (misaligned) on riscv64. If the
+program then tags pair pointers by ORing bits into the low 3, the tag
+collides with pointer bits on the misaligned arches and every pair is
+corrupt.
+
+Programs that care about alignment therefore align **at boot, in code**:
+
+```
+P1_LI_R4
+&heap_next
+P1_LD_R0_R4_0
+P1_ORI_R0_R0_7 ## x |= 7
+P1_ADDI_R0_R0_1 ## x += 1 → x rounded up to next 8-aligned
+P1_ST_R0_R4_0
+```
+
+The `(x | mask) + 1` idiom rounds any pointer up to `mask + 1`. Use
+`mask = 7` for 8-byte alignment (tagged pointers with a 3-bit tag),
+`mask = 15` for 16-byte alignment (cache lines, `malloc`-style).
+
+**Allocator contract.** Any allocator that returns cells eligible to be
+tagged (pair, closure, vector, …) MUST return pointers aligned to at
+least the tag width. The low tag bits are architecturally unowned by
+the allocator — they belong to the caller to stamp a tag into.
+
+**Caller contract.** Callers of bump-style allocators must pass sizes
+that are multiples of the alignment. For the step-2 bump allocator
+that's 8-byte multiples; the caller rounds up. A mature allocator
+(step 9 onward) rounds internally, but the current one trusts the
+caller.
+
## Staged implementation plan
1. **Spike across all three arches.** *Done.* `lispcc/hello.M1` and
@@ -325,9 +435,12 @@ programs, emit a pool per function.
hex2; no extensions required. A loop-and-branch demo (e.g. print
digits 0–9, or sum 1..N) is the natural next program — it forces
conditional branching through the indirect-via-r7 pattern.
-3. **Generator for the ~30-op × register matrix.** Hand-maintenance of
- the per-tuple DEFINEs becomes painful past ~20 entries. A small
- template script produces `p1_<arch>.M1` from a shared op table.
+3. **Generator for the ~30-op × register matrix.** *Done.*
+ `p1_gen.py` is the single source of truth for all three
+ `p1_<arch>.M1` defs files. Each row is an `(op, reg-tuple, imm)`
+ triple; per-arch encoders lower rows to native bytes. Includes the
+ N-slot `PROLOGUE_Nk` / `EPILOGUE_Nk` / `TAIL_Nk` variants (k=1..4).
+ Regenerate with `make gen`; CI-check freshness with `make check-gen`.
4. **Cross-arch differential harness.** Assemble each P1 source three
ways and diff runtime behavior. Currently eyeballed via
`make run-all`.
diff --git a/lint.sh b/lint.sh
@@ -0,0 +1,42 @@
+#!/bin/sh
+## lint.sh — catch undefined P1 tokens before they reach M1.
+##
+## M1 silently passes undefined tokens through as literal text, so a
+## misspelled P1_LD_R0_R4_0 (when only P1_LD_R0_R4_8 is defined) produces
+## a runnable-but-SIGILL-ing binary with no diagnostic. This script
+## extracts every P1_*/SYS_* token referenced in a .M1 program and
+## asserts each has a matching `DEFINE` in the per-arch defs file.
+##
+## Usage: lint.sh <p1_arch.M1> <prog.M1> [<prog.M1> ...]
+## Exit: 0 on success; 1 + diagnostic on any missing token; 2 on misuse.
+##
+## POSIX sh — no bash process substitution, no GNU-only flags.
+
+set -eu
+
+if [ "$#" -lt 2 ]; then
+ echo "usage: $0 <p1_arch.M1> <prog.M1> [<prog.M1> ...]" >&2
+ exit 2
+fi
+
+defs="$1"
+shift
+
+## Strip M1 `#`-to-EOL comments before token extraction: file-like
+## references (P1_TODO.md, etc.) live in comments and shouldn't be
+## flagged as missing DEFINEs. awk emits the pre-comment portion of
+## each line on stdout.
+used=$(awk '{ sub(/#.*/, ""); print }' "$@" | grep -oE 'P1_[A-Z0-9_]+|SYS_[A-Z0-9_]+' | sort -u)
+
+tmp=$(mktemp)
+trap 'rm -f "$tmp"' EXIT INT TERM
+awk '/^DEFINE[ \t]/ {print $2}' "$defs" | sort -u > "$tmp"
+
+missing=$(printf '%s\n' "$used" | grep -Fxv -f "$tmp" || true)
+
+if [ -n "$missing" ]; then
+ echo "error: P1 lint: undefined token(s) referenced in M1 source" >&2
+ echo " (defs file: $defs)" >&2
+ printf ' %s\n' $missing >&2
+ exit 1
+fi
diff --git a/lisp.M1 b/lisp.M1
@@ -17,7 +17,7 @@
## open/read/internal error pads) is removed — it will come back with
## the Reader in step 4.
-## ---- heap-state + cons scratch cell ----------------------------------
+## ---- heap-state --------------------------------------------------------
## Placed at the very start of ELF_text so file-offset alignment is
## predictable. ELF header (64 B) + 1 program header (56 B) = 120 B, so
## :heap_next lands at offset 120 from ELF_base — 8-byte aligned, which
@@ -34,14 +34,6 @@
&heap_tail
'00000000'
-## cons needs both incoming args (car, cdr) to survive an alloc() call.
-## The PROLOGUE scratch slot [sp+8] holds one, so park the other here.
-## Safe because the step-2 interpreter is single-threaded and cons is
-## not reentrant. Step 9 (mark-sweep GC) replaces this with the real
-## root-spill discipline.
-:cons_save_cdr
-'0000000000000000'
-
## ---- _start ----------------------------------------------------------
## Linux process entry. Builds the demo pair and exits with the decoded
@@ -125,19 +117,17 @@
## ---- cons(car, cdr) -> pair -----------------------------------------
## r1 = car, r2 = cdr. Allocates 16 bytes, writes the two cells, returns
-## (ptr | PAIR_TAG). Uses [sp+8] for car and :cons_save_cdr for cdr to
-## shepherd both args across the alloc() call (which clobbers r1..r5).
+## (ptr | PAIR_TAG). Uses the N=2 prologue — car lives in frame slot 1
+## at [sp+8], cdr in slot 2 at [sp+16] — to shepherd both args across
+## alloc(). Both roots stay on the walkable stack, which is the
+## discipline the step-9 GC will depend on (P1_TODO.md issue 4).
:cons
- P1_PROLOGUE
+ P1_PROLOGUE_N2
- ## Spill car to [sp+8].
+ ## Spill car to slot 1 [sp+8], cdr to slot 2 [sp+16].
P1_MOV_R4_SP
P1_ST_R1_R4_8
-
- ## Spill cdr to :cons_save_cdr.
- P1_LI_R4
- &cons_save_cdr
- P1_ST_R2_R4_0
+ P1_ST_R2_R4_16
## alloc(16). 16 is already 8-byte aligned.
P1_LI_R1
@@ -146,21 +136,17 @@
&alloc
P1_CALL ## r0 = raw (untagged) pair ptr
- ## [ptr+0] = car.
+ ## [ptr+0] = car ; [ptr+8] = cdr.
P1_MOV_R4_SP
P1_LD_R1_R4_8 ## r1 = car
P1_ST_R1_R0_0
-
- ## [ptr+8] = cdr.
- P1_LI_R4
- &cons_save_cdr
- P1_LD_R2_R4_0 ## r2 = cdr
+ P1_LD_R2_R4_16 ## r2 = cdr
P1_ST_R2_R0_8
## Tag as a pair and return.
P1_ORI_R0_R0_2
- P1_EPILOGUE
+ P1_EPILOGUE_N2
P1_RET
diff --git a/p1_aarch64.M1 b/p1_aarch64.M1
@@ -1,30 +1,11 @@
-## P1 pseudo-ISA — aarch64 backing defs (v0.1 spike)
+## p1_aarch64.M1 — GENERATED by p1_gen.py. Do not edit by hand.
##
-## Implements the subset needed by hello.M1 and demo.M1: LI, SYSCALL,
-## the demo's arith/imm/memory/branch/call tuples. The full ~1200-entry
-## table described in P1.md is generator-driven; what's here is the
-## spike's hand-written sliver.
-## See P1.md for the full ISA and register mapping.
-##
-## Register mapping (P1 → aarch64):
-## r0 → x0 , r1 → x1 , r2 → x2 , r3 → x3
-## r4 → x4 , r5 → x5 , r6 → x19, r7 → x20
-##
-## LI rD, <4-byte-literal> — load a 4-byte little-endian literal into rD
-## (zero-extended into the 64-bit register). Usage in source:
-##
-## P1_LI_R1
-## &some_label # or '0E000000'
-##
-## Expansion is "ldr w<D>, [pc+8] ; b +8" (8 bytes). The caller emits
-## the 4 inline literal bytes that follow. Addresses that fit in 32 bits
-## (true for our ELF-at-0x400000 binaries) round-trip through the W reg
-## because aarch64 LDR-W zero-extends into X.
-##
-## The 4-byte form is deliberate for the spike: it pairs with hex2's
-## `&label` sigil unchanged. A proper 64-bit LI via a PC-relative LDR X
-## literal is left for later.
+## Shared op-table lives in p1_gen.py; each arch's encoders expand
+## (op, register-tuple, imm) rows into native bytes. See P1.md for the
+## ISA spec and register mapping.
+
+## ---- LI — load 4-byte zero-extended literal from inline data slot
DEFINE P1_LI_R0 4000001802000014
DEFINE P1_LI_R1 4100001802000014
DEFINE P1_LI_R2 4200001802000014
@@ -34,277 +15,115 @@ DEFINE P1_LI_R5 4500001802000014
DEFINE P1_LI_R6 5300001802000014
DEFINE P1_LI_R7 5400001802000014
-## SYSCALL — num in r0, args r1..r6, result in r0. Clobbers r0 only.
-##
-## aarch64 Linux wants num in x8 and args in x0..x5. P1 puts args one
-## register higher (x1..x5, x19), so we save r1..r5 into x21..x25 before
-## shuffling them down into x0..x4; x5 comes from x19 directly; x8 gets
-## the number. After `svc #0` we restore r1..r5 from the x21..x25 saves
-## so the caller sees only r0 clobbered.
-##
-## x21..x25 are reserved for this expansion and invisible to P1 code
-## (see P1.md §"Register mapping"). Using callee-saved native regs means
-## zero memory traffic — the save-and-restore is pure register moves.
-##
-## Expansion (18 insns, 72 bytes):
-## mov x8, x0 ; num
-## mov x21, x1 ; save P1 r1
-## mov x22, x2 ; save P1 r2
-## mov x23, x3 ; save P1 r3
-## mov x24, x4 ; save P1 r4
-## mov x25, x5 ; save P1 r5
-## mov x0, x21 ; arg0 <- saved r1
-## mov x1, x22 ; arg1 <- saved r2
-## mov x2, x23 ; arg2 <- saved r3
-## mov x3, x24 ; arg3 <- saved r4
-## mov x4, x25 ; arg4 <- saved r5
-## mov x5, x19 ; arg5 <- P1 r6
-## svc #0
-## mov x1, x21 ; restore r1
-## mov x2, x22 ; restore r2
-## mov x3, x23 ; restore r3
-## mov x4, x24 ; restore r4
-## mov x5, x25 ; restore r5
+## ---- SYSCALL / SYSOPEN — uniform (clobbers r0 only) across arches
DEFINE P1_SYSCALL E80300AAF50301AAF60302AAF70303AAF80304AAF90305AAE00315AAE10316AAE20317AAE30318AAE40319AAE50313AA010000D4E10315AAE20316AAE30317AAE40318AAE50319AA
+DEFINE P1_SYSOPEN 600C8092080780D2010000D4
-
-## Linux syscall numbers (aarch64 uses the generic table).
-## Emitted as 4-byte little-endian immediates, to be consumed by P1_LI_R*.
+## ---- Linux syscall numbers (per-arch table). LE-32 immediate operands for LI.
DEFINE SYS_WRITE 40000000
-DEFINE SYS_EXIT 5D000000
-
-
-## ---- Tranche 1: full arith reg-reg-reg ----------------------------------
-## Non-identity discriminator chain used by demo.M1 to exercise every
-## P1 arith op. r1 is the running accumulator; each step's result is
-## unique vs. its neighbor ops on the chosen operands (see demo.M1).
-## aarch64 has 3-operand forms for all of these — one insn per op.
-## REM has no native op — expands to sdiv+msub through scratch reg x4.
-
-## MOV rD, rA -> orr xD, xzr, xA
-DEFINE P1_MOV_R1_R6 E10313AA ## mov x1, x19
-DEFINE P1_MOV_R6_R1 F30301AA ## mov x19, x1
-
-## MOV rD, sp -> add xD, sp, #0 (canonical "mov rD, sp" form).
-## Used by the tranche-5 stack-balance discriminator.
-DEFINE P1_MOV_R6_SP F3030091 ## mov x19, sp
-DEFINE P1_MOV_R2_SP E2030091 ## mov x2, sp
-
-## ADD rD, rA, rB -> add xD, xA, xB
-DEFINE P1_ADD_R1_R1_R2 2100028B ## add x1, x1, x2
-DEFINE P1_ADD_R1_R1_R4 2100048B ## add x1, x1, x4
-DEFINE P1_ADD_R2_R2_R6 4200138B ## add x2, x2, x19 (syscall-arg computation)
-
-## SUB / AND / OR / XOR — base opcodes CB / 8A / AA / CA.
-DEFINE P1_SUB_R1_R1_R2 210002CB ## sub x1, x1, x2
-DEFINE P1_SUB_R2_R2_R6 420013CB ## sub x2, x2, x19 (tranche-5 sp delta)
-DEFINE P1_AND_R1_R1_R5 2100058A ## and x1, x1, x5
-DEFINE P1_OR_R1_R1_R2 210002AA ## orr x1, x1, x2
-DEFINE P1_XOR_R1_R1_R2 210002CA ## eor x1, x1, x2
-
-## MUL rD, rA, rB -> mul xD, xA, xB (= madd xD, xA, xB, xzr)
-DEFINE P1_MUL_R1_R1_R2 217C029B ## mul x1, x1, x2
-
-## DIV rD, rA, rB -> sdiv xD, xA, xB
-DEFINE P1_DIV_R1_R1_R2 210CC29A ## sdiv x1, x1, x2
-
-## REM rD, rA, rB -> sdiv x4, xA, xB ; msub xD, x4, xB, xA
-## Two insns; x4 is a caller-saved scratch (= P1 r4). Demo keeps any
-## live r4 value dead across this op.
-DEFINE P1_REM_R1_R1_R5 240CC59A8184059B ## sdiv x4,x1,x5; msub x1,x4,x5,x1
-
-## SHL / SHR -> lslv / lsrv (64-bit).
-DEFINE P1_SHL_R1_R1_R2 2120C29A ## lsl x1, x1, x2
-DEFINE P1_SHR_R1_R1_R2 2124C29A ## lsr x1, x1, x2
-
-## SAR rD, rA, rB -> asrv xD, xA, xB. Discriminator lives on r4
-## (negative value) — see demo.M1 tranche 1.
-DEFINE P1_SAR_R4_R4_R2 8428C29A ## asr x4, x4, x2
-
-
-## ---- Tranche 2: immediate arith ---------------------------------------
-## Demo chain: 5 → 8 → 5 → 10 → 5 → 4 → 5 plus a SARI discriminator
-## on r4 (negative). All imms are valid bitmask-immediates where
-## applicable (see P1.md); 5 is NOT a valid bitmask so ORI uses 1.
-##
-## Shift-imm ops are UBFM/SBFM aliases:
-## LSL #n -> UBFM immr=-n mod 64, imms=63-n
-## LSR #n -> UBFM immr=n, imms=63
-## ASR #n -> SBFM immr=n, imms=63
-## Bitwise-imm ops use aarch64's logical-immediate encoding
-## (N:immr:imms over 13 bits).
-
-## aarch64 has no native "ADDI with negative imm" form — model P1's
-## signed ADDI by emitting ADD (imm) for non-negative values and
-## SUB (imm) for negative ones. Both flavors sign-extend the imm12
-## into the 64-bit destination.
-DEFINE P1_ADDI_R1_R1_3 210C0091 ## add x1, x1, #3
-DEFINE P1_ADDI_R1_R1_1 21040091 ## add x1, x1, #1
-DEFINE P1_ADDI_R1_R1_NEG3 210C00D1 ## sub x1, x1, #3
-DEFINE P1_ADDI_R4_R4_NEG1 840400D1 ## sub x4, x4, #1
-
-DEFINE P1_SHLI_R1_R1_1 21F87FD3 ## lsl x1, x1, #1 (UBFM immr=63, imms=62)
-DEFINE P1_SHRI_R1_R1_1 21FC41D3 ## lsr x1, x1, #1 (UBFM immr=1, imms=63)
-
-DEFINE P1_ANDI_R1_R1_6 21043F92 ## and x1, x1, #6
-DEFINE P1_ORI_R1_R1_1 210040B2 ## orr x1, x1, #1
-
-DEFINE P1_SARI_R4_R4_1 84FC4193 ## asr x4, x4, #1 (SBFM immr=1, imms=63)
-
-
-## ---- Tranche 3: LA + memory ops ---------------------------------------
-## LA is LI in the spike — both load a 4-byte zero-extended literal,
-## which is enough to address the ELF (base 0x00600000 < 2^32).
-DEFINE P1_LA_R4 4400001802000014 ## ldr w4, [pc+8] ; b +8 ; <4 bytes>
-
-## Unsigned-offset forms of LDR/STR at 8/1-byte widths. imm12 is scaled
-## by access size, so e.g. LDR-B offset 16 uses imm12=16.
-## 32-bit LW/SW dropped from the ISA.
-DEFINE P1_ST_R1_R4_0 810000F9 ## str x1, [x4, #0]
-DEFINE P1_LD_R1_R4_0 810040F9 ## ldr x1, [x4, #0]
-DEFINE P1_ST_R1_R4_8 810400F9 ## str x1, [x4, #8] (imm12=1, scaled ×8)
-DEFINE P1_LD_R1_R4_8 810440F9 ## ldr x1, [x4, #8]
-DEFINE P1_SB_R1_R4_16 81400039 ## strb w1, [x4, #16]
-DEFINE P1_LB_R1_R4_16 81404039 ## ldrb w1, [x4, #16] (zero-ext)
-
-## Negative imm12: aarch64's unsigned-offset LDR/STR can't encode a
-## negative offset, so use the unscaled-offset forms STUR/LDUR, which
-## carry a signed 9-bit immediate. sign-extended imm9 for -8 = 0x1F8.
-DEFINE P1_ST_R1_R4_NEG8 81801FF8 ## stur x1, [x4, #-8]
-DEFINE P1_LD_R1_R4_NEG8 81805FF8 ## ldur x1, [x4, #-8]
-
-
-## ---- Tranche 4: branches (r7-indirect, no hex2_word needed) ------------
-## Conditional branches compare ra vs rb, then jump to the address in r7.
-## Pattern: cmp xRa, xRb ; b.<INV> +8 ; br x20
-## - If cond is false we take the +8 skip and fall through the BR.
-## - If cond is true we execute the BR x20 jump to r7.
-## Caller loads the target into r7 (via P1_LI_R7 &label) beforehand.
-## Unconditional P1_B is just BR x20.
-##
-## Conditions for "skip if NOT cond":
-## BEQ -> B.NE (cond 1) BNE -> B.EQ (cond 0)
-## BLT -> B.GE (cond A)
-## Unsigned branches (BLTU/BGEU/BGE) dropped from the ISA — see P1.md.
-## CMP xN, xM = SUBS xzr, xN, xM. BR x20 = 0xD61F0280.
-
-DEFINE P1_B 80021FD6
-DEFINE P1_BEQ_R2_R3_R7 5F0003EB4100005480021FD6 ## cmp x2,x3 ; b.ne +8 ; br x20
-DEFINE P1_BNE_R2_R3_R7 5F0003EB4000005480021FD6 ## cmp x2,x3 ; b.eq +8 ; br x20
-DEFINE P1_BLT_R2_R3_R7 5F0003EB4A00005480021FD6 ## cmp x2,x3 ; b.ge +8 ; br x20
-DEFINE P1_BLT_R4_R2_R7 9F0002EB4A00005480021FD6 ## cmp x4,x2 ; b.ge +8 ; br x20
-
-
-## ---- Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL -----------------
-## CALL is an r7-indirect branch-and-link, same pattern as branches: caller
-## loads &target into r7, then `blr x20` sets x30 = PC+4 and jumps.
-## RET jumps through x30 (the native `ret` pseudo).
-##
-## PROLOGUE / EPILOGUE explicitly spill and reload x30 on aarch64 so that
-## nested CALLs can't clobber the caller's return address. After PROLOGUE,
-## [sp+0] holds the return address, matching P1.md's uniform convention.
-##
-## TAIL = EPILOGUE + B. Calling convention: load &target into r7 first,
-## then TAIL performs the callee's own epilogue (restoring the parent's
-## caller-retaddr into x30) and jumps to r7. When the tail target later
-## RETs, control returns to the parent's caller.
-
-DEFINE P1_PROLOGUE FF4300D1FE0300F9 ## sub sp,#16 ; str x30,[sp]
-DEFINE P1_EPILOGUE FE0340F9FF430091 ## ldr x30,[sp] ; add sp,#16
-DEFINE P1_RET C0035FD6 ## ret (br x30)
-DEFINE P1_CALL 80023FD6 ## blr x20
-DEFINE P1_TAIL FE0340F9FF43009180021FD6 ## epilogue ; br x20
-
-
-## ---- Tranche 6: seed-Lisp step 1 extensions ---------------------------
-## Ops required by lisp.M1 (LISP.md item 1, proof-of-life): file I/O
-## wrappers, cell loads/stores, and the arg-shuffling MOVs they imply.
-
-## Extra Linux syscall numbers (generic table, same as riscv64).
-DEFINE SYS_READ 3F000000
+DEFINE SYS_EXIT 5D000000
+DEFINE SYS_READ 3F000000
DEFINE SYS_CLOSE 39000000
-## SYSOPEN — portable open(path, flags, mode). Assumes r1=path, r2=flags,
-## r3=mode. aarch64 has no open(2); it uses openat(2) = syscall 56, which
-## wants (dirfd=AT_FDCWD, path, flags, mode). P1's r1..r3 already line up
-## with native x1..x3, so only x0=AT_FDCWD and x8=56 need loading.
-##
-## Expansion:
-## movn x0, #99 ; x0 = -100 = AT_FDCWD (60 0C 80 92)
-## movz x8, #56 ; x8 = SYS_openat (08 07 80 D2)
-## svc #0 (01 00 00 D4)
-DEFINE P1_SYSOPEN 600C8092080780D2010000D4
-
-## MOV xD, xA — extra register pairings used around syscalls / calls.
-DEFINE P1_MOV_R6_R0 F30300AA ## mov x19, x0
-DEFINE P1_MOV_R0_R3 E00303AA ## mov x0, x3
-DEFINE P1_MOV_R7_R0 F40300AA ## mov x20, x0
-DEFINE P1_MOV_R7_R2 F40302AA ## mov x20, x2
-DEFINE P1_MOV_R2_R6 E20313AA ## mov x2, x19
-DEFINE P1_MOV_R3_R7 E30314AA ## mov x3, x20
-DEFINE P1_MOV_R2_R7 E20314AA ## mov x2, x20
-DEFINE P1_MOV_R4_R7 E40314AA ## mov x4, x20
-
-## LD/ST extras — additional dst/base pairings at scaled imm12.
-DEFINE P1_LD_R0_R6_0 600240F9 ## ldr x0, [x19]
-DEFINE P1_LD_R1_R6_16 610A40F9 ## ldr x1, [x19, #16]
-DEFINE P1_LD_R3_R4_0 830040F9 ## ldr x3, [x4]
-DEFINE P1_LD_R0_R5_0 A00040F9 ## ldr x0, [x5]
-DEFINE P1_LB_R1_R4_0 81004039 ## ldrb w1, [x4]
-DEFINE P1_ST_R2_R4_0 820000F9 ## str x2, [x4]
-
-## ADD r2, r3, r1 — x2 = x3 + x1.
-DEFINE P1_ADD_R2_R3_R1 6200018B ## add x2, x3, x1
-
-## BLT r0, r2, r7 — signed jump-if-less; used for argc/open/read checks.
-## cmp x0, x2 ; b.ge +8 ; br x20
+## ---- Reg-reg-reg arithmetic (tranche 1) --------------------------
+DEFINE P1_ADD_R1_R1_R2 2100028B
+DEFINE P1_ADD_R1_R1_R4 2100048B
+DEFINE P1_ADD_R2_R2_R6 4200138B
+DEFINE P1_ADD_R2_R3_R1 6200018B
+DEFINE P1_SUB_R1_R1_R2 210002CB
+DEFINE P1_SUB_R2_R2_R6 420013CB
+DEFINE P1_AND_R1_R1_R5 2100058A
+DEFINE P1_OR_R1_R1_R2 210002AA
+DEFINE P1_XOR_R1_R1_R2 210002CA
+DEFINE P1_MUL_R1_R1_R2 217C029B
+DEFINE P1_DIV_R1_R1_R2 210CC29A
+DEFINE P1_REM_R1_R1_R5 300CC59A0106059B
+DEFINE P1_SHL_R1_R1_R2 2120C29A
+DEFINE P1_SHR_R1_R1_R2 2124C29A
+DEFINE P1_SAR_R4_R4_R2 8428C29A
+
+## ---- Immediate arithmetic (tranche 2) ----------------------------
+DEFINE P1_ADDI_R1_R1_3 210C0091
+DEFINE P1_ADDI_R1_R1_1 21040091
+DEFINE P1_ADDI_R1_R1_NEG3 210C00D1
+DEFINE P1_ADDI_R4_R4_NEG1 840400D1
+DEFINE P1_ADDI_R1_R1_NEG2 210800D1
+DEFINE P1_ADDI_R0_R0_1 00040091
+DEFINE P1_SHLI_R1_R1_1 21F87FD3
+DEFINE P1_SHRI_R1_R1_1 21FC41D3
+DEFINE P1_SARI_R4_R4_1 84FC4193
+DEFINE P1_ANDI_R1_R1_6 21047F92
+DEFINE P1_ANDI_R1_R1_7 21084092
+DEFINE P1_ORI_R1_R1_1 210040B2
+DEFINE P1_ORI_R0_R0_2 00007FB2
+DEFINE P1_ORI_R0_R0_7 000840B2
+
+## ---- LA + memory ops (tranche 3) ---------------------------------
+DEFINE P1_LA_R4 4400001802000014
+DEFINE P1_ST_R1_R4_0 810000F9
+DEFINE P1_LD_R1_R4_0 810040F9
+DEFINE P1_ST_R1_R4_8 810400F9
+DEFINE P1_LD_R1_R4_8 810440F9
+DEFINE P1_SB_R1_R4_16 81400039
+DEFINE P1_LB_R1_R4_16 81404039
+DEFINE P1_ST_R1_R4_NEG8 81801FF8
+DEFINE P1_LD_R1_R4_NEG8 81805FF8
+
+## ---- Branches (tranche 4, r7-indirect) ---------------------------
+DEFINE P1_B 80021FD6
+DEFINE P1_BEQ_R2_R3_R7 5F0003EB4100005480021FD6
+DEFINE P1_BNE_R2_R3_R7 5F0003EB4000005480021FD6
+DEFINE P1_BLT_R2_R3_R7 5F0003EB4A00005480021FD6
+DEFINE P1_BLT_R4_R2_R7 9F0002EB4A00005480021FD6
+
+## ---- Control: CALL/RET + single-slot and N-slot PROLOGUE/EPILOGUE/TAIL
+DEFINE P1_PROLOGUE FF4300D1FE0300F9
+DEFINE P1_EPILOGUE FE0340F9FF430091
+DEFINE P1_RET C0035FD6
+DEFINE P1_CALL 80023FD6
+DEFINE P1_TAIL FE0340F9FF43009180021FD6
+DEFINE P1_PROLOGUE_N2 FF8300D1FE0300F9
+DEFINE P1_EPILOGUE_N2 FE0340F9FF830091
+DEFINE P1_TAIL_N2 FE0340F9FF83009180021FD6
+DEFINE P1_PROLOGUE_N3 FF8300D1FE0300F9
+DEFINE P1_EPILOGUE_N3 FE0340F9FF830091
+DEFINE P1_TAIL_N3 FE0340F9FF83009180021FD6
+DEFINE P1_PROLOGUE_N4 FFC300D1FE0300F9
+DEFINE P1_EPILOGUE_N4 FE0340F9FFC30091
+DEFINE P1_TAIL_N4 FE0340F9FFC3009180021FD6
+
+## ---- Seed-Lisp step 1 extensions (tranche 6) ---------------------
+DEFINE P1_MOV_R1_R6 E10313AA
+DEFINE P1_MOV_R6_R1 F30301AA
+DEFINE P1_MOV_R6_R0 F30300AA
+DEFINE P1_MOV_R0_R3 E00303AA
+DEFINE P1_MOV_R7_R0 F40300AA
+DEFINE P1_MOV_R7_R2 F40302AA
+DEFINE P1_MOV_R2_R6 E20313AA
+DEFINE P1_MOV_R3_R7 E30314AA
+DEFINE P1_MOV_R2_R7 E20314AA
+DEFINE P1_MOV_R4_R7 E40314AA
+DEFINE P1_MOV_R2_SP E2030091
+DEFINE P1_MOV_R4_SP E4030091
+DEFINE P1_MOV_R6_SP F3030091
+DEFINE P1_MOV_R2_R0 E20300AA
+DEFINE P1_LD_R0_R6_0 600240F9
+DEFINE P1_LD_R1_R6_16 610A40F9
+DEFINE P1_LD_R3_R4_0 830040F9
+DEFINE P1_LD_R0_R5_0 A00040F9
+DEFINE P1_LB_R1_R4_0 81004039
+DEFINE P1_ST_R2_R4_0 820000F9
+DEFINE P1_ST_R0_R4_8 800400F9
+DEFINE P1_LD_R0_R4_8 800440F9
+DEFINE P1_LB_R1_R0_0 01004039
+DEFINE P1_LD_R0_R1_0 200040F9
+DEFINE P1_LD_R0_R1_8 200440F9
+DEFINE P1_ST_R1_R0_0 010000F9
+DEFINE P1_LD_R2_R4_0 820040F9
+DEFINE P1_ST_R2_R0_8 020400F9
+DEFINE P1_LD_R0_R4_0 800040F9
+DEFINE P1_ST_R2_R4_16 820800F9
+DEFINE P1_LD_R2_R4_16 820840F9
DEFINE P1_BLT_R0_R2_R7 1F0002EB4A00005480021FD6
-
-
-## ---- Tranche 7: [sp+8] scratch-slot access + assorted MOV/LD/ST -------
-## P1.md §PROLOGUE defines [sp+8] as a callee-private scratch cell.
-## Access pattern: MOV rX, sp then LD/ST rY, [rX, 8]. These defs cover
-## the specific register pairings lisp.M1 needs to spill the `buf`
-## pointer around the read-syscall + BLT + close-syscall sequence.
-
-DEFINE P1_MOV_R4_SP E4030091 ## add x4, sp, #0 (canonical mov x4, sp)
-DEFINE P1_MOV_R2_R0 E20300AA ## mov x2, x0
-DEFINE P1_ST_R0_R4_8 800400F9 ## str x0, [x4, #8]
-DEFINE P1_LD_R0_R4_8 800440F9 ## ldr x0, [x4, #8]
-DEFINE P1_LB_R1_R0_0 01004039 ## ldrb w1, [x0, #0] (zero-extend)
-
-
-## ---- Tranche 8: seed-Lisp step 2 (tagged values) ---------------------
-## Ops required by lisp.M1 (LISP.md item 2, tagged values): cons / car /
-## cdr plus tag checks (ANDI #7, BNE) and pair-tag patch (ORI #2).
-## car and cdr untag the pair pointer by subtracting 2 before loading.
-
-## ADDI signed imm (negative): sub x1, x1, #2 (untag pair pointer).
-DEFINE P1_ADDI_R1_R1_NEG2 210800D1
-
-## Pair-cell loads/stores. Base 0xF9400000 (LDR) / 0xF9000000 (STR);
-## imm12 is scaled ×8 for 64-bit accesses.
-DEFINE P1_LD_R0_R1_0 200040F9 ## ldr x0, [x1] (car slot)
-DEFINE P1_LD_R0_R1_8 200440F9 ## ldr x0, [x1, #8] (cdr slot)
-DEFINE P1_ST_R1_R0_0 010000F9 ## str x1, [x0] (write car)
-DEFINE P1_LD_R2_R4_0 820040F9 ## ldr x2, [x4] (reload cdr from BSS)
-DEFINE P1_ST_R2_R0_8 020400F9 ## str x2, [x0, #8] (write cdr)
-
-## Bitwise-imm tag ops. aarch64 logical-imm uses N:immr:imms:
-## orr x0, x0, #2 — N=1, immr=63, imms=0 (1 one rotated to bit 1)
-## and x1, x1, #7 — N=1, immr=0, imms=2 (3 contiguous ones)
-DEFINE P1_ORI_R0_R0_2 00007FB2 ## orr x0, x0, #2 (tag pair)
-DEFINE P1_ANDI_R1_R1_7 21084092 ## and x1, x1, #7 (extract tag)
-
-## BNE reg-pair extensions for tag-dispatch / nil-check branches.
-## cmp xA, xB ; b.eq +8 ; br x20
-DEFINE P1_BNE_R1_R2_R7 3F0002EB4000005480021FD6
-DEFINE P1_BNE_R0_R2_R7 1F0002EB4000005480021FD6
-
-## heap_next boot-alignment: (heap_next | 7) + 1 rounds the initial
-## bump pointer up to the next 8-byte boundary. heap_start's offset
-## from ELF_base is code-size-dependent and not aligned on every arch.
-## orr x0, x0, #7 — N=1, immr=0, imms=2 (3 contiguous ones)
-DEFINE P1_ORI_R0_R0_7 000840B2 ## orr x0, x0, #7
-DEFINE P1_ADDI_R0_R0_1 00040091 ## add x0, x0, #1
-DEFINE P1_LD_R0_R4_0 800040F9 ## ldr x0, [x4]
+DEFINE P1_BNE_R1_R2_R7 3F0002EB4000005480021FD6
+DEFINE P1_BNE_R0_R2_R7 1F0002EB4000005480021FD6
diff --git a/p1_amd64.M1 b/p1_amd64.M1
@@ -1,267 +1,129 @@
-## P1 pseudo-ISA — amd64 backing defs (v0.1 spike)
+## p1_amd64.M1 — GENERATED by p1_gen.py. Do not edit by hand.
##
-## Implements the subset needed by hello.M1 and demo.M1. See P1.md.
-##
-## Register mapping (P1 → amd64):
-## r0 → rax , r1 → rdi , r2 → rsi , r3 → rdx
-## r4 → r10 , r5 → r8 , r6 → rbx , r7 → r12
-##
-## LI rD, <4-byte-literal> — zero-extended load into rD.
-## Expands to "mov r<D>d, imm32" (5 or 6 bytes). Because x86-64
-## zero-extends 32-bit mov-to-GPR into the 64-bit register, the
-## 4-byte literal that immediately follows in source order ends up
-## as the full 64-bit value.
-##
-## Usage:
-## P1_LI_R1
-## &some_label # or '0E000000'
-
-DEFINE P1_LI_R0 B8 # mov eax, imm32
-DEFINE P1_LI_R1 BF # mov edi, imm32
-DEFINE P1_LI_R2 BE # mov esi, imm32
-DEFINE P1_LI_R3 BA # mov edx, imm32
-DEFINE P1_LI_R4 41BA # mov r10d, imm32
-DEFINE P1_LI_R5 41B8 # mov r8d, imm32
-DEFINE P1_LI_R6 BB # mov ebx, imm32
-DEFINE P1_LI_R7 41BC # mov r12d, imm32
-
-## SYSCALL — num in r0, args r1..r6, result in r0.
-##
-## amd64 Linux syscall ABI: num in rax, args in rdi,rsi,rdx,r10,r8,r9.
-## P1's mapping already places num (r0→rax) and args r1..r5 (→rdi,rsi,
-## rdx,r10,r8) in their native slots. The only mismatch is arg 6:
-## P1 r6 = rbx, native arg6 = r9. So SYSCALL shuffles that one register
-## and then traps.
-##
-## Expansion:
-## mov r9, rbx ; 49 89 D9
-## syscall ; 0F 05
+## Shared op-table lives in p1_gen.py; each arch's encoders expand
+## (op, register-tuple, imm) rows into native bytes. See P1.md for the
+## ISA spec and register mapping.
+
+
+## ---- LI — load 4-byte zero-extended literal from inline data slot
+DEFINE P1_LI_R0 B8
+DEFINE P1_LI_R1 BF
+DEFINE P1_LI_R2 BE
+DEFINE P1_LI_R3 BA
+DEFINE P1_LI_R4 41BA
+DEFINE P1_LI_R5 41B8
+DEFINE P1_LI_R6 BB
+DEFINE P1_LI_R7 41BC
+
+## ---- SYSCALL / SYSOPEN — uniform (clobbers r0 only) across arches
DEFINE P1_SYSCALL 4989D90F05
+DEFINE P1_SYSOPEN B8020000000F05
-
-## Linux syscall numbers (amd64-specific table).
-## Emitted as 4-byte little-endian immediates, consumed by P1_LI_R*.
+## ---- Linux syscall numbers (per-arch table). LE-32 immediate operands for LI.
DEFINE SYS_WRITE 01000000
-DEFINE SYS_EXIT 3C000000
-
-
-## ---- Tranche 1: full arith reg-reg-reg ----------------------------------
-## amd64 lacks a 3-operand arithmetic form, so every "rD = rA op rB"
-## expands unconditionally to two native insns:
-## mov rD_native, rA_native
-## <op> rD_native, rB_native
-## When rD == rA the leading mov is a no-op write, kept anyway — P1 is
-## deliberately unoptimized (P1.md §"Non-goals").
-## x86-64 oddities:
-## - Shifts need count in cl; three-insn form mov rD,rA; mov rcx,rB; shl.
-## - IDIV needs dividend in rdx:rax. We save rdx to rcx before CQO so
-## both the divisor (when it is rdx) and the caller's r3 survive.
-
-## MOV rD, rA -> mov rD_native, rA_native
-DEFINE P1_MOV_R1_R6 4889DF ## mov rdi, rbx
-DEFINE P1_MOV_R6_R1 4889FB ## mov rbx, rdi
-
-## MOV rD, sp — read rsp into a GPR. Used by the tranche-5 stack-
-## balance discriminator. On amd64 PROLOGUE/EPILOGUE are NOPs and
-## CALL/RET self-balance, so this check is a no-op here; still
-## encoded so the same demo assembles unchanged.
-DEFINE P1_MOV_R6_SP 4889E3 ## mov rbx, rsp
-DEFINE P1_MOV_R2_SP 4889E6 ## mov rsi, rsp
-
-## ADD rD, rA, rB -> mov rD,rA ; add rD,rB
-DEFINE P1_ADD_R1_R1_R2 4889FF4801F7 ## mov rdi,rdi ; add rdi,rsi
-DEFINE P1_ADD_R1_R1_R4 4889FF4C01D7 ## mov rdi,rdi ; add rdi,r10
-DEFINE P1_ADD_R2_R2_R6 4889F64801DE ## mov rsi,rsi ; add rsi,rbx (syscall arg)
-
-## SUB / AND / OR / XOR — 2-insn form, leading mov rdi,rdi kept.
-DEFINE P1_SUB_R1_R1_R2 4889FF4829F7 ## mov rdi,rdi ; sub rdi,rsi
-DEFINE P1_SUB_R2_R2_R6 4889F64829DE ## mov rsi,rsi ; sub rsi,rbx (tranche-5 sp delta)
-DEFINE P1_XOR_R1_R1_R2 4889FF4831F7 ## mov rdi,rdi ; xor rdi,rsi
-DEFINE P1_OR_R1_R1_R2 4889FF4809F7 ## mov rdi,rdi ; or rdi,rsi
-DEFINE P1_AND_R1_R1_R5 4889FF4C21C7 ## mov rdi,rdi ; and rdi,r8
-
-## MUL rD, rA, rB -> mov rD,rA ; imul rD,rB (IMUL r64,r/m64 = 0F AF)
-DEFINE P1_MUL_R1_R1_R2 4889FF480FAFFE ## mov rdi,rdi ; imul rdi,rsi
-
-## DIV rD, rA, rB -> divisor in rsi (r2). CQO still clobbers rdx,
-## so save/restore caller's r3 through rcx.
-## mov rcx,rdx ; mov rax,rdi ; cqo ; idiv rsi ; mov rdi,rax ; mov rdx,rcx
-DEFINE P1_DIV_R1_R1_R2 4889D14889F8489948F7FE4889C74889CA
-
-## REM rD, rA, rB -> divisor in r8 (r5). Same rdx-save dance.
-## mov rcx,rdx ; mov rax,rdi ; cqo ; idiv r8 ; mov rdi,rdx ; mov rdx,rcx
-DEFINE P1_REM_R1_R1_R5 4889D14889F8489949F7F84889D74889CA
-
-## SHL / SHR -> mov rD,rA ; mov rcx,rB ; shl/shr rD,cl
-## Opcode D3 /n with REX.W: /4=SHL, /5=SHR, /7=SAR.
-DEFINE P1_SHL_R1_R1_R2 4889FF4889F148D3E7 ## mov rdi,rdi; mov rcx,rsi; shl rdi,cl
-DEFINE P1_SHR_R1_R1_R2 4889FF4889F148D3EF ## mov rdi,rdi; mov rcx,rsi; shr rdi,cl
-
-## SAR — lives on r4 (r10) with negative input; see demo.M1 tranche 1.
-## mov r10,r10 ; mov rcx,rsi ; sar r10,cl
-DEFINE P1_SAR_R4_R4_R2 4D89D24889F149D3FA
-
-
-## ---- Tranche 2: immediate arith ---------------------------------------
-## Pattern: mov rD,rA ; <op> rD, imm8 (sign-extended imm8 via opcode 83;
-## shifts via C1). /n is the opcode-extension field in ModRM.reg.
-## Negative ADDI uses the same `add r/m64, imm8` with a sign-extended
-## imm8 — x86 handles the sign natively.
-
-DEFINE P1_ADDI_R1_R1_3 4889FF4883C703 ## add rdi, 3 (83 /0 ib)
-DEFINE P1_ADDI_R1_R1_1 4889FF4883C701 ## add rdi, 1
-DEFINE P1_ADDI_R1_R1_NEG3 4889FF4883C7FD ## add rdi, -3 (imm8 FD = -3)
-DEFINE P1_ADDI_R4_R4_NEG1 4D89D24983C2FF ## mov r10,r10 ; add r10, -1
-
-DEFINE P1_SHLI_R1_R1_1 4889FF48C1E701 ## shl rdi, 1 (C1 /4 ib)
-DEFINE P1_SHRI_R1_R1_1 4889FF48C1EF01 ## shr rdi, 1 (C1 /5 ib)
-
-DEFINE P1_ANDI_R1_R1_6 4889FF4883E706 ## and rdi, 6 (83 /4 ib)
-DEFINE P1_ORI_R1_R1_1 4889FF4883CF01 ## or rdi, 1 (83 /1 ib)
-
-DEFINE P1_SARI_R4_R4_1 4D89D249C1FA01 ## mov r10,r10 ; sar r10, 1
-
-
-## ---- Tranche 3: LA + memory ops ---------------------------------------
-## LA is LI in the spike (addresses fit in 32 bits → zero-extends cleanly
-## through the mov-to-r32 form). r4 is r10.
-DEFINE P1_LA_R4 41BA ## mov r10d, imm32
-
-## Plain MOV r/m, r / MOV r, r/m with 8-bit displacement. REX.W=1 for
-## 64-bit moves; REX.B=1 always (r10 is the base register).
-## 32-bit LW/SW dropped from the ISA.
-DEFINE P1_ST_R1_R4_0 49893A ## mov [r10], rdi
-DEFINE P1_LD_R1_R4_0 498B3A ## mov rdi, [r10]
-DEFINE P1_ST_R1_R4_8 49897A08 ## mov [r10+8], rdi (disp8)
-DEFINE P1_LD_R1_R4_8 498B7A08 ## mov rdi, [r10+8] (disp8)
-DEFINE P1_SB_R1_R4_16 41887A10 ## mov [r10+16], dil
-DEFINE P1_LB_R1_R4_16 490FB67A10 ## movzx rdi, byte [r10+16]
-
-## Negative disp8: x86-64 disp8 is signed, so -8 = 0xF8.
-DEFINE P1_ST_R1_R4_NEG8 49897AF8 ## mov [r10-8], rdi
-DEFINE P1_LD_R1_R4_NEG8 498B7AF8 ## mov rdi, [r10-8]
-
-
-## ---- Tranche 4: branches (r7-indirect, no hex2_word needed) ------------
-## Pattern: cmp ra,rb ; short native jcc over a jmp-through-r7.
-## If cond is false the native "skip" jcc fires (opposite of the P1 cond)
-## and steps past the 3-byte `jmp r12`, falling through. If cond is true
-## we take the jmp r12 to the address the caller stashed in r7.
-## P1_B is just `jmp r12` unconditionally.
-##
-## CMP rsi, rdx = 48 39 D6 (REX.W, opcode 39 /r, ModRM: 11 010 110).
-## JMP r12 = 41 FF E4 (REX.B, opcode FF /4, ModRM: 11 100 100).
-## jcc rel8 opcodes (skip when NOT cond): JE=74 JNE=75 JL=7C JGE=7D.
-
-DEFINE P1_B 41FFE4 ## jmp r12
-DEFINE P1_BEQ_R2_R3_R7 4839D6750341FFE4 ## cmp ; jne +3 ; jmp r12
-DEFINE P1_BNE_R2_R3_R7 4839D6740341FFE4 ## cmp ; je +3 ; jmp r12
-DEFINE P1_BLT_R2_R3_R7 4839D67D0341FFE4 ## cmp rsi,rdx ; jge +3 ; jmp r12
-DEFINE P1_BLT_R4_R2_R7 4939F27D0341FFE4 ## cmp r10,rsi ; jge +3 ; jmp r12
-
-
-## ---- Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL -----------------
-## amd64's native CALL already pushes the return address to the stack, but
-## the P1 convention requires a 16-byte frame on entry so that [sp+0] =
-## retaddr and [sp+8] = callee-private scratch (see P1.md §PROLOGUE).
-## The native CALL only supplies [sp+0]; we have to allocate [sp+8]
-## ourselves. The pop/sub/push dance is the smallest way to do that
-## while keeping retaddr at [sp+0]:
-##
-## pop r11 ; r11 = retaddr (scratch — r11 is not a P1 reg)
-## sub rsp, 16 ; reserve 2 cells
-## push r11 ; [sp+0] = retaddr, [sp+8] = scratch
-##
-## r11 is Linux's syscall clobber register and not mapped to any P1 GPR,
-## so using it as the pop scratch is safe. EPILOGUE is the inverse.
-##
-## CALL expects the target pre-loaded into r7 (= r12); expands to `call r12`.
-## TAIL = EPILOGUE + unconditional B (= `jmp r12`).
-
-DEFINE P1_PROLOGUE 415B4883EC104153 ## pop r11 ; sub rsp,16 ; push r11
-DEFINE P1_EPILOGUE 415B4883C4104153 ## pop r11 ; add rsp,16 ; push r11
-DEFINE P1_RET C3 ## ret
-DEFINE P1_CALL 41FFD4 ## call r12
-DEFINE P1_TAIL 415B4883C410415341FFE4 ## epilogue ; jmp r12
-
-
-## ---- Tranche 6: seed-Lisp step 1 extensions ---------------------------
-## Ops required by lisp.M1 (LISP.md item 1, proof-of-life): file I/O
-## wrappers, cell loads/stores, and the arg-shuffling MOVs they imply.
-
-## Extra Linux syscall numbers (amd64-specific table).
-DEFINE SYS_READ 00000000
+DEFINE SYS_EXIT 3C000000
+DEFINE SYS_READ 00000000
DEFINE SYS_CLOSE 03000000
-## SYSOPEN — portable open(path, flags, mode). Assumes r1=path, r2=flags,
-## r3=mode. On amd64 maps to open(2) (syscall 2); P1's r1/r2/r3 already
-## sit in rdi/rsi/rdx, so the expansion is just load-num + syscall.
-##
-## Expansion:
-## mov eax, 2 ; B8 02 00 00 00 (zero-extends to rax)
-## syscall ; 0F 05
-DEFINE P1_SYSOPEN B8020000000F05
-
-## MOV rD, rA — extra register pairings used around syscalls / calls.
-DEFINE P1_MOV_R6_R0 4889C3 ## mov rbx, rax
-DEFINE P1_MOV_R0_R3 4889D0 ## mov rax, rdx
-DEFINE P1_MOV_R7_R0 4989C4 ## mov r12, rax
-DEFINE P1_MOV_R7_R2 4989F4 ## mov r12, rsi
-DEFINE P1_MOV_R2_R6 4889DE ## mov rsi, rbx
-DEFINE P1_MOV_R3_R7 4C89E2 ## mov rdx, r12
-DEFINE P1_MOV_R2_R7 4C89E6 ## mov rsi, r12
-DEFINE P1_MOV_R4_R7 4D89E2 ## mov r10, r12
-
-## LD/ST extras — additional dst/base pairings at imm8 offsets.
-DEFINE P1_LD_R0_R6_0 488B03 ## mov rax, [rbx]
-DEFINE P1_LD_R1_R6_16 488B7B10 ## mov rdi, [rbx+16]
-DEFINE P1_LD_R3_R4_0 498B12 ## mov rdx, [r10]
-DEFINE P1_LD_R0_R5_0 498B00 ## mov rax, [r8]
-DEFINE P1_LB_R1_R4_0 490FB63A ## movzx rdi, byte [r10]
-DEFINE P1_ST_R2_R4_0 498932 ## mov [r10], rsi
-
-## ADD r2, r3, r1 — rsi = rdx + rdi. Two-insn form like other ADDs.
-## mov rsi, rdx ; add rsi, rdi
+## ---- Reg-reg-reg arithmetic (tranche 1) --------------------------
+DEFINE P1_ADD_R1_R1_R2 4889FF4801F7
+DEFINE P1_ADD_R1_R1_R4 4889FF4C01D7
+DEFINE P1_ADD_R2_R2_R6 4889F64801DE
DEFINE P1_ADD_R2_R3_R1 4889D64801FE
+DEFINE P1_SUB_R1_R1_R2 4889FF4829F7
+DEFINE P1_SUB_R2_R2_R6 4889F64829DE
+DEFINE P1_AND_R1_R1_R5 4889FF4C21C7
+DEFINE P1_OR_R1_R1_R2 4889FF4809F7
+DEFINE P1_XOR_R1_R1_R2 4889FF4831F7
+DEFINE P1_MUL_R1_R1_R2 4889FF480FAFFE
+DEFINE P1_DIV_R1_R1_R2 4989C34889D14889F8489948F7FE4889C74889CA4C89D8
+DEFINE P1_REM_R1_R1_R5 4989C34889D14889F8489949F7F84889D74889CA4C89D8
+DEFINE P1_SHL_R1_R1_R2 4889FF4889F148D3E7
+DEFINE P1_SHR_R1_R1_R2 4889FF4889F148D3EF
+DEFINE P1_SAR_R4_R4_R2 4D89D24889F149D3FA
-## BLT r0, r2, r7 — signed jump-if-less; used for argc/open/read checks.
-## cmp rax, rsi ; jge +3 ; jmp r12
+## ---- Immediate arithmetic (tranche 2) ----------------------------
+DEFINE P1_ADDI_R1_R1_3 4889FF4883C703
+DEFINE P1_ADDI_R1_R1_1 4889FF4883C701
+DEFINE P1_ADDI_R1_R1_NEG3 4889FF4883C7FD
+DEFINE P1_ADDI_R4_R4_NEG1 4D89D24983C2FF
+DEFINE P1_ADDI_R1_R1_NEG2 4889FF4883C7FE
+DEFINE P1_ADDI_R0_R0_1 4889C04883C001
+DEFINE P1_SHLI_R1_R1_1 4889FF48C1E701
+DEFINE P1_SHRI_R1_R1_1 4889FF48C1EF01
+DEFINE P1_SARI_R4_R4_1 4D89D249C1FA01
+DEFINE P1_ANDI_R1_R1_6 4889FF4883E706
+DEFINE P1_ANDI_R1_R1_7 4889FF4883E707
+DEFINE P1_ORI_R1_R1_1 4889FF4883CF01
+DEFINE P1_ORI_R0_R0_2 4889C04883C802
+DEFINE P1_ORI_R0_R0_7 4889C04883C807
+
+## ---- LA + memory ops (tranche 3) ---------------------------------
+DEFINE P1_LA_R4 41BA
+DEFINE P1_ST_R1_R4_0 49897A00
+DEFINE P1_LD_R1_R4_0 498B7A00
+DEFINE P1_ST_R1_R4_8 49897A08
+DEFINE P1_LD_R1_R4_8 498B7A08
+DEFINE P1_SB_R1_R4_16 49887A10
+DEFINE P1_LB_R1_R4_16 490FB67A10
+DEFINE P1_ST_R1_R4_NEG8 49897AF8
+DEFINE P1_LD_R1_R4_NEG8 498B7AF8
+
+## ---- Branches (tranche 4, r7-indirect) ---------------------------
+DEFINE P1_B 41FFE4
+DEFINE P1_BEQ_R2_R3_R7 4839D6750341FFE4
+DEFINE P1_BNE_R2_R3_R7 4839D6740341FFE4
+DEFINE P1_BLT_R2_R3_R7 4839D67D0341FFE4
+DEFINE P1_BLT_R4_R2_R7 4939F27D0341FFE4
+
+## ---- Control: CALL/RET + single-slot and N-slot PROLOGUE/EPILOGUE/TAIL
+DEFINE P1_PROLOGUE 415B4883EC104153
+DEFINE P1_EPILOGUE 415B4883C4104153
+DEFINE P1_RET C3
+DEFINE P1_CALL 41FFD4
+DEFINE P1_TAIL 415B4883C410415341FFE4
+DEFINE P1_PROLOGUE_N2 415B4883EC204153
+DEFINE P1_EPILOGUE_N2 415B4883C4204153
+DEFINE P1_TAIL_N2 415B4883C420415341FFE4
+DEFINE P1_PROLOGUE_N3 415B4883EC204153
+DEFINE P1_EPILOGUE_N3 415B4883C4204153
+DEFINE P1_TAIL_N3 415B4883C420415341FFE4
+DEFINE P1_PROLOGUE_N4 415B4883EC304153
+DEFINE P1_EPILOGUE_N4 415B4883C4304153
+DEFINE P1_TAIL_N4 415B4883C430415341FFE4
+
+## ---- Seed-Lisp step 1 extensions (tranche 6) ---------------------
+DEFINE P1_MOV_R1_R6 4889DF
+DEFINE P1_MOV_R6_R1 4889FB
+DEFINE P1_MOV_R6_R0 4889C3
+DEFINE P1_MOV_R0_R3 4889D0
+DEFINE P1_MOV_R7_R0 4989C4
+DEFINE P1_MOV_R7_R2 4989F4
+DEFINE P1_MOV_R2_R6 4889DE
+DEFINE P1_MOV_R3_R7 4C89E2
+DEFINE P1_MOV_R2_R7 4C89E6
+DEFINE P1_MOV_R4_R7 4D89E2
+DEFINE P1_MOV_R2_SP 4889E6
+DEFINE P1_MOV_R4_SP 4989E2
+DEFINE P1_MOV_R6_SP 4889E3
+DEFINE P1_MOV_R2_R0 4889C6
+DEFINE P1_LD_R0_R6_0 488B4300
+DEFINE P1_LD_R1_R6_16 488B7B10
+DEFINE P1_LD_R3_R4_0 498B5200
+DEFINE P1_LD_R0_R5_0 498B4000
+DEFINE P1_LB_R1_R4_0 490FB67A00
+DEFINE P1_ST_R2_R4_0 49897200
+DEFINE P1_ST_R0_R4_8 49894208
+DEFINE P1_LD_R0_R4_8 498B4208
+DEFINE P1_LB_R1_R0_0 480FB67800
+DEFINE P1_LD_R0_R1_0 488B4700
+DEFINE P1_LD_R0_R1_8 488B4708
+DEFINE P1_ST_R1_R0_0 48897800
+DEFINE P1_LD_R2_R4_0 498B7200
+DEFINE P1_ST_R2_R0_8 48897008
+DEFINE P1_LD_R0_R4_0 498B4200
+DEFINE P1_ST_R2_R4_16 49897210
+DEFINE P1_LD_R2_R4_16 498B7210
DEFINE P1_BLT_R0_R2_R7 4839F07D0341FFE4
-
-
-## ---- Tranche 7: [sp+8] scratch-slot access + assorted MOV/LD/ST -------
-## P1.md §PROLOGUE defines [sp+8] as a callee-private scratch cell.
-## Access pattern: MOV rX, sp then LD/ST rY, [rX, 8].
-
-DEFINE P1_MOV_R4_SP 4989E2 ## mov r10, rsp
-DEFINE P1_MOV_R2_R0 4889C6 ## mov rsi, rax
-DEFINE P1_ST_R0_R4_8 49894208 ## mov [r10+8], rax
-DEFINE P1_LD_R0_R4_8 498B4208 ## mov rax, [r10+8]
-DEFINE P1_LB_R1_R0_0 480FB638 ## movzx rdi, byte [rax]
-
-
-## ---- Tranche 8: seed-Lisp step 2 (tagged values) ---------------------
-## See p1_aarch64.M1 §Tranche 8.
-
-## sub rdi, 2 (via the 2-insn mov+add idiom kept for consistency).
-DEFINE P1_ADDI_R1_R1_NEG2 4889FF4883C7FE ## mov rdi,rdi ; add rdi,-2
-
-## Pair-cell loads/stores. REX.W=48 for 64-bit; r10 base needs REX.B.
-DEFINE P1_LD_R0_R1_0 488B07 ## mov rax, [rdi]
-DEFINE P1_LD_R0_R1_8 488B4708 ## mov rax, [rdi+8]
-DEFINE P1_ST_R1_R0_0 488938 ## mov [rax], rdi
-DEFINE P1_LD_R2_R4_0 498B32 ## mov rsi, [r10]
-DEFINE P1_ST_R2_R0_8 48897008 ## mov [rax+8], rsi
-
-## Tag ops — imm8 sign-extended via opcode 83 (/1 or, /4 and).
-DEFINE P1_ORI_R0_R0_2 4889C04883C802 ## mov rax,rax ; or rax, 2
-DEFINE P1_ANDI_R1_R1_7 4889FF4883E707 ## mov rdi,rdi ; and rdi, 7
-
-## BNE pairs: cmp a,b ; je +3 ; jmp r12
-DEFINE P1_BNE_R1_R2_R7 4839F7740341FFE4 ## cmp rdi,rsi ; je +3 ; jmp r12
-DEFINE P1_BNE_R0_R2_R7 4839F0740341FFE4 ## cmp rax,rsi ; je +3 ; jmp r12
-
-## heap_next boot-alignment helpers (see p1_aarch64.M1 §Tranche 8).
-DEFINE P1_ORI_R0_R0_7 4889C04883C807 ## mov rax,rax ; or rax, 7
-DEFINE P1_ADDI_R0_R0_1 4889C04883C001 ## mov rax,rax ; add rax, 1
-DEFINE P1_LD_R0_R4_0 498B02 ## mov rax, [r10]
+DEFINE P1_BNE_R1_R2_R7 4839F7740341FFE4
+DEFINE P1_BNE_R0_R2_R7 4839F0740341FFE4
diff --git a/p1_gen.py b/p1_gen.py
@@ -0,0 +1,949 @@
+#!/usr/bin/env python3
+"""p1_gen.py — generate p1_<arch>.M1 from a shared op table.
+
+Single source of truth for the P1 DEFINE tables across all three target
+arches. Replaces the hand-written p1_<arch>.M1 files; running this script
+rewrites all three in place.
+
+Why a generator: the hand-written defs diverge across arches in
+hard-to-review ways, typos silently produce SIGILL'ing binaries (M1
+passes undefined tokens through as literal text — see P1_TODO.md issue
+1), and the combinatorial surface (~1200 DEFINEs per arch if fully
+enumerated) is past the point of hand-maintainability.
+
+Design:
+ * OPS is a list of emission rows. Each row is a small class whose
+ `encode(arch) -> hex_string` method knows how to lower itself to
+ that arch's native bytes.
+ * Per-arch encoders live next to the Op classes. Adding a new op
+ means adding one Op subclass with three encode methods.
+ * Row ordering controls the output order in the .M1 file; tranches
+ are grouped by banner comments.
+
+Running:
+ $ python3 p1_gen.py # rewrite all three files
+ $ python3 p1_gen.py --check # diff against current files
+
+Output files: p1_aarch64.M1, p1_amd64.M1, p1_riscv64.M1.
+"""
+
+import os
+import sys
+from dataclasses import dataclass
+from typing import Optional
+
+ARCHES = ('aarch64', 'amd64', 'riscv64')
+
+## ---------- Register mappings --------------------------------------------
+## P1 register name → native encoding number. The native numbers are what
+## the per-arch encoders insert into instruction fields; the human-facing
+## names (rax, x1, a2, …) never appear in this file.
+
+NAT_AA64 = {'r0': 0, 'r1': 1, 'r2': 2, 'r3': 3,
+ 'r4': 4, 'r5': 5, 'r6': 19, 'r7': 20,
+ 'sp': 31, 'xzr': 31, 'lr': 30,
+ 'x21': 21, 'x22': 22, 'x23': 23, 'x24': 24, 'x25': 25, 'x8': 8}
+
+## amd64 ModRM.reg/rm + REX.R/B bit: native regnums 0..15 with r8..r15
+## setting the REX bit. We store the 4-bit native number directly.
+NAT_AMD64 = {'r0': 0, # rax
+ 'r1': 7, # rdi
+ 'r2': 6, # rsi
+ 'r3': 2, # rdx
+ 'r4': 10, # r10
+ 'r5': 8, # r8
+ 'r6': 3, # rbx
+ 'r7': 12, # r12
+ 'sp': 4, # rsp
+ 'rcx': 1, # shift-count scratch (not a P1 reg)
+ 'r9': 9, # syscall arg6 slot
+ 'r11': 11, # call-retaddr scratch in PROLOGUE
+ }
+
+NAT_RV64 = {'r0': 10, 'r1': 11, 'r2': 12, 'r3': 13,
+ 'r4': 14, 'r5': 15, 'r6': 9, 'r7': 18,
+ 'sp': 2, 'ra': 1, 'zero': 0, 'a7': 17,
+ 's3': 19, 's4': 20, 's5': 21, 's6': 22, 's7': 23}
+
+
+## ---------- Low-level encoding helpers -----------------------------------
+
+def le32(n: int) -> str:
+ return (n & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
+
+def byte(n: int) -> str:
+ return f'{n & 0xFF:02X}'
+
+
+## ---------- amd64 primitive encoders ------------------------------------
+## amd64 is variable-length. Helpers below emit specific instruction
+## shapes used by the P1 expansions. REX prefix bits: W=64b, R=ModRM.reg
+## high, B=ModRM.rm high, X=SIB.index high (unused here).
+
+def rex(w, r, x, b):
+ v = 0x40 | (w << 3) | (r << 2) | (x << 1) | b
+ return byte(v)
+
+def modrm(mod, reg, rm):
+ return byte((mod << 6) | ((reg & 7) << 3) | (rm & 7))
+
+def amd_mov_rr(dst, src):
+ """mov dst, src — REX.W + 89 /r (MOV r/m64, r64)."""
+ d, s = NAT_AMD64[dst], NAT_AMD64[src]
+ return rex(1, s >> 3, 0, d >> 3) + '89' + modrm(3, s, d)
+
+def amd_alu_rr(op, dst, src):
+ """op dst, src — 2-operand ALU. op is the opcode byte (01 add,
+ 29 sub, 21 and, 09 or, 31 xor)."""
+ d, s = NAT_AMD64[dst], NAT_AMD64[src]
+ return rex(1, s >> 3, 0, d >> 3) + op + modrm(3, s, d)
+
+def amd_alu_ri8(ext, dst, imm):
+ """op dst, imm8 (sign-extended). Opcode 83 /ext ib."""
+ d = NAT_AMD64[dst]
+ return rex(1, 0, 0, d >> 3) + '83' + modrm(3, ext, d) + byte(imm)
+
+def amd_shift_ri8(ext, dst, imm):
+ """shl/shr/sar dst, imm8. Opcode C1 /ext ib."""
+ d = NAT_AMD64[dst]
+ return rex(1, 0, 0, d >> 3) + 'C1' + modrm(3, ext, d) + byte(imm)
+
+def amd_shift_cl(ext, dst):
+ """shl/shr/sar dst, cl. Opcode D3 /ext."""
+ d = NAT_AMD64[dst]
+ return rex(1, 0, 0, d >> 3) + 'D3' + modrm(3, ext, d)
+
+def amd_imul_rr(dst, src):
+ """imul dst, src — 0F AF /r."""
+ d, s = NAT_AMD64[dst], NAT_AMD64[src]
+ return rex(1, d >> 3, 0, s >> 3) + '0FAF' + modrm(3, d, s)
+
+def amd_idiv(src):
+ """idiv src — F7 /7 (signed div of rdx:rax by src)."""
+ s = NAT_AMD64[src]
+ return rex(1, 0, 0, s >> 3) + 'F7' + modrm(3, 7, s)
+
+def amd_cqo():
+ """cqo — sign-extend rax into rdx:rax. 48 99."""
+ return '4899'
+
+def amd_mem_rm(opcode, reg, base, disp):
+ """[base+disp] <-> reg, for MOV r,r/m or MOV r/m,r (opcode=89 store, 8B load).
+ disp is signed int; encodes as disp8 if in range, else disp32."""
+ r, b = NAT_AMD64[reg], NAT_AMD64[base]
+ prefix = rex(1, r >> 3, 0, b >> 3) + opcode
+ if -128 <= disp <= 127:
+ mod = 1
+ d = byte(disp)
+ elif b == 4: # SIB required for rsp
+ mod = 2
+ d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
+ else:
+ mod = 2
+ d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
+ # rsp as base requires SIB byte (rm=4 with no SIB is rip-relative).
+ if b == 4:
+ return prefix + modrm(mod, r, 4) + '24' + d
+ return prefix + modrm(mod, r, b) + d
+
+def amd_mov_rm_b(reg, base, disp, store):
+ """Byte load/store. 88 /r (store), 0F B6 /r (movzx load)."""
+ r, b = NAT_AMD64[reg], NAT_AMD64[base]
+ if -128 <= disp <= 127:
+ mod = 1
+ d = byte(disp)
+ else:
+ mod = 2
+ d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
+ if store:
+ # MOV r/m8, r8 — 88 /r. Requires REX to address dil/sil/bpl/spl.
+ prefix = rex(1, r >> 3, 0, b >> 3) + '88'
+ sib = '24' if b == 4 else ''
+ rmv = 4 if b == 4 else b
+ return prefix + modrm(mod, r, rmv) + sib + d
+ else:
+ # MOVZX r64, r/m8 — REX.W 0F B6 /r.
+ prefix = rex(1, r >> 3, 0, b >> 3) + '0FB6'
+ sib = '24' if b == 4 else ''
+ rmv = 4 if b == 4 else b
+ return prefix + modrm(mod, r, rmv) + sib + d
+
+
+## ---------- aarch64 primitive encoders ----------------------------------
+## aarch64 is fixed 4-byte insns. Helpers return the 4 bytes LE-encoded.
+
+def aa_rrr(base, rD, rA, rB):
+ d, a, b = NAT_AA64[rD], NAT_AA64[rA], NAT_AA64[rB]
+ return le32(base | (b << 16) | (a << 5) | d)
+
+def aa_add_imm(rD, rA, imm12, sub=False):
+ """ADD/SUB (immediate, shift=0). imm12 unsigned 0..4095."""
+ d, a = NAT_AA64[rD], NAT_AA64[rA]
+ base = 0xD1000000 if sub else 0x91000000
+ return le32(base | ((imm12 & 0xFFF) << 10) | (a << 5) | d)
+
+def aa_logical_imm(base, rD, rA, N, immr, imms):
+ d, a = NAT_AA64[rD], NAT_AA64[rA]
+ return le32(base | (N << 22) | (immr << 16) | (imms << 10) | (a << 5) | d)
+
+def aa_ubfm(rD, rA, immr, imms):
+ """UBFM (N=1 for sf=64)."""
+ d, a = NAT_AA64[rD], NAT_AA64[rA]
+ return le32(0xD3400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
+
+def aa_sbfm(rD, rA, immr, imms):
+ """SBFM (N=1 for sf=64)."""
+ d, a = NAT_AA64[rD], NAT_AA64[rA]
+ return le32(0x93400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
+
+def aa_ldst_uimm12(base, rT, rN, off_bytes, size_log2):
+ """LDR/STR (unsigned offset). off_bytes must be a multiple of
+ 2^size_log2 and non-negative. imm12 = off_bytes >> size_log2."""
+ assert off_bytes >= 0 and (off_bytes % (1 << size_log2)) == 0
+ imm12 = off_bytes >> size_log2
+ assert 0 <= imm12 < 4096
+ t, n = NAT_AA64[rT], NAT_AA64[rN]
+ return le32(base | (imm12 << 10) | (n << 5) | t)
+
+def aa_ldst_unscaled(base, rT, rN, off):
+ """LDUR/STUR (unscaled, signed imm9)."""
+ assert -256 <= off <= 255
+ imm9 = off & 0x1FF
+ t, n = NAT_AA64[rT], NAT_AA64[rN]
+ return le32(base | (imm9 << 12) | (n << 5) | t)
+
+
+## ---------- riscv64 primitive encoders ----------------------------------
+
+def rv_r(base, rD, rA, rB):
+ d, a, b = NAT_RV64[rD], NAT_RV64[rA], NAT_RV64[rB]
+ return le32(base | (b << 20) | (a << 15) | (d << 7))
+
+def rv_i(base, rD, rA, imm12):
+ """I-type: imm12[11:0], rs1, funct3, rd, opcode. imm12 is a signed
+ int that gets masked to 12 bits."""
+ d, a = NAT_RV64[rD], NAT_RV64[rA]
+ return le32(base | ((imm12 & 0xFFF) << 20) | (a << 15) | (d << 7))
+
+def rv_s(base, rS, rA, imm12):
+ """S-type store: imm12[11:5] rs2 rs1 funct3 imm12[4:0] opcode."""
+ s, a = NAT_RV64[rS], NAT_RV64[rA]
+ hi = (imm12 >> 5) & 0x7F
+ lo = imm12 & 0x1F
+ return le32(base | (hi << 25) | (s << 20) | (a << 15) | (lo << 7))
+
+def rv_shift_imm(base, rD, rA, shamt):
+ """Shift-imm: base already has funct7 set; shamt in [0,63]."""
+ d, a = NAT_RV64[rD], NAT_RV64[rA]
+ return le32(base | ((shamt & 0x3F) << 20) | (a << 15) | (d << 7))
+
+
+## ---------- Ops ---------------------------------------------------------
+## Each class represents a row in the output. `name` is the DEFINE
+## name (without the P1_ prefix that gets added automatically).
+
+@dataclass
+class Op:
+ name: str
+ comment: str = ''
+
+ def encode(self, arch: str) -> str:
+ raise NotImplementedError
+
+## --- Reg-reg-reg arith ---
+## Per-arch base opcodes. Dict-of-dicts: BASES[op][arch] = base_value.
+
+AA64_RRR_BASE = {
+ 'ADD': 0x8B000000,
+ 'SUB': 0xCB000000,
+ 'AND': 0x8A000000,
+ 'OR': 0xAA000000,
+ 'XOR': 0xCA000000,
+ 'SHL': 0x9AC02000,
+ 'SHR': 0x9AC02400,
+ 'SAR': 0x9AC02800,
+ 'DIV': 0x9AC00C00,
+}
+AMD64_RRR_OPC = {
+ 'ADD': '01', 'SUB': '29', 'AND': '21', 'OR': '09', 'XOR': '31',
+}
+RV_RRR = {
+ 'ADD': (0x00000033,), # funct7=0 funct3=0 opcode=0x33
+ 'SUB': (0x40000033,),
+ 'XOR': (0x00004033,),
+ 'OR': (0x00006033,),
+ 'AND': (0x00007033,),
+ 'SHL': (0x00001033,),
+ 'SHR': (0x00005033,),
+ 'SAR': (0x40005033,),
+ 'MUL': (0x02000033,),
+ 'DIV': (0x02004033,),
+ 'REM': (0x02006033,),
+}
+
+@dataclass
+class RRR(Op):
+ op: str = ''
+ rD: str = ''
+ rA: str = ''
+ rB: str = ''
+
+ def encode(self, arch):
+ if arch == 'aarch64':
+ if self.op == 'MUL':
+ # MUL = MADD with Ra=xzr. 100 11011 000 mmmmm 0 aaaaa nnnnn ddddd
+ d = NAT_AA64[self.rD]
+ a = NAT_AA64[self.rA]
+ b = NAT_AA64[self.rB]
+ return le32(0x9B000000 | (b << 16) | (31 << 10) | (a << 5) | d)
+ if self.op == 'REM':
+ # SDIV x16, xA, xB ; MSUB xD, x16, xB, xA.
+ # x16 (ARM IP0, caller-saved, not a P1 reg) is scratch so
+ # REM does not hidden-clobber P1 r4 — the op modifies rD only.
+ d = NAT_AA64[self.rD]; a = NAT_AA64[self.rA]; b = NAT_AA64[self.rB]
+ SC = 16
+ sdiv = 0x9AC00C00 | (b << 16) | (a << 5) | SC
+ msub = 0x9B000000 | (b << 16) | (a << 10) | (SC << 5) | d
+ return le32(sdiv) + le32(msub)
+ base = AA64_RRR_BASE[self.op]
+ return aa_rrr(base, self.rD, self.rA, self.rB)
+
+ if arch == 'amd64':
+ if self.op == 'MUL':
+ return amd_mov_rr(self.rD, self.rA) + amd_imul_rr(self.rD, self.rB)
+ if self.op in ('DIV', 'REM'):
+ # x86 idiv implicitly reads/writes rax (P1 r0) and rdx
+ # (P1 r3). To keep DIV/REM clobber-free (only rD changes),
+ # stash r0 into r11 and r3 into rcx — neither is a P1 reg —
+ # then restore. If rA or rB alias r0/r3, read from the
+ # saved copy since we've overwritten the originals.
+ # Skip the final restore for whichever of r0/r3 *is* rD,
+ # so rD keeps its newly computed value.
+ seq = amd_mov_rr('r11', 'r0') # save r0 (rax)
+ seq += amd_mov_rr('rcx', 'r3') # save r3 (rdx)
+ src_a = 'r11' if self.rA == 'r0' else ('rcx' if self.rA == 'r3' else self.rA)
+ seq += amd_mov_rr('r0', src_a) # rax = rA
+ seq += amd_cqo() # rdx:rax = sign-ext rax
+ src_b = 'r11' if self.rB == 'r0' else ('rcx' if self.rB == 'r3' else self.rB)
+ seq += amd_idiv(src_b)
+ if self.op == 'DIV':
+ seq += amd_mov_rr(self.rD, 'r0') # rD = quotient
+ else:
+ seq += amd_mov_rr(self.rD, 'r3') # rD = remainder
+ if self.rD != 'r3':
+ seq += amd_mov_rr('r3', 'rcx') # restore r3
+ if self.rD != 'r0':
+ seq += amd_mov_rr('r0', 'r11') # restore r0
+ return seq
+ if self.op in ('SHL', 'SHR', 'SAR'):
+ ext = {'SHL': 4, 'SHR': 5, 'SAR': 7}[self.op]
+ seq = amd_mov_rr(self.rD, self.rA)
+ seq += amd_mov_rr('rcx', self.rB)
+ seq += amd_shift_cl(ext, self.rD)
+ return seq
+ # ADD/SUB/AND/OR/XOR: mov rD,rA ; op rD,rB
+ seq = amd_mov_rr(self.rD, self.rA)
+ seq += amd_alu_rr(AMD64_RRR_OPC[self.op], self.rD, self.rB)
+ return seq
+
+ if arch == 'riscv64':
+ base, = RV_RRR[self.op]
+ return rv_r(base, self.rD, self.rA, self.rB)
+
+ raise ValueError(arch)
+
+## --- Immediate arith ---
+@dataclass
+class AddI(Op):
+ rD: str = ''
+ rA: str = ''
+ imm: int = 0
+
+ def encode(self, arch):
+ if arch == 'aarch64':
+ if self.imm >= 0:
+ return aa_add_imm(self.rD, self.rA, self.imm, sub=False)
+ else:
+ return aa_add_imm(self.rD, self.rA, -self.imm, sub=True)
+ if arch == 'amd64':
+ # mov rD,rA ; add rD,imm8
+ seq = amd_mov_rr(self.rD, self.rA)
+ seq += amd_alu_ri8(0, self.rD, self.imm) # /0 = ADD
+ return seq
+ if arch == 'riscv64':
+ return rv_i(0x00000013, self.rD, self.rA, self.imm)
+
+@dataclass
+class LogI(Op):
+ """Bitwise-imm ops: ANDI, ORI. aarch64 logical-imm encoding is
+ pattern-based — caller supplies N/immr/imms. imm is small and
+ explicit for clarity."""
+ op: str = ''
+ rD: str = ''
+ rA: str = ''
+ imm: int = 0
+ aa_N: int = 0
+ aa_immr: int = 0
+ aa_imms: int = 0
+
+ def encode(self, arch):
+ if arch == 'aarch64':
+ base = 0x92000000 if self.op == 'ANDI' else 0xB2000000 # ORI = orr
+ return aa_logical_imm(base, self.rD, self.rA, self.aa_N, self.aa_immr, self.aa_imms)
+ if arch == 'amd64':
+ ext = {'ANDI': 4, 'ORI': 1}[self.op]
+ seq = amd_mov_rr(self.rD, self.rA)
+ seq += amd_alu_ri8(ext, self.rD, self.imm)
+ return seq
+ if arch == 'riscv64':
+ base = {'ANDI': 0x00007013, 'ORI': 0x00006013}[self.op]
+ return rv_i(base, self.rD, self.rA, self.imm)
+
+@dataclass
+class ShiftI(Op):
+ op: str = '' # SHLI/SHRI/SARI
+ rD: str = ''
+ rA: str = ''
+ imm: int = 0
+
+ def encode(self, arch):
+ if arch == 'aarch64':
+ if self.op == 'SHLI':
+ return aa_ubfm(self.rD, self.rA, (-self.imm) & 63, 63 - self.imm)
+ if self.op == 'SHRI':
+ return aa_ubfm(self.rD, self.rA, self.imm, 63)
+ if self.op == 'SARI':
+ return aa_sbfm(self.rD, self.rA, self.imm, 63)
+ if arch == 'amd64':
+ ext = {'SHLI': 4, 'SHRI': 5, 'SARI': 7}[self.op]
+ seq = amd_mov_rr(self.rD, self.rA)
+ seq += amd_shift_ri8(ext, self.rD, self.imm)
+ return seq
+ if arch == 'riscv64':
+ base = {'SHLI': 0x00001013, 'SHRI': 0x00005013, 'SARI': 0x40005013}[self.op]
+ return rv_shift_imm(base, self.rD, self.rA, self.imm)
+
+## --- Moves ---
+@dataclass
+class Mov(Op):
+ rD: str = ''
+ rA: str = ''
+
+ def encode(self, arch):
+ if arch == 'aarch64':
+ if self.rA == 'sp':
+ return aa_add_imm(self.rD, 'sp', 0, sub=False)
+ # MOV xD, xA = ORR xD, xzr, xA
+ d = NAT_AA64[self.rD]; a = NAT_AA64[self.rA]
+ return le32(0xAA000000 | (a << 16) | (31 << 5) | d)
+ if arch == 'amd64':
+ return amd_mov_rr(self.rD, self.rA)
+ if arch == 'riscv64':
+ return rv_i(0x00000013, self.rD, self.rA, 0) # addi rD, rA, 0
+
+## --- LI (wide literal) ---
+@dataclass
+class Li(Op):
+ rD: str = ''
+
+ def encode(self, arch):
+ if arch == 'aarch64':
+ # ldr wD, [pc+8] ; b +8 (caller emits 4 bytes of data next)
+ d = NAT_AA64[self.rD]
+ ldr_w_lit = 0x18000040 | d # LDR (literal) 32-bit, offset 8: imm19=2 → 0x40 in [23:5]
+ b_plus8 = 0x14000002 # B offset 8 (imm26 = 2 words = 8 bytes)
+ return le32(ldr_w_lit) + le32(b_plus8)
+ if arch == 'amd64':
+ # mov <rD as r32>, imm32 — opcode B8+r (with REX.B if r8..r15)
+ d = NAT_AMD64[self.rD]
+ if d >= 8:
+ return '41' + byte(0xB8 + (d & 7))
+ return byte(0xB8 + d)
+ if arch == 'riscv64':
+ # auipc rD,0 ; lwu rD,12(rD) ; jal x0,+8
+ d = NAT_RV64[self.rD]
+ auipc = 0x00000017 | (d << 7)
+ lwu = 0x00006003 | (d << 7) | (d << 15) | (12 << 20)
+ jal_p8 = 0x0080006F # jal x0, +8
+ return le32(auipc) + le32(lwu) + le32(jal_p8)
+
+## --- LA (address-load) — in the spike, same as LI ---
+@dataclass
+class La(Op):
+ rD: str = ''
+
+ def encode(self, arch):
+ return Li(name=self.name, rD=self.rD).encode(arch)
+
+## --- Memory: LD (64b), ST (64b), LB (8b zero-ext), SB (8b) ---
+@dataclass
+class Mem(Op):
+ op: str = '' # LD/ST/LB/SB
+ rT: str = '' # load dest or store src
+ rN: str = '' # base
+ off: int = 0
+
+ def encode(self, arch):
+ if arch == 'aarch64':
+ if self.op == 'LD':
+ if self.off >= 0:
+ return aa_ldst_uimm12(0xF9400000, self.rT, self.rN, self.off, 3)
+ return aa_ldst_unscaled(0xF8400000, self.rT, self.rN, self.off)
+ if self.op == 'ST':
+ if self.off >= 0:
+ return aa_ldst_uimm12(0xF9000000, self.rT, self.rN, self.off, 3)
+ return aa_ldst_unscaled(0xF8000000, self.rT, self.rN, self.off)
+ if self.op == 'LB':
+ if self.off >= 0:
+ return aa_ldst_uimm12(0x39400000, self.rT, self.rN, self.off, 0)
+ return aa_ldst_unscaled(0x38400000, self.rT, self.rN, self.off)
+ if self.op == 'SB':
+ if self.off >= 0:
+ return aa_ldst_uimm12(0x39000000, self.rT, self.rN, self.off, 0)
+ return aa_ldst_unscaled(0x38000000, self.rT, self.rN, self.off)
+ if arch == 'amd64':
+ if self.op == 'LD':
+ return amd_mem_rm('8B', self.rT, self.rN, self.off)
+ if self.op == 'ST':
+ return amd_mem_rm('89', self.rT, self.rN, self.off)
+ if self.op == 'LB':
+ return amd_mov_rm_b(self.rT, self.rN, self.off, store=False)
+ if self.op == 'SB':
+ return amd_mov_rm_b(self.rT, self.rN, self.off, store=True)
+ if arch == 'riscv64':
+ # funct3: LD=3, ST=3, LBU=4, SB=0. Opcodes: load=03, store=23.
+ if self.op == 'LD':
+ return rv_i(0x00003003, self.rT, self.rN, self.off)
+ if self.op == 'ST':
+ return rv_s(0x00003023, self.rT, self.rN, self.off)
+ if self.op == 'LB':
+ return rv_i(0x00004003, self.rT, self.rN, self.off) # LBU
+ if self.op == 'SB':
+ return rv_s(0x00000023, self.rT, self.rN, self.off)
+
+## --- Branches: r7-indirect pattern ---
+@dataclass
+class B(Op):
+ def encode(self, arch):
+ if arch == 'aarch64':
+ # BR x20 — 0xD61F0280
+ return le32(0xD61F0280)
+ if arch == 'amd64':
+ # jmp r12 — 41 FF E4
+ return '41FFE4'
+ if arch == 'riscv64':
+ # jalr x0, 0(s2) — opcode 67 00 09 00
+ return le32(0x00090067)
+
+@dataclass
+class CondB(Op):
+ op: str = '' # BEQ/BNE/BLT
+ rA: str = ''
+ rB: str = ''
+
+ def encode(self, arch):
+ if arch == 'aarch64':
+ # cmp xA, xB = SUBS xzr, xA, xB (0xEB000000 base, rD=31)
+ a = NAT_AA64[self.rA]; b = NAT_AA64[self.rB]
+ cmp = le32(0xEB000000 | (b << 16) | (a << 5) | 31)
+ # b.cond +8 : opcode 54 00 00 4X where X is the cond
+ # Skip when NOT cond holds. BEQ→NE(1), BNE→EQ(0), BLT→GE(A).
+ cond = {'BEQ': 1, 'BNE': 0, 'BLT': 10}[self.op]
+ bcond = le32(0x54000040 | cond)
+ br_x20 = le32(0xD61F0280)
+ return cmp + bcond + br_x20
+ if arch == 'amd64':
+ a, b = NAT_AMD64[self.rA], NAT_AMD64[self.rB]
+ # cmp rA, rB — opcode 39 /r with rA as r/m
+ cmp_ = rex(1, b >> 3, 0, a >> 3) + '39' + modrm(3, b, a)
+ # jcc rel8 opcode, skip=3 (past jmp r12):
+ # BEQ→JNE 75 03 ; BNE→JE 74 03 ; BLT→JGE 7D 03
+ jop = {'BEQ': '75', 'BNE': '74', 'BLT': '7D'}[self.op]
+ jmp_r12 = '41FFE4'
+ return cmp_ + jop + '03' + jmp_r12
+ if arch == 'riscv64':
+ # B<inv> rA, rB, +8 ; jalr x0, 0(s2)
+ a, b = NAT_RV64[self.rA], NAT_RV64[self.rB]
+ # B-type imm=8 encoding: imm[12|10:5] = 0, imm[4:1|11] = 0b0100_0.
+ # That is: [31]=0 [30:25]=0 [11:8]=0b0100 [7]=0 → imm[11:7] = 0b01000, imm[31:25]=0.
+ # Bytes: xx xx x? 00 with byte[1]_upper=0x4 for the rd-field=8.
+ # funct3 picks the op: BEQ→BNE(1), BNE→BEQ(0), BLT→BGE(5).
+ funct3 = {'BEQ': 1, 'BNE': 0, 'BLT': 5}[self.op]
+ # Build B-type: opcode=0x63, funct3, rs1=rA, rs2=rB, imm=+8.
+ # imm encoding for +8:
+ # bit 12 = 0
+ # bits 10:5 = 000000
+ # bits 4:1 = 0100
+ # bit 11 = 0
+ insn = 0x00000063 | (funct3 << 12) | (a << 15) | (b << 20)
+ # imm bits: [11:7] carries imm[4:1,11]; [31:25] carries imm[12,10:5]
+ # For +8: imm[4:1]=0b0100 → [10:7] = 0b1000 → byte pos [11:7] = 0b01000 = 8
+ insn |= (8 << 7) # bits [11:8] = 0b0100, bit 7=0
+ jalr_x0_s2 = 0x00090067
+ return le32(insn) + le32(jalr_x0_s2)
+
+
+## --- Simple singletons ---
+@dataclass
+class Literal(Op):
+ hex_by_arch: dict = None
+
+ def encode(self, arch):
+ return self.hex_by_arch[arch]
+
+
+## --- PROLOGUE / EPILOGUE / TAIL — N-slot variants ------------------------
+## Frame layout after PROLOGUE_Nk (k >= 1, rounded up so total frame bytes
+## stay 16-byte aligned on aarch64):
+##
+## [sp + 0] = retaddr (aarch64 lr / riscv64 ra / amd64 retaddr)
+## [sp + 8] = slot 1 (callee-private scratch)
+## [sp + 16] = slot 2
+## [sp + 24] = slot 3
+## ...
+## [sp + 8*k] = slot k
+##
+## Frame size = round_up_to_16(8 + 8*k). So k=1 → 16, k=2 → 24 → 32,
+## k=3 → 32, k=4 → 40 → 48. Keeping the EPILOGUE a strict inverse.
+
+def prologue_frame_bytes(k: int) -> int:
+ raw = 8 + 8 * k
+ return (raw + 15) & ~15
+
+@dataclass
+class Prologue(Op):
+ k: int = 1
+
+ def encode(self, arch):
+ fb = prologue_frame_bytes(self.k)
+ if arch == 'aarch64':
+ # sub sp, sp, #fb ; str x30, [sp]
+ sub = aa_add_imm('sp', 'sp', fb, sub=True)
+ str_lr = aa_ldst_uimm12(0xF9000000, 'lr', 'sp', 0, 3)
+ return sub + str_lr
+ if arch == 'amd64':
+ # pop r11 ; sub rsp,fb ; push r11
+ # pop r11 = 41 5B ; sub rsp,imm8 = 48 83 EC ib (if fb<=127)
+ # push r11 = 41 53
+ assert fb <= 127
+ return '415B' + '4883EC' + byte(fb) + '4153'
+ if arch == 'riscv64':
+ # addi sp, sp, -fb ; sd ra, 0(sp)
+ sub = rv_i(0x00000013, 'sp', 'sp', -fb)
+ sd = rv_s(0x00003023, 'ra', 'sp', 0)
+ return sub + sd
+
+@dataclass
+class Epilogue(Op):
+ k: int = 1
+
+ def encode(self, arch):
+ fb = prologue_frame_bytes(self.k)
+ if arch == 'aarch64':
+ ldr_lr = aa_ldst_uimm12(0xF9400000, 'lr', 'sp', 0, 3)
+ add = aa_add_imm('sp', 'sp', fb, sub=False)
+ return ldr_lr + add
+ if arch == 'amd64':
+ assert fb <= 127
+ return '415B' + '4883C4' + byte(fb) + '4153'
+ if arch == 'riscv64':
+ ld = rv_i(0x00003003, 'ra', 'sp', 0)
+ add = rv_i(0x00000013, 'sp', 'sp', fb)
+ return ld + add
+
+@dataclass
+class Tail(Op):
+ k: int = 1
+
+ def encode(self, arch):
+ # Epilogue + unconditional B (= jump through r7)
+ epi = Epilogue(name='', k=self.k).encode(arch)
+ br = B(name='').encode(arch)
+ return epi + br
+
+@dataclass
+class Call(Op):
+ def encode(self, arch):
+ if arch == 'aarch64':
+ return le32(0xD63F0280) # BLR x20
+ if arch == 'amd64':
+ return '41FFD4' # call r12
+ if arch == 'riscv64':
+ return le32(0x000900E7) # jalr ra, 0(s2)
+
+@dataclass
+class Ret(Op):
+ def encode(self, arch):
+ if arch == 'aarch64':
+ return le32(0xD65F03C0) # ret (= br x30)
+ if arch == 'amd64':
+ return 'C3'
+ if arch == 'riscv64':
+ return le32(0x00008067) # jalr x0, 0(ra)
+
+## --- SYSCALL / SYSOPEN ---
+SYSCALL_HEX = {
+ 'aarch64': (
+ # mov x8,x0 ; save r1..r5 into x21..x25 ; shuffle into x0..x4 ;
+ # mov x5,x19 ; svc #0 ; restore r1..r5.
+ '' .join([
+ le32(0xAA0003E8), # mov x8, x0 (ORR x8, xzr, x0)
+ le32(0xAA0103F5), # mov x21, x1
+ le32(0xAA0203F6), # mov x22, x2
+ le32(0xAA0303F7), # mov x23, x3
+ le32(0xAA0403F8), # mov x24, x4
+ le32(0xAA0503F9), # mov x25, x5
+ le32(0xAA1503E0), # mov x0, x21
+ le32(0xAA1603E1), # mov x1, x22
+ le32(0xAA1703E2), # mov x2, x23
+ le32(0xAA1803E3), # mov x3, x24
+ le32(0xAA1903E4), # mov x4, x25
+ le32(0xAA1303E5), # mov x5, x19
+ le32(0xD4000001), # svc #0
+ le32(0xAA1503E1), # mov x1, x21
+ le32(0xAA1603E2), # mov x2, x22
+ le32(0xAA1703E3), # mov x3, x23
+ le32(0xAA1803E4), # mov x4, x24
+ le32(0xAA1903E5), # mov x5, x25
+ ])
+ ),
+ 'amd64': '4989D9' + '0F05', # mov r9, rbx ; syscall
+ 'riscv64': (
+ ''.join([
+ le32(0x00050893), # addi a7, a0, 0 (mv a7, a0)
+ le32(0x00058993), # mv s3, a1
+ le32(0x00060A13), # mv s4, a2
+ le32(0x00068A93), # mv s5, a3
+ le32(0x00070B13), # mv s6, a4
+ le32(0x00078B93), # mv s7, a5
+ le32(0x00098513), # mv a0, s3
+ le32(0x000A0593), # mv a1, s4
+ le32(0x000A8613), # mv a2, s5
+ le32(0x000B0693), # mv a3, s6
+ le32(0x000B8713), # mv a4, s7
+ le32(0x00048793), # mv a5, s1
+ le32(0x00000073), # ecall
+ le32(0x00098593), # mv a1, s3
+ le32(0x000A0613), # mv a2, s4
+ le32(0x000A8693), # mv a3, s5
+ le32(0x000B0713), # mv a4, s6
+ le32(0x000B8793), # mv a5, s7
+ ])
+ ),
+}
+
+SYSOPEN_HEX = {
+ # aarch64: movn x0,#99 ; movz x8,#56 ; svc #0
+ 'aarch64': le32(0x92800C60) + le32(0xD2800708) + le32(0xD4000001),
+ # amd64: mov eax,2 (open) ; syscall
+ 'amd64': 'B8' + le32(2) + '0F05',
+ # riscv64: addi a0,zero,-100 ; addi a7,zero,56 ; ecall
+ 'riscv64': le32(0xF9C00513) + le32(0x03800893) + le32(0x00000073),
+}
+
+## --- Syscall numbers (little-endian 32-bit for LI operand) ---
+SYS_NUM = {
+ 'aarch64': {'SYS_WRITE': 64, 'SYS_EXIT': 93, 'SYS_READ': 63, 'SYS_CLOSE': 57},
+ # amd64 uses its own table
+ 'amd64': {'SYS_WRITE': 1, 'SYS_EXIT': 60, 'SYS_READ': 0, 'SYS_CLOSE': 3},
+ 'riscv64': {'SYS_WRITE': 64, 'SYS_EXIT': 93, 'SYS_READ': 63, 'SYS_CLOSE': 57},
+}
+
+
+## ---------- Op table (rows emitted in order) -----------------------------
+
+HEADER = """## p1_{arch}.M1 — GENERATED by p1_gen.py. Do not edit by hand.
+##
+## Shared op-table lives in p1_gen.py; each arch's encoders expand
+## (op, register-tuple, imm) rows into native bytes. See P1.md for the
+## ISA spec and register mapping.
+"""
+
+@dataclass
+class Banner:
+ text: str
+
+def rows():
+ R = []
+
+ # --- LI (wide literal load) ---
+ R.append(Banner('LI — load 4-byte zero-extended literal from inline data slot'))
+ for rd in ['r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7']:
+ R.append(Li(name=f'LI_{rd.upper()}', rD=rd))
+
+ # --- SYSCALL / SYSOPEN ---
+ R.append(Banner('SYSCALL / SYSOPEN — uniform (clobbers r0 only) across arches'))
+ R.append(Literal(name='SYSCALL', hex_by_arch=SYSCALL_HEX))
+ R.append(Literal(name='SYSOPEN', hex_by_arch=SYSOPEN_HEX))
+
+ # --- Syscall numbers ---
+ R.append(Banner('Linux syscall numbers (per-arch table). LE-32 immediate operands for LI.'))
+ for name in ('SYS_WRITE', 'SYS_EXIT', 'SYS_READ', 'SYS_CLOSE'):
+ R.append(Literal(name=name, hex_by_arch={a: le32(SYS_NUM[a][name]) for a in ARCHES}))
+
+ # --- Reg-reg-reg arith tuples used by demo/lisp ---
+ R.append(Banner('Reg-reg-reg arithmetic (tranche 1)'))
+ for op, d, a, b in [
+ ('ADD','r1','r1','r2'), ('ADD','r1','r1','r4'), ('ADD','r2','r2','r6'),
+ ('ADD','r2','r3','r1'),
+ ('SUB','r1','r1','r2'), ('SUB','r2','r2','r6'),
+ ('AND','r1','r1','r5'),
+ ('OR', 'r1','r1','r2'),
+ ('XOR','r1','r1','r2'),
+ ('MUL','r1','r1','r2'),
+ ('DIV','r1','r1','r2'),
+ ('REM','r1','r1','r5'),
+ ('SHL','r1','r1','r2'),
+ ('SHR','r1','r1','r2'),
+ ('SAR','r4','r4','r2'),
+ ]:
+ R.append(RRR(name=f'{op}_{d.upper()}_{a.upper()}_{b.upper()}', op=op, rD=d, rA=a, rB=b))
+
+ # --- Immediate arith ---
+ R.append(Banner('Immediate arithmetic (tranche 2)'))
+ for d, a, imm in [('r1','r1',3), ('r1','r1',1), ('r1','r1',-3), ('r4','r4',-1),
+ ('r1','r1',-2), ('r0','r0',1)]:
+ suf = f'NEG{-imm}' if imm < 0 else f'{imm}'
+ R.append(AddI(name=f'ADDI_{d.upper()}_{a.upper()}_{suf}', rD=d, rA=a, imm=imm))
+
+ # SHLI/SHRI with imm=1
+ R.append(ShiftI(name='SHLI_R1_R1_1', op='SHLI', rD='r1', rA='r1', imm=1))
+ R.append(ShiftI(name='SHRI_R1_R1_1', op='SHRI', rD='r1', rA='r1', imm=1))
+ R.append(ShiftI(name='SARI_R4_R4_1', op='SARI', rD='r4', rA='r4', imm=1))
+
+ # ANDI 6: aarch64 logical-imm N=1, immr=0, imms=1 (2 ones at bit 1..2)
+ R.append(LogI(name='ANDI_R1_R1_6', op='ANDI', rD='r1', rA='r1', imm=6,
+ aa_N=1, aa_immr=63, aa_imms=1))
+ # ANDI 7: N=1, immr=0, imms=2 (3 contiguous ones)
+ R.append(LogI(name='ANDI_R1_R1_7', op='ANDI', rD='r1', rA='r1', imm=7,
+ aa_N=1, aa_immr=0, aa_imms=2))
+ # ORI 1: N=1, immr=0, imms=0 (1 one at bit 0)
+ R.append(LogI(name='ORI_R1_R1_1', op='ORI', rD='r1', rA='r1', imm=1,
+ aa_N=1, aa_immr=0, aa_imms=0))
+ # ORI 2 on r0: 1 one rotated to bit 1 → immr=63, imms=0
+ R.append(LogI(name='ORI_R0_R0_2', op='ORI', rD='r0', rA='r0', imm=2,
+ aa_N=1, aa_immr=63, aa_imms=0))
+ # ORI 7 on r0: 3 ones at bit 0..2 → N=1, immr=0, imms=2
+ R.append(LogI(name='ORI_R0_R0_7', op='ORI', rD='r0', rA='r0', imm=7,
+ aa_N=1, aa_immr=0, aa_imms=2))
+
+ # --- LA + Memory ops ---
+ R.append(Banner('LA + memory ops (tranche 3)'))
+ R.append(La(name='LA_R4', rD='r4'))
+ for op, rt, rn, off in [
+ ('ST','r1','r4',0), ('LD','r1','r4',0),
+ ('ST','r1','r4',8), ('LD','r1','r4',8),
+ ('SB','r1','r4',16), ('LB','r1','r4',16),
+ ('ST','r1','r4',-8), ('LD','r1','r4',-8),
+ ]:
+ suf = f'NEG{-off}' if off < 0 else f'{off}'
+ R.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{suf}',
+ op=op, rT=rt, rN=rn, off=off))
+
+ # --- Branches ---
+ R.append(Banner('Branches (tranche 4, r7-indirect)'))
+ R.append(B(name='B'))
+ for op, a, b in [
+ ('BEQ','r2','r3'), ('BNE','r2','r3'), ('BLT','r2','r3'), ('BLT','r4','r2'),
+ ]:
+ R.append(CondB(name=f'{op}_{a.upper()}_{b.upper()}_R7', op=op, rA=a, rB=b))
+
+ # --- PROLOGUE / EPILOGUE / CALL / RET / TAIL — single-slot + Nk variants ---
+ R.append(Banner('Control: CALL/RET + single-slot and N-slot PROLOGUE/EPILOGUE/TAIL'))
+ R.append(Prologue(name='PROLOGUE', k=1))
+ R.append(Epilogue(name='EPILOGUE', k=1))
+ R.append(Ret(name='RET'))
+ R.append(Call(name='CALL'))
+ R.append(Tail(name='TAIL', k=1))
+ for k in (2, 3, 4):
+ R.append(Prologue(name=f'PROLOGUE_N{k}', k=k))
+ R.append(Epilogue(name=f'EPILOGUE_N{k}', k=k))
+ R.append(Tail(name=f'TAIL_N{k}', k=k))
+
+ # --- Tranche 6: seed-Lisp step-1 extensions ---
+ R.append(Banner('Seed-Lisp step 1 extensions (tranche 6)'))
+ R.append(Mov(name='MOV_R1_R6', rD='r1', rA='r6'))
+ R.append(Mov(name='MOV_R6_R1', rD='r6', rA='r1'))
+ R.append(Mov(name='MOV_R6_R0', rD='r6', rA='r0'))
+ R.append(Mov(name='MOV_R0_R3', rD='r0', rA='r3'))
+ R.append(Mov(name='MOV_R7_R0', rD='r7', rA='r0'))
+ R.append(Mov(name='MOV_R7_R2', rD='r7', rA='r2'))
+ R.append(Mov(name='MOV_R2_R6', rD='r2', rA='r6'))
+ R.append(Mov(name='MOV_R3_R7', rD='r3', rA='r7'))
+ R.append(Mov(name='MOV_R2_R7', rD='r2', rA='r7'))
+ R.append(Mov(name='MOV_R4_R7', rD='r4', rA='r7'))
+ # MOV rD, sp variants
+ R.append(Mov(name='MOV_R2_SP', rD='r2', rA='sp'))
+ R.append(Mov(name='MOV_R4_SP', rD='r4', rA='sp'))
+ R.append(Mov(name='MOV_R6_SP', rD='r6', rA='sp'))
+ # Extra MOVs needed around calls
+ R.append(Mov(name='MOV_R2_R0', rD='r2', rA='r0'))
+
+ # LD/ST extras
+ for op, rt, rn, off in [
+ ('LD','r0','r6',0), ('LD','r1','r6',16), ('LD','r3','r4',0),
+ ('LD','r0','r5',0), ('LB','r1','r4',0), ('ST','r2','r4',0),
+ ('ST','r0','r4',8), ('LD','r0','r4',8), ('LB','r1','r0',0),
+ ('LD','r0','r1',0), ('LD','r0','r1',8), ('ST','r1','r0',0),
+ ('LD','r2','r4',0), ('ST','r2','r0',8), ('LD','r0','r4',0),
+ # N=2 scratch slot 2 at [sp+16]: used by cons after dropping
+ # the :cons_save_cdr BSS spill (P1_TODO.md issue 4).
+ ('ST','r2','r4',16), ('LD','r2','r4',16),
+ ]:
+ suf = f'NEG{-off}' if off < 0 else f'{off}'
+ R.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{suf}',
+ op=op, rT=rt, rN=rn, off=off))
+
+ # BLT r0, r2 ; BNE r1, r2 ; BNE r0, r2
+ R.append(CondB(name='BLT_R0_R2_R7', op='BLT', rA='r0', rB='r2'))
+ R.append(CondB(name='BNE_R1_R2_R7', op='BNE', rA='r1', rB='r2'))
+ R.append(CondB(name='BNE_R0_R2_R7', op='BNE', rA='r0', rB='r2'))
+
+ return R
+
+
+## ---------- File emission -----------------------------------------------
+
+def emit(arch: str) -> str:
+ out = [HEADER.format(arch=arch).rstrip(), '']
+ seen = set()
+ for row in rows():
+ if isinstance(row, Banner):
+ out.append('')
+ out.append('## ---- ' + row.text + ' ' + '-' * max(0, 60 - len(row.text)))
+ continue
+ name = 'P1_' + row.name if not row.name.startswith('SYS_') else row.name
+ if name in seen:
+ raise RuntimeError(f'duplicate DEFINE: {name}')
+ seen.add(name)
+ hex_bytes = row.encode(arch)
+ out.append(f'DEFINE {name} {hex_bytes}')
+ out.append('')
+ return '\n'.join(out)
+
+
+def main():
+ here = os.path.dirname(os.path.abspath(__file__))
+ check = '--check' in sys.argv
+
+ had_diff = False
+ for arch in ARCHES:
+ path = os.path.join(here, f'p1_{arch}.M1')
+ content = emit(arch)
+ if check:
+ with open(path) as f:
+ existing = f.read()
+ if existing != content:
+ sys.stderr.write(f'DIFF: {path}\n')
+ had_diff = True
+ else:
+ with open(path, 'w') as f:
+ f.write(content)
+ print(f'wrote {path} ({len(content)} bytes)')
+
+ if check and had_diff:
+ sys.exit(1)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/p1_riscv64.M1 b/p1_riscv64.M1
@@ -1,32 +1,11 @@
-## P1 pseudo-ISA — riscv64 backing defs (v0.1 spike)
+## p1_riscv64.M1 — GENERATED by p1_gen.py. Do not edit by hand.
##
-## Implements the subset needed by hello.M1 and demo.M1. See P1.md.
-##
-## Register mapping (P1 → RISC-V):
-## r0 → a0 (x10) , r1 → a1 (x11) , r2 → a2 (x12) , r3 → a3 (x13)
-## r4 → a4 (x14) , r5 → a5 (x15) , r6 → s1 (x9) , r7 → s2 (x18)
-##
-## LI rD, <4-byte-literal> — zero-extended load into rD.
-## RISC-V lacks a single "load imm32" form, so the expansion uses the
-## same PC-relative-inline-data trick as aarch64, in three words:
-##
-## auipc rD, 0 ; rD = pc_of_auipc
-## lwu rD, 12(rD) ; rD = *(u32*)(pc_of_auipc + 12) [zero-ext to 64]
-## jal x0, +8 ; skip past the 4-byte data slot
-## <4-byte literal>
-##
-## The LWU offset is 12 because the literal lives 12 bytes past the
-## auipc: auipc(4) + lwu(4) + jal(4) = 12.
-##
-## Hex2 `&label` emits the 4-byte absolute address in little-endian,
-## matching how lwu reads it.
-##
-## Usage:
-## P1_LI_R1
-## &some_label # or '0E000000'
+## Shared op-table lives in p1_gen.py; each arch's encoders expand
+## (op, register-tuple, imm) rows into native bytes. See P1.md for the
+## ISA spec and register mapping.
-## Each DEFINE below is three little-endian 32-bit words concatenated:
-## [auipc rD,0] [lwu rD,12(rD)] [jal x0,+8].
+
+## ---- LI — load 4-byte zero-extended literal from inline data slot
DEFINE P1_LI_R0 170500000365C5006F008000
DEFINE P1_LI_R1 9705000083E5C5006F008000
DEFINE P1_LI_R2 170600000366C6006F008000
@@ -36,235 +15,115 @@ DEFINE P1_LI_R5 9707000083E7C7006F008000
DEFINE P1_LI_R6 9704000083E4C4006F008000
DEFINE P1_LI_R7 170900000369C9006F008000
-## SYSCALL — num in r0, args r1..r6, result in r0. Clobbers r0 only.
-##
-## riscv64 Linux wants num in a7, args in a0..a5. P1 puts args one
-## register higher (a1..a5, s1), so we save a1..a5 into s3..s7 before
-## shuffling them down into a0..a4; a5 comes from s1; a7 gets the
-## number. After `ecall` we restore a1..a5 from s3..s7 so the caller
-## sees only r0 clobbered.
-##
-## s3..s7 are reserved for this expansion and invisible to P1 code (see
-## P1.md §"Register mapping").
-##
-## Expansion (18 insns, 72 bytes):
-## mv a7, a0 ; num
-## mv s3, a1 ; save P1 r1
-## mv s4, a2 ; save P1 r2
-## mv s5, a3 ; save P1 r3
-## mv s6, a4 ; save P1 r4
-## mv s7, a5 ; save P1 r5
-## mv a0, s3 ; arg0 <- saved r1
-## mv a1, s4 ; arg1 <- saved r2
-## mv a2, s5 ; arg2 <- saved r3
-## mv a3, s6 ; arg3 <- saved r4
-## mv a4, s7 ; arg4 <- saved r5
-## mv a5, s1 ; arg5 <- P1 r6
-## ecall
-## mv a1, s3 ; restore r1
-## mv a2, s4 ; restore r2
-## mv a3, s5 ; restore r3
-## mv a4, s6 ; restore r4
-## mv a5, s7 ; restore r5
+## ---- SYSCALL / SYSOPEN — uniform (clobbers r0 only) across arches
DEFINE P1_SYSCALL 9308050093890500130A0600938A0600130B0700938B07001385090093050A0013860A0093060B0013870B0093870400730000009385090013060A0093860A0013070B0093870B00
+DEFINE P1_SYSOPEN 1305C0F99308800373000000
-
-## Linux syscall numbers (riscv64 uses the generic table — same as aarch64).
+## ---- Linux syscall numbers (per-arch table). LE-32 immediate operands for LI.
DEFINE SYS_WRITE 40000000
-DEFINE SYS_EXIT 5D000000
-
-
-## ---- Tranche 1: full arith reg-reg-reg ----------------------------------
-## Non-identity discriminator chain used by demo.M1. All one-insn R-type
-## ops. MUL/DIV/REM are M-extension ops (standard on rv64gc).
-
-## MOV rD, rA -> addi rD, rA, 0 (the `mv` pseudo)
-DEFINE P1_MOV_R1_R6 93850400 ## mv a1, s1
-DEFINE P1_MOV_R6_R1 93840500 ## mv s1, a1
-
-## MOV rD, sp -> addi rD, sp, 0 (sp = x2).
-## Used by the tranche-5 stack-balance discriminator.
-DEFINE P1_MOV_R6_SP 93040100 ## addi s1, sp, 0
-DEFINE P1_MOV_R2_SP 13060100 ## addi a2, sp, 0
-
-## ADD / SUB / AND / OR / XOR — R-type, funct3 picks the op.
-DEFINE P1_ADD_R1_R1_R2 B385C500 ## add a1, a1, a2
-DEFINE P1_ADD_R1_R1_R4 B385E500 ## add a1, a1, a4
-DEFINE P1_ADD_R2_R2_R6 33069600 ## add a2, a2, s1 (syscall-arg computation)
-
-DEFINE P1_SUB_R1_R1_R2 B385C540 ## sub a1, a1, a2 (funct7=0x20)
-DEFINE P1_SUB_R2_R2_R6 33069640 ## sub a2, a2, s1 (tranche-5 sp delta)
-DEFINE P1_XOR_R1_R1_R2 B3C5C500 ## xor a1, a1, a2 (funct3=4)
-DEFINE P1_OR_R1_R1_R2 B3E5C500 ## or a1, a1, a2 (funct3=6)
-DEFINE P1_AND_R1_R1_R5 B3F5F500 ## and a1, a1, a5 (funct3=7)
-
-## MUL / DIV / REM — M extension, funct7=1.
-DEFINE P1_MUL_R1_R1_R2 B385C502 ## mul a1, a1, a2
-DEFINE P1_DIV_R1_R1_R2 B3C5C502 ## div a1, a1, a2 (funct3=4)
-DEFINE P1_REM_R1_R1_R5 B3E5F502 ## rem a1, a1, a5 (funct3=6)
-
-## SHL / SHR -> sll / srl.
-DEFINE P1_SHL_R1_R1_R2 B395C500 ## sll a1, a1, a2 (funct3=1)
-DEFINE P1_SHR_R1_R1_R2 B3D5C500 ## srl a1, a1, a2 (funct3=5)
-
-## SAR rD, rA, rB -> sra xD, xA, xB. Discriminator lives on r4
-## (negative value) — see demo.M1 tranche 1.
-DEFINE P1_SAR_R4_R4_R2 3357C740 ## sra a4, a4, a2 (funct7=0x20)
-
-
-## ---- Tranche 2: immediate arith ---------------------------------------
-## I-type / shift-immediate forms. Shift amount is 6 bits (shamt6) for
-## rv64. SRAI sets bit 30 to distinguish from SRLI. Negative ADDI uses
-## sign-extended imm12 directly — no separate "subi" needed.
-
-DEFINE P1_ADDI_R1_R1_3 93853500 ## addi a1, a1, 3
-DEFINE P1_ADDI_R1_R1_1 93851500 ## addi a1, a1, 1
-DEFINE P1_ADDI_R1_R1_NEG3 9385D5FF ## addi a1, a1, -3
-DEFINE P1_ADDI_R4_R4_NEG1 1307F7FF ## addi a4, a4, -1
-
-DEFINE P1_SHLI_R1_R1_1 93951500 ## slli a1, a1, 1 (funct3=1)
-DEFINE P1_SHRI_R1_R1_1 93D51500 ## srli a1, a1, 1 (funct3=5)
-
-DEFINE P1_ANDI_R1_R1_6 93F56500 ## andi a1, a1, 6 (funct3=7)
-DEFINE P1_ORI_R1_R1_1 93E51500 ## ori a1, a1, 1 (funct3=6)
-
-DEFINE P1_SARI_R4_R4_1 13571740 ## srai a4, a4, 1 (imm[11:6]=0x10)
-
-
-## ---- Tranche 3: LA + memory ops ---------------------------------------
-## LA is LI in the spike.
-DEFINE P1_LA_R4 170700000367C7006F008000 ## auipc a4,0; lwu a4,12(a4); jal x0,+8
-
-## LOAD (opcode 0x03) / STORE (opcode 0x23) with signed 12-bit offset.
-## For P1: LD=64b, LB=8b zero-ext (= LBU). ST=SD (64b), SB=8b.
-## 32-bit LW/SW dropped from the ISA.
-DEFINE P1_ST_R1_R4_0 2330B700 ## sd a1, 0(a4)
-DEFINE P1_LD_R1_R4_0 83350700 ## ld a1, 0(a4)
-DEFINE P1_ST_R1_R4_8 2334B700 ## sd a1, 8(a4)
-DEFINE P1_LD_R1_R4_8 83358700 ## ld a1, 8(a4)
-DEFINE P1_SB_R1_R4_16 2308B700 ## sb a1, 16(a4)
-DEFINE P1_LB_R1_R4_16 83450701 ## lbu a1, 16(a4)
-
-## Negative imm12: RISC-V load/store immediates are signed 12-bit,
-## so -8 = 0xFF8 sign-extended.
-DEFINE P1_ST_R1_R4_NEG8 233CB7FE ## sd a1, -8(a4)
-DEFINE P1_LD_R1_R4_NEG8 833587FF ## ld a1, -8(a4)
-
-
-## ---- Tranche 4: branches (r7-indirect, no hex2_word needed) ------------
-## RISC-V has B-type conditional branches with a scattered immediate; writing
-## literal byte strings for arbitrary offsets is painful without hex2_word.
-## Sidestep it with the r7-indirect pattern: a fixed-offset branch that skips
-## the unconditional `jalr x0, 0(s2)` when the P1 condition is NOT met.
-##
-## Fixed offset: the conditional branches below all use imm=8 (skip past the
-## 4-byte JALR on the false path). B-type imm=8 encodes to [11:7]=0x08,
-## [31:25]=0x00.
-##
-## P1_B is just `jalr x0, 0(s2)` — unconditional jump to address in r7.
-## Unsigned branches (BLTU/BGEU/BGE) dropped from the ISA — see P1.md.
-
-DEFINE P1_B 67000900 ## jalr x0, 0(s2)
-DEFINE P1_BEQ_R2_R3_R7 6314D60067000900 ## bne a2,a3,+8 ; jalr x0,0(s2)
-DEFINE P1_BNE_R2_R3_R7 6304D60067000900 ## beq a2,a3,+8 ; jalr
-DEFINE P1_BLT_R2_R3_R7 6354D60067000900 ## bge a2,a3,+8 ; jalr
-DEFINE P1_BLT_R4_R2_R7 6354C70067000900 ## bge a4,a2,+8 ; jalr
-
-
-## ---- Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL -----------------
-## CALL is JALR through s2 (= P1 r7), saving PC+4 into ra. Caller loads
-## &target into r7 beforehand. RET is the canonical `ret` pseudo
-## (JALR x0, 0(ra)).
-##
-## PROLOGUE / EPILOGUE save and restore ra around nested calls; after
-## PROLOGUE, [sp+0] holds the return address (matching P1.md §"return
-## address lives in [sp+0] after prologue"). TAIL = EPILOGUE + B.
-
-DEFINE P1_PROLOGUE 130101FF23301100 ## addi sp,sp,-16 ; sd ra,0(sp)
-DEFINE P1_EPILOGUE 8330010013010101 ## ld ra,0(sp) ; addi sp,sp,16
-DEFINE P1_RET 67800000 ## jalr x0, 0(ra)
-DEFINE P1_CALL E7000900 ## jalr ra, 0(s2)
-DEFINE P1_TAIL 833001001301010167000900 ## epilogue ; jalr x0, 0(s2)
-
-
-## ---- Tranche 6: seed-Lisp step 1 extensions ---------------------------
-## Ops required by lisp.M1 (LISP.md item 1, proof-of-life): file I/O
-## wrappers, cell loads/stores, and the arg-shuffling MOVs they imply.
-
-## Extra Linux syscall numbers (generic table, same as aarch64).
-DEFINE SYS_READ 3F000000
+DEFINE SYS_EXIT 5D000000
+DEFINE SYS_READ 3F000000
DEFINE SYS_CLOSE 39000000
-## SYSOPEN — portable open(path, flags, mode). Assumes r1=path, r2=flags,
-## r3=mode. riscv64 uses openat(2) = syscall 56; P1's r1..r3 already line
-## up with native a1..a3, so only a0=AT_FDCWD and a7=56 need loading.
-##
-## Expansion:
-## addi a0, x0, -100 ; a0 = AT_FDCWD (13 05 C0 F9)
-## addi a7, x0, 56 ; a7 = SYS_openat (93 08 80 03)
-## ecall (73 00 00 00)
-DEFINE P1_SYSOPEN 1305C0F99308800373000000
-
-## MOV rD, rA — extra register pairings used around syscalls / calls.
-## MOV rD, rA -> addi rD, rA, 0 (the `mv` pseudo).
-DEFINE P1_MOV_R6_R0 93040500 ## mv s1, a0
-DEFINE P1_MOV_R0_R3 13850600 ## mv a0, a3
-DEFINE P1_MOV_R7_R0 13090500 ## mv s2, a0
-DEFINE P1_MOV_R7_R2 13090600 ## mv s2, a2
-DEFINE P1_MOV_R2_R6 13860400 ## mv a2, s1
-DEFINE P1_MOV_R3_R7 93060900 ## mv a3, s2
-DEFINE P1_MOV_R2_R7 13060900 ## mv a2, s2
-DEFINE P1_MOV_R4_R7 13070900 ## mv a4, s2
-
-## LD/ST extras — additional dst/base pairings at signed imm12 offsets.
-DEFINE P1_LD_R0_R6_0 03B50400 ## ld a0, 0(s1)
-DEFINE P1_LD_R1_R6_16 83B50401 ## ld a1, 16(s1)
-DEFINE P1_LD_R3_R4_0 83360700 ## ld a3, 0(a4)
-DEFINE P1_LD_R0_R5_0 03B50700 ## ld a0, 0(a5)
-DEFINE P1_LB_R1_R4_0 83450700 ## lbu a1, 0(a4)
-DEFINE P1_ST_R2_R4_0 2330C700 ## sd a2, 0(a4)
-
-## ADD r2, r3, r1 — a2 = a3 + a1.
-DEFINE P1_ADD_R2_R3_R1 3386B600 ## add a2, a3, a1
-
-## BLT r0, r2, r7 — signed jump-if-less; used for argc/open/read checks.
-## bge a0, a2, +8 ; jalr x0, 0(s2)
+## ---- Reg-reg-reg arithmetic (tranche 1) --------------------------
+DEFINE P1_ADD_R1_R1_R2 B385C500
+DEFINE P1_ADD_R1_R1_R4 B385E500
+DEFINE P1_ADD_R2_R2_R6 33069600
+DEFINE P1_ADD_R2_R3_R1 3386B600
+DEFINE P1_SUB_R1_R1_R2 B385C540
+DEFINE P1_SUB_R2_R2_R6 33069640
+DEFINE P1_AND_R1_R1_R5 B3F5F500
+DEFINE P1_OR_R1_R1_R2 B3E5C500
+DEFINE P1_XOR_R1_R1_R2 B3C5C500
+DEFINE P1_MUL_R1_R1_R2 B385C502
+DEFINE P1_DIV_R1_R1_R2 B3C5C502
+DEFINE P1_REM_R1_R1_R5 B3E5F502
+DEFINE P1_SHL_R1_R1_R2 B395C500
+DEFINE P1_SHR_R1_R1_R2 B3D5C500
+DEFINE P1_SAR_R4_R4_R2 3357C740
+
+## ---- Immediate arithmetic (tranche 2) ----------------------------
+DEFINE P1_ADDI_R1_R1_3 93853500
+DEFINE P1_ADDI_R1_R1_1 93851500
+DEFINE P1_ADDI_R1_R1_NEG3 9385D5FF
+DEFINE P1_ADDI_R4_R4_NEG1 1307F7FF
+DEFINE P1_ADDI_R1_R1_NEG2 9385E5FF
+DEFINE P1_ADDI_R0_R0_1 13051500
+DEFINE P1_SHLI_R1_R1_1 93951500
+DEFINE P1_SHRI_R1_R1_1 93D51500
+DEFINE P1_SARI_R4_R4_1 13571740
+DEFINE P1_ANDI_R1_R1_6 93F56500
+DEFINE P1_ANDI_R1_R1_7 93F57500
+DEFINE P1_ORI_R1_R1_1 93E51500
+DEFINE P1_ORI_R0_R0_2 13652500
+DEFINE P1_ORI_R0_R0_7 13657500
+
+## ---- LA + memory ops (tranche 3) ---------------------------------
+DEFINE P1_LA_R4 170700000367C7006F008000
+DEFINE P1_ST_R1_R4_0 2330B700
+DEFINE P1_LD_R1_R4_0 83350700
+DEFINE P1_ST_R1_R4_8 2334B700
+DEFINE P1_LD_R1_R4_8 83358700
+DEFINE P1_SB_R1_R4_16 2308B700
+DEFINE P1_LB_R1_R4_16 83450701
+DEFINE P1_ST_R1_R4_NEG8 233CB7FE
+DEFINE P1_LD_R1_R4_NEG8 833587FF
+
+## ---- Branches (tranche 4, r7-indirect) ---------------------------
+DEFINE P1_B 67000900
+DEFINE P1_BEQ_R2_R3_R7 6314D60067000900
+DEFINE P1_BNE_R2_R3_R7 6304D60067000900
+DEFINE P1_BLT_R2_R3_R7 6354D60067000900
+DEFINE P1_BLT_R4_R2_R7 6354C70067000900
+
+## ---- Control: CALL/RET + single-slot and N-slot PROLOGUE/EPILOGUE/TAIL
+DEFINE P1_PROLOGUE 130101FF23301100
+DEFINE P1_EPILOGUE 8330010013010101
+DEFINE P1_RET 67800000
+DEFINE P1_CALL E7000900
+DEFINE P1_TAIL 833001001301010167000900
+DEFINE P1_PROLOGUE_N2 130101FE23301100
+DEFINE P1_EPILOGUE_N2 8330010013010102
+DEFINE P1_TAIL_N2 833001001301010267000900
+DEFINE P1_PROLOGUE_N3 130101FE23301100
+DEFINE P1_EPILOGUE_N3 8330010013010102
+DEFINE P1_TAIL_N3 833001001301010267000900
+DEFINE P1_PROLOGUE_N4 130101FD23301100
+DEFINE P1_EPILOGUE_N4 8330010013010103
+DEFINE P1_TAIL_N4 833001001301010367000900
+
+## ---- Seed-Lisp step 1 extensions (tranche 6) ---------------------
+DEFINE P1_MOV_R1_R6 93850400
+DEFINE P1_MOV_R6_R1 93840500
+DEFINE P1_MOV_R6_R0 93040500
+DEFINE P1_MOV_R0_R3 13850600
+DEFINE P1_MOV_R7_R0 13090500
+DEFINE P1_MOV_R7_R2 13090600
+DEFINE P1_MOV_R2_R6 13860400
+DEFINE P1_MOV_R3_R7 93060900
+DEFINE P1_MOV_R2_R7 13060900
+DEFINE P1_MOV_R4_R7 13070900
+DEFINE P1_MOV_R2_SP 13060100
+DEFINE P1_MOV_R4_SP 13070100
+DEFINE P1_MOV_R6_SP 93040100
+DEFINE P1_MOV_R2_R0 13060500
+DEFINE P1_LD_R0_R6_0 03B50400
+DEFINE P1_LD_R1_R6_16 83B50401
+DEFINE P1_LD_R3_R4_0 83360700
+DEFINE P1_LD_R0_R5_0 03B50700
+DEFINE P1_LB_R1_R4_0 83450700
+DEFINE P1_ST_R2_R4_0 2330C700
+DEFINE P1_ST_R0_R4_8 2334A700
+DEFINE P1_LD_R0_R4_8 03358700
+DEFINE P1_LB_R1_R0_0 83450500
+DEFINE P1_LD_R0_R1_0 03B50500
+DEFINE P1_LD_R0_R1_8 03B58500
+DEFINE P1_ST_R1_R0_0 2330B500
+DEFINE P1_LD_R2_R4_0 03360700
+DEFINE P1_ST_R2_R0_8 2334C500
+DEFINE P1_LD_R0_R4_0 03350700
+DEFINE P1_ST_R2_R4_16 2338C700
+DEFINE P1_LD_R2_R4_16 03360701
DEFINE P1_BLT_R0_R2_R7 6354C50067000900
-
-
-## ---- Tranche 7: [sp+8] scratch-slot access + assorted MOV/LD/ST -------
-## P1.md §PROLOGUE defines [sp+8] as a callee-private scratch cell.
-## Access pattern: MOV rX, sp then LD/ST rY, [rX, 8].
-
-DEFINE P1_MOV_R4_SP 13070100 ## addi a4, sp, 0 (mv a4, sp)
-DEFINE P1_MOV_R2_R0 13060500 ## addi a2, a0, 0 (mv a2, a0)
-DEFINE P1_ST_R0_R4_8 2334A700 ## sd a0, 8(a4)
-DEFINE P1_LD_R0_R4_8 03358700 ## ld a0, 8(a4)
-DEFINE P1_LB_R1_R0_0 83450500 ## lbu a1, 0(a0)
-
-
-## ---- Tranche 8: seed-Lisp step 2 (tagged values) ---------------------
-## See p1_aarch64.M1 §Tranche 8.
-
-## addi a1, a1, -2 — sign-extended imm12 = 0xFFE.
-DEFINE P1_ADDI_R1_R1_NEG2 9385E5FF
-
-## Pair-cell loads/stores. Opcode 0x03 (LOAD), 0x23 (STORE), funct3=011 (d).
-DEFINE P1_LD_R0_R1_0 03B50500 ## ld a0, 0(a1)
-DEFINE P1_LD_R0_R1_8 03B58500 ## ld a0, 8(a1)
-DEFINE P1_ST_R1_R0_0 2330B500 ## sd a1, 0(a0)
-DEFINE P1_LD_R2_R4_0 03360700 ## ld a2, 0(a4)
-DEFINE P1_ST_R2_R0_8 2334C500 ## sd a2, 8(a0)
-
-## Tag ops — I-type (funct3=110 ori, 111 andi).
-DEFINE P1_ORI_R0_R0_2 13652500 ## ori a0, a0, 2
-DEFINE P1_ANDI_R1_R1_7 93F57500 ## andi a1, a1, 7
-
-## BNE pairs: beq a,b,+8 ; jalr x0,0(s2) (skip past JALR on equal-false).
-DEFINE P1_BNE_R1_R2_R7 6384C50067000900 ## beq a1,a2,+8 ; jalr x0,0(s2)
-DEFINE P1_BNE_R0_R2_R7 6304C50067000900 ## beq a0,a2,+8 ; jalr x0,0(s2)
-
-## heap_next boot-alignment helpers (see p1_aarch64.M1 §Tranche 8).
-DEFINE P1_ORI_R0_R0_7 13657500 ## ori a0, a0, 7
-DEFINE P1_ADDI_R0_R0_1 13051500 ## addi a0, a0, 1
-DEFINE P1_LD_R0_R4_0 03350700 ## ld a0, 0(a4)
+DEFINE P1_BNE_R1_R2_R7 6384C50067000900
+DEFINE P1_BNE_R0_R2_R7 6304C50067000900