build: only curdir is mounted into podman - boot2

commit b9b5b0447f9729f86deac54ff643d8097c82636b
parent 49dcd6e7b024c3b4921442ba79db41bb510e5cc3
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue, 21 Apr 2026 08:16:46 -0700

build: only curdir is mounted into podman

Podman invocations now mount $(CURDIR):/work only. populate-upstream.sh
runs on the host, mirroring the files bootstrap.sh consumes
(hex0-seeds, hex0/1/2/catm/M0 sources, ELF-<arch>.hex2) from
\$UPSTREAM/seed/stage0-posix/ into build/upstream/. PODMAN_BOOTSTRAP
collapses into the unified PODMAN.

arch/ is gone: the vendored (and subtly reformatted) ELF-*.hex2 files
are deleted — the link rule now reads them straight from
build/upstream/. p1_gen.py moves to src/ and writes its three
p1_<arch>.M1 defs files into build/<arch>/. Only original sources
live in curdir; everything derived or copied lands under build/.

Diffstat:
M .gitignore  | 3 ---
M Makefile  | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
M README.md  | 57 ++++++++++++++++++++++++++-------------------------------
D arch/ELF-aarch64.hex2  | 75 ---------------------------------------------------------------------------
D arch/ELF-amd64.hex2  | 74 --------------------------------------------------------------------------
D arch/ELF-riscv64.hex2  | 74 --------------------------------------------------------------------------
D arch/p1_gen.py  | 1066 -------------------------------------------------------------------------------
M bootstrap.sh  | 19 +++++++++++--------
A populate-upstream.sh  | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
A src/p1_gen.py  | 1089 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

10 files changed, 1225 insertions(+), 1357 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1 @@
 build/
-arch/p1_aarch64.M1
-arch/p1_amd64.M1
-arch/p1_riscv64.M1
diff --git a/Makefile b/Makefile
@@ -48,7 +48,11 @@ ifeq ($(PLATFORM),)
   $(error ARCH '$(ARCH)' not supported — use aarch64, amd64, or riscv64)
 endif
 
-UPSTREAM      := $(abspath $(CURDIR)/../live-bootstrap)
+# Default upstream checkout path. Only consumed by populate-upstream.sh,
+# which runs on the host to populate build/upstream/ before the container
+# ever starts. Podman itself never mounts this — the container only ever
+# sees curdir, so all inputs must already live inside it.
+UPSTREAM ?= $(abspath $(CURDIR)/../live-bootstrap)
 
 # Pinned to a manifest-list digest (not the :latest tag): podman on macOS
 # only stores one image per tag locally, so cross-arch pulls under :latest
@@ -63,16 +67,22 @@ RUNTIME_IMAGE := public.ecr.aws/docker/library/alpine@sha256:5b10f432ef3da1b8d4c
 OUT_DIR   := build/$(ARCH)
 TOOLS_DIR := $(OUT_DIR)/tools
 
-# Two container views:
-#   PODMAN_BOOTSTRAP — toolchain build. Needs read-only access to stage0-posix
-#                      under ../live-bootstrap; writes only into build/$(ARCH)/tools.
-#   PODMAN           — assemble / link / run. Sees only the lispcc dir.
-PODMAN_BOOTSTRAP := podman run --rm --platform $(PLATFORM) \
-    -v $(UPSTREAM):/work/live-bootstrap:ro \
-    -v $(CURDIR):/work/lispcc \
-    -w /work/lispcc \
-    $(RUNTIME_IMAGE)
-
+# stage0-posix uses mixed-case arch dirs (AArch64, AMD64) that don't match
+# our lowercase ARCH. Map them so build/upstream/ mirrors upstream layout.
+ARCH_DIR_aarch64 := AArch64
+ARCH_DIR_amd64   := AMD64
+ARCH_DIR_riscv64 := riscv64
+ARCH_DIR         := $(ARCH_DIR_$(ARCH))
+
+# Host-populated mirror of the upstream files we consume. Everything
+# bootstrap.sh needs (seeds, hex0/1/2 sources, catm, M0, ELF headers)
+# lands here before any podman work begins.
+UPSTREAM_DIR   := build/upstream
+UPSTREAM_STAMP := $(UPSTREAM_DIR)/.stamp
+
+# Single podman view: curdir mounted at /work. Toolchain build, assembly,
+# link, and run all share this view. Keeping it narrow means nothing
+# outside the repo is visible to the container.
 PODMAN := podman run --rm --platform $(PLATFORM) \
     -v $(CURDIR):/work \
     -w /work \
@@ -80,21 +90,36 @@ PODMAN := podman run --rm --platform $(PLATFORM) \
 
 # --- Targets ---------------------------------------------------------------
 
-.PHONY: all toolchain run run-all test-lisp test-lisp-all clean
+.PHONY: all toolchain populate-upstream run run-all test-lisp test-lisp-all clean
 
 all: $(OUT_DIR)/$(PROG)
 
 toolchain: $(TOOLS_DIR)/M0
 
+populate-upstream: $(UPSTREAM_STAMP)
+
 $(OUT_DIR) $(TOOLS_DIR):
 	mkdir -p $@
 
+# Mirror the upstream seed + hex0/1/2/catm/M0/ELF files we need from
+# $(UPSTREAM) into build/upstream/. Host-side so the container mount stays
+# minimal. The stamp doubles as an order marker and avoids re-copying on
+# every toolchain build.
+$(UPSTREAM_STAMP): populate-upstream.sh
+	sh populate-upstream.sh $(UPSTREAM)
+	@touch $@
+
+# Any file anyone asks for under build/upstream/ is produced by the stamp
+# rule above. Empty recipe — the file is already on disk once the stamp
+# exists, and the stamp's timestamp stands in for every file's freshness.
+$(UPSTREAM_DIR)/%: $(UPSTREAM_STAMP) ;
+
 # Bootstrap M0, hex2-0, catm (and the throwaway hex0/hex1) from hex0-seed.
 # One shot per arch — see bootstrap.sh for the phase-by-phase chain.
 #
 # Grouped target (&:) so all five outputs come from a single recipe run.
-$(TOOLS_DIR)/M0 $(TOOLS_DIR)/hex2-0 $(TOOLS_DIR)/catm $(TOOLS_DIR)/hex0 $(TOOLS_DIR)/hex1 &: bootstrap.sh | $(TOOLS_DIR)
-	$(PODMAN_BOOTSTRAP) sh bootstrap.sh $(ARCH) /work/lispcc/$(TOOLS_DIR)
+$(TOOLS_DIR)/M0 $(TOOLS_DIR)/hex2-0 $(TOOLS_DIR)/catm $(TOOLS_DIR)/hex0 $(TOOLS_DIR)/hex1 &: bootstrap.sh $(UPSTREAM_STAMP) | $(TOOLS_DIR)
+	$(PODMAN) sh bootstrap.sh $(ARCH) /work/$(TOOLS_DIR)
 
 # Assemble: lint first, then combine per-arch defs + program and feed to M0.
 #
@@ -105,10 +130,10 @@ $(TOOLS_DIR)/M0 $(TOOLS_DIR)/hex2-0 $(TOOLS_DIR)/catm $(TOOLS_DIR)/hex0 $(TOOLS_
 # M0 takes a single positional input (no -f flag), so we catm the two
 # sources together first. The intermediate .combined.M1 is kept in OUT_DIR
 # so it gets cleaned along with everything else.
-$(OUT_DIR)/$(PROG).hex2: $(PROG_SRC) arch/p1_$(ARCH).M1 lint.sh $(TOOLS_DIR)/M0 $(TOOLS_DIR)/catm | $(OUT_DIR)
-	./lint.sh arch/p1_$(ARCH).M1 $(PROG_SRC)
+$(OUT_DIR)/$(PROG).hex2: $(PROG_SRC) $(OUT_DIR)/p1_$(ARCH).M1 lint.sh $(TOOLS_DIR)/M0 $(TOOLS_DIR)/catm | $(OUT_DIR)
+	./lint.sh $(OUT_DIR)/p1_$(ARCH).M1 $(PROG_SRC)
 	$(PODMAN) sh -ec ' \
-	    $(TOOLS_DIR)/catm $(OUT_DIR)/$(PROG).combined.M1 arch/p1_$(ARCH).M1 $(PROG_SRC) ; \
+	    $(TOOLS_DIR)/catm $(OUT_DIR)/$(PROG).combined.M1 $(OUT_DIR)/p1_$(ARCH).M1 $(PROG_SRC) ; \
 	    $(TOOLS_DIR)/M0   $(OUT_DIR)/$(PROG).combined.M1 $(OUT_DIR)/$(PROG).hex2'
 
 # Link: prepend the ELF header and feed to hex2-0.
@@ -117,9 +142,9 @@ $(OUT_DIR)/$(PROG).hex2: $(PROG_SRC) arch/p1_$(ARCH).M1 lint.sh $(TOOLS_DIR)/M0 
 # base address 0x00600000 (no --base-address flag), which is why the ELF
 # header references `&ELF_base` symbolically rather than baking in a
 # concrete VA — the header travels to whatever base the linker chose.
-$(OUT_DIR)/$(PROG): $(OUT_DIR)/$(PROG).hex2 arch/ELF-$(ARCH).hex2 $(TOOLS_DIR)/hex2-0 $(TOOLS_DIR)/catm
+$(OUT_DIR)/$(PROG): $(OUT_DIR)/$(PROG).hex2 $(UPSTREAM_DIR)/$(ARCH_DIR)/ELF-$(ARCH).hex2 $(TOOLS_DIR)/hex2-0 $(TOOLS_DIR)/catm
 	$(PODMAN) sh -ec ' \
-	    $(TOOLS_DIR)/catm   $(OUT_DIR)/$(PROG).linked.hex2 arch/ELF-$(ARCH).hex2 $(OUT_DIR)/$(PROG).hex2 ; \
+	    $(TOOLS_DIR)/catm   $(OUT_DIR)/$(PROG).linked.hex2 $(UPSTREAM_DIR)/$(ARCH_DIR)/ELF-$(ARCH).hex2 $(OUT_DIR)/$(PROG).hex2 ; \
 	    $(TOOLS_DIR)/hex2-0 $(OUT_DIR)/$(PROG).linked.hex2 $(OUT_DIR)/$(PROG)'
 
 run: $(OUT_DIR)/$(PROG)
@@ -168,12 +193,12 @@ test-lisp-all:
 	$(MAKE) --no-print-directory ARCH=riscv64 test-lisp
 
 clean:
-	rm -rf build/ arch/p1_aarch64.M1 arch/p1_amd64.M1 arch/p1_riscv64.M1
+	rm -rf build/
 
-# Generate all three per-arch DEFINE tables from arch/p1_gen.py in a
+# Generate all three per-arch DEFINE tables from src/p1_gen.py in a
 # single shot. Grouped target (&:) because p1_gen.py writes all three
-# files unconditionally (next to itself, i.e. into arch/). These are
-# build artifacts — gitignored; the build regenerates them on any
-# p1_gen.py edit so there's no staleness risk.
-arch/p1_aarch64.M1 arch/p1_amd64.M1 arch/p1_riscv64.M1 &: arch/p1_gen.py
-	python3 arch/p1_gen.py
+# files unconditionally. Output lands under build/<arch>/ (build/ is
+# wiped by clean, so the build regenerates on any p1_gen.py edit with
+# no staleness risk).
+build/aarch64/p1_aarch64.M1 build/amd64/p1_amd64.M1 build/riscv64/p1_riscv64.M1 &: src/p1_gen.py
+	python3 src/p1_gen.py build
diff --git a/README.md b/README.md
@@ -10,50 +10,45 @@ Goal is a 4–6× shrink in auditable LOC. See [docs/PLAN.md](docs/PLAN.md).
 Stage 0: hello-world in the P1 portable pseudo-ISA (see [docs/P1.md](docs/P1.md)),
 assembled and run inside a pristine alpine container on all three target
 arches (aarch64, amd64, riscv64). The same `tests/hello.M1` source assembles
-for every arch; only the backing `arch/p1_<arch>.M1` defs file varies.
-Toolchain (M1, hex2) builds statically from the upstream mescc-tools C
-source.
+for every arch; only the backing `build/<arch>/p1_<arch>.M1` defs file
+varies. Toolchain (M1, hex2) builds statically from the upstream mescc-tools
+C source.
 
 ## Layout
 
 ```
-docs/               design docs (PLAN, SEED, P1, C1, LISP)
-src/                real programs (lisp.M1, kaem-minimal.M1)
-tests/              smoke programs (hello.M1, demo.M1) + fixtures
-    lisp/           lisp test fixtures (*.scm + *.expected)
-    kaem.run        smoke input for kaem-minimal
-arch/               per-arch defs + ELF headers
-    p1_gen.py       generator for p1_<arch>.M1
-    p1_<arch>.M1    per-arch P1 defs (gitignored, generated)
-    ELF-<arch>.hex2 per-arch ELF header template
-bootstrap.sh        hex0-seed → M0/hex2-0/catm toolchain build
-lint.sh             M1 undefined-token guard
-Makefile            podman-driven build, ARCH-parameterized
-build/<arch>/       per-arch outputs + toolchain
+docs/                 design docs (PLAN, SEED, P1, C1, LISP)
+src/                  real programs (lisp.M1, kaem-minimal.M1) + p1_gen.py
+tests/                smoke programs (hello.M1, demo.M1) + fixtures
+    lisp/             lisp test fixtures (*.scm + *.expected)
+    kaem.run          smoke input for kaem-minimal
+bootstrap.sh          hex0-seed → M0/hex2-0/catm toolchain build
+populate-upstream.sh  host-side copy of upstream seeds + sources into build/upstream/
+lint.sh               M1 undefined-token guard
+Makefile              podman-driven build, ARCH-parameterized
+build/                all derived artifacts (gitignored)
+    upstream/         mirror of the files bootstrap.sh consumes from live-bootstrap
+    <arch>/           per-arch outputs
+        tools/        bootstrapped M0, hex2-0, catm (+ throwaway hex0/hex1)
+        p1_<arch>.M1  generated P1 defs
+        <prog>        final ELF binary
 ```
 
 ## Build & run
 
-Requires podman. Non-native arches run via podman's binfmt + qemu-user
-path (works transparently on a default `podman machine` setup).
+Requires podman. Uses Alpine as the host. Non-native arches run via podman's
+binfmt + qemu-user path (works transparently on a default `podman machine`
+setup).
 
 ```
-make image              # one-time: build the alpine+gcc builder image
-make                    # default ARCH=aarch64 → build/aarch64/hello
-make ARCH=amd64         # build/amd64/hello
-make ARCH=riscv64       # build/riscv64/hello
-make run                # run build/$(ARCH)/hello in pristine alpine
 make run-all            # build + run on all three arches
 make clean              # wipe build/
 ```
 
-Two images are used: `lispcc-builder` (alpine+gcc, ~184 MB) only compiles
-M1/hex2 at host arch; `alpine:latest` pulled per target platform runs
-the assembled binary with the static toolchain mounted in.
-
 ## Source layout assumption
 
-The Makefile reaches the upstream mescc-tools C source via the parent dir
-mount (`HOST_ROOT := $(abspath $(CURDIR)/..)`), expecting
-`../live-bootstrap/seed/stage0-posix/mescc-tools/`. Override `TOOLCHAIN_SRC`
-in the Makefile if your layout differs.
+`populate-upstream.sh` runs on the host and mirrors the files bootstrap.sh
+needs from `$UPSTREAM/seed/stage0-posix/` into `build/upstream/`; the
+default is `../live-bootstrap`. Override by invoking `make UPSTREAM=/path
+populate-upstream`. Podman itself only ever mounts curdir, so everything
+the container sees must live inside the repo.
diff --git a/arch/ELF-aarch64.hex2 b/arch/ELF-aarch64.hex2
@@ -1,75 +0,0 @@
-### Copyright (C) 2016 Jeremiah Orians
-### Copyright (C) 2017 Jan Nieuwenhuizen <janneke@gnu.org>
-### Copyright (C) 2020 deesix <deesix@tuta.io>
-### This file is part of M2-Planet.
-###
-### M2-Planet is free software: you can redistribute it and/or modify
-### it under the terms of the GNU General Public License as published by
-### the Free Software Foundation, either version 3 of the License, or
-### (at your option) any later version.
-###
-### M2-Planet is distributed in the hope that it will be useful,
-### but WITHOUT ANY WARRANTY; without even the implied warranty of
-### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-### GNU General Public License for more details.
-###
-### You should have received a copy of the GNU General Public License
-### along with M2-Planet.  If not, see <http://www.gnu.org/licenses/>.
-
-### stage0's hex2 format
-###    !<label>          1 byte relative
-###    $<label>          2 byte address
-###    @<label>          2 byte relative
-###    &<label>          4 byte address
-###    %<label>          4 byte relative
-
-### if you wish to use this header, you need to add :ELF_end to the end of your
-### M1 or hex2 files.
-
-## ELF Header
-
-:ELF_base
-7F 45 4C 46                               # e_ident[EI_MAG0-3] ELF's magic number
-
-02                                        # e_ident[EI_CLASS] Indicating 64 bit
-01                                        # e_ident[EI_DATA] Indicating little endianness
-01                                        # e_ident[EI_VERSION] Indicating original elf
-
-03                                        # e_ident[EI_OSABI] Set at 3 because FreeBSD is strict
-00                                        # e_ident[EI_ABIVERSION] See above
-
-00 00 00 00 00 00 00                      # e_ident[EI_PAD]
-
-02 00                                     # e_type Indicating Executable
-B7 00                                     # e_machine Indicating AArch64
-01 00 00 00                               # e_version Indicating original elf
-
-&_start 00 00 00 00                       # e_entry Address of the entry point
-%ELF_program_headers>ELF_base 00 00 00 00 # e_phoff Address of program header table
-00 00 00 00 00 00 00 00                   # e_shoff Address of section header table
-
-00 00 00 00                               # e_flags
-
-40 00                                     # e_ehsize Indicating our 64 Byte header
-
-38 00                                     # e_phentsize size of a program header table
-01 00                                     # e_phnum number of entries in program table
-
-00 00                                     # e_shentsize size of a section header table
-00 00                                     # e_shnum number of entries in section table
-
-00 00                                     # e_shstrndx index of the section names
-
-
-:ELF_program_headers
-:ELF_program_header__text
-01 00 00 00                               # ph_type: PT-LOAD = 1
-07 00 00 00                               # ph_flags: PF-X|PF-W|PF-R = 7
-00 00 00 00 00 00 00 00                   # ph_offset
-&ELF_base 00 00 00 00                     # ph_vaddr
-&ELF_base 00 00 00 00                     # ph_physaddr
-%ELF_end>ELF_base 00 00 00 00             # ph_filesz
-%ELF_end>ELF_base 00 00 00 00             # ph_memsz
-01 00 00 00 00 00 00 00                   # ph_align
-
-:ELF_text
diff --git a/arch/ELF-amd64.hex2 b/arch/ELF-amd64.hex2
@@ -1,74 +0,0 @@
-### Copyright (C) 2016 Jeremiah Orians
-### Copyright (C) 2017 Jan Nieuwenhuizen <janneke@gnu.org>
-### This file is part of M2-Planet.
-###
-### M2-Planet is free software: you can redistribute it and/or modify
-### it under the terms of the GNU General Public License as published by
-### the Free Software Foundation, either version 3 of the License, or
-### (at your option) any later version.
-###
-### M2-Planet is distributed in the hope that it will be useful,
-### but WITHOUT ANY WARRANTY; without even the implied warranty of
-### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-### GNU General Public License for more details.
-###
-### You should have received a copy of the GNU General Public License
-### along with M2-Planet.  If not, see <http://www.gnu.org/licenses/>.
-
-### stage0's hex2 format
-###    !<label>          1 byte relative
-###    $<label>          2 byte address
-###    @<label>          2 byte relative
-###    &<label>          4 byte address
-###    %<label>          4 byte relative
-
-### if you wish to use this header, you need to add :ELF_end to the end of your
-### M1 or hex2 files.
-
-## ELF Header
-
-:ELF_base
-7F 45 4C 46                               # e_ident[EI_MAG0-3] ELF's magic number
-
-02                                        # e_ident[EI_CLASS] Indicating 64 bit
-01                                        # e_ident[EI_DATA] Indicating little endianness
-01                                        # e_ident[EI_VERSION] Indicating original elf
-
-03                                        # e_ident[EI_OSABI] Set at 3 because FreeBSD is strict
-00                                        # e_ident[EI_ABIVERSION] See above
-
-00 00 00 00 00 00 00                      # e_ident[EI_PAD]
-
-02 00                                     # e_type Indicating Executable
-3E 00                                     # e_machine Indicating AMD64
-01 00 00 00                               # e_version Indicating original elf
-
-&_start 00 00 00 00                       # e_entry Address of the entry point
-%ELF_program_headers>ELF_base 00 00 00 00 # e_phoff Address of program header table
-00 00 00 00 00 00 00 00                   # e_shoff Address of section header table
-
-00 00 00 00                               # e_flags
-
-40 00                                     # e_ehsize Indicating our 64 Byte header
-
-38 00                                     # e_phentsize size of a program header table
-01 00                                     # e_phnum number of entries in program table
-
-00 00                                     # e_shentsize size of a section header table
-00 00                                     # e_shnum number of entries in section table
-
-00 00                                     # e_shstrndx index of the section names
-
-
-:ELF_program_headers
-:ELF_program_header__text
-01 00 00 00                               # ph_type: PT-LOAD = 1
-07 00 00 00                               # ph_flags: PF-X|PF-W|PF-R = 7
-00 00 00 00 00 00 00 00                   # ph_offset
-&ELF_base 00 00 00 00                     # ph_vaddr
-&ELF_base 00 00 00 00                     # ph_physaddr
-%ELF_end>ELF_base 00 00 00 00             # ph_filesz
-%ELF_end>ELF_base 00 00 00 00             # ph_memsz
-01 00 00 00 00 00 00 00                   # ph_align
-
-:ELF_text
diff --git a/arch/ELF-riscv64.hex2 b/arch/ELF-riscv64.hex2
@@ -1,74 +0,0 @@
-### Copyright (C) 2016 Jeremiah Orians
-### Copyright (C) 2017 Jan Nieuwenhuizen <janneke@gnu.org>
-### This file is part of M2-Planet.
-###
-### M2-Planet is free software: you can redistribute it and/or modify
-### it under the terms of the GNU General Public License as published by
-### the Free Software Foundation, either version 3 of the License, or
-### (at your option) any later version.
-###
-### M2-Planet is distributed in the hope that it will be useful,
-### but WITHOUT ANY WARRANTY; without even the implied warranty of
-### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-### GNU General Public License for more details.
-###
-### You should have received a copy of the GNU General Public License
-### along with M2-Planet.  If not, see <http://www.gnu.org/licenses/>.
-
-### stage0's hex2 format
-###    !<label>          1 byte relative
-###    $<label>          2 byte address
-###    @<label>          2 byte relative
-###    &<label>          4 byte address
-###    %<label>          4 byte relative
-
-### if you wish to use this header, you need to add :ELF_end to the end of your
-### M1 or hex2 files.
-
-## ELF Header
-
-:ELF_base
-7F 45 4C 46                               # e_ident[EI_MAG0-3] ELF's magic number
-
-02                                        # e_ident[EI_CLASS] Indicating 64 bit
-01                                        # e_ident[EI_DATA] Indicating little endianness
-01                                        # e_ident[EI_VERSION] Indicating original elf
-
-03                                        # e_ident[EI_OSABI] Set at 3 because FreeBSD is strict
-00                                        # e_ident[EI_ABIVERSION] See above
-
-00 00 00 00 00 00 00                      # e_ident[EI_PAD]
-
-02 00                                     # e_type Indicating Executable
-F3 00                                     # e_machine Indicating RISC-V
-01 00 00 00                               # e_version Indicating original elf
-
-&_start 00 00 00 00                       # e_entry Address of the entry point
-%ELF_program_headers>ELF_base 00 00 00 00 # e_phoff Address of program header table
-00 00 00 00 00 00 00 00                   # e_shoff Address of section header table
-
-00 00 00 00                               # e_flags
-
-40 00                                     # e_ehsize Indicating our 64 Byte header
-
-38 00                                     # e_phentsize size of a program header table
-01 00                                     # e_phnum number of entries in program table
-
-00 00                                     # e_shentsize size of a section header table
-00 00                                     # e_shnum number of entries in section table
-
-00 00                                     # e_shstrndx index of the section names
-
-
-:ELF_program_headers
-:ELF_program_header__text
-01 00 00 00                               # ph_type: PT-LOAD = 1
-07 00 00 00                               # ph_flags: PF-X|PF-W|PF-R = 7
-00 00 00 00 00 00 00 00                   # ph_offset
-&ELF_base 00 00 00 00                     # ph_vaddr
-&ELF_base 00 00 00 00                     # ph_physaddr
-%ELF_end>ELF_base 00 00 00 00             # ph_filesz
-%ELF_end>ELF_base 00 00 00 00             # ph_memsz
-01 00 00 00 00 00 00 00                   # ph_align
-
-:ELF_text
diff --git a/arch/p1_gen.py b/arch/p1_gen.py
@@ -1,1066 +0,0 @@
-#!/usr/bin/env python3
-"""p1_gen.py — generate p1_<arch>.M1 from a per-arch encoder table.
-
-Single source of truth for the P1 DEFINE tables across all three target
-arches. Running this script rewrites p1_aarch64.M1, p1_amd64.M1, and
-p1_riscv64.M1 in place.
-
-Structure:
-  * Low-level native encoders (amd_*, aa_*, rv_*) — one bank of
-    helpers per arch.
-  * Encoder classes AA64/AMD64/RV64 (subclasses of Encoder): one
-    method per P1 op category, lowering (op, reg-tuple, imm) into
-    native hex. Each arch's encoder is a coherent bundle — adding a
-    new op means one new method on each of the three.
-  * Op dataclasses — thin rows holding the DEFINE's name + data.
-    Op.encode(enc) dispatches into enc.<op-method>() with the Op's
-    fields unpacked. No per-arch branching lives in Op classes.
-  * rows() — builds the output list. Non-RRR ops are emitted as the
-    full register product × a curated imm/offset/shamt set. RRR
-    keeps an explicit table (the full 8³ cube is 5.6k entries per
-    arch, >99% dead weight). Adding a new RRR triple or a new imm
-    value is a one-line edit to rows(); a new register combination
-    for any other op needs no edit at all.
-  * emit(arch) / main — iterate rows, ask the arch's encoder to
-    lower each, write out the defs file.
-
-Running:
-    $ python3 p1_gen.py                  # rewrite all three files
-    $ python3 p1_gen.py --check          # diff against current files
-"""
-
-import os
-import sys
-from dataclasses import dataclass
-from itertools import product
-from typing import Optional
-
-ARCHES = ('aarch64', 'amd64', 'riscv64')
-
-## P1 GPRs (the 8 caller/callee-split registers exposed to P1 source).
-P1_REGS = ('r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7')
-
-## ---------- Register mappings --------------------------------------------
-## P1 register name → native encoding number. The native numbers are what
-## the per-arch encoders insert into instruction fields; the human-facing
-## names (rax, x1, a2, …) never appear in this file.
-
-## 4:4 caller/callee-saved split. r0–r3 caller (native argregs); r4–r7
-## callee (native callee-saved). `br` is the hidden branch-target scratch
-## (not a P1 reg) — picked so every op's expansion clobbers only what its
-## name declares.
-NAT_AA64 = {'r0': 0, 'r1': 1, 'r2': 2, 'r3': 3,
-            'r4': 26, 'r5': 27, 'r6': 19, 'r7': 20,
-            'br': 17,  # x17 (IP1, caller-saved linker scratch)
-            'sp': 31, 'xzr': 31, 'lr': 30,
-            'x21': 21, 'x22': 22, 'x23': 23, 'x8': 8}
-
-## amd64 ModRM.reg/rm + REX.R/B bit: native regnums 0..15 with r8..r15
-## setting the REX bit. We store the 4-bit native number directly.
-NAT_AMD64 = {'r0': 0,   # rax
-             'r1': 7,   # rdi
-             'r2': 6,   # rsi
-             'r3': 2,   # rdx
-             'r4': 13,  # r13 (callee-saved)
-             'r5': 14,  # r14 (callee-saved)
-             'r6': 3,   # rbx
-             'r7': 12,  # r12
-             'br': 11,  # r11 — branch/call target scratch + DIV/REM r0 save
-             'sp': 4,   # rsp
-             'rcx': 1,  # shift-count scratch + DIV/REM rdx save (not a P1 reg)
-             'r10': 10, # syscall arg4 slot (not a P1 reg)
-             'r8':  8,  # syscall arg5 slot (not a P1 reg)
-             'r9':  9,  # syscall arg6 slot (not a P1 reg)
-             'r11': 11, # alias for br (some expansions spell it r11 directly)
-             }
-
-NAT_RV64 = {'r0': 10, 'r1': 11, 'r2': 12, 'r3': 13,
-            'r4': 20, 'r5': 21, 'r6': 9,  'r7': 18,
-            'br': 30,  # t5 (caller-saved temp)
-            'sp': 2, 'ra': 1, 'zero': 0, 'a7': 17,
-            's3': 19, 's6': 22, 's7': 23}
-
-
-## ---------- Low-level encoding helpers -----------------------------------
-
-def le32(n: int) -> str:
-    return (n & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
-
-def byte(n: int) -> str:
-    return f'{n & 0xFF:02X}'
-
-
-## ---------- amd64 primitive encoders ------------------------------------
-## amd64 is variable-length. Helpers below emit specific instruction
-## shapes used by the P1 expansions. REX prefix bits: W=64b, R=ModRM.reg
-## high, B=ModRM.rm high, X=SIB.index high (unused here).
-
-def rex(w, r, x, b):
-    v = 0x40 | (w << 3) | (r << 2) | (x << 1) | b
-    return byte(v)
-
-def modrm(mod, reg, rm):
-    return byte((mod << 6) | ((reg & 7) << 3) | (rm & 7))
-
-def amd_mov_rr(dst, src):
-    """mov dst, src — REX.W + 89 /r (MOV r/m64, r64)."""
-    d, s = NAT_AMD64[dst], NAT_AMD64[src]
-    return rex(1, s >> 3, 0, d >> 3) + '89' + modrm(3, s, d)
-
-def amd_alu_rr(op, dst, src):
-    """op dst, src — 2-operand ALU. op is the opcode byte (01 add,
-    29 sub, 21 and, 09 or, 31 xor)."""
-    d, s = NAT_AMD64[dst], NAT_AMD64[src]
-    return rex(1, s >> 3, 0, d >> 3) + op + modrm(3, s, d)
-
-def amd_alu_ri8(ext, dst, imm):
-    """op dst, imm8 (sign-extended). Opcode 83 /ext ib."""
-    d = NAT_AMD64[dst]
-    return rex(1, 0, 0, d >> 3) + '83' + modrm(3, ext, d) + byte(imm)
-
-def amd_alu_ri32(ext, dst, imm):
-    """op dst, imm32 (sign-extended). Opcode 81 /ext id. Used when
-    an immediate doesn't fit in the imm8 form (e.g., ADDI with
-    values outside [-128, 127])."""
-    d = NAT_AMD64[dst]
-    imm_le = (imm & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
-    return rex(1, 0, 0, d >> 3) + '81' + modrm(3, ext, d) + imm_le
-
-def amd_shift_ri8(ext, dst, imm):
-    """shl/shr/sar dst, imm8. Opcode C1 /ext ib."""
-    d = NAT_AMD64[dst]
-    return rex(1, 0, 0, d >> 3) + 'C1' + modrm(3, ext, d) + byte(imm)
-
-def amd_shift_cl(ext, dst):
-    """shl/shr/sar dst, cl. Opcode D3 /ext."""
-    d = NAT_AMD64[dst]
-    return rex(1, 0, 0, d >> 3) + 'D3' + modrm(3, ext, d)
-
-def amd_imul_rr(dst, src):
-    """imul dst, src — 0F AF /r."""
-    d, s = NAT_AMD64[dst], NAT_AMD64[src]
-    return rex(1, d >> 3, 0, s >> 3) + '0FAF' + modrm(3, d, s)
-
-def amd_idiv(src):
-    """idiv src — F7 /7 (signed div of rdx:rax by src)."""
-    s = NAT_AMD64[src]
-    return rex(1, 0, 0, s >> 3) + 'F7' + modrm(3, 7, s)
-
-def amd_cqo():
-    """cqo — sign-extend rax into rdx:rax. 48 99."""
-    return '4899'
-
-def amd_mem_rm(opcode, reg, base, disp):
-    """[base+disp] <-> reg, for MOV r,r/m or MOV r/m,r (opcode=89 store, 8B load).
-    disp is signed int; encodes as disp8 if in range, else disp32."""
-    r, b = NAT_AMD64[reg], NAT_AMD64[base]
-    prefix = rex(1, r >> 3, 0, b >> 3) + opcode
-    if -128 <= disp <= 127:
-        mod = 1
-        d = byte(disp)
-    elif b == 4:  # SIB required for rsp
-        mod = 2
-        d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
-    else:
-        mod = 2
-        d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
-    # rsp as base requires SIB byte (rm=4 with no SIB is rip-relative).
-    if b == 4:
-        return prefix + modrm(mod, r, 4) + '24' + d
-    return prefix + modrm(mod, r, b) + d
-
-def amd_mov_rm_b(reg, base, disp, store):
-    """Byte load/store. 88 /r (store), 0F B6 /r (movzx load)."""
-    r, b = NAT_AMD64[reg], NAT_AMD64[base]
-    if -128 <= disp <= 127:
-        mod = 1
-        d = byte(disp)
-    else:
-        mod = 2
-        d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
-    if store:
-        # MOV r/m8, r8 — 88 /r. Requires REX to address dil/sil/bpl/spl.
-        prefix = rex(1, r >> 3, 0, b >> 3) + '88'
-        sib = '24' if b == 4 else ''
-        rmv = 4 if b == 4 else b
-        return prefix + modrm(mod, r, rmv) + sib + d
-    else:
-        # MOVZX r64, r/m8 — REX.W 0F B6 /r.
-        prefix = rex(1, r >> 3, 0, b >> 3) + '0FB6'
-        sib = '24' if b == 4 else ''
-        rmv = 4 if b == 4 else b
-        return prefix + modrm(mod, r, rmv) + sib + d
-
-
-## ---------- aarch64 primitive encoders ----------------------------------
-## aarch64 is fixed 4-byte insns. Helpers return the 4 bytes LE-encoded.
-
-def aa_rrr(base, rD, rA, rB):
-    d, a, b = NAT_AA64[rD], NAT_AA64[rA], NAT_AA64[rB]
-    return le32(base | (b << 16) | (a << 5) | d)
-
-def aa_add_imm(rD, rA, imm12, sub=False):
-    """ADD/SUB (immediate, shift=0). imm12 unsigned 0..4095."""
-    d, a = NAT_AA64[rD], NAT_AA64[rA]
-    base = 0xD1000000 if sub else 0x91000000
-    return le32(base | ((imm12 & 0xFFF) << 10) | (a << 5) | d)
-
-def aa_logical_imm(base, rD, rA, N, immr, imms):
-    d, a = NAT_AA64[rD], NAT_AA64[rA]
-    return le32(base | (N << 22) | (immr << 16) | (imms << 10) | (a << 5) | d)
-
-def aa_ubfm(rD, rA, immr, imms):
-    """UBFM (N=1 for sf=64)."""
-    d, a = NAT_AA64[rD], NAT_AA64[rA]
-    return le32(0xD3400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
-
-def aa_sbfm(rD, rA, immr, imms):
-    """SBFM (N=1 for sf=64)."""
-    d, a = NAT_AA64[rD], NAT_AA64[rA]
-    return le32(0x93400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
-
-def aa_ldst_uimm12(base, rT, rN, off_bytes, size_log2):
-    """LDR/STR (unsigned offset). off_bytes must be a multiple of
-    2^size_log2 and non-negative. imm12 = off_bytes >> size_log2."""
-    assert off_bytes >= 0 and (off_bytes % (1 << size_log2)) == 0
-    imm12 = off_bytes >> size_log2
-    assert 0 <= imm12 < 4096
-    t, n = NAT_AA64[rT], NAT_AA64[rN]
-    return le32(base | (imm12 << 10) | (n << 5) | t)
-
-def aa_ldst_unscaled(base, rT, rN, off):
-    """LDUR/STUR (unscaled, signed imm9). Handles arbitrary small
-    offsets — negative, or positive-but-not-a-multiple-of-the-access-
-    size (e.g. LD at offset 7). imm9 range is [-256, 255]."""
-    assert -256 <= off <= 255
-    imm9 = off & 0x1FF
-    t, n = NAT_AA64[rT], NAT_AA64[rN]
-    return le32(base | (imm9 << 12) | (n << 5) | t)
-
-
-## ---------- riscv64 primitive encoders ----------------------------------
-
-def rv_r(base, rD, rA, rB):
-    d, a, b = NAT_RV64[rD], NAT_RV64[rA], NAT_RV64[rB]
-    return le32(base | (b << 20) | (a << 15) | (d << 7))
-
-def rv_i(base, rD, rA, imm12):
-    """I-type: imm12[11:0], rs1, funct3, rd, opcode. imm12 is a signed
-    int that gets masked to 12 bits."""
-    d, a = NAT_RV64[rD], NAT_RV64[rA]
-    return le32(base | ((imm12 & 0xFFF) << 20) | (a << 15) | (d << 7))
-
-def rv_s(base, rS, rA, imm12):
-    """S-type store: imm12[11:5] rs2 rs1 funct3 imm12[4:0] opcode."""
-    s, a = NAT_RV64[rS], NAT_RV64[rA]
-    hi = (imm12 >> 5) & 0x7F
-    lo = imm12 & 0x1F
-    return le32(base | (hi << 25) | (s << 20) | (a << 15) | (lo << 7))
-
-def rv_shift_imm(base, rD, rA, shamt):
-    """Shift-imm: base already has funct7 set; shamt in [0,63]."""
-    d, a = NAT_RV64[rD], NAT_RV64[rA]
-    return le32(base | ((shamt & 0x3F) << 20) | (a << 15) | (d << 7))
-
-
-## ---------- Per-arch op base tables -------------------------------------
-
-AA64_RRR_BASE = {
-    'ADD': 0x8B000000,
-    'SUB': 0xCB000000,
-    'AND': 0x8A000000,
-    'OR':  0xAA000000,
-    'XOR': 0xCA000000,
-    'SHL': 0x9AC02000,
-    'SHR': 0x9AC02400,
-    'SAR': 0x9AC02800,
-    'DIV': 0x9AC00C00,
-}
-AMD64_RRR_OPC = {
-    'ADD': '01', 'SUB': '29', 'AND': '21', 'OR': '09', 'XOR': '31',
-}
-RV_RRR = {
-    'ADD': 0x00000033,  # funct7=0 funct3=0 opcode=0x33
-    'SUB': 0x40000033,
-    'XOR': 0x00004033,
-    'OR':  0x00006033,
-    'AND': 0x00007033,
-    'SHL': 0x00001033,
-    'SHR': 0x00005033,
-    'SAR': 0x40005033,
-    'MUL': 0x02000033,
-    'DIV': 0x02004033,
-    'REM': 0x02006033,
-}
-
-
-## aarch64 bitmask-immediate encoding for ANDI/ORI. Entries are the
-## (N, immr, imms) triples that encode each small imm as an aarch64
-## "logical immediate." Computed by hand because the full encoding
-## algorithm (contiguous-run + rotation for element sizes
-## 2/4/8/16/32/64) is substantial and we only need a handful of
-## values. Extend this table if a new imm shows up in P1 source.
-AA64_LOGI_ENC = {
-    1: (1, 0, 0),    # 0b0001 — single bit at position 0
-    2: (1, 63, 0),   # 0b0010 — single bit at position 1
-    3: (1, 0, 1),    # 0b0011 — 2 contiguous ones
-    4: (1, 62, 0),   # 0b0100 — single bit at position 2
-    6: (1, 63, 1),   # 0b0110 — 2 ones rotated by 1
-    7: (1, 0, 2),    # 0b0111 — 3 contiguous ones
-    8: (1, 61, 0),   # 0b1000 — single bit at position 3
-}
-
-
-## Frame layout after PROLOGUE_Nk (k >= 1, rounded up so total frame
-## bytes stay 16-byte aligned on aarch64):
-##   [sp + 0]      = retaddr (aarch64 lr / riscv64 ra / amd64 retaddr)
-##   [sp + 8]      = slot 1 (callee-private scratch)
-##   [sp + 16]     = slot 2
-##   ...
-##   [sp + 8*k]    = slot k
-##
-## Frame size = round_up_to_16(8 + 8*k). So k=1 → 16, k=2 → 24 → 32,
-## k=3 → 32, k=4 → 40 → 48.
-
-def prologue_frame_bytes(k: int) -> int:
-    raw = 8 + 8 * k
-    return (raw + 15) & ~15
-
-
-## ---------- Encoders ----------------------------------------------------
-## One class per arch. Each provides one method per P1 op category,
-## mapping (op, reg-tuple, imm) to native bytes. Op classes dispatch
-## here via `Op.encode(enc)` → `enc.<method>(fields)`.
-
-class Encoder:
-    """Per-arch encoder base. Subclasses implement one method per
-    op category. `arch` is used by literal() to pick the right
-    pre-encoded bytes from an arch-keyed dict."""
-    arch = ''
-
-    def literal(self, hex_by_arch):
-        return hex_by_arch[self.arch]
-
-
-class AA64(Encoder):
-    arch = 'aarch64'
-
-    def rrr(self, op, rD, rA, rB):
-        if op == 'MUL':
-            # MUL = MADD with Ra=xzr. 100 11011 000 mmmmm 0 aaaaa nnnnn ddddd
-            d = NAT_AA64[rD]; a = NAT_AA64[rA]; b = NAT_AA64[rB]
-            return le32(0x9B000000 | (b << 16) | (31 << 10) | (a << 5) | d)
-        if op == 'REM':
-            # SDIV x16, xA, xB ; MSUB xD, x16, xB, xA.
-            # x16 (ARM IP0, caller-saved, not a P1 reg) is scratch so
-            # REM does not hidden-clobber P1 r4 — the op modifies rD only.
-            # MSUB needs bit 15 set (o0=1); without it it decodes as
-            # MADD and REM returns A + (A/B)*B.
-            d = NAT_AA64[rD]; a = NAT_AA64[rA]; b = NAT_AA64[rB]
-            SC = 16
-            sdiv = 0x9AC00C00 | (b << 16) | (a << 5) | SC
-            msub = 0x9B008000 | (b << 16) | (a << 10) | (SC << 5) | d
-            return le32(sdiv) + le32(msub)
-        return aa_rrr(AA64_RRR_BASE[op], rD, rA, rB)
-
-    def addi(self, rD, rA, imm):
-        if imm >= 0:
-            return aa_add_imm(rD, rA, imm, sub=False)
-        return aa_add_imm(rD, rA, -imm, sub=True)
-
-    def logi(self, op, rD, rA, imm):
-        N, immr, imms = AA64_LOGI_ENC[imm]
-        base = 0x92000000 if op == 'ANDI' else 0xB2000000  # ORI = orr
-        return aa_logical_imm(base, rD, rA, N, immr, imms)
-
-    def shifti(self, op, rD, rA, imm):
-        if op == 'SHLI':
-            return aa_ubfm(rD, rA, (-imm) & 63, 63 - imm)
-        if op == 'SHRI':
-            return aa_ubfm(rD, rA, imm, 63)
-        if op == 'SARI':
-            return aa_sbfm(rD, rA, imm, 63)
-
-    def mov(self, rD, rA):
-        if rA == 'sp':
-            return aa_add_imm(rD, 'sp', 0, sub=False)
-        # MOV xD, xA = ORR xD, xzr, xA
-        d = NAT_AA64[rD]; a = NAT_AA64[rA]
-        return le32(0xAA000000 | (a << 16) | (31 << 5) | d)
-
-    def li(self, rD):
-        # ldr wD, [pc+8] ; b +8  (caller emits 4 bytes of data next)
-        d = NAT_AA64[rD]
-        ldr_w_lit = 0x18000040 | d   # LDR (literal) 32-bit, offset 8
-        b_plus8   = 0x14000002       # B offset 8 (imm26 = 2 words = 8 bytes)
-        return le32(ldr_w_lit) + le32(b_plus8)
-
-    def la(self, rD):
-        return self.li(rD)
-
-    def mem(self, op, rT, rN, off):
-        # Pick uimm12 (scaled, large range) when the offset is a
-        # non-negative multiple of the access width; otherwise fall
-        # back to the unscaled signed-imm9 form (covers negative
-        # offsets and positive-but-misaligned ones like 7).
-        BASES = {
-            'LD': (0xF9400000, 3, 0xF8400000),
-            'ST': (0xF9000000, 3, 0xF8000000),
-            'LB': (0x39400000, 0, 0x38400000),
-            'SB': (0x39000000, 0, 0x38000000),
-        }
-        uimm_base, size_log2, unscaled_base = BASES[op]
-        scale = 1 << size_log2
-        if off >= 0 and (off % scale) == 0:
-            return aa_ldst_uimm12(uimm_base, rT, rN, off, size_log2)
-        return aa_ldst_unscaled(unscaled_base, rT, rN, off)
-
-    def b(self):
-        return le32(0xD61F0000 | (NAT_AA64['br'] << 5))   # BR x17
-
-    def condb(self, op, rA, rB):
-        # cmp xA, xB = SUBS xzr, xA, xB  (0xEB000000 base, rD=31).
-        # Skip when NOT cond holds. BEQ→NE(1), BNE→EQ(0), BLT→GE(A).
-        a = NAT_AA64[rA]; b_ = NAT_AA64[rB]
-        cmp_ = le32(0xEB000000 | (b_ << 16) | (a << 5) | 31)
-        cond = {'BEQ': 1, 'BNE': 0, 'BLT': 10}[op]
-        bcond = le32(0x54000040 | cond)
-        br = le32(0xD61F0000 | (NAT_AA64['br'] << 5))
-        return cmp_ + bcond + br
-
-    def call(self):
-        return le32(0xD63F0000 | (NAT_AA64['br'] << 5))   # BLR x17
-
-    def ret(self):
-        return le32(0xD65F03C0)                           # RET (= br x30)
-
-    def prologue(self, k):
-        fb = prologue_frame_bytes(k)
-        sub = aa_add_imm('sp', 'sp', fb, sub=True)
-        str_lr = aa_ldst_uimm12(0xF9000000, 'lr', 'sp', 0, 3)
-        return sub + str_lr
-
-    def epilogue(self, k):
-        fb = prologue_frame_bytes(k)
-        ldr_lr = aa_ldst_uimm12(0xF9400000, 'lr', 'sp', 0, 3)
-        add    = aa_add_imm('sp', 'sp', fb, sub=False)
-        return ldr_lr + add
-
-    def tail(self, k):
-        return self.epilogue(k) + self.b()
-
-
-class AMD64(Encoder):
-    arch = 'amd64'
-
-    def rrr(self, op, rD, rA, rB):
-        if op == 'MUL':
-            return amd_mov_rr(rD, rA) + amd_imul_rr(rD, rB)
-        if op in ('DIV', 'REM'):
-            # x86 idiv implicitly reads/writes rax (P1 r0) and rdx
-            # (P1 r3). To keep DIV/REM clobber-free (only rD changes),
-            # stash r0 into r11 and r3 into rcx — neither is a P1 reg —
-            # then restore. If rA or rB alias r0/r3, read from the
-            # saved copy since we've overwritten the originals.
-            # Skip the final restore for whichever of r0/r3 *is* rD,
-            # so rD keeps its newly computed value.
-            seq  = amd_mov_rr('r11', 'r0')        # save r0 (rax)
-            seq += amd_mov_rr('rcx', 'r3')        # save r3 (rdx)
-            src_a = 'r11' if rA == 'r0' else ('rcx' if rA == 'r3' else rA)
-            seq += amd_mov_rr('r0', src_a)        # rax = rA
-            seq += amd_cqo()                      # rdx:rax = sign-ext rax
-            src_b = 'r11' if rB == 'r0' else ('rcx' if rB == 'r3' else rB)
-            seq += amd_idiv(src_b)
-            seq += amd_mov_rr(rD, 'r0' if op == 'DIV' else 'r3')
-            if rD != 'r3':
-                seq += amd_mov_rr('r3', 'rcx')
-            if rD != 'r0':
-                seq += amd_mov_rr('r0', 'r11')
-            return seq
-        if op in ('SHL', 'SHR', 'SAR'):
-            ext = {'SHL': 4, 'SHR': 5, 'SAR': 7}[op]
-            seq = amd_mov_rr(rD, rA)
-            seq += amd_mov_rr('rcx', rB)
-            seq += amd_shift_cl(ext, rD)
-            return seq
-        # ADD/SUB/AND/OR/XOR: mov rD,rA ; op rD,rB
-        seq = amd_mov_rr(rD, rA)
-        seq += amd_alu_rr(AMD64_RRR_OPC[op], rD, rB)
-        return seq
-
-    def addi(self, rD, rA, imm):
-        # mov rD,rA ; add rD,imm.  Use imm8 form when it fits
-        # ([-128, 127]); otherwise emit the imm32 form.
-        seq = amd_mov_rr(rD, rA)
-        if -128 <= imm <= 127:
-            seq += amd_alu_ri8(0, rD, imm)  # /0 = ADD
-        else:
-            seq += amd_alu_ri32(0, rD, imm)
-        return seq
-
-    def logi(self, op, rD, rA, imm):
-        ext = {'ANDI': 4, 'ORI': 1}[op]
-        seq = amd_mov_rr(rD, rA)
-        seq += amd_alu_ri8(ext, rD, imm)
-        return seq
-
-    def shifti(self, op, rD, rA, imm):
-        ext = {'SHLI': 4, 'SHRI': 5, 'SARI': 7}[op]
-        seq = amd_mov_rr(rD, rA)
-        seq += amd_shift_ri8(ext, rD, imm)
-        return seq
-
-    def mov(self, rD, rA):
-        return amd_mov_rr(rD, rA)
-
-    def li(self, rD):
-        # mov <rD as r32>, imm32 — opcode B8+r (with REX.B if r8..r15)
-        d = NAT_AMD64[rD]
-        if d >= 8:
-            return '41' + byte(0xB8 + (d & 7))
-        return byte(0xB8 + d)
-
-    def la(self, rD):
-        return self.li(rD)
-
-    def mem(self, op, rT, rN, off):
-        if op == 'LD': return amd_mem_rm('8B', rT, rN, off)
-        if op == 'ST': return amd_mem_rm('89', rT, rN, off)
-        if op == 'LB': return amd_mov_rm_b(rT, rN, off, store=False)
-        if op == 'SB': return amd_mov_rm_b(rT, rN, off, store=True)
-
-    def b(self):
-        return '41FFE3'                                   # jmp r11
-
-    def condb(self, op, rA, rB):
-        a, b_ = NAT_AMD64[rA], NAT_AMD64[rB]
-        # cmp rA, rB — opcode 39 /r with rA as r/m
-        cmp_ = rex(1, b_ >> 3, 0, a >> 3) + '39' + modrm(3, b_, a)
-        # jcc rel8 opcode, skip=3 (past jmp r11):
-        #   BEQ→JNE 75 03 ; BNE→JE 74 03 ; BLT→JGE 7D 03
-        jop = {'BEQ': '75', 'BNE': '74', 'BLT': '7D'}[op]
-        return cmp_ + jop + '03' + '41FFE3'              # jmp r11
-
-    def call(self):
-        return '41FFD3'                                   # call r11
-
-    def ret(self):
-        return 'C3'
-
-    def prologue(self, k):
-        # pop rcx ; sub rsp,fb ; push rcx. rcx is the retaddr-carry
-        # scratch — caller-save, never a P1 reg. r11 (= 'br') is
-        # off-limits because TAIL = EPILOGUE + `jmp r11`, and using
-        # r11 here would clobber the LI_BR-loaded tail target.
-        fb = prologue_frame_bytes(k)
-        assert fb <= 127
-        return '59' + '4883EC' + byte(fb) + '51'
-
-    def epilogue(self, k):
-        # Mirror of prologue: pop rcx ; add rsp,fb ; push rcx.
-        fb = prologue_frame_bytes(k)
-        assert fb <= 127
-        return '59' + '4883C4' + byte(fb) + '51'
-
-    def tail(self, k):
-        return self.epilogue(k) + self.b()
-
-
-class RV64(Encoder):
-    arch = 'riscv64'
-
-    def rrr(self, op, rD, rA, rB):
-        return rv_r(RV_RRR[op], rD, rA, rB)
-
-    def addi(self, rD, rA, imm):
-        return rv_i(0x00000013, rD, rA, imm)
-
-    def logi(self, op, rD, rA, imm):
-        base = {'ANDI': 0x00007013, 'ORI': 0x00006013}[op]
-        return rv_i(base, rD, rA, imm)
-
-    def shifti(self, op, rD, rA, imm):
-        base = {'SHLI': 0x00001013, 'SHRI': 0x00005013, 'SARI': 0x40005013}[op]
-        return rv_shift_imm(base, rD, rA, imm)
-
-    def mov(self, rD, rA):
-        return rv_i(0x00000013, rD, rA, 0)                # addi rD, rA, 0
-
-    def li(self, rD):
-        # auipc rD,0 ; lwu rD,12(rD) ; jal x0,+8
-        d = NAT_RV64[rD]
-        auipc  = 0x00000017 | (d << 7)
-        lwu    = 0x00006003 | (d << 7) | (d << 15) | (12 << 20)
-        jal_p8 = 0x0080006F
-        return le32(auipc) + le32(lwu) + le32(jal_p8)
-
-    def la(self, rD):
-        return self.li(rD)
-
-    def mem(self, op, rT, rN, off):
-        # funct3: LD=3, ST=3, LBU=4, SB=0. Opcodes: load=03, store=23.
-        if op == 'LD': return rv_i(0x00003003, rT, rN, off)
-        if op == 'ST': return rv_s(0x00003023, rT, rN, off)
-        if op == 'LB': return rv_i(0x00004003, rT, rN, off)  # LBU
-        if op == 'SB': return rv_s(0x00000023, rT, rN, off)
-
-    def b(self):
-        return le32(0x00000067 | (NAT_RV64['br'] << 15))  # jalr x0, 0(t5)
-
-    def condb(self, op, rA, rB):
-        # B<inv> rA, rB, +8 ; jalr x0, 0(t5). funct3 picks the op:
-        # BEQ→BNE(1), BNE→BEQ(0), BLT→BGE(5).
-        a, b_ = NAT_RV64[rA], NAT_RV64[rB]
-        funct3 = {'BEQ': 1, 'BNE': 0, 'BLT': 5}[op]
-        insn = 0x00000063 | (funct3 << 12) | (a << 15) | (b_ << 20) | (8 << 7)
-        jalr = 0x00000067 | (NAT_RV64['br'] << 15)
-        return le32(insn) + le32(jalr)
-
-    def call(self):
-        return le32(0x000000E7 | (NAT_RV64['br'] << 15))  # jalr ra, 0(t5)
-
-    def ret(self):
-        return le32(0x00008067)                           # jalr x0, 0(ra)
-
-    def prologue(self, k):
-        fb = prologue_frame_bytes(k)
-        sub = rv_i(0x00000013, 'sp', 'sp', -fb)
-        sd  = rv_s(0x00003023, 'ra', 'sp', 0)
-        return sub + sd
-
-    def epilogue(self, k):
-        fb = prologue_frame_bytes(k)
-        ld  = rv_i(0x00003003, 'ra', 'sp', 0)
-        add = rv_i(0x00000013, 'sp', 'sp', fb)
-        return ld + add
-
-    def tail(self, k):
-        return self.epilogue(k) + self.b()
-
-
-ENCODERS = {'aarch64': AA64(), 'amd64': AMD64(), 'riscv64': RV64()}
-
-
-## ---------- Op dataclasses ----------------------------------------------
-## Thin wrappers: each row holds its DEFINE name + the data needed to
-## reconstruct the encoding. `encode(enc)` calls the matching method
-## on the arch's encoder.
-
-@dataclass
-class Op:
-    name: str
-    comment: str = ''
-
-    def encode(self, enc: Encoder) -> str:
-        raise NotImplementedError
-
-@dataclass
-class RRR(Op):
-    op: str = ''
-    rD: str = ''
-    rA: str = ''
-    rB: str = ''
-    def encode(self, enc):
-        return enc.rrr(self.op, self.rD, self.rA, self.rB)
-
-@dataclass
-class AddI(Op):
-    rD: str = ''
-    rA: str = ''
-    imm: int = 0
-    def encode(self, enc):
-        return enc.addi(self.rD, self.rA, self.imm)
-
-@dataclass
-class LogI(Op):
-    op: str = ''          # ANDI / ORI
-    rD: str = ''
-    rA: str = ''
-    imm: int = 0
-    def encode(self, enc):
-        return enc.logi(self.op, self.rD, self.rA, self.imm)
-
-@dataclass
-class ShiftI(Op):
-    op: str = ''          # SHLI / SHRI / SARI
-    rD: str = ''
-    rA: str = ''
-    imm: int = 0
-    def encode(self, enc):
-        return enc.shifti(self.op, self.rD, self.rA, self.imm)
-
-@dataclass
-class Mov(Op):
-    rD: str = ''
-    rA: str = ''
-    def encode(self, enc):
-        return enc.mov(self.rD, self.rA)
-
-@dataclass
-class Li(Op):
-    rD: str = ''
-    def encode(self, enc):
-        return enc.li(self.rD)
-
-@dataclass
-class La(Op):
-    rD: str = ''
-    def encode(self, enc):
-        return enc.la(self.rD)
-
-@dataclass
-class Mem(Op):
-    op: str = ''          # LD / ST / LB / SB
-    rT: str = ''
-    rN: str = ''
-    off: int = 0
-    def encode(self, enc):
-        return enc.mem(self.op, self.rT, self.rN, self.off)
-
-@dataclass
-class B(Op):
-    def encode(self, enc):
-        return enc.b()
-
-@dataclass
-class CondB(Op):
-    op: str = ''          # BEQ / BNE / BLT
-    rA: str = ''
-    rB: str = ''
-    def encode(self, enc):
-        return enc.condb(self.op, self.rA, self.rB)
-
-@dataclass
-class Literal(Op):
-    hex_by_arch: Optional[dict] = None
-    def encode(self, enc):
-        return enc.literal(self.hex_by_arch)
-
-@dataclass
-class Prologue(Op):
-    k: int = 1
-    def encode(self, enc):
-        return enc.prologue(self.k)
-
-@dataclass
-class Epilogue(Op):
-    k: int = 1
-    def encode(self, enc):
-        return enc.epilogue(self.k)
-
-@dataclass
-class Tail(Op):
-    k: int = 1
-    def encode(self, enc):
-        return enc.tail(self.k)
-
-@dataclass
-class Call(Op):
-    def encode(self, enc):
-        return enc.call()
-
-@dataclass
-class Ret(Op):
-    def encode(self, enc):
-        return enc.ret()
-
-
-## ---------- SYSCALL pre-encoded sequences -------------------------------
-## The one-shot syscall wrapper. Shuffles P1's r0=num, r1–r6=args into
-## each arch's native syscall ABI and clobbers only r0 on return.
-## Encoded by hand (per P1.md §"Syscall conventions").
-
-SYSCALL_HEX = {
-    'aarch64': (
-        # r4/r5 now live in callee-saved natives (x26/x27), so the
-        # kernel preserves them — no save/restore needed. Only r1/r2/r3
-        # (in caller-saved x1/x2/x3) must be stashed across the shuffle.
-        '' .join([
-            le32(0xAA0003E8),  # mov x8, x0      (syscall number)
-            le32(0xAA0103F5),  # mov x21, x1     (save r1)
-            le32(0xAA0203F6),  # mov x22, x2     (save r2)
-            le32(0xAA0303F7),  # mov x23, x3     (save r3)
-            le32(0xAA1503E0),  # mov x0, x21     (arg1 = r1)
-            le32(0xAA1603E1),  # mov x1, x22     (arg2 = r2)
-            le32(0xAA1703E2),  # mov x2, x23     (arg3 = r3)
-            le32(0xAA1A03E3),  # mov x3, x26     (arg4 = r4)
-            le32(0xAA1B03E4),  # mov x4, x27     (arg5 = r5)
-            le32(0xAA1303E5),  # mov x5, x19     (arg6 = r6)
-            le32(0xD4000001),  # svc #0
-            le32(0xAA1503E1),  # mov x1, x21     (restore r1)
-            le32(0xAA1603E2),  # mov x2, x22
-            le32(0xAA1703E3),  # mov x3, x23
-        ])
-    ),
-    # r4=r13, r5=r14 are callee-saved natively, but syscall wants args
-    # 4/5 in r10/r8. r6=rbx, but arg6 lives in r9. Three shuffle moves,
-    # then syscall. The kernel preserves rdi/rsi/rdx/r12–r15/rbx, so no
-    # P1 reg is clobbered beyond r0 (syscall return).
-    'amd64':  '4D89EA' + '4D89F0' + '4989D9' + '0F05',
-    'riscv64': (
-        # Same story as aarch64: r4/r5 in callee-saved s4/s5 (=x20/x21),
-        # so we only save/restore a1/a2/a3. Scratch slots: s3, s6, s7.
-        ''.join([
-            le32(0x00050893),  # mv a7, a0       (syscall number)
-            le32(0x00058993),  # mv s3, a1       (save r1)
-            le32(0x00060B13),  # mv s6, a2       (save r2)
-            le32(0x00068B93),  # mv s7, a3       (save r3)
-            le32(0x00098513),  # mv a0, s3       (arg1 = r1)
-            le32(0x000B0593),  # mv a1, s6       (arg2 = r2)
-            le32(0x000B8613),  # mv a2, s7       (arg3 = r3)
-            le32(0x000A0693),  # mv a3, s4       (arg4 = r4)
-            le32(0x000A8713),  # mv a4, s5       (arg5 = r5)
-            le32(0x00048793),  # mv a5, s1       (arg6 = r6)
-            le32(0x00000073),  # ecall
-            le32(0x00098593),  # mv a1, s3       (restore r1)
-            le32(0x000B0613),  # mv a2, s6
-            le32(0x000B8693),  # mv a3, s7
-        ])
-    ),
-}
-
-## Syscall numbers (little-endian 32-bit for LI operand).
-## aarch64 and riscv64 share the asm-generic table; amd64 has its own.
-##
-## Portability notes — every entry below is a syscall that exists on all
-## three with the same semantics under the uniform P1 SYSCALL convention
-## (r0 = num, r1-r6 = args):
-##   - `fork` is amd64-only; `wait4` is asm-generic 32-bit compat only.
-##     Use `clone(SIGCHLD)` and `waitid` instead.
-##   - `open` is amd64-only (removed from asm-generic). Use `openat` with
-##     dirfd = AT_FDCWD (-100) as arg1.
-##   - `clone` arg order differs: amd64 is (flags, stack, ptid, ctid, tls);
-##     aarch64/riscv64 are (flags, stack, ptid, tls, ctid). Benign when
-##     ptid/ctid/tls are all zero (the fork-equivalent case).
-SYS_NUM = {
-    'aarch64': {'SYS_WRITE':  64, 'SYS_EXIT': 93, 'SYS_READ': 63, 'SYS_CLOSE': 57,
-                'SYS_OPENAT': 56,
-                'SYS_CLONE': 220, 'SYS_EXECVE': 221, 'SYS_WAITID': 95},
-    'amd64':   {'SYS_WRITE':   1, 'SYS_EXIT': 60, 'SYS_READ':  0, 'SYS_CLOSE':  3,
-                'SYS_OPENAT':257,
-                'SYS_CLONE':  56, 'SYS_EXECVE':  59, 'SYS_WAITID':247},
-    'riscv64': {'SYS_WRITE':  64, 'SYS_EXIT': 93, 'SYS_READ': 63, 'SYS_CLOSE': 57,
-                'SYS_OPENAT': 56,
-                'SYS_CLONE': 220, 'SYS_EXECVE': 221, 'SYS_WAITID': 95},
-}
-
-
-## ---------- Canonical imm/offset/shamt sets -----------------------------
-## Enumerated instead of sigil-passed: M1's DEFINE substitutes hex
-## bytes verbatim, so every distinct imm value needs its own DEFINE.
-## These cover every value used across hello/demo/lisp/kaem-minimal
-## plus a little headroom. Extend when a new value appears in P1 src.
-
-## ADDI imms. NEG48/48 handle the ASCII '0' bias; the rest cover tag
-## stripping and loop counters. Full reg product × this set = 8²×N.
-ADDI_IMMS = (-48, -8, -7, -6, -5, -4, -3, -2, -1,
-             1, 2, 3, 4, 5, 6, 7, 8, 48)
-
-## Shift amounts (for SHLI/SHRI/SARI). 32/52 implement low-N-bit masks
-## (length field extraction; 4096-slot symbol-table index); the small
-## values scale-by-N for byte offsets and fixnum encode/decode.
-SHIFT_IMMS = (1, 2, 3, 5, 16, 32, 52)
-
-## ANDI/ORI imms. Every entry must appear in AA64_LOGI_ENC.
-LOGI_IMMS = (1, 2, 3, 4, 6, 7, 8)
-
-## Memory offsets for LD/ST/LB/SB. 0/8/16/24/32 cover slot offsets in
-## N-slot frames and common struct fields; 7 is the NUL terminator
-## position inside an 8-byte zero-padded slot; -8 reaches one slot
-## below the current base.
-MEM_OFFS = (-8, 0, 7, 8, 16, 24, 32)
-
-CONDB_OPS = ('BEQ', 'BNE', 'BLT')
-SHIFT_OPS = ('SHLI', 'SHRI', 'SARI')
-LOGI_OPS  = ('ANDI', 'ORI')
-MEM_OPS   = ('LD', 'ST', 'LB', 'SB')
-
-
-## Curated RRR triples. The full cube is 11 ops × 8³ regs = 5632
-## entries per arch — >99% would be dead weight. Each tuple below
-## is one actually used by hello/demo/lisp/kaem-minimal. Lint
-## catches missing triples on assembly; add a line here and
-## regenerate.
-RRR_TABLE = (
-    # demo/lisp step-1 arith cube
-    ('ADD','r1','r1','r2'), ('ADD','r1','r1','r4'),
-    ('ADD','r2','r2','r6'), ('ADD','r2','r3','r1'),
-    ('SUB','r1','r1','r2'), ('SUB','r2','r2','r6'),
-    ('AND','r1','r1','r5'),
-    ('OR', 'r1','r1','r2'),
-    ('XOR','r1','r1','r2'),
-    ('MUL','r1','r1','r2'),
-    ('DIV','r1','r1','r2'),
-    ('REM','r1','r1','r5'),
-    ('SHL','r1','r1','r2'),
-    ('SHR','r1','r1','r2'),
-    ('SAR','r4','r4','r2'),
-    # alloc / pointer arithmetic
-    ('ADD','r2','r0','r1'),
-    ('ADD','r0','r0','r3'),
-    ('ADD','r2','r2','r0'),
-    ('ADD','r2','r2','r1'),
-    ('SUB','r3','r3','r0'),
-    # reader / display index+offset fold
-    ('ADD','r6','r1','r2'),
-    ('ADD','r6','r6','r0'),
-    ('ADD','r7','r1','r2'),
-    ('SUB','r2','r1','r6'),
-    ('SUB','r3','r1','r6'),
-    ('REM','r1','r1','r2'),
-    # kaem-minimal bump-pointer + accumulator updates
-    ('ADD','r1','r1','r0'),
-    ('ADD','r5','r5','r0'),
-    ('ADD','r7','r7','r0'),
-    ('SUB','r3','r3','r2'),
-    ('SUB','r6','r6','r0'),
-)
-
-
-## ---------- Row assembly ------------------------------------------------
-
-HEADER = """## p1_{arch}.M1 — GENERATED by p1_gen.py. Do not edit by hand.
-##
-## Shared op-table lives in p1_gen.py; each arch's encoder lowers
-## (op, register-tuple, imm) rows into native bytes. See P1.md for the
-## ISA spec and register mapping.
-"""
-
-@dataclass
-class Banner:
-    text: str
-
-
-def _imm_suf(imm):
-    return f'NEG{-imm}' if imm < 0 else f'{imm}'
-
-
-def rows():
-    R = []
-
-    # --- LI / LA — wide literal and address loads ---
-    R.append(Banner('LI / LA — load 4-byte zero-extended literal or label addr'))
-    for rd in P1_REGS:
-        R.append(Li(name=f'LI_{rd.upper()}', rD=rd))
-    # LI_BR loads into the hidden branch-target scratch (x17/r11/t5).
-    # Every branch/call site is `LI_BR &target ; P1_<BR>`. The scratch
-    # is *not* a P1 reg.
-    R.append(Li(name='LI_BR', rD='br'))
-    for rd in P1_REGS:
-        R.append(La(name=f'LA_{rd.upper()}', rD=rd))
-
-    # --- MOV — register-to-register + MOV rD, sp ---
-    R.append(Banner('MOV — full register product (src may be sp)'))
-    for rd in P1_REGS:
-        for ra in P1_REGS:
-            R.append(Mov(name=f'MOV_{rd.upper()}_{ra.upper()}', rD=rd, rA=ra))
-        R.append(Mov(name=f'MOV_{rd.upper()}_SP', rD=rd, rA='sp'))
-
-    # --- RRR — curated triples (full cube would be 5.6k/arch) ---
-    R.append(Banner('RRR — curated triples (explicit table in p1_gen.py)'))
-    for op, d, a, b in RRR_TABLE:
-        R.append(RRR(name=f'{op}_{d.upper()}_{a.upper()}_{b.upper()}',
-                     op=op, rD=d, rA=a, rB=b))
-
-    # --- Immediate arith: ADDI × full reg product × imm set ---
-    R.append(Banner('ADDI — full register product × ADDI_IMMS'))
-    for d, a, imm in product(P1_REGS, P1_REGS, ADDI_IMMS):
-        R.append(AddI(name=f'ADDI_{d.upper()}_{a.upper()}_{_imm_suf(imm)}',
-                      rD=d, rA=a, imm=imm))
-
-    # --- ANDI / ORI × full reg product × LOGI_IMMS ---
-    R.append(Banner('ANDI / ORI — full register product × LOGI_IMMS'))
-    for op, d, a, imm in product(LOGI_OPS, P1_REGS, P1_REGS, LOGI_IMMS):
-        R.append(LogI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}',
-                      op=op, rD=d, rA=a, imm=imm))
-
-    # --- SHLI / SHRI / SARI × full reg product × SHIFT_IMMS ---
-    R.append(Banner('SHLI / SHRI / SARI — full register product × SHIFT_IMMS'))
-    for op, d, a, imm in product(SHIFT_OPS, P1_REGS, P1_REGS, SHIFT_IMMS):
-        R.append(ShiftI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}',
-                        op=op, rD=d, rA=a, imm=imm))
-
-    # --- Memory: LD/ST/LB/SB × full reg product × MEM_OFFS ---
-    R.append(Banner('LD / ST / LB / SB — full register product × MEM_OFFS'))
-    for op, rt, rn, off in product(MEM_OPS, P1_REGS, P1_REGS, MEM_OFFS):
-        R.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{_imm_suf(off)}',
-                     op=op, rT=rt, rN=rn, off=off))
-
-    # --- Branches: BEQ/BNE/BLT × full reg product + unconditional B ---
-    R.append(Banner('Branches — LI_BR-indirect pattern'))
-    R.append(B(name='B'))
-    for op, a, b in product(CONDB_OPS, P1_REGS, P1_REGS):
-        R.append(CondB(name=f'{op}_{a.upper()}_{b.upper()}',
-                       op=op, rA=a, rB=b))
-
-    # --- Control: CALL / RET / PROLOGUE / EPILOGUE / TAIL (Nk = 1..4) ---
-    R.append(Banner('Control: CALL/RET + single-slot and N-slot PROLOGUE/EPILOGUE/TAIL'))
-    R.append(Prologue(name='PROLOGUE', k=1))
-    R.append(Epilogue(name='EPILOGUE', k=1))
-    R.append(Ret(name='RET'))
-    R.append(Call(name='CALL'))
-    R.append(Tail(name='TAIL', k=1))
-    for k in (2, 3, 4):
-        R.append(Prologue(name=f'PROLOGUE_N{k}', k=k))
-        R.append(Epilogue(name=f'EPILOGUE_N{k}', k=k))
-        R.append(Tail(name=f'TAIL_N{k}', k=k))
-
-    # --- SYSCALL — pre-encoded per-arch wrapper ---
-    R.append(Banner('SYSCALL — uniform "clobbers r0 only" across arches'))
-    R.append(Literal(name='SYSCALL', hex_by_arch=SYSCALL_HEX))
-
-    # --- Syscall numbers (LE-32 immediates) ---
-    R.append(Banner('Linux syscall numbers (per-arch table). LE-32 operands for LI.'))
-    for name in ('SYS_WRITE', 'SYS_EXIT', 'SYS_READ', 'SYS_CLOSE', 'SYS_OPENAT',
-                 'SYS_CLONE', 'SYS_EXECVE', 'SYS_WAITID'):
-        R.append(Literal(name=name,
-                         hex_by_arch={a: le32(SYS_NUM[a][name]) for a in ARCHES}))
-
-    return R
-
-
-## ---------- File emission -----------------------------------------------
-
-def emit(arch: str) -> str:
-    enc = ENCODERS[arch]
-    out = [HEADER.format(arch=arch).rstrip(), '']
-    seen = set()
-    for row in rows():
-        if isinstance(row, Banner):
-            out.append('')
-            out.append('## ---- ' + row.text + ' ' + '-' * max(0, 60 - len(row.text)))
-            continue
-        name = 'P1_' + row.name if not row.name.startswith('SYS_') else row.name
-        if name in seen:
-            raise RuntimeError(f'duplicate DEFINE: {name}')
-        seen.add(name)
-        out.append(f'DEFINE {name} {row.encode(enc)}')
-    out.append('')
-    return '\n'.join(out)
-
-
-def main():
-    here = os.path.dirname(os.path.abspath(__file__))
-    check = '--check' in sys.argv
-
-    had_diff = False
-    for arch in ARCHES:
-        path = os.path.join(here, f'p1_{arch}.M1')
-        content = emit(arch)
-        if check:
-            with open(path) as f:
-                existing = f.read()
-            if existing != content:
-                sys.stderr.write(f'DIFF: {path}\n')
-                had_diff = True
-        else:
-            with open(path, 'w') as f:
-                f.write(content)
-            print(f'wrote {path} ({len(content)} bytes)')
-
-    if check and had_diff:
-        sys.exit(1)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/bootstrap.sh b/bootstrap.sh
@@ -9,6 +9,10 @@
 # bytes, hand-assembled, shipped by stage0-posix). Nothing above M0 is built,
 # which is the whole point — no C compiler is involved, not even cc_<arch>.
 #
+# Inputs are read from build/upstream/, which populate-upstream.sh mirrors
+# from live-bootstrap's stage0-posix on the host. The container mounts only
+# curdir, so everything bootstrap.sh needs must already live inside it.
+#
 # Phase map (stage0-posix mescc-tools-{seed,mini}-kaem.kaem phases 0-3):
 #   0)  hex0-seed + hex0_<A>.hex0             -> hex0
 #   1)  hex0      + hex1_<A>.hex0             -> hex1
@@ -34,9 +38,8 @@ case "$ARCH" in
     *) echo "bootstrap.sh: unsupported arch '$ARCH'" >&2 ; exit 1 ;;
 esac
 
-S=/work/live-bootstrap/seed/stage0-posix
+S=build/upstream
 mkdir -p "$OUT"
-cd "$S"
 
 # qemu-user amd64 workaround: the shipped hex0-seed and the hex0 it produces
 # both have a program header with p_flags=0x01 (PF_X only, no PF_R). A native
@@ -47,7 +50,7 @@ cd "$S"
 #
 # This only affects foreign-arch builds on non-amd64 hosts; on a native amd64
 # host the patch is a no-op (binary would load fine either way).
-SEED=./bootstrap-seeds/POSIX/"$A"/hex0-seed
+SEED="$S"/bootstrap-seeds/POSIX/"$A"/hex0-seed
 if [ "$ARCH" = amd64 ]; then
     cp "$SEED" "$OUT"/hex0-seed
     printf '\5' | dd of="$OUT"/hex0-seed bs=1 seek=68 count=1 conv=notrunc status=none
@@ -55,13 +58,13 @@ if [ "$ARCH" = amd64 ]; then
     SEED="$OUT"/hex0-seed
 fi
 
-"$SEED" "$A"/hex0_"$A".hex0 "$OUT"/hex0
+"$SEED" "$S"/"$A"/hex0_"$A".hex0 "$OUT"/hex0
 if [ "$ARCH" = amd64 ]; then
     printf '\5' | dd of="$OUT"/hex0 bs=1 seek=68 count=1 conv=notrunc status=none
 fi
 
-"$OUT"/hex0  "$A"/hex1_"$A".hex0 "$OUT"/hex1
-"$OUT"/hex1  "$A"/hex2_"$A".hex1 "$OUT"/hex2-0
-"$OUT"/"$CATM_ASM" "$A"/"$CATM_SRC" "$OUT"/catm
-"$OUT"/catm  "$OUT"/M0.hex2 "$A"/ELF-"$ARCH".hex2 "$A"/M0_"$A".hex2
+"$OUT"/hex0  "$S"/"$A"/hex1_"$A".hex0 "$OUT"/hex1
+"$OUT"/hex1  "$S"/"$A"/hex2_"$A".hex1 "$OUT"/hex2-0
+"$OUT"/"$CATM_ASM" "$S"/"$A"/"$CATM_SRC" "$OUT"/catm
+"$OUT"/catm  "$OUT"/M0.hex2 "$S"/"$A"/ELF-"$ARCH".hex2 "$S"/"$A"/M0_"$A".hex2
 "$OUT"/hex2-0 "$OUT"/M0.hex2 "$OUT"/M0
diff --git a/populate-upstream.sh b/populate-upstream.sh
@@ -0,0 +1,48 @@
+#!/bin/sh
+# Copy the files bootstrap.sh needs from live-bootstrap's stage0-posix into
+# build/upstream/, mirroring the upstream directory layout. Runs on the host:
+# the podman invocations in the Makefile only mount curdir, so anything
+# bootstrap.sh needs has to land inside curdir first.
+#
+# Inputs per arch (A in AArch64|AMD64|riscv64):
+#   bootstrap-seeds/POSIX/$A/hex0-seed
+#   $A/hex0_$A.hex0
+#   $A/hex1_$A.hex0
+#   $A/hex2_$A.hex1
+#   $A/catm_$A.(hex1|hex2)     extension differs across arches
+#   $A/M0_$A.hex2
+#   $A/ELF-<arch>.hex2         used by both bootstrap.sh (M0.hex2 link) and
+#                              the Makefile's final program link
+#
+# Usage: populate-upstream.sh [UPSTREAM]
+#   UPSTREAM: path to live-bootstrap checkout (default: ../live-bootstrap)
+set -eu
+
+UPSTREAM=${1:-../live-bootstrap}
+S="$UPSTREAM/seed/stage0-posix"
+OUT=build/upstream
+
+if [ ! -d "$S" ]; then
+    echo "populate-upstream.sh: expected '$S' to exist" >&2
+    exit 1
+fi
+
+for A in AArch64 AMD64 riscv64; do
+    case "$A" in
+        AArch64) arch=aarch64 ; CATM=catm_AArch64.hex1 ;;
+        AMD64)   arch=amd64   ; CATM=catm_AMD64.hex2   ;;
+        riscv64) arch=riscv64 ; CATM=catm_riscv64.hex2 ;;
+    esac
+
+    mkdir -p "$OUT/bootstrap-seeds/POSIX/$A" "$OUT/$A"
+
+    cp "$S/bootstrap-seeds/POSIX/$A/hex0-seed" "$OUT/bootstrap-seeds/POSIX/$A/"
+    cp "$S/$A/hex0_$A.hex0"   "$OUT/$A/"
+    cp "$S/$A/hex1_$A.hex0"   "$OUT/$A/"
+    cp "$S/$A/hex2_$A.hex1"   "$OUT/$A/"
+    cp "$S/$A/$CATM"          "$OUT/$A/"
+    cp "$S/$A/M0_$A.hex2"     "$OUT/$A/"
+    cp "$S/$A/ELF-$arch.hex2" "$OUT/$A/"
+done
+
+echo "populate-upstream: copied into $OUT from $UPSTREAM"
diff --git a/src/p1_gen.py b/src/p1_gen.py
@@ -0,0 +1,1089 @@
+#!/usr/bin/env python3
+"""p1_gen.py — generate p1_<arch>.M1 from a per-arch encoder table.
+
+Single source of truth for the P1 DEFINE tables across all three target
+arches. Running this script writes <build>/aarch64/p1_aarch64.M1 and the
+amd64/riscv64 siblings (default <build> = "build").
+
+Structure:
+  * Low-level native encoders (amd_*, aa_*, rv_*) — one bank of
+    helpers per arch.
+  * Encoder classes AA64/AMD64/RV64 (subclasses of Encoder): one
+    method per P1 op category, lowering (op, reg-tuple, imm) into
+    native hex. Each arch's encoder is a coherent bundle — adding a
+    new op means one new method on each of the three.
+  * Op dataclasses — thin rows holding the DEFINE's name + data.
+    Op.encode(enc) dispatches into enc.<op-method>() with the Op's
+    fields unpacked. No per-arch branching lives in Op classes.
+  * rows() — builds the output list. Non-RRR ops are emitted as the
+    full register product × a curated imm/offset/shamt set. RRR
+    keeps an explicit table (the full 8³ cube is 5.6k entries per
+    arch, >99% dead weight). Adding a new RRR triple or a new imm
+    value is a one-line edit to rows(); a new register combination
+    for any other op needs no edit at all.
+  * emit(arch) / main — iterate rows, ask the arch's encoder to
+    lower each, write out the defs file.
+
+Running:
+    $ python3 p1_gen.py [build-root]           # rewrite all three files
+    $ python3 p1_gen.py --check [build-root]   # diff against current files
+"""
+
+import os
+import sys
+from dataclasses import dataclass
+from itertools import product
+from typing import Optional
+
+ARCHES = ('aarch64', 'amd64', 'riscv64')
+
+## P1 GPRs (the 8 caller/callee-split registers exposed to P1 source).
+P1_REGS = ('r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7')
+
+## ---------- Register mappings --------------------------------------------
+## P1 register name → native encoding number. The native numbers are what
+## the per-arch encoders insert into instruction fields; the human-facing
+## names (rax, x1, a2, …) never appear in this file.
+
+## 4:4 caller/callee-saved split. r0–r3 caller (native argregs); r4–r7
+## callee (native callee-saved). `br` is the hidden branch-target scratch
+## (not a P1 reg) — picked so every op's expansion clobbers only what its
+## name declares.
+NAT_AA64 = {'r0': 0, 'r1': 1, 'r2': 2, 'r3': 3,
+            'r4': 26, 'r5': 27, 'r6': 19, 'r7': 20,
+            'br': 17,  # x17 (IP1, caller-saved linker scratch)
+            'sp': 31, 'xzr': 31, 'lr': 30,
+            'x21': 21, 'x22': 22, 'x23': 23, 'x8': 8}
+
+## amd64 ModRM.reg/rm + REX.R/B bit: native regnums 0..15 with r8..r15
+## setting the REX bit. We store the 4-bit native number directly.
+NAT_AMD64 = {'r0': 0,   # rax
+             'r1': 7,   # rdi
+             'r2': 6,   # rsi
+             'r3': 2,   # rdx
+             'r4': 13,  # r13 (callee-saved)
+             'r5': 14,  # r14 (callee-saved)
+             'r6': 3,   # rbx
+             'r7': 12,  # r12
+             'br': 11,  # r11 — branch/call target scratch + DIV/REM r0 save
+             'sp': 4,   # rsp
+             'rcx': 1,  # shift-count scratch + DIV/REM rdx save (not a P1 reg)
+             'r10': 10, # syscall arg4 slot (not a P1 reg)
+             'r8':  8,  # syscall arg5 slot (not a P1 reg)
+             'r9':  9,  # syscall arg6 slot (not a P1 reg)
+             'r11': 11, # alias for br (some expansions spell it r11 directly)
+             }
+
+NAT_RV64 = {'r0': 10, 'r1': 11, 'r2': 12, 'r3': 13,
+            'r4': 20, 'r5': 21, 'r6': 9,  'r7': 18,
+            'br': 30,  # t5 (caller-saved temp)
+            'sp': 2, 'ra': 1, 'zero': 0, 'a7': 17,
+            's3': 19, 's6': 22, 's7': 23}
+
+
+## ---------- Low-level encoding helpers -----------------------------------
+
+def le32(n: int) -> str:
+    return (n & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
+
+def byte(n: int) -> str:
+    return f'{n & 0xFF:02X}'
+
+
+## ---------- amd64 primitive encoders ------------------------------------
+## amd64 is variable-length. Helpers below emit specific instruction
+## shapes used by the P1 expansions. REX prefix bits: W=64b, R=ModRM.reg
+## high, B=ModRM.rm high, X=SIB.index high (unused here).
+
+def rex(w, r, x, b):
+    v = 0x40 | (w << 3) | (r << 2) | (x << 1) | b
+    return byte(v)
+
+def modrm(mod, reg, rm):
+    return byte((mod << 6) | ((reg & 7) << 3) | (rm & 7))
+
+def amd_mov_rr(dst, src):
+    """mov dst, src — REX.W + 89 /r (MOV r/m64, r64)."""
+    d, s = NAT_AMD64[dst], NAT_AMD64[src]
+    return rex(1, s >> 3, 0, d >> 3) + '89' + modrm(3, s, d)
+
+def amd_alu_rr(op, dst, src):
+    """op dst, src — 2-operand ALU. op is the opcode byte (01 add,
+    29 sub, 21 and, 09 or, 31 xor)."""
+    d, s = NAT_AMD64[dst], NAT_AMD64[src]
+    return rex(1, s >> 3, 0, d >> 3) + op + modrm(3, s, d)
+
+def amd_alu_ri8(ext, dst, imm):
+    """op dst, imm8 (sign-extended). Opcode 83 /ext ib."""
+    d = NAT_AMD64[dst]
+    return rex(1, 0, 0, d >> 3) + '83' + modrm(3, ext, d) + byte(imm)
+
+def amd_alu_ri32(ext, dst, imm):
+    """op dst, imm32 (sign-extended). Opcode 81 /ext id. Used when
+    an immediate doesn't fit in the imm8 form (e.g., ADDI with
+    values outside [-128, 127])."""
+    d = NAT_AMD64[dst]
+    imm_le = (imm & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
+    return rex(1, 0, 0, d >> 3) + '81' + modrm(3, ext, d) + imm_le
+
+def amd_shift_ri8(ext, dst, imm):
+    """shl/shr/sar dst, imm8. Opcode C1 /ext ib."""
+    d = NAT_AMD64[dst]
+    return rex(1, 0, 0, d >> 3) + 'C1' + modrm(3, ext, d) + byte(imm)
+
+def amd_shift_cl(ext, dst):
+    """shl/shr/sar dst, cl. Opcode D3 /ext."""
+    d = NAT_AMD64[dst]
+    return rex(1, 0, 0, d >> 3) + 'D3' + modrm(3, ext, d)
+
+def amd_imul_rr(dst, src):
+    """imul dst, src — 0F AF /r."""
+    d, s = NAT_AMD64[dst], NAT_AMD64[src]
+    return rex(1, d >> 3, 0, s >> 3) + '0FAF' + modrm(3, d, s)
+
+def amd_idiv(src):
+    """idiv src — F7 /7 (signed div of rdx:rax by src)."""
+    s = NAT_AMD64[src]
+    return rex(1, 0, 0, s >> 3) + 'F7' + modrm(3, 7, s)
+
+def amd_cqo():
+    """cqo — sign-extend rax into rdx:rax. 48 99."""
+    return '4899'
+
+def amd_mem_rm(opcode, reg, base, disp):
+    """[base+disp] <-> reg, for MOV r,r/m or MOV r/m,r (opcode=89 store, 8B load).
+    disp is signed int; encodes as disp8 if in range, else disp32."""
+    r, b = NAT_AMD64[reg], NAT_AMD64[base]
+    prefix = rex(1, r >> 3, 0, b >> 3) + opcode
+    if -128 <= disp <= 127:
+        mod = 1
+        d = byte(disp)
+    elif b == 4:  # SIB required for rsp
+        mod = 2
+        d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
+    else:
+        mod = 2
+        d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
+    # rsp as base requires SIB byte (rm=4 with no SIB is rip-relative).
+    if b == 4:
+        return prefix + modrm(mod, r, 4) + '24' + d
+    return prefix + modrm(mod, r, b) + d
+
+def amd_mov_rm_b(reg, base, disp, store):
+    """Byte load/store. 88 /r (store), 0F B6 /r (movzx load)."""
+    r, b = NAT_AMD64[reg], NAT_AMD64[base]
+    if -128 <= disp <= 127:
+        mod = 1
+        d = byte(disp)
+    else:
+        mod = 2
+        d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
+    if store:
+        # MOV r/m8, r8 — 88 /r. Requires REX to address dil/sil/bpl/spl.
+        prefix = rex(1, r >> 3, 0, b >> 3) + '88'
+        sib = '24' if b == 4 else ''
+        rmv = 4 if b == 4 else b
+        return prefix + modrm(mod, r, rmv) + sib + d
+    else:
+        # MOVZX r64, r/m8 — REX.W 0F B6 /r.
+        prefix = rex(1, r >> 3, 0, b >> 3) + '0FB6'
+        sib = '24' if b == 4 else ''
+        rmv = 4 if b == 4 else b
+        return prefix + modrm(mod, r, rmv) + sib + d
+
+
+## ---------- aarch64 primitive encoders ----------------------------------
+## aarch64 is fixed 4-byte insns. Helpers return the 4 bytes LE-encoded.
+
+def aa_rrr(base, rD, rA, rB):
+    d, a, b = NAT_AA64[rD], NAT_AA64[rA], NAT_AA64[rB]
+    return le32(base | (b << 16) | (a << 5) | d)
+
+def aa_add_imm(rD, rA, imm12, sub=False):
+    """ADD/SUB (immediate, shift=0). imm12 unsigned 0..4095."""
+    d, a = NAT_AA64[rD], NAT_AA64[rA]
+    base = 0xD1000000 if sub else 0x91000000
+    return le32(base | ((imm12 & 0xFFF) << 10) | (a << 5) | d)
+
+def aa_logical_imm(base, rD, rA, N, immr, imms):
+    d, a = NAT_AA64[rD], NAT_AA64[rA]
+    return le32(base | (N << 22) | (immr << 16) | (imms << 10) | (a << 5) | d)
+
+def aa_ubfm(rD, rA, immr, imms):
+    """UBFM (N=1 for sf=64)."""
+    d, a = NAT_AA64[rD], NAT_AA64[rA]
+    return le32(0xD3400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
+
+def aa_sbfm(rD, rA, immr, imms):
+    """SBFM (N=1 for sf=64)."""
+    d, a = NAT_AA64[rD], NAT_AA64[rA]
+    return le32(0x93400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
+
+def aa_ldst_uimm12(base, rT, rN, off_bytes, size_log2):
+    """LDR/STR (unsigned offset). off_bytes must be a multiple of
+    2^size_log2 and non-negative. imm12 = off_bytes >> size_log2."""
+    assert off_bytes >= 0 and (off_bytes % (1 << size_log2)) == 0
+    imm12 = off_bytes >> size_log2
+    assert 0 <= imm12 < 4096
+    t, n = NAT_AA64[rT], NAT_AA64[rN]
+    return le32(base | (imm12 << 10) | (n << 5) | t)
+
+def aa_ldst_unscaled(base, rT, rN, off):
+    """LDUR/STUR (unscaled, signed imm9). Handles arbitrary small
+    offsets — negative, or positive-but-not-a-multiple-of-the-access-
+    size (e.g. LD at offset 7). imm9 range is [-256, 255]."""
+    assert -256 <= off <= 255
+    imm9 = off & 0x1FF
+    t, n = NAT_AA64[rT], NAT_AA64[rN]
+    return le32(base | (imm9 << 12) | (n << 5) | t)
+
+
+## ---------- riscv64 primitive encoders ----------------------------------
+
+def rv_r(base, rD, rA, rB):
+    d, a, b = NAT_RV64[rD], NAT_RV64[rA], NAT_RV64[rB]
+    return le32(base | (b << 20) | (a << 15) | (d << 7))
+
+def rv_i(base, rD, rA, imm12):
+    """I-type: imm12[11:0], rs1, funct3, rd, opcode. imm12 is a signed
+    int that gets masked to 12 bits."""
+    d, a = NAT_RV64[rD], NAT_RV64[rA]
+    return le32(base | ((imm12 & 0xFFF) << 20) | (a << 15) | (d << 7))
+
+def rv_s(base, rS, rA, imm12):
+    """S-type store: imm12[11:5] rs2 rs1 funct3 imm12[4:0] opcode."""
+    s, a = NAT_RV64[rS], NAT_RV64[rA]
+    hi = (imm12 >> 5) & 0x7F
+    lo = imm12 & 0x1F
+    return le32(base | (hi << 25) | (s << 20) | (a << 15) | (lo << 7))
+
+def rv_shift_imm(base, rD, rA, shamt):
+    """Shift-imm: base already has funct7 set; shamt in [0,63]."""
+    d, a = NAT_RV64[rD], NAT_RV64[rA]
+    return le32(base | ((shamt & 0x3F) << 20) | (a << 15) | (d << 7))
+
+
+## ---------- Per-arch op base tables -------------------------------------
+
+AA64_RRR_BASE = {
+    'ADD': 0x8B000000,
+    'SUB': 0xCB000000,
+    'AND': 0x8A000000,
+    'OR':  0xAA000000,
+    'XOR': 0xCA000000,
+    'SHL': 0x9AC02000,
+    'SHR': 0x9AC02400,
+    'SAR': 0x9AC02800,
+    'DIV': 0x9AC00C00,
+}
+AMD64_RRR_OPC = {
+    'ADD': '01', 'SUB': '29', 'AND': '21', 'OR': '09', 'XOR': '31',
+}
+RV_RRR = {
+    'ADD': 0x00000033,  # funct7=0 funct3=0 opcode=0x33
+    'SUB': 0x40000033,
+    'XOR': 0x00004033,
+    'OR':  0x00006033,
+    'AND': 0x00007033,
+    'SHL': 0x00001033,
+    'SHR': 0x00005033,
+    'SAR': 0x40005033,
+    'MUL': 0x02000033,
+    'DIV': 0x02004033,
+    'REM': 0x02006033,
+}
+
+
+## aarch64 bitmask-immediate encoding for ANDI/ORI. Entries are the
+## (N, immr, imms) triples that encode each small imm as an aarch64
+## "logical immediate." Computed by hand because the full encoding
+## algorithm (contiguous-run + rotation for element sizes
+## 2/4/8/16/32/64) is substantial and we only need a handful of
+## values. Extend this table if a new imm shows up in P1 source.
+AA64_LOGI_ENC = {
+    1: (1, 0, 0),    # 0b0001 — single bit at position 0
+    2: (1, 63, 0),   # 0b0010 — single bit at position 1
+    3: (1, 0, 1),    # 0b0011 — 2 contiguous ones
+    4: (1, 62, 0),   # 0b0100 — single bit at position 2
+    6: (1, 63, 1),   # 0b0110 — 2 ones rotated by 1
+    7: (1, 0, 2),    # 0b0111 — 3 contiguous ones
+    8: (1, 61, 0),   # 0b1000 — single bit at position 3
+}
+
+
+## Frame layout after PROLOGUE_Nk (k >= 1, rounded up so total frame
+## bytes stay 16-byte aligned on aarch64):
+##   [sp + 0]      = retaddr (aarch64 lr / riscv64 ra / amd64 retaddr)
+##   [sp + 8]      = slot 1 (callee-private scratch)
+##   [sp + 16]     = slot 2
+##   ...
+##   [sp + 8*k]    = slot k
+##
+## Frame size = round_up_to_16(8 + 8*k). So k=1 → 16, k=2 → 24 → 32,
+## k=3 → 32, k=4 → 40 → 48.
+
+def prologue_frame_bytes(k: int) -> int:
+    raw = 8 + 8 * k
+    return (raw + 15) & ~15
+
+
+## ---------- Encoders ----------------------------------------------------
+## One class per arch. Each provides one method per P1 op category,
+## mapping (op, reg-tuple, imm) to native bytes. Op classes dispatch
+## here via `Op.encode(enc)` → `enc.<method>(fields)`.
+
+class Encoder:
+    """Per-arch encoder base. Subclasses implement one method per
+    op category. `arch` is used by literal() to pick the right
+    pre-encoded bytes from an arch-keyed dict."""
+    arch = ''
+
+    def literal(self, hex_by_arch):
+        return hex_by_arch[self.arch]
+
+
+class AA64(Encoder):
+    arch = 'aarch64'
+
+    def rrr(self, op, rD, rA, rB):
+        if op == 'MUL':
+            # MUL = MADD with Ra=xzr. 100 11011 000 mmmmm 0 aaaaa nnnnn ddddd
+            d = NAT_AA64[rD]; a = NAT_AA64[rA]; b = NAT_AA64[rB]
+            return le32(0x9B000000 | (b << 16) | (31 << 10) | (a << 5) | d)
+        if op == 'REM':
+            # SDIV x16, xA, xB ; MSUB xD, x16, xB, xA.
+            # x16 (ARM IP0, caller-saved, not a P1 reg) is scratch so
+            # REM does not hidden-clobber P1 r4 — the op modifies rD only.
+            # MSUB needs bit 15 set (o0=1); without it it decodes as
+            # MADD and REM returns A + (A/B)*B.
+            d = NAT_AA64[rD]; a = NAT_AA64[rA]; b = NAT_AA64[rB]
+            SC = 16
+            sdiv = 0x9AC00C00 | (b << 16) | (a << 5) | SC
+            msub = 0x9B008000 | (b << 16) | (a << 10) | (SC << 5) | d
+            return le32(sdiv) + le32(msub)
+        return aa_rrr(AA64_RRR_BASE[op], rD, rA, rB)
+
+    def addi(self, rD, rA, imm):
+        if imm >= 0:
+            return aa_add_imm(rD, rA, imm, sub=False)
+        return aa_add_imm(rD, rA, -imm, sub=True)
+
+    def logi(self, op, rD, rA, imm):
+        N, immr, imms = AA64_LOGI_ENC[imm]
+        base = 0x92000000 if op == 'ANDI' else 0xB2000000  # ORI = orr
+        return aa_logical_imm(base, rD, rA, N, immr, imms)
+
+    def shifti(self, op, rD, rA, imm):
+        if op == 'SHLI':
+            return aa_ubfm(rD, rA, (-imm) & 63, 63 - imm)
+        if op == 'SHRI':
+            return aa_ubfm(rD, rA, imm, 63)
+        if op == 'SARI':
+            return aa_sbfm(rD, rA, imm, 63)
+
+    def mov(self, rD, rA):
+        if rA == 'sp':
+            return aa_add_imm(rD, 'sp', 0, sub=False)
+        # MOV xD, xA = ORR xD, xzr, xA
+        d = NAT_AA64[rD]; a = NAT_AA64[rA]
+        return le32(0xAA000000 | (a << 16) | (31 << 5) | d)
+
+    def li(self, rD):
+        # ldr wD, [pc+8] ; b +8  (caller emits 4 bytes of data next)
+        d = NAT_AA64[rD]
+        ldr_w_lit = 0x18000040 | d   # LDR (literal) 32-bit, offset 8
+        b_plus8   = 0x14000002       # B offset 8 (imm26 = 2 words = 8 bytes)
+        return le32(ldr_w_lit) + le32(b_plus8)
+
+    def la(self, rD):
+        return self.li(rD)
+
+    def mem(self, op, rT, rN, off):
+        # Pick uimm12 (scaled, large range) when the offset is a
+        # non-negative multiple of the access width; otherwise fall
+        # back to the unscaled signed-imm9 form (covers negative
+        # offsets and positive-but-misaligned ones like 7).
+        BASES = {
+            'LD': (0xF9400000, 3, 0xF8400000),
+            'ST': (0xF9000000, 3, 0xF8000000),
+            'LB': (0x39400000, 0, 0x38400000),
+            'SB': (0x39000000, 0, 0x38000000),
+        }
+        uimm_base, size_log2, unscaled_base = BASES[op]
+        scale = 1 << size_log2
+        if off >= 0 and (off % scale) == 0:
+            return aa_ldst_uimm12(uimm_base, rT, rN, off, size_log2)
+        return aa_ldst_unscaled(unscaled_base, rT, rN, off)
+
+    def b(self):
+        return le32(0xD61F0000 | (NAT_AA64['br'] << 5))   # BR x17
+
+    def condb(self, op, rA, rB):
+        # cmp xA, xB = SUBS xzr, xA, xB  (0xEB000000 base, rD=31).
+        # Skip when NOT cond holds. BEQ→NE(1), BNE→EQ(0), BLT→GE(A).
+        a = NAT_AA64[rA]; b_ = NAT_AA64[rB]
+        cmp_ = le32(0xEB000000 | (b_ << 16) | (a << 5) | 31)
+        cond = {'BEQ': 1, 'BNE': 0, 'BLT': 10}[op]
+        bcond = le32(0x54000040 | cond)
+        br = le32(0xD61F0000 | (NAT_AA64['br'] << 5))
+        return cmp_ + bcond + br
+
+    def call(self):
+        return le32(0xD63F0000 | (NAT_AA64['br'] << 5))   # BLR x17
+
+    def ret(self):
+        return le32(0xD65F03C0)                           # RET (= br x30)
+
+    def prologue(self, k):
+        fb = prologue_frame_bytes(k)
+        sub = aa_add_imm('sp', 'sp', fb, sub=True)
+        str_lr = aa_ldst_uimm12(0xF9000000, 'lr', 'sp', 0, 3)
+        return sub + str_lr
+
+    def epilogue(self, k):
+        fb = prologue_frame_bytes(k)
+        ldr_lr = aa_ldst_uimm12(0xF9400000, 'lr', 'sp', 0, 3)
+        add    = aa_add_imm('sp', 'sp', fb, sub=False)
+        return ldr_lr + add
+
+    def tail(self, k):
+        return self.epilogue(k) + self.b()
+
+
+class AMD64(Encoder):
+    arch = 'amd64'
+
+    def rrr(self, op, rD, rA, rB):
+        if op == 'MUL':
+            return amd_mov_rr(rD, rA) + amd_imul_rr(rD, rB)
+        if op in ('DIV', 'REM'):
+            # x86 idiv implicitly reads/writes rax (P1 r0) and rdx
+            # (P1 r3). To keep DIV/REM clobber-free (only rD changes),
+            # stash r0 into r11 and r3 into rcx — neither is a P1 reg —
+            # then restore. If rA or rB alias r0/r3, read from the
+            # saved copy since we've overwritten the originals.
+            # Skip the final restore for whichever of r0/r3 *is* rD,
+            # so rD keeps its newly computed value.
+            seq  = amd_mov_rr('r11', 'r0')        # save r0 (rax)
+            seq += amd_mov_rr('rcx', 'r3')        # save r3 (rdx)
+            src_a = 'r11' if rA == 'r0' else ('rcx' if rA == 'r3' else rA)
+            seq += amd_mov_rr('r0', src_a)        # rax = rA
+            seq += amd_cqo()                      # rdx:rax = sign-ext rax
+            src_b = 'r11' if rB == 'r0' else ('rcx' if rB == 'r3' else rB)
+            seq += amd_idiv(src_b)
+            seq += amd_mov_rr(rD, 'r0' if op == 'DIV' else 'r3')
+            if rD != 'r3':
+                seq += amd_mov_rr('r3', 'rcx')
+            if rD != 'r0':
+                seq += amd_mov_rr('r0', 'r11')
+            return seq
+        if op in ('SHL', 'SHR', 'SAR'):
+            ext = {'SHL': 4, 'SHR': 5, 'SAR': 7}[op]
+            seq = amd_mov_rr(rD, rA)
+            seq += amd_mov_rr('rcx', rB)
+            seq += amd_shift_cl(ext, rD)
+            return seq
+        # ADD/SUB/AND/OR/XOR: mov rD,rA ; op rD,rB
+        seq = amd_mov_rr(rD, rA)
+        seq += amd_alu_rr(AMD64_RRR_OPC[op], rD, rB)
+        return seq
+
+    def addi(self, rD, rA, imm):
+        # mov rD,rA ; add rD,imm.  Use imm8 form when it fits
+        # ([-128, 127]); otherwise emit the imm32 form.
+        seq = amd_mov_rr(rD, rA)
+        if -128 <= imm <= 127:
+            seq += amd_alu_ri8(0, rD, imm)  # /0 = ADD
+        else:
+            seq += amd_alu_ri32(0, rD, imm)
+        return seq
+
+    def logi(self, op, rD, rA, imm):
+        ext = {'ANDI': 4, 'ORI': 1}[op]
+        seq = amd_mov_rr(rD, rA)
+        seq += amd_alu_ri8(ext, rD, imm)
+        return seq
+
+    def shifti(self, op, rD, rA, imm):
+        ext = {'SHLI': 4, 'SHRI': 5, 'SARI': 7}[op]
+        seq = amd_mov_rr(rD, rA)
+        seq += amd_shift_ri8(ext, rD, imm)
+        return seq
+
+    def mov(self, rD, rA):
+        return amd_mov_rr(rD, rA)
+
+    def li(self, rD):
+        # mov <rD as r32>, imm32 — opcode B8+r (with REX.B if r8..r15)
+        d = NAT_AMD64[rD]
+        if d >= 8:
+            return '41' + byte(0xB8 + (d & 7))
+        return byte(0xB8 + d)
+
+    def la(self, rD):
+        return self.li(rD)
+
+    def mem(self, op, rT, rN, off):
+        if op == 'LD': return amd_mem_rm('8B', rT, rN, off)
+        if op == 'ST': return amd_mem_rm('89', rT, rN, off)
+        if op == 'LB': return amd_mov_rm_b(rT, rN, off, store=False)
+        if op == 'SB': return amd_mov_rm_b(rT, rN, off, store=True)
+
+    def b(self):
+        return '41FFE3'                                   # jmp r11
+
+    def condb(self, op, rA, rB):
+        a, b_ = NAT_AMD64[rA], NAT_AMD64[rB]
+        # cmp rA, rB — opcode 39 /r with rA as r/m
+        cmp_ = rex(1, b_ >> 3, 0, a >> 3) + '39' + modrm(3, b_, a)
+        # jcc rel8 opcode, skip=3 (past jmp r11):
+        #   BEQ→JNE 75 03 ; BNE→JE 74 03 ; BLT→JGE 7D 03
+        jop = {'BEQ': '75', 'BNE': '74', 'BLT': '7D'}[op]
+        return cmp_ + jop + '03' + '41FFE3'              # jmp r11
+
+    def call(self):
+        return '41FFD3'                                   # call r11
+
+    def ret(self):
+        return 'C3'
+
+    def prologue(self, k):
+        # pop rcx ; sub rsp,fb ; push rcx. rcx is the retaddr-carry
+        # scratch — caller-save, never a P1 reg. r11 (= 'br') is
+        # off-limits because TAIL = EPILOGUE + `jmp r11`, and using
+        # r11 here would clobber the LI_BR-loaded tail target.
+        fb = prologue_frame_bytes(k)
+        assert fb <= 127
+        return '59' + '4883EC' + byte(fb) + '51'
+
+    def epilogue(self, k):
+        # Mirror of prologue: pop rcx ; add rsp,fb ; push rcx.
+        fb = prologue_frame_bytes(k)
+        assert fb <= 127
+        return '59' + '4883C4' + byte(fb) + '51'
+
+    def tail(self, k):
+        return self.epilogue(k) + self.b()
+
+
+class RV64(Encoder):
+    arch = 'riscv64'
+
+    def rrr(self, op, rD, rA, rB):
+        return rv_r(RV_RRR[op], rD, rA, rB)
+
+    def addi(self, rD, rA, imm):
+        return rv_i(0x00000013, rD, rA, imm)
+
+    def logi(self, op, rD, rA, imm):
+        base = {'ANDI': 0x00007013, 'ORI': 0x00006013}[op]
+        return rv_i(base, rD, rA, imm)
+
+    def shifti(self, op, rD, rA, imm):
+        base = {'SHLI': 0x00001013, 'SHRI': 0x00005013, 'SARI': 0x40005013}[op]
+        return rv_shift_imm(base, rD, rA, imm)
+
+    def mov(self, rD, rA):
+        return rv_i(0x00000013, rD, rA, 0)                # addi rD, rA, 0
+
+    def li(self, rD):
+        # auipc rD,0 ; lwu rD,12(rD) ; jal x0,+8
+        d = NAT_RV64[rD]
+        auipc  = 0x00000017 | (d << 7)
+        lwu    = 0x00006003 | (d << 7) | (d << 15) | (12 << 20)
+        jal_p8 = 0x0080006F
+        return le32(auipc) + le32(lwu) + le32(jal_p8)
+
+    def la(self, rD):
+        return self.li(rD)
+
+    def mem(self, op, rT, rN, off):
+        # funct3: LD=3, ST=3, LBU=4, SB=0. Opcodes: load=03, store=23.
+        if op == 'LD': return rv_i(0x00003003, rT, rN, off)
+        if op == 'ST': return rv_s(0x00003023, rT, rN, off)
+        if op == 'LB': return rv_i(0x00004003, rT, rN, off)  # LBU
+        if op == 'SB': return rv_s(0x00000023, rT, rN, off)
+
+    def b(self):
+        return le32(0x00000067 | (NAT_RV64['br'] << 15))  # jalr x0, 0(t5)
+
+    def condb(self, op, rA, rB):
+        # B<inv> rA, rB, +8 ; jalr x0, 0(t5). funct3 picks the op:
+        # BEQ→BNE(1), BNE→BEQ(0), BLT→BGE(5).
+        a, b_ = NAT_RV64[rA], NAT_RV64[rB]
+        funct3 = {'BEQ': 1, 'BNE': 0, 'BLT': 5}[op]
+        insn = 0x00000063 | (funct3 << 12) | (a << 15) | (b_ << 20) | (8 << 7)
+        jalr = 0x00000067 | (NAT_RV64['br'] << 15)
+        return le32(insn) + le32(jalr)
+
+    def call(self):
+        return le32(0x000000E7 | (NAT_RV64['br'] << 15))  # jalr ra, 0(t5)
+
+    def ret(self):
+        return le32(0x00008067)                           # jalr x0, 0(ra)
+
+    def prologue(self, k):
+        fb = prologue_frame_bytes(k)
+        sub = rv_i(0x00000013, 'sp', 'sp', -fb)
+        sd  = rv_s(0x00003023, 'ra', 'sp', 0)
+        return sub + sd
+
+    def epilogue(self, k):
+        fb = prologue_frame_bytes(k)
+        ld  = rv_i(0x00003003, 'ra', 'sp', 0)
+        add = rv_i(0x00000013, 'sp', 'sp', fb)
+        return ld + add
+
+    def tail(self, k):
+        return self.epilogue(k) + self.b()
+
+
+ENCODERS = {'aarch64': AA64(), 'amd64': AMD64(), 'riscv64': RV64()}
+
+
+## ---------- Op dataclasses ----------------------------------------------
+## Thin wrappers: each row holds its DEFINE name + the data needed to
+## reconstruct the encoding. `encode(enc)` calls the matching method
+## on the arch's encoder.
+
+@dataclass
+class Op:
+    name: str
+    comment: str = ''
+
+    def encode(self, enc: Encoder) -> str:
+        raise NotImplementedError
+
+@dataclass
+class RRR(Op):
+    op: str = ''
+    rD: str = ''
+    rA: str = ''
+    rB: str = ''
+    def encode(self, enc):
+        return enc.rrr(self.op, self.rD, self.rA, self.rB)
+
+@dataclass
+class AddI(Op):
+    rD: str = ''
+    rA: str = ''
+    imm: int = 0
+    def encode(self, enc):
+        return enc.addi(self.rD, self.rA, self.imm)
+
+@dataclass
+class LogI(Op):
+    op: str = ''          # ANDI / ORI
+    rD: str = ''
+    rA: str = ''
+    imm: int = 0
+    def encode(self, enc):
+        return enc.logi(self.op, self.rD, self.rA, self.imm)
+
+@dataclass
+class ShiftI(Op):
+    op: str = ''          # SHLI / SHRI / SARI
+    rD: str = ''
+    rA: str = ''
+    imm: int = 0
+    def encode(self, enc):
+        return enc.shifti(self.op, self.rD, self.rA, self.imm)
+
+@dataclass
+class Mov(Op):
+    rD: str = ''
+    rA: str = ''
+    def encode(self, enc):
+        return enc.mov(self.rD, self.rA)
+
+@dataclass
+class Li(Op):
+    rD: str = ''
+    def encode(self, enc):
+        return enc.li(self.rD)
+
+@dataclass
+class La(Op):
+    rD: str = ''
+    def encode(self, enc):
+        return enc.la(self.rD)
+
+@dataclass
+class Mem(Op):
+    op: str = ''          # LD / ST / LB / SB
+    rT: str = ''
+    rN: str = ''
+    off: int = 0
+    def encode(self, enc):
+        return enc.mem(self.op, self.rT, self.rN, self.off)
+
+@dataclass
+class B(Op):
+    def encode(self, enc):
+        return enc.b()
+
+@dataclass
+class CondB(Op):
+    op: str = ''          # BEQ / BNE / BLT
+    rA: str = ''
+    rB: str = ''
+    def encode(self, enc):
+        return enc.condb(self.op, self.rA, self.rB)
+
+@dataclass
+class Literal(Op):
+    hex_by_arch: Optional[dict] = None
+    def encode(self, enc):
+        return enc.literal(self.hex_by_arch)
+
+@dataclass
+class Prologue(Op):
+    k: int = 1
+    def encode(self, enc):
+        return enc.prologue(self.k)
+
+@dataclass
+class Epilogue(Op):
+    k: int = 1
+    def encode(self, enc):
+        return enc.epilogue(self.k)
+
+@dataclass
+class Tail(Op):
+    k: int = 1
+    def encode(self, enc):
+        return enc.tail(self.k)
+
+@dataclass
+class Call(Op):
+    def encode(self, enc):
+        return enc.call()
+
+@dataclass
+class Ret(Op):
+    def encode(self, enc):
+        return enc.ret()
+
+
+## ---------- SYSCALL pre-encoded sequences -------------------------------
+## The one-shot syscall wrapper. Shuffles P1's r0=num, r1–r6=args into
+## each arch's native syscall ABI and clobbers only r0 on return.
+## Encoded by hand (per P1.md §"Syscall conventions").
+
+SYSCALL_HEX = {
+    'aarch64': (
+        # r4/r5 now live in callee-saved natives (x26/x27), so the
+        # kernel preserves them — no save/restore needed. Only r1/r2/r3
+        # (in caller-saved x1/x2/x3) must be stashed across the shuffle.
+        '' .join([
+            le32(0xAA0003E8),  # mov x8, x0      (syscall number)
+            le32(0xAA0103F5),  # mov x21, x1     (save r1)
+            le32(0xAA0203F6),  # mov x22, x2     (save r2)
+            le32(0xAA0303F7),  # mov x23, x3     (save r3)
+            le32(0xAA1503E0),  # mov x0, x21     (arg1 = r1)
+            le32(0xAA1603E1),  # mov x1, x22     (arg2 = r2)
+            le32(0xAA1703E2),  # mov x2, x23     (arg3 = r3)
+            le32(0xAA1A03E3),  # mov x3, x26     (arg4 = r4)
+            le32(0xAA1B03E4),  # mov x4, x27     (arg5 = r5)
+            le32(0xAA1303E5),  # mov x5, x19     (arg6 = r6)
+            le32(0xD4000001),  # svc #0
+            le32(0xAA1503E1),  # mov x1, x21     (restore r1)
+            le32(0xAA1603E2),  # mov x2, x22
+            le32(0xAA1703E3),  # mov x3, x23
+        ])
+    ),
+    # r4=r13, r5=r14 are callee-saved natively, but syscall wants args
+    # 4/5 in r10/r8. r6=rbx, but arg6 lives in r9. Three shuffle moves,
+    # then syscall. The kernel preserves rdi/rsi/rdx/r12–r15/rbx, so no
+    # P1 reg is clobbered beyond r0 (syscall return).
+    'amd64':  '4D89EA' + '4D89F0' + '4989D9' + '0F05',
+    'riscv64': (
+        # Same story as aarch64: r4/r5 in callee-saved s4/s5 (=x20/x21),
+        # so we only save/restore a1/a2/a3. Scratch slots: s3, s6, s7.
+        ''.join([
+            le32(0x00050893),  # mv a7, a0       (syscall number)
+            le32(0x00058993),  # mv s3, a1       (save r1)
+            le32(0x00060B13),  # mv s6, a2       (save r2)
+            le32(0x00068B93),  # mv s7, a3       (save r3)
+            le32(0x00098513),  # mv a0, s3       (arg1 = r1)
+            le32(0x000B0593),  # mv a1, s6       (arg2 = r2)
+            le32(0x000B8613),  # mv a2, s7       (arg3 = r3)
+            le32(0x000A0693),  # mv a3, s4       (arg4 = r4)
+            le32(0x000A8713),  # mv a4, s5       (arg5 = r5)
+            le32(0x00048793),  # mv a5, s1       (arg6 = r6)
+            le32(0x00000073),  # ecall
+            le32(0x00098593),  # mv a1, s3       (restore r1)
+            le32(0x000B0613),  # mv a2, s6
+            le32(0x000B8693),  # mv a3, s7
+        ])
+    ),
+}
+
+## Syscall numbers (little-endian 32-bit for LI operand).
+## aarch64 and riscv64 share the asm-generic table; amd64 has its own.
+##
+## Portability notes — every entry below is a syscall that exists on all
+## three with the same semantics under the uniform P1 SYSCALL convention
+## (r0 = num, r1-r6 = args):
+##   - `fork` is amd64-only; `wait4` is asm-generic 32-bit compat only.
+##     Use `clone(SIGCHLD)` and `waitid` instead.
+##   - `open` is amd64-only (removed from asm-generic). Use `openat` with
+##     dirfd = AT_FDCWD (-100) as arg1.
+##   - `clone` arg order differs: amd64 is (flags, stack, ptid, ctid, tls);
+##     aarch64/riscv64 are (flags, stack, ptid, tls, ctid). Benign when
+##     ptid/ctid/tls are all zero (the fork-equivalent case).
+SYS_NUM = {
+    'aarch64': {'SYS_WRITE':  64, 'SYS_EXIT': 93, 'SYS_READ': 63, 'SYS_CLOSE': 57,
+                'SYS_OPENAT': 56,
+                'SYS_CLONE': 220, 'SYS_EXECVE': 221, 'SYS_WAITID': 95},
+    'amd64':   {'SYS_WRITE':   1, 'SYS_EXIT': 60, 'SYS_READ':  0, 'SYS_CLOSE':  3,
+                'SYS_OPENAT':257,
+                'SYS_CLONE':  56, 'SYS_EXECVE':  59, 'SYS_WAITID':247},
+    'riscv64': {'SYS_WRITE':  64, 'SYS_EXIT': 93, 'SYS_READ': 63, 'SYS_CLOSE': 57,
+                'SYS_OPENAT': 56,
+                'SYS_CLONE': 220, 'SYS_EXECVE': 221, 'SYS_WAITID': 95},
+}
+
+
+## ---------- Canonical imm/offset/shamt sets -----------------------------
+## Enumerated instead of sigil-passed: M1's DEFINE substitutes hex
+## bytes verbatim, so every distinct imm value needs its own DEFINE.
+## These cover every value used across hello/demo/lisp/kaem-minimal
+## plus a little headroom. Extend when a new value appears in P1 src.
+
+## ADDI imms. NEG48/48 handle the ASCII '0' bias; the rest cover tag
+## stripping and loop counters. Full reg product × this set = 8²×N.
+ADDI_IMMS = (-48, -8, -7, -6, -5, -4, -3, -2, -1,
+             1, 2, 3, 4, 5, 6, 7, 8, 48)
+
+## Shift amounts (for SHLI/SHRI/SARI). 32/52 implement low-N-bit masks
+## (length field extraction; 4096-slot symbol-table index); the small
+## values scale-by-N for byte offsets and fixnum encode/decode.
+SHIFT_IMMS = (1, 2, 3, 5, 16, 32, 52)
+
+## ANDI/ORI imms. Every entry must appear in AA64_LOGI_ENC.
+LOGI_IMMS = (1, 2, 3, 4, 6, 7, 8)
+
+## Memory offsets for LD/ST/LB/SB. 0/8/16/24/32 cover slot offsets in
+## N-slot frames and common struct fields; 7 is the NUL terminator
+## position inside an 8-byte zero-padded slot; -8 reaches one slot
+## below the current base.
+MEM_OFFS = (-8, 0, 7, 8, 16, 24, 32)
+
+CONDB_OPS = ('BEQ', 'BNE', 'BLT')
+SHIFT_OPS = ('SHLI', 'SHRI', 'SARI')
+LOGI_OPS  = ('ANDI', 'ORI')
+MEM_OPS   = ('LD', 'ST', 'LB', 'SB')
+
+
+## Curated RRR triples. The full cube is 11 ops × 8³ regs = 5632
+## entries per arch — >99% would be dead weight. Each tuple below
+## is one actually used by hello/demo/lisp/kaem-minimal. Lint
+## catches missing triples on assembly; add a line here and
+## regenerate.
+RRR_TABLE = (
+    # demo/lisp step-1 arith cube
+    ('ADD','r1','r1','r2'), ('ADD','r1','r1','r4'),
+    ('ADD','r2','r2','r6'), ('ADD','r2','r3','r1'),
+    ('SUB','r1','r1','r2'), ('SUB','r2','r2','r6'),
+    ('AND','r1','r1','r5'),
+    ('OR', 'r1','r1','r2'),
+    ('XOR','r1','r1','r2'),
+    ('MUL','r1','r1','r2'),
+    ('DIV','r1','r1','r2'),
+    ('REM','r1','r1','r5'),
+    ('SHL','r1','r1','r2'),
+    ('SHR','r1','r1','r2'),
+    ('SAR','r4','r4','r2'),
+    # alloc / pointer arithmetic
+    ('ADD','r2','r0','r1'),
+    ('ADD','r0','r0','r3'),
+    ('ADD','r2','r2','r0'),
+    ('ADD','r2','r2','r1'),
+    ('SUB','r3','r3','r0'),
+    # reader / display index+offset fold
+    ('ADD','r6','r1','r2'),
+    ('ADD','r6','r6','r0'),
+    ('ADD','r7','r1','r2'),
+    ('SUB','r2','r1','r6'),
+    ('SUB','r3','r1','r6'),
+    ('REM','r1','r1','r2'),
+    # kaem-minimal bump-pointer + accumulator updates
+    ('ADD','r1','r1','r0'),
+    ('ADD','r5','r5','r0'),
+    ('ADD','r7','r7','r0'),
+    ('SUB','r3','r3','r2'),
+    ('SUB','r6','r6','r0'),
+    # Primitive bodies (LISP.md step 10c). Convention: r1=argc,
+    # r2=argv (both input), r3=accumulator, r0=scratch/return.
+    # Variadic folds: (r3 = r3 op r0). Unary negate / bit-not:
+    # (r3 = r0 - r3) with r0 = 0 or -1. arithmetic-shift k-negate:
+    # (r0 = r1 - r0) with r1 = 0 after argc is consumed.
+    ('ADD','r3','r3','r0'),
+    ## ('SUB','r3','r3','r0') — already above (kaem-minimal row)
+    ('SUB','r3','r0','r3'),
+    ('SUB','r0','r1','r0'),
+    ('MUL','r3','r3','r0'),
+    ('DIV','r3','r3','r0'),
+    ('REM','r3','r3','r0'),
+    ('AND','r3','r3','r0'),
+    ('OR', 'r3','r3','r0'),
+    ('XOR','r3','r3','r0'),
+    ('SHL','r3','r3','r0'),
+    ('SAR','r3','r3','r0'),
+)
+
+
+## ---------- Row assembly ------------------------------------------------
+
+HEADER = """## p1_{arch}.M1 — GENERATED by p1_gen.py. Do not edit by hand.
+##
+## Shared op-table lives in p1_gen.py; each arch's encoder lowers
+## (op, register-tuple, imm) rows into native bytes. See P1.md for the
+## ISA spec and register mapping.
+"""
+
+@dataclass
+class Banner:
+    text: str
+
+
+def _imm_suf(imm):
+    return f'NEG{-imm}' if imm < 0 else f'{imm}'
+
+
+def rows():
+    R = []
+
+    # --- LI / LA — wide literal and address loads ---
+    R.append(Banner('LI / LA — load 4-byte zero-extended literal or label addr'))
+    for rd in P1_REGS:
+        R.append(Li(name=f'LI_{rd.upper()}', rD=rd))
+    # LI_BR loads into the hidden branch-target scratch (x17/r11/t5).
+    # Every branch/call site is `LI_BR &target ; P1_<BR>`. The scratch
+    # is *not* a P1 reg.
+    R.append(Li(name='LI_BR', rD='br'))
+    for rd in P1_REGS:
+        R.append(La(name=f'LA_{rd.upper()}', rD=rd))
+
+    # --- MOV — register-to-register + MOV rD, sp ---
+    R.append(Banner('MOV — full register product (src may be sp)'))
+    for rd in P1_REGS:
+        for ra in P1_REGS:
+            R.append(Mov(name=f'MOV_{rd.upper()}_{ra.upper()}', rD=rd, rA=ra))
+        R.append(Mov(name=f'MOV_{rd.upper()}_SP', rD=rd, rA='sp'))
+
+    # --- RRR — curated triples (full cube would be 5.6k/arch) ---
+    R.append(Banner('RRR — curated triples (explicit table in p1_gen.py)'))
+    for op, d, a, b in RRR_TABLE:
+        R.append(RRR(name=f'{op}_{d.upper()}_{a.upper()}_{b.upper()}',
+                     op=op, rD=d, rA=a, rB=b))
+
+    # --- Immediate arith: ADDI × full reg product × imm set ---
+    R.append(Banner('ADDI — full register product × ADDI_IMMS'))
+    for d, a, imm in product(P1_REGS, P1_REGS, ADDI_IMMS):
+        R.append(AddI(name=f'ADDI_{d.upper()}_{a.upper()}_{_imm_suf(imm)}',
+                      rD=d, rA=a, imm=imm))
+
+    # --- ANDI / ORI × full reg product × LOGI_IMMS ---
+    R.append(Banner('ANDI / ORI — full register product × LOGI_IMMS'))
+    for op, d, a, imm in product(LOGI_OPS, P1_REGS, P1_REGS, LOGI_IMMS):
+        R.append(LogI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}',
+                      op=op, rD=d, rA=a, imm=imm))
+
+    # --- SHLI / SHRI / SARI × full reg product × SHIFT_IMMS ---
+    R.append(Banner('SHLI / SHRI / SARI — full register product × SHIFT_IMMS'))
+    for op, d, a, imm in product(SHIFT_OPS, P1_REGS, P1_REGS, SHIFT_IMMS):
+        R.append(ShiftI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}',
+                        op=op, rD=d, rA=a, imm=imm))
+
+    # --- Memory: LD/ST/LB/SB × full reg product × MEM_OFFS ---
+    R.append(Banner('LD / ST / LB / SB — full register product × MEM_OFFS'))
+    for op, rt, rn, off in product(MEM_OPS, P1_REGS, P1_REGS, MEM_OFFS):
+        R.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{_imm_suf(off)}',
+                     op=op, rT=rt, rN=rn, off=off))
+
+    # --- Branches: BEQ/BNE/BLT × full reg product + unconditional B ---
+    R.append(Banner('Branches — LI_BR-indirect pattern'))
+    R.append(B(name='B'))
+    for op, a, b in product(CONDB_OPS, P1_REGS, P1_REGS):
+        R.append(CondB(name=f'{op}_{a.upper()}_{b.upper()}',
+                       op=op, rA=a, rB=b))
+
+    # --- Control: CALL / RET / PROLOGUE / EPILOGUE / TAIL (Nk = 1..4) ---
+    R.append(Banner('Control: CALL/RET + single-slot and N-slot PROLOGUE/EPILOGUE/TAIL'))
+    R.append(Prologue(name='PROLOGUE', k=1))
+    R.append(Epilogue(name='EPILOGUE', k=1))
+    R.append(Ret(name='RET'))
+    R.append(Call(name='CALL'))
+    R.append(Tail(name='TAIL', k=1))
+    for k in (2, 3, 4):
+        R.append(Prologue(name=f'PROLOGUE_N{k}', k=k))
+        R.append(Epilogue(name=f'EPILOGUE_N{k}', k=k))
+        R.append(Tail(name=f'TAIL_N{k}', k=k))
+
+    # --- SYSCALL — pre-encoded per-arch wrapper ---
+    R.append(Banner('SYSCALL — uniform "clobbers r0 only" across arches'))
+    R.append(Literal(name='SYSCALL', hex_by_arch=SYSCALL_HEX))
+
+    # --- Syscall numbers (LE-32 immediates) ---
+    R.append(Banner('Linux syscall numbers (per-arch table). LE-32 operands for LI.'))
+    for name in ('SYS_WRITE', 'SYS_EXIT', 'SYS_READ', 'SYS_CLOSE', 'SYS_OPENAT',
+                 'SYS_CLONE', 'SYS_EXECVE', 'SYS_WAITID'):
+        R.append(Literal(name=name,
+                         hex_by_arch={a: le32(SYS_NUM[a][name]) for a in ARCHES}))
+
+    return R
+
+
+## ---------- File emission -----------------------------------------------
+
+def emit(arch: str) -> str:
+    enc = ENCODERS[arch]
+    out = [HEADER.format(arch=arch).rstrip(), '']
+    seen = set()
+    for row in rows():
+        if isinstance(row, Banner):
+            out.append('')
+            out.append('## ---- ' + row.text + ' ' + '-' * max(0, 60 - len(row.text)))
+            continue
+        name = 'P1_' + row.name if not row.name.startswith('SYS_') else row.name
+        if name in seen:
+            raise RuntimeError(f'duplicate DEFINE: {name}')
+        seen.add(name)
+        out.append(f'DEFINE {name} {row.encode(enc)}')
+    out.append('')
+    return '\n'.join(out)
+
+
+def main():
+    check = '--check' in sys.argv
+    positional = [a for a in sys.argv[1:] if not a.startswith('--')]
+    build_root = positional[0] if positional else 'build'
+
+    had_diff = False
+    for arch in ARCHES:
+        dest_dir = os.path.join(build_root, arch)
+        path = os.path.join(dest_dir, f'p1_{arch}.M1')
+        content = emit(arch)
+        if check:
+            try:
+                with open(path) as f:
+                    existing = f.read()
+            except FileNotFoundError:
+                existing = ''
+            if existing != content:
+                sys.stderr.write(f'DIFF: {path}\n')
+                had_diff = True
+        else:
+            os.makedirs(dest_dir, exist_ok=True)
+            with open(path, 'w') as f:
+                f.write(content)
+            print(f'wrote {path} ({len(content)} bytes)')
+
+    if check and had_diff:
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()

	boot2 Playing with the boostrap
	git clone https://git.ryansepassi.com/git/boot2.git
	Log \| Files \| Refs

M	.gitignore	\|	3	---
M	Makefile	\|	77	+++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
M	README.md	\|	57	++++++++++++++++++++++++++-------------------------------
D	arch/ELF-aarch64.hex2	\|	75	---------------------------------------------------------------------------
D	arch/ELF-amd64.hex2	\|	74	--------------------------------------------------------------------------
D	arch/ELF-riscv64.hex2	\|	74	--------------------------------------------------------------------------
D	arch/p1_gen.py	\|	1066	-------------------------------------------------------------------------------
M	bootstrap.sh	\|	19	+++++++++++--------
A	populate-upstream.sh	\|	48	++++++++++++++++++++++++++++++++++++++++++++++++
A	src/p1_gen.py	\|	1089	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++