commit b9b5b0447f9729f86deac54ff643d8097c82636b
parent 49dcd6e7b024c3b4921442ba79db41bb510e5cc3
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Tue, 21 Apr 2026 08:16:46 -0700
build: only curdir is mounted into podman
Podman invocations now mount $(CURDIR):/work only. populate-upstream.sh
runs on the host, mirroring the files bootstrap.sh consumes
(hex0-seeds, hex0/1/2/catm/M0 sources, ELF-<arch>.hex2) from
\$UPSTREAM/seed/stage0-posix/ into build/upstream/. PODMAN_BOOTSTRAP
collapses into the unified PODMAN.
arch/ is gone: the vendored (and subtly reformatted) ELF-*.hex2 files
are deleted — the link rule now reads them straight from
build/upstream/. p1_gen.py moves to src/ and writes its three
p1_<arch>.M1 defs files into build/<arch>/. Only original sources
live in curdir; everything derived or copied lands under build/.
Diffstat:
| M | .gitignore | | | 3 | --- |
| M | Makefile | | | 77 | +++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------- |
| M | README.md | | | 57 | ++++++++++++++++++++++++++------------------------------- |
| D | arch/ELF-aarch64.hex2 | | | 75 | --------------------------------------------------------------------------- |
| D | arch/ELF-amd64.hex2 | | | 74 | -------------------------------------------------------------------------- |
| D | arch/ELF-riscv64.hex2 | | | 74 | -------------------------------------------------------------------------- |
| D | arch/p1_gen.py | | | 1066 | ------------------------------------------------------------------------------- |
| M | bootstrap.sh | | | 19 | +++++++++++-------- |
| A | populate-upstream.sh | | | 48 | ++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | src/p1_gen.py | | | 1089 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
10 files changed, 1225 insertions(+), 1357 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1 @@
build/
-arch/p1_aarch64.M1
-arch/p1_amd64.M1
-arch/p1_riscv64.M1
diff --git a/Makefile b/Makefile
@@ -48,7 +48,11 @@ ifeq ($(PLATFORM),)
$(error ARCH '$(ARCH)' not supported — use aarch64, amd64, or riscv64)
endif
-UPSTREAM := $(abspath $(CURDIR)/../live-bootstrap)
+# Default upstream checkout path. Only consumed by populate-upstream.sh,
+# which runs on the host to populate build/upstream/ before the container
+# ever starts. Podman itself never mounts this — the container only ever
+# sees curdir, so all inputs must already live inside it.
+UPSTREAM ?= $(abspath $(CURDIR)/../live-bootstrap)
# Pinned to a manifest-list digest (not the :latest tag): podman on macOS
# only stores one image per tag locally, so cross-arch pulls under :latest
@@ -63,16 +67,22 @@ RUNTIME_IMAGE := public.ecr.aws/docker/library/alpine@sha256:5b10f432ef3da1b8d4c
OUT_DIR := build/$(ARCH)
TOOLS_DIR := $(OUT_DIR)/tools
-# Two container views:
-# PODMAN_BOOTSTRAP — toolchain build. Needs read-only access to stage0-posix
-# under ../live-bootstrap; writes only into build/$(ARCH)/tools.
-# PODMAN — assemble / link / run. Sees only the lispcc dir.
-PODMAN_BOOTSTRAP := podman run --rm --platform $(PLATFORM) \
- -v $(UPSTREAM):/work/live-bootstrap:ro \
- -v $(CURDIR):/work/lispcc \
- -w /work/lispcc \
- $(RUNTIME_IMAGE)
-
+# stage0-posix uses mixed-case arch dirs (AArch64, AMD64) that don't match
+# our lowercase ARCH. Map them so build/upstream/ mirrors upstream layout.
+ARCH_DIR_aarch64 := AArch64
+ARCH_DIR_amd64 := AMD64
+ARCH_DIR_riscv64 := riscv64
+ARCH_DIR := $(ARCH_DIR_$(ARCH))
+
+# Host-populated mirror of the upstream files we consume. Everything
+# bootstrap.sh needs (seeds, hex0/1/2 sources, catm, M0, ELF headers)
+# lands here before any podman work begins.
+UPSTREAM_DIR := build/upstream
+UPSTREAM_STAMP := $(UPSTREAM_DIR)/.stamp
+
+# Single podman view: curdir mounted at /work. Toolchain build, assembly,
+# link, and run all share this view. Keeping it narrow means nothing
+# outside the repo is visible to the container.
PODMAN := podman run --rm --platform $(PLATFORM) \
-v $(CURDIR):/work \
-w /work \
@@ -80,21 +90,36 @@ PODMAN := podman run --rm --platform $(PLATFORM) \
# --- Targets ---------------------------------------------------------------
-.PHONY: all toolchain run run-all test-lisp test-lisp-all clean
+.PHONY: all toolchain populate-upstream run run-all test-lisp test-lisp-all clean
all: $(OUT_DIR)/$(PROG)
toolchain: $(TOOLS_DIR)/M0
+populate-upstream: $(UPSTREAM_STAMP)
+
$(OUT_DIR) $(TOOLS_DIR):
mkdir -p $@
+# Mirror the upstream seed + hex0/1/2/catm/M0/ELF files we need from
+# $(UPSTREAM) into build/upstream/. Host-side so the container mount stays
+# minimal. The stamp doubles as an order marker and avoids re-copying on
+# every toolchain build.
+$(UPSTREAM_STAMP): populate-upstream.sh
+ sh populate-upstream.sh $(UPSTREAM)
+ @touch $@
+
+# Any file anyone asks for under build/upstream/ is produced by the stamp
+# rule above. Empty recipe — the file is already on disk once the stamp
+# exists, and the stamp's timestamp stands in for every file's freshness.
+$(UPSTREAM_DIR)/%: $(UPSTREAM_STAMP) ;
+
# Bootstrap M0, hex2-0, catm (and the throwaway hex0/hex1) from hex0-seed.
# One shot per arch — see bootstrap.sh for the phase-by-phase chain.
#
# Grouped target (&:) so all five outputs come from a single recipe run.
-$(TOOLS_DIR)/M0 $(TOOLS_DIR)/hex2-0 $(TOOLS_DIR)/catm $(TOOLS_DIR)/hex0 $(TOOLS_DIR)/hex1 &: bootstrap.sh | $(TOOLS_DIR)
- $(PODMAN_BOOTSTRAP) sh bootstrap.sh $(ARCH) /work/lispcc/$(TOOLS_DIR)
+$(TOOLS_DIR)/M0 $(TOOLS_DIR)/hex2-0 $(TOOLS_DIR)/catm $(TOOLS_DIR)/hex0 $(TOOLS_DIR)/hex1 &: bootstrap.sh $(UPSTREAM_STAMP) | $(TOOLS_DIR)
+ $(PODMAN) sh bootstrap.sh $(ARCH) /work/$(TOOLS_DIR)
# Assemble: lint first, then combine per-arch defs + program and feed to M0.
#
@@ -105,10 +130,10 @@ $(TOOLS_DIR)/M0 $(TOOLS_DIR)/hex2-0 $(TOOLS_DIR)/catm $(TOOLS_DIR)/hex0 $(TOOLS_
# M0 takes a single positional input (no -f flag), so we catm the two
# sources together first. The intermediate .combined.M1 is kept in OUT_DIR
# so it gets cleaned along with everything else.
-$(OUT_DIR)/$(PROG).hex2: $(PROG_SRC) arch/p1_$(ARCH).M1 lint.sh $(TOOLS_DIR)/M0 $(TOOLS_DIR)/catm | $(OUT_DIR)
- ./lint.sh arch/p1_$(ARCH).M1 $(PROG_SRC)
+$(OUT_DIR)/$(PROG).hex2: $(PROG_SRC) $(OUT_DIR)/p1_$(ARCH).M1 lint.sh $(TOOLS_DIR)/M0 $(TOOLS_DIR)/catm | $(OUT_DIR)
+ ./lint.sh $(OUT_DIR)/p1_$(ARCH).M1 $(PROG_SRC)
$(PODMAN) sh -ec ' \
- $(TOOLS_DIR)/catm $(OUT_DIR)/$(PROG).combined.M1 arch/p1_$(ARCH).M1 $(PROG_SRC) ; \
+ $(TOOLS_DIR)/catm $(OUT_DIR)/$(PROG).combined.M1 $(OUT_DIR)/p1_$(ARCH).M1 $(PROG_SRC) ; \
$(TOOLS_DIR)/M0 $(OUT_DIR)/$(PROG).combined.M1 $(OUT_DIR)/$(PROG).hex2'
# Link: prepend the ELF header and feed to hex2-0.
@@ -117,9 +142,9 @@ $(OUT_DIR)/$(PROG).hex2: $(PROG_SRC) arch/p1_$(ARCH).M1 lint.sh $(TOOLS_DIR)/M0
# base address 0x00600000 (no --base-address flag), which is why the ELF
# header references `&ELF_base` symbolically rather than baking in a
# concrete VA — the header travels to whatever base the linker chose.
-$(OUT_DIR)/$(PROG): $(OUT_DIR)/$(PROG).hex2 arch/ELF-$(ARCH).hex2 $(TOOLS_DIR)/hex2-0 $(TOOLS_DIR)/catm
+$(OUT_DIR)/$(PROG): $(OUT_DIR)/$(PROG).hex2 $(UPSTREAM_DIR)/$(ARCH_DIR)/ELF-$(ARCH).hex2 $(TOOLS_DIR)/hex2-0 $(TOOLS_DIR)/catm
$(PODMAN) sh -ec ' \
- $(TOOLS_DIR)/catm $(OUT_DIR)/$(PROG).linked.hex2 arch/ELF-$(ARCH).hex2 $(OUT_DIR)/$(PROG).hex2 ; \
+ $(TOOLS_DIR)/catm $(OUT_DIR)/$(PROG).linked.hex2 $(UPSTREAM_DIR)/$(ARCH_DIR)/ELF-$(ARCH).hex2 $(OUT_DIR)/$(PROG).hex2 ; \
$(TOOLS_DIR)/hex2-0 $(OUT_DIR)/$(PROG).linked.hex2 $(OUT_DIR)/$(PROG)'
run: $(OUT_DIR)/$(PROG)
@@ -168,12 +193,12 @@ test-lisp-all:
$(MAKE) --no-print-directory ARCH=riscv64 test-lisp
clean:
- rm -rf build/ arch/p1_aarch64.M1 arch/p1_amd64.M1 arch/p1_riscv64.M1
+ rm -rf build/
-# Generate all three per-arch DEFINE tables from arch/p1_gen.py in a
+# Generate all three per-arch DEFINE tables from src/p1_gen.py in a
# single shot. Grouped target (&:) because p1_gen.py writes all three
-# files unconditionally (next to itself, i.e. into arch/). These are
-# build artifacts — gitignored; the build regenerates them on any
-# p1_gen.py edit so there's no staleness risk.
-arch/p1_aarch64.M1 arch/p1_amd64.M1 arch/p1_riscv64.M1 &: arch/p1_gen.py
- python3 arch/p1_gen.py
+# files unconditionally. Output lands under build/<arch>/ (build/ is
+# wiped by clean, so the build regenerates on any p1_gen.py edit with
+# no staleness risk).
+build/aarch64/p1_aarch64.M1 build/amd64/p1_amd64.M1 build/riscv64/p1_riscv64.M1 &: src/p1_gen.py
+ python3 src/p1_gen.py build
diff --git a/README.md b/README.md
@@ -10,50 +10,45 @@ Goal is a 4–6× shrink in auditable LOC. See [docs/PLAN.md](docs/PLAN.md).
Stage 0: hello-world in the P1 portable pseudo-ISA (see [docs/P1.md](docs/P1.md)),
assembled and run inside a pristine alpine container on all three target
arches (aarch64, amd64, riscv64). The same `tests/hello.M1` source assembles
-for every arch; only the backing `arch/p1_<arch>.M1` defs file varies.
-Toolchain (M1, hex2) builds statically from the upstream mescc-tools C
-source.
+for every arch; only the backing `build/<arch>/p1_<arch>.M1` defs file
+varies. Toolchain (M1, hex2) builds statically from the upstream mescc-tools
+C source.
## Layout
```
-docs/ design docs (PLAN, SEED, P1, C1, LISP)
-src/ real programs (lisp.M1, kaem-minimal.M1)
-tests/ smoke programs (hello.M1, demo.M1) + fixtures
- lisp/ lisp test fixtures (*.scm + *.expected)
- kaem.run smoke input for kaem-minimal
-arch/ per-arch defs + ELF headers
- p1_gen.py generator for p1_<arch>.M1
- p1_<arch>.M1 per-arch P1 defs (gitignored, generated)
- ELF-<arch>.hex2 per-arch ELF header template
-bootstrap.sh hex0-seed → M0/hex2-0/catm toolchain build
-lint.sh M1 undefined-token guard
-Makefile podman-driven build, ARCH-parameterized
-build/<arch>/ per-arch outputs + toolchain
+docs/ design docs (PLAN, SEED, P1, C1, LISP)
+src/ real programs (lisp.M1, kaem-minimal.M1) + p1_gen.py
+tests/ smoke programs (hello.M1, demo.M1) + fixtures
+ lisp/ lisp test fixtures (*.scm + *.expected)
+ kaem.run smoke input for kaem-minimal
+bootstrap.sh hex0-seed → M0/hex2-0/catm toolchain build
+populate-upstream.sh host-side copy of upstream seeds + sources into build/upstream/
+lint.sh M1 undefined-token guard
+Makefile podman-driven build, ARCH-parameterized
+build/ all derived artifacts (gitignored)
+ upstream/ mirror of the files bootstrap.sh consumes from live-bootstrap
+ <arch>/ per-arch outputs
+ tools/ bootstrapped M0, hex2-0, catm (+ throwaway hex0/hex1)
+ p1_<arch>.M1 generated P1 defs
+ <prog> final ELF binary
```
## Build & run
-Requires podman. Non-native arches run via podman's binfmt + qemu-user
-path (works transparently on a default `podman machine` setup).
+Requires podman. Uses Alpine as the host. Non-native arches run via podman's
+binfmt + qemu-user path (works transparently on a default `podman machine`
+setup).
```
-make image # one-time: build the alpine+gcc builder image
-make # default ARCH=aarch64 → build/aarch64/hello
-make ARCH=amd64 # build/amd64/hello
-make ARCH=riscv64 # build/riscv64/hello
-make run # run build/$(ARCH)/hello in pristine alpine
make run-all # build + run on all three arches
make clean # wipe build/
```
-Two images are used: `lispcc-builder` (alpine+gcc, ~184 MB) only compiles
-M1/hex2 at host arch; `alpine:latest` pulled per target platform runs
-the assembled binary with the static toolchain mounted in.
-
## Source layout assumption
-The Makefile reaches the upstream mescc-tools C source via the parent dir
-mount (`HOST_ROOT := $(abspath $(CURDIR)/..)`), expecting
-`../live-bootstrap/seed/stage0-posix/mescc-tools/`. Override `TOOLCHAIN_SRC`
-in the Makefile if your layout differs.
+`populate-upstream.sh` runs on the host and mirrors the files bootstrap.sh
+needs from `$UPSTREAM/seed/stage0-posix/` into `build/upstream/`; the
+default is `../live-bootstrap`. Override by invoking `make UPSTREAM=/path
+populate-upstream`. Podman itself only ever mounts curdir, so everything
+the container sees must live inside the repo.
diff --git a/arch/ELF-aarch64.hex2 b/arch/ELF-aarch64.hex2
@@ -1,75 +0,0 @@
-### Copyright (C) 2016 Jeremiah Orians
-### Copyright (C) 2017 Jan Nieuwenhuizen <janneke@gnu.org>
-### Copyright (C) 2020 deesix <deesix@tuta.io>
-### This file is part of M2-Planet.
-###
-### M2-Planet is free software: you can redistribute it and/or modify
-### it under the terms of the GNU General Public License as published by
-### the Free Software Foundation, either version 3 of the License, or
-### (at your option) any later version.
-###
-### M2-Planet is distributed in the hope that it will be useful,
-### but WITHOUT ANY WARRANTY; without even the implied warranty of
-### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-### GNU General Public License for more details.
-###
-### You should have received a copy of the GNU General Public License
-### along with M2-Planet. If not, see <http://www.gnu.org/licenses/>.
-
-### stage0's hex2 format
-### !<label> 1 byte relative
-### $<label> 2 byte address
-### @<label> 2 byte relative
-### &<label> 4 byte address
-### %<label> 4 byte relative
-
-### if you wish to use this header, you need to add :ELF_end to the end of your
-### M1 or hex2 files.
-
-## ELF Header
-
-:ELF_base
-7F 45 4C 46 # e_ident[EI_MAG0-3] ELF's magic number
-
-02 # e_ident[EI_CLASS] Indicating 64 bit
-01 # e_ident[EI_DATA] Indicating little endianness
-01 # e_ident[EI_VERSION] Indicating original elf
-
-03 # e_ident[EI_OSABI] Set at 3 because FreeBSD is strict
-00 # e_ident[EI_ABIVERSION] See above
-
-00 00 00 00 00 00 00 # e_ident[EI_PAD]
-
-02 00 # e_type Indicating Executable
-B7 00 # e_machine Indicating AArch64
-01 00 00 00 # e_version Indicating original elf
-
-&_start 00 00 00 00 # e_entry Address of the entry point
-%ELF_program_headers>ELF_base 00 00 00 00 # e_phoff Address of program header table
-00 00 00 00 00 00 00 00 # e_shoff Address of section header table
-
-00 00 00 00 # e_flags
-
-40 00 # e_ehsize Indicating our 64 Byte header
-
-38 00 # e_phentsize size of a program header table
-01 00 # e_phnum number of entries in program table
-
-00 00 # e_shentsize size of a section header table
-00 00 # e_shnum number of entries in section table
-
-00 00 # e_shstrndx index of the section names
-
-
-:ELF_program_headers
-:ELF_program_header__text
-01 00 00 00 # ph_type: PT-LOAD = 1
-07 00 00 00 # ph_flags: PF-X|PF-W|PF-R = 7
-00 00 00 00 00 00 00 00 # ph_offset
-&ELF_base 00 00 00 00 # ph_vaddr
-&ELF_base 00 00 00 00 # ph_physaddr
-%ELF_end>ELF_base 00 00 00 00 # ph_filesz
-%ELF_end>ELF_base 00 00 00 00 # ph_memsz
-01 00 00 00 00 00 00 00 # ph_align
-
-:ELF_text
diff --git a/arch/ELF-amd64.hex2 b/arch/ELF-amd64.hex2
@@ -1,74 +0,0 @@
-### Copyright (C) 2016 Jeremiah Orians
-### Copyright (C) 2017 Jan Nieuwenhuizen <janneke@gnu.org>
-### This file is part of M2-Planet.
-###
-### M2-Planet is free software: you can redistribute it and/or modify
-### it under the terms of the GNU General Public License as published by
-### the Free Software Foundation, either version 3 of the License, or
-### (at your option) any later version.
-###
-### M2-Planet is distributed in the hope that it will be useful,
-### but WITHOUT ANY WARRANTY; without even the implied warranty of
-### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-### GNU General Public License for more details.
-###
-### You should have received a copy of the GNU General Public License
-### along with M2-Planet. If not, see <http://www.gnu.org/licenses/>.
-
-### stage0's hex2 format
-### !<label> 1 byte relative
-### $<label> 2 byte address
-### @<label> 2 byte relative
-### &<label> 4 byte address
-### %<label> 4 byte relative
-
-### if you wish to use this header, you need to add :ELF_end to the end of your
-### M1 or hex2 files.
-
-## ELF Header
-
-:ELF_base
-7F 45 4C 46 # e_ident[EI_MAG0-3] ELF's magic number
-
-02 # e_ident[EI_CLASS] Indicating 64 bit
-01 # e_ident[EI_DATA] Indicating little endianness
-01 # e_ident[EI_VERSION] Indicating original elf
-
-03 # e_ident[EI_OSABI] Set at 3 because FreeBSD is strict
-00 # e_ident[EI_ABIVERSION] See above
-
-00 00 00 00 00 00 00 # e_ident[EI_PAD]
-
-02 00 # e_type Indicating Executable
-3E 00 # e_machine Indicating AMD64
-01 00 00 00 # e_version Indicating original elf
-
-&_start 00 00 00 00 # e_entry Address of the entry point
-%ELF_program_headers>ELF_base 00 00 00 00 # e_phoff Address of program header table
-00 00 00 00 00 00 00 00 # e_shoff Address of section header table
-
-00 00 00 00 # e_flags
-
-40 00 # e_ehsize Indicating our 64 Byte header
-
-38 00 # e_phentsize size of a program header table
-01 00 # e_phnum number of entries in program table
-
-00 00 # e_shentsize size of a section header table
-00 00 # e_shnum number of entries in section table
-
-00 00 # e_shstrndx index of the section names
-
-
-:ELF_program_headers
-:ELF_program_header__text
-01 00 00 00 # ph_type: PT-LOAD = 1
-07 00 00 00 # ph_flags: PF-X|PF-W|PF-R = 7
-00 00 00 00 00 00 00 00 # ph_offset
-&ELF_base 00 00 00 00 # ph_vaddr
-&ELF_base 00 00 00 00 # ph_physaddr
-%ELF_end>ELF_base 00 00 00 00 # ph_filesz
-%ELF_end>ELF_base 00 00 00 00 # ph_memsz
-01 00 00 00 00 00 00 00 # ph_align
-
-:ELF_text
diff --git a/arch/ELF-riscv64.hex2 b/arch/ELF-riscv64.hex2
@@ -1,74 +0,0 @@
-### Copyright (C) 2016 Jeremiah Orians
-### Copyright (C) 2017 Jan Nieuwenhuizen <janneke@gnu.org>
-### This file is part of M2-Planet.
-###
-### M2-Planet is free software: you can redistribute it and/or modify
-### it under the terms of the GNU General Public License as published by
-### the Free Software Foundation, either version 3 of the License, or
-### (at your option) any later version.
-###
-### M2-Planet is distributed in the hope that it will be useful,
-### but WITHOUT ANY WARRANTY; without even the implied warranty of
-### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-### GNU General Public License for more details.
-###
-### You should have received a copy of the GNU General Public License
-### along with M2-Planet. If not, see <http://www.gnu.org/licenses/>.
-
-### stage0's hex2 format
-### !<label> 1 byte relative
-### $<label> 2 byte address
-### @<label> 2 byte relative
-### &<label> 4 byte address
-### %<label> 4 byte relative
-
-### if you wish to use this header, you need to add :ELF_end to the end of your
-### M1 or hex2 files.
-
-## ELF Header
-
-:ELF_base
-7F 45 4C 46 # e_ident[EI_MAG0-3] ELF's magic number
-
-02 # e_ident[EI_CLASS] Indicating 64 bit
-01 # e_ident[EI_DATA] Indicating little endianness
-01 # e_ident[EI_VERSION] Indicating original elf
-
-03 # e_ident[EI_OSABI] Set at 3 because FreeBSD is strict
-00 # e_ident[EI_ABIVERSION] See above
-
-00 00 00 00 00 00 00 # e_ident[EI_PAD]
-
-02 00 # e_type Indicating Executable
-F3 00 # e_machine Indicating RISC-V
-01 00 00 00 # e_version Indicating original elf
-
-&_start 00 00 00 00 # e_entry Address of the entry point
-%ELF_program_headers>ELF_base 00 00 00 00 # e_phoff Address of program header table
-00 00 00 00 00 00 00 00 # e_shoff Address of section header table
-
-00 00 00 00 # e_flags
-
-40 00 # e_ehsize Indicating our 64 Byte header
-
-38 00 # e_phentsize size of a program header table
-01 00 # e_phnum number of entries in program table
-
-00 00 # e_shentsize size of a section header table
-00 00 # e_shnum number of entries in section table
-
-00 00 # e_shstrndx index of the section names
-
-
-:ELF_program_headers
-:ELF_program_header__text
-01 00 00 00 # ph_type: PT-LOAD = 1
-07 00 00 00 # ph_flags: PF-X|PF-W|PF-R = 7
-00 00 00 00 00 00 00 00 # ph_offset
-&ELF_base 00 00 00 00 # ph_vaddr
-&ELF_base 00 00 00 00 # ph_physaddr
-%ELF_end>ELF_base 00 00 00 00 # ph_filesz
-%ELF_end>ELF_base 00 00 00 00 # ph_memsz
-01 00 00 00 00 00 00 00 # ph_align
-
-:ELF_text
diff --git a/arch/p1_gen.py b/arch/p1_gen.py
@@ -1,1066 +0,0 @@
-#!/usr/bin/env python3
-"""p1_gen.py — generate p1_<arch>.M1 from a per-arch encoder table.
-
-Single source of truth for the P1 DEFINE tables across all three target
-arches. Running this script rewrites p1_aarch64.M1, p1_amd64.M1, and
-p1_riscv64.M1 in place.
-
-Structure:
- * Low-level native encoders (amd_*, aa_*, rv_*) — one bank of
- helpers per arch.
- * Encoder classes AA64/AMD64/RV64 (subclasses of Encoder): one
- method per P1 op category, lowering (op, reg-tuple, imm) into
- native hex. Each arch's encoder is a coherent bundle — adding a
- new op means one new method on each of the three.
- * Op dataclasses — thin rows holding the DEFINE's name + data.
- Op.encode(enc) dispatches into enc.<op-method>() with the Op's
- fields unpacked. No per-arch branching lives in Op classes.
- * rows() — builds the output list. Non-RRR ops are emitted as the
- full register product × a curated imm/offset/shamt set. RRR
- keeps an explicit table (the full 8³ cube is 5.6k entries per
- arch, >99% dead weight). Adding a new RRR triple or a new imm
- value is a one-line edit to rows(); a new register combination
- for any other op needs no edit at all.
- * emit(arch) / main — iterate rows, ask the arch's encoder to
- lower each, write out the defs file.
-
-Running:
- $ python3 p1_gen.py # rewrite all three files
- $ python3 p1_gen.py --check # diff against current files
-"""
-
-import os
-import sys
-from dataclasses import dataclass
-from itertools import product
-from typing import Optional
-
-ARCHES = ('aarch64', 'amd64', 'riscv64')
-
-## P1 GPRs (the 8 caller/callee-split registers exposed to P1 source).
-P1_REGS = ('r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7')
-
-## ---------- Register mappings --------------------------------------------
-## P1 register name → native encoding number. The native numbers are what
-## the per-arch encoders insert into instruction fields; the human-facing
-## names (rax, x1, a2, …) never appear in this file.
-
-## 4:4 caller/callee-saved split. r0–r3 caller (native argregs); r4–r7
-## callee (native callee-saved). `br` is the hidden branch-target scratch
-## (not a P1 reg) — picked so every op's expansion clobbers only what its
-## name declares.
-NAT_AA64 = {'r0': 0, 'r1': 1, 'r2': 2, 'r3': 3,
- 'r4': 26, 'r5': 27, 'r6': 19, 'r7': 20,
- 'br': 17, # x17 (IP1, caller-saved linker scratch)
- 'sp': 31, 'xzr': 31, 'lr': 30,
- 'x21': 21, 'x22': 22, 'x23': 23, 'x8': 8}
-
-## amd64 ModRM.reg/rm + REX.R/B bit: native regnums 0..15 with r8..r15
-## setting the REX bit. We store the 4-bit native number directly.
-NAT_AMD64 = {'r0': 0, # rax
- 'r1': 7, # rdi
- 'r2': 6, # rsi
- 'r3': 2, # rdx
- 'r4': 13, # r13 (callee-saved)
- 'r5': 14, # r14 (callee-saved)
- 'r6': 3, # rbx
- 'r7': 12, # r12
- 'br': 11, # r11 — branch/call target scratch + DIV/REM r0 save
- 'sp': 4, # rsp
- 'rcx': 1, # shift-count scratch + DIV/REM rdx save (not a P1 reg)
- 'r10': 10, # syscall arg4 slot (not a P1 reg)
- 'r8': 8, # syscall arg5 slot (not a P1 reg)
- 'r9': 9, # syscall arg6 slot (not a P1 reg)
- 'r11': 11, # alias for br (some expansions spell it r11 directly)
- }
-
-NAT_RV64 = {'r0': 10, 'r1': 11, 'r2': 12, 'r3': 13,
- 'r4': 20, 'r5': 21, 'r6': 9, 'r7': 18,
- 'br': 30, # t5 (caller-saved temp)
- 'sp': 2, 'ra': 1, 'zero': 0, 'a7': 17,
- 's3': 19, 's6': 22, 's7': 23}
-
-
-## ---------- Low-level encoding helpers -----------------------------------
-
-def le32(n: int) -> str:
- return (n & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
-
-def byte(n: int) -> str:
- return f'{n & 0xFF:02X}'
-
-
-## ---------- amd64 primitive encoders ------------------------------------
-## amd64 is variable-length. Helpers below emit specific instruction
-## shapes used by the P1 expansions. REX prefix bits: W=64b, R=ModRM.reg
-## high, B=ModRM.rm high, X=SIB.index high (unused here).
-
-def rex(w, r, x, b):
- v = 0x40 | (w << 3) | (r << 2) | (x << 1) | b
- return byte(v)
-
-def modrm(mod, reg, rm):
- return byte((mod << 6) | ((reg & 7) << 3) | (rm & 7))
-
-def amd_mov_rr(dst, src):
- """mov dst, src — REX.W + 89 /r (MOV r/m64, r64)."""
- d, s = NAT_AMD64[dst], NAT_AMD64[src]
- return rex(1, s >> 3, 0, d >> 3) + '89' + modrm(3, s, d)
-
-def amd_alu_rr(op, dst, src):
- """op dst, src — 2-operand ALU. op is the opcode byte (01 add,
- 29 sub, 21 and, 09 or, 31 xor)."""
- d, s = NAT_AMD64[dst], NAT_AMD64[src]
- return rex(1, s >> 3, 0, d >> 3) + op + modrm(3, s, d)
-
-def amd_alu_ri8(ext, dst, imm):
- """op dst, imm8 (sign-extended). Opcode 83 /ext ib."""
- d = NAT_AMD64[dst]
- return rex(1, 0, 0, d >> 3) + '83' + modrm(3, ext, d) + byte(imm)
-
-def amd_alu_ri32(ext, dst, imm):
- """op dst, imm32 (sign-extended). Opcode 81 /ext id. Used when
- an immediate doesn't fit in the imm8 form (e.g., ADDI with
- values outside [-128, 127])."""
- d = NAT_AMD64[dst]
- imm_le = (imm & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
- return rex(1, 0, 0, d >> 3) + '81' + modrm(3, ext, d) + imm_le
-
-def amd_shift_ri8(ext, dst, imm):
- """shl/shr/sar dst, imm8. Opcode C1 /ext ib."""
- d = NAT_AMD64[dst]
- return rex(1, 0, 0, d >> 3) + 'C1' + modrm(3, ext, d) + byte(imm)
-
-def amd_shift_cl(ext, dst):
- """shl/shr/sar dst, cl. Opcode D3 /ext."""
- d = NAT_AMD64[dst]
- return rex(1, 0, 0, d >> 3) + 'D3' + modrm(3, ext, d)
-
-def amd_imul_rr(dst, src):
- """imul dst, src — 0F AF /r."""
- d, s = NAT_AMD64[dst], NAT_AMD64[src]
- return rex(1, d >> 3, 0, s >> 3) + '0FAF' + modrm(3, d, s)
-
-def amd_idiv(src):
- """idiv src — F7 /7 (signed div of rdx:rax by src)."""
- s = NAT_AMD64[src]
- return rex(1, 0, 0, s >> 3) + 'F7' + modrm(3, 7, s)
-
-def amd_cqo():
- """cqo — sign-extend rax into rdx:rax. 48 99."""
- return '4899'
-
-def amd_mem_rm(opcode, reg, base, disp):
- """[base+disp] <-> reg, for MOV r,r/m or MOV r/m,r (opcode=89 store, 8B load).
- disp is signed int; encodes as disp8 if in range, else disp32."""
- r, b = NAT_AMD64[reg], NAT_AMD64[base]
- prefix = rex(1, r >> 3, 0, b >> 3) + opcode
- if -128 <= disp <= 127:
- mod = 1
- d = byte(disp)
- elif b == 4: # SIB required for rsp
- mod = 2
- d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
- else:
- mod = 2
- d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
- # rsp as base requires SIB byte (rm=4 with no SIB is rip-relative).
- if b == 4:
- return prefix + modrm(mod, r, 4) + '24' + d
- return prefix + modrm(mod, r, b) + d
-
-def amd_mov_rm_b(reg, base, disp, store):
- """Byte load/store. 88 /r (store), 0F B6 /r (movzx load)."""
- r, b = NAT_AMD64[reg], NAT_AMD64[base]
- if -128 <= disp <= 127:
- mod = 1
- d = byte(disp)
- else:
- mod = 2
- d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
- if store:
- # MOV r/m8, r8 — 88 /r. Requires REX to address dil/sil/bpl/spl.
- prefix = rex(1, r >> 3, 0, b >> 3) + '88'
- sib = '24' if b == 4 else ''
- rmv = 4 if b == 4 else b
- return prefix + modrm(mod, r, rmv) + sib + d
- else:
- # MOVZX r64, r/m8 — REX.W 0F B6 /r.
- prefix = rex(1, r >> 3, 0, b >> 3) + '0FB6'
- sib = '24' if b == 4 else ''
- rmv = 4 if b == 4 else b
- return prefix + modrm(mod, r, rmv) + sib + d
-
-
-## ---------- aarch64 primitive encoders ----------------------------------
-## aarch64 is fixed 4-byte insns. Helpers return the 4 bytes LE-encoded.
-
-def aa_rrr(base, rD, rA, rB):
- d, a, b = NAT_AA64[rD], NAT_AA64[rA], NAT_AA64[rB]
- return le32(base | (b << 16) | (a << 5) | d)
-
-def aa_add_imm(rD, rA, imm12, sub=False):
- """ADD/SUB (immediate, shift=0). imm12 unsigned 0..4095."""
- d, a = NAT_AA64[rD], NAT_AA64[rA]
- base = 0xD1000000 if sub else 0x91000000
- return le32(base | ((imm12 & 0xFFF) << 10) | (a << 5) | d)
-
-def aa_logical_imm(base, rD, rA, N, immr, imms):
- d, a = NAT_AA64[rD], NAT_AA64[rA]
- return le32(base | (N << 22) | (immr << 16) | (imms << 10) | (a << 5) | d)
-
-def aa_ubfm(rD, rA, immr, imms):
- """UBFM (N=1 for sf=64)."""
- d, a = NAT_AA64[rD], NAT_AA64[rA]
- return le32(0xD3400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
-
-def aa_sbfm(rD, rA, immr, imms):
- """SBFM (N=1 for sf=64)."""
- d, a = NAT_AA64[rD], NAT_AA64[rA]
- return le32(0x93400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
-
-def aa_ldst_uimm12(base, rT, rN, off_bytes, size_log2):
- """LDR/STR (unsigned offset). off_bytes must be a multiple of
- 2^size_log2 and non-negative. imm12 = off_bytes >> size_log2."""
- assert off_bytes >= 0 and (off_bytes % (1 << size_log2)) == 0
- imm12 = off_bytes >> size_log2
- assert 0 <= imm12 < 4096
- t, n = NAT_AA64[rT], NAT_AA64[rN]
- return le32(base | (imm12 << 10) | (n << 5) | t)
-
-def aa_ldst_unscaled(base, rT, rN, off):
- """LDUR/STUR (unscaled, signed imm9). Handles arbitrary small
- offsets — negative, or positive-but-not-a-multiple-of-the-access-
- size (e.g. LD at offset 7). imm9 range is [-256, 255]."""
- assert -256 <= off <= 255
- imm9 = off & 0x1FF
- t, n = NAT_AA64[rT], NAT_AA64[rN]
- return le32(base | (imm9 << 12) | (n << 5) | t)
-
-
-## ---------- riscv64 primitive encoders ----------------------------------
-
-def rv_r(base, rD, rA, rB):
- d, a, b = NAT_RV64[rD], NAT_RV64[rA], NAT_RV64[rB]
- return le32(base | (b << 20) | (a << 15) | (d << 7))
-
-def rv_i(base, rD, rA, imm12):
- """I-type: imm12[11:0], rs1, funct3, rd, opcode. imm12 is a signed
- int that gets masked to 12 bits."""
- d, a = NAT_RV64[rD], NAT_RV64[rA]
- return le32(base | ((imm12 & 0xFFF) << 20) | (a << 15) | (d << 7))
-
-def rv_s(base, rS, rA, imm12):
- """S-type store: imm12[11:5] rs2 rs1 funct3 imm12[4:0] opcode."""
- s, a = NAT_RV64[rS], NAT_RV64[rA]
- hi = (imm12 >> 5) & 0x7F
- lo = imm12 & 0x1F
- return le32(base | (hi << 25) | (s << 20) | (a << 15) | (lo << 7))
-
-def rv_shift_imm(base, rD, rA, shamt):
- """Shift-imm: base already has funct7 set; shamt in [0,63]."""
- d, a = NAT_RV64[rD], NAT_RV64[rA]
- return le32(base | ((shamt & 0x3F) << 20) | (a << 15) | (d << 7))
-
-
-## ---------- Per-arch op base tables -------------------------------------
-
-AA64_RRR_BASE = {
- 'ADD': 0x8B000000,
- 'SUB': 0xCB000000,
- 'AND': 0x8A000000,
- 'OR': 0xAA000000,
- 'XOR': 0xCA000000,
- 'SHL': 0x9AC02000,
- 'SHR': 0x9AC02400,
- 'SAR': 0x9AC02800,
- 'DIV': 0x9AC00C00,
-}
-AMD64_RRR_OPC = {
- 'ADD': '01', 'SUB': '29', 'AND': '21', 'OR': '09', 'XOR': '31',
-}
-RV_RRR = {
- 'ADD': 0x00000033, # funct7=0 funct3=0 opcode=0x33
- 'SUB': 0x40000033,
- 'XOR': 0x00004033,
- 'OR': 0x00006033,
- 'AND': 0x00007033,
- 'SHL': 0x00001033,
- 'SHR': 0x00005033,
- 'SAR': 0x40005033,
- 'MUL': 0x02000033,
- 'DIV': 0x02004033,
- 'REM': 0x02006033,
-}
-
-
-## aarch64 bitmask-immediate encoding for ANDI/ORI. Entries are the
-## (N, immr, imms) triples that encode each small imm as an aarch64
-## "logical immediate." Computed by hand because the full encoding
-## algorithm (contiguous-run + rotation for element sizes
-## 2/4/8/16/32/64) is substantial and we only need a handful of
-## values. Extend this table if a new imm shows up in P1 source.
-AA64_LOGI_ENC = {
- 1: (1, 0, 0), # 0b0001 — single bit at position 0
- 2: (1, 63, 0), # 0b0010 — single bit at position 1
- 3: (1, 0, 1), # 0b0011 — 2 contiguous ones
- 4: (1, 62, 0), # 0b0100 — single bit at position 2
- 6: (1, 63, 1), # 0b0110 — 2 ones rotated by 1
- 7: (1, 0, 2), # 0b0111 — 3 contiguous ones
- 8: (1, 61, 0), # 0b1000 — single bit at position 3
-}
-
-
-## Frame layout after PROLOGUE_Nk (k >= 1, rounded up so total frame
-## bytes stay 16-byte aligned on aarch64):
-## [sp + 0] = retaddr (aarch64 lr / riscv64 ra / amd64 retaddr)
-## [sp + 8] = slot 1 (callee-private scratch)
-## [sp + 16] = slot 2
-## ...
-## [sp + 8*k] = slot k
-##
-## Frame size = round_up_to_16(8 + 8*k). So k=1 → 16, k=2 → 24 → 32,
-## k=3 → 32, k=4 → 40 → 48.
-
-def prologue_frame_bytes(k: int) -> int:
- raw = 8 + 8 * k
- return (raw + 15) & ~15
-
-
-## ---------- Encoders ----------------------------------------------------
-## One class per arch. Each provides one method per P1 op category,
-## mapping (op, reg-tuple, imm) to native bytes. Op classes dispatch
-## here via `Op.encode(enc)` → `enc.<method>(fields)`.
-
-class Encoder:
- """Per-arch encoder base. Subclasses implement one method per
- op category. `arch` is used by literal() to pick the right
- pre-encoded bytes from an arch-keyed dict."""
- arch = ''
-
- def literal(self, hex_by_arch):
- return hex_by_arch[self.arch]
-
-
-class AA64(Encoder):
- arch = 'aarch64'
-
- def rrr(self, op, rD, rA, rB):
- if op == 'MUL':
- # MUL = MADD with Ra=xzr. 100 11011 000 mmmmm 0 aaaaa nnnnn ddddd
- d = NAT_AA64[rD]; a = NAT_AA64[rA]; b = NAT_AA64[rB]
- return le32(0x9B000000 | (b << 16) | (31 << 10) | (a << 5) | d)
- if op == 'REM':
- # SDIV x16, xA, xB ; MSUB xD, x16, xB, xA.
- # x16 (ARM IP0, caller-saved, not a P1 reg) is scratch so
- # REM does not hidden-clobber P1 r4 — the op modifies rD only.
- # MSUB needs bit 15 set (o0=1); without it it decodes as
- # MADD and REM returns A + (A/B)*B.
- d = NAT_AA64[rD]; a = NAT_AA64[rA]; b = NAT_AA64[rB]
- SC = 16
- sdiv = 0x9AC00C00 | (b << 16) | (a << 5) | SC
- msub = 0x9B008000 | (b << 16) | (a << 10) | (SC << 5) | d
- return le32(sdiv) + le32(msub)
- return aa_rrr(AA64_RRR_BASE[op], rD, rA, rB)
-
- def addi(self, rD, rA, imm):
- if imm >= 0:
- return aa_add_imm(rD, rA, imm, sub=False)
- return aa_add_imm(rD, rA, -imm, sub=True)
-
- def logi(self, op, rD, rA, imm):
- N, immr, imms = AA64_LOGI_ENC[imm]
- base = 0x92000000 if op == 'ANDI' else 0xB2000000 # ORI = orr
- return aa_logical_imm(base, rD, rA, N, immr, imms)
-
- def shifti(self, op, rD, rA, imm):
- if op == 'SHLI':
- return aa_ubfm(rD, rA, (-imm) & 63, 63 - imm)
- if op == 'SHRI':
- return aa_ubfm(rD, rA, imm, 63)
- if op == 'SARI':
- return aa_sbfm(rD, rA, imm, 63)
-
- def mov(self, rD, rA):
- if rA == 'sp':
- return aa_add_imm(rD, 'sp', 0, sub=False)
- # MOV xD, xA = ORR xD, xzr, xA
- d = NAT_AA64[rD]; a = NAT_AA64[rA]
- return le32(0xAA000000 | (a << 16) | (31 << 5) | d)
-
- def li(self, rD):
- # ldr wD, [pc+8] ; b +8 (caller emits 4 bytes of data next)
- d = NAT_AA64[rD]
- ldr_w_lit = 0x18000040 | d # LDR (literal) 32-bit, offset 8
- b_plus8 = 0x14000002 # B offset 8 (imm26 = 2 words = 8 bytes)
- return le32(ldr_w_lit) + le32(b_plus8)
-
- def la(self, rD):
- return self.li(rD)
-
- def mem(self, op, rT, rN, off):
- # Pick uimm12 (scaled, large range) when the offset is a
- # non-negative multiple of the access width; otherwise fall
- # back to the unscaled signed-imm9 form (covers negative
- # offsets and positive-but-misaligned ones like 7).
- BASES = {
- 'LD': (0xF9400000, 3, 0xF8400000),
- 'ST': (0xF9000000, 3, 0xF8000000),
- 'LB': (0x39400000, 0, 0x38400000),
- 'SB': (0x39000000, 0, 0x38000000),
- }
- uimm_base, size_log2, unscaled_base = BASES[op]
- scale = 1 << size_log2
- if off >= 0 and (off % scale) == 0:
- return aa_ldst_uimm12(uimm_base, rT, rN, off, size_log2)
- return aa_ldst_unscaled(unscaled_base, rT, rN, off)
-
- def b(self):
- return le32(0xD61F0000 | (NAT_AA64['br'] << 5)) # BR x17
-
- def condb(self, op, rA, rB):
- # cmp xA, xB = SUBS xzr, xA, xB (0xEB000000 base, rD=31).
- # Skip when NOT cond holds. BEQ→NE(1), BNE→EQ(0), BLT→GE(A).
- a = NAT_AA64[rA]; b_ = NAT_AA64[rB]
- cmp_ = le32(0xEB000000 | (b_ << 16) | (a << 5) | 31)
- cond = {'BEQ': 1, 'BNE': 0, 'BLT': 10}[op]
- bcond = le32(0x54000040 | cond)
- br = le32(0xD61F0000 | (NAT_AA64['br'] << 5))
- return cmp_ + bcond + br
-
- def call(self):
- return le32(0xD63F0000 | (NAT_AA64['br'] << 5)) # BLR x17
-
- def ret(self):
- return le32(0xD65F03C0) # RET (= br x30)
-
- def prologue(self, k):
- fb = prologue_frame_bytes(k)
- sub = aa_add_imm('sp', 'sp', fb, sub=True)
- str_lr = aa_ldst_uimm12(0xF9000000, 'lr', 'sp', 0, 3)
- return sub + str_lr
-
- def epilogue(self, k):
- fb = prologue_frame_bytes(k)
- ldr_lr = aa_ldst_uimm12(0xF9400000, 'lr', 'sp', 0, 3)
- add = aa_add_imm('sp', 'sp', fb, sub=False)
- return ldr_lr + add
-
- def tail(self, k):
- return self.epilogue(k) + self.b()
-
-
-class AMD64(Encoder):
- arch = 'amd64'
-
- def rrr(self, op, rD, rA, rB):
- if op == 'MUL':
- return amd_mov_rr(rD, rA) + amd_imul_rr(rD, rB)
- if op in ('DIV', 'REM'):
- # x86 idiv implicitly reads/writes rax (P1 r0) and rdx
- # (P1 r3). To keep DIV/REM clobber-free (only rD changes),
- # stash r0 into r11 and r3 into rcx — neither is a P1 reg —
- # then restore. If rA or rB alias r0/r3, read from the
- # saved copy since we've overwritten the originals.
- # Skip the final restore for whichever of r0/r3 *is* rD,
- # so rD keeps its newly computed value.
- seq = amd_mov_rr('r11', 'r0') # save r0 (rax)
- seq += amd_mov_rr('rcx', 'r3') # save r3 (rdx)
- src_a = 'r11' if rA == 'r0' else ('rcx' if rA == 'r3' else rA)
- seq += amd_mov_rr('r0', src_a) # rax = rA
- seq += amd_cqo() # rdx:rax = sign-ext rax
- src_b = 'r11' if rB == 'r0' else ('rcx' if rB == 'r3' else rB)
- seq += amd_idiv(src_b)
- seq += amd_mov_rr(rD, 'r0' if op == 'DIV' else 'r3')
- if rD != 'r3':
- seq += amd_mov_rr('r3', 'rcx')
- if rD != 'r0':
- seq += amd_mov_rr('r0', 'r11')
- return seq
- if op in ('SHL', 'SHR', 'SAR'):
- ext = {'SHL': 4, 'SHR': 5, 'SAR': 7}[op]
- seq = amd_mov_rr(rD, rA)
- seq += amd_mov_rr('rcx', rB)
- seq += amd_shift_cl(ext, rD)
- return seq
- # ADD/SUB/AND/OR/XOR: mov rD,rA ; op rD,rB
- seq = amd_mov_rr(rD, rA)
- seq += amd_alu_rr(AMD64_RRR_OPC[op], rD, rB)
- return seq
-
- def addi(self, rD, rA, imm):
- # mov rD,rA ; add rD,imm. Use imm8 form when it fits
- # ([-128, 127]); otherwise emit the imm32 form.
- seq = amd_mov_rr(rD, rA)
- if -128 <= imm <= 127:
- seq += amd_alu_ri8(0, rD, imm) # /0 = ADD
- else:
- seq += amd_alu_ri32(0, rD, imm)
- return seq
-
- def logi(self, op, rD, rA, imm):
- ext = {'ANDI': 4, 'ORI': 1}[op]
- seq = amd_mov_rr(rD, rA)
- seq += amd_alu_ri8(ext, rD, imm)
- return seq
-
- def shifti(self, op, rD, rA, imm):
- ext = {'SHLI': 4, 'SHRI': 5, 'SARI': 7}[op]
- seq = amd_mov_rr(rD, rA)
- seq += amd_shift_ri8(ext, rD, imm)
- return seq
-
- def mov(self, rD, rA):
- return amd_mov_rr(rD, rA)
-
- def li(self, rD):
- # mov <rD as r32>, imm32 — opcode B8+r (with REX.B if r8..r15)
- d = NAT_AMD64[rD]
- if d >= 8:
- return '41' + byte(0xB8 + (d & 7))
- return byte(0xB8 + d)
-
- def la(self, rD):
- return self.li(rD)
-
- def mem(self, op, rT, rN, off):
- if op == 'LD': return amd_mem_rm('8B', rT, rN, off)
- if op == 'ST': return amd_mem_rm('89', rT, rN, off)
- if op == 'LB': return amd_mov_rm_b(rT, rN, off, store=False)
- if op == 'SB': return amd_mov_rm_b(rT, rN, off, store=True)
-
- def b(self):
- return '41FFE3' # jmp r11
-
- def condb(self, op, rA, rB):
- a, b_ = NAT_AMD64[rA], NAT_AMD64[rB]
- # cmp rA, rB — opcode 39 /r with rA as r/m
- cmp_ = rex(1, b_ >> 3, 0, a >> 3) + '39' + modrm(3, b_, a)
- # jcc rel8 opcode, skip=3 (past jmp r11):
- # BEQ→JNE 75 03 ; BNE→JE 74 03 ; BLT→JGE 7D 03
- jop = {'BEQ': '75', 'BNE': '74', 'BLT': '7D'}[op]
- return cmp_ + jop + '03' + '41FFE3' # jmp r11
-
- def call(self):
- return '41FFD3' # call r11
-
- def ret(self):
- return 'C3'
-
- def prologue(self, k):
- # pop rcx ; sub rsp,fb ; push rcx. rcx is the retaddr-carry
- # scratch — caller-save, never a P1 reg. r11 (= 'br') is
- # off-limits because TAIL = EPILOGUE + `jmp r11`, and using
- # r11 here would clobber the LI_BR-loaded tail target.
- fb = prologue_frame_bytes(k)
- assert fb <= 127
- return '59' + '4883EC' + byte(fb) + '51'
-
- def epilogue(self, k):
- # Mirror of prologue: pop rcx ; add rsp,fb ; push rcx.
- fb = prologue_frame_bytes(k)
- assert fb <= 127
- return '59' + '4883C4' + byte(fb) + '51'
-
- def tail(self, k):
- return self.epilogue(k) + self.b()
-
-
-class RV64(Encoder):
- arch = 'riscv64'
-
- def rrr(self, op, rD, rA, rB):
- return rv_r(RV_RRR[op], rD, rA, rB)
-
- def addi(self, rD, rA, imm):
- return rv_i(0x00000013, rD, rA, imm)
-
- def logi(self, op, rD, rA, imm):
- base = {'ANDI': 0x00007013, 'ORI': 0x00006013}[op]
- return rv_i(base, rD, rA, imm)
-
- def shifti(self, op, rD, rA, imm):
- base = {'SHLI': 0x00001013, 'SHRI': 0x00005013, 'SARI': 0x40005013}[op]
- return rv_shift_imm(base, rD, rA, imm)
-
- def mov(self, rD, rA):
- return rv_i(0x00000013, rD, rA, 0) # addi rD, rA, 0
-
- def li(self, rD):
- # auipc rD,0 ; lwu rD,12(rD) ; jal x0,+8
- d = NAT_RV64[rD]
- auipc = 0x00000017 | (d << 7)
- lwu = 0x00006003 | (d << 7) | (d << 15) | (12 << 20)
- jal_p8 = 0x0080006F
- return le32(auipc) + le32(lwu) + le32(jal_p8)
-
- def la(self, rD):
- return self.li(rD)
-
- def mem(self, op, rT, rN, off):
- # funct3: LD=3, ST=3, LBU=4, SB=0. Opcodes: load=03, store=23.
- if op == 'LD': return rv_i(0x00003003, rT, rN, off)
- if op == 'ST': return rv_s(0x00003023, rT, rN, off)
- if op == 'LB': return rv_i(0x00004003, rT, rN, off) # LBU
- if op == 'SB': return rv_s(0x00000023, rT, rN, off)
-
- def b(self):
- return le32(0x00000067 | (NAT_RV64['br'] << 15)) # jalr x0, 0(t5)
-
- def condb(self, op, rA, rB):
- # B<inv> rA, rB, +8 ; jalr x0, 0(t5). funct3 picks the op:
- # BEQ→BNE(1), BNE→BEQ(0), BLT→BGE(5).
- a, b_ = NAT_RV64[rA], NAT_RV64[rB]
- funct3 = {'BEQ': 1, 'BNE': 0, 'BLT': 5}[op]
- insn = 0x00000063 | (funct3 << 12) | (a << 15) | (b_ << 20) | (8 << 7)
- jalr = 0x00000067 | (NAT_RV64['br'] << 15)
- return le32(insn) + le32(jalr)
-
- def call(self):
- return le32(0x000000E7 | (NAT_RV64['br'] << 15)) # jalr ra, 0(t5)
-
- def ret(self):
- return le32(0x00008067) # jalr x0, 0(ra)
-
- def prologue(self, k):
- fb = prologue_frame_bytes(k)
- sub = rv_i(0x00000013, 'sp', 'sp', -fb)
- sd = rv_s(0x00003023, 'ra', 'sp', 0)
- return sub + sd
-
- def epilogue(self, k):
- fb = prologue_frame_bytes(k)
- ld = rv_i(0x00003003, 'ra', 'sp', 0)
- add = rv_i(0x00000013, 'sp', 'sp', fb)
- return ld + add
-
- def tail(self, k):
- return self.epilogue(k) + self.b()
-
-
-ENCODERS = {'aarch64': AA64(), 'amd64': AMD64(), 'riscv64': RV64()}
-
-
-## ---------- Op dataclasses ----------------------------------------------
-## Thin wrappers: each row holds its DEFINE name + the data needed to
-## reconstruct the encoding. `encode(enc)` calls the matching method
-## on the arch's encoder.
-
-@dataclass
-class Op:
- name: str
- comment: str = ''
-
- def encode(self, enc: Encoder) -> str:
- raise NotImplementedError
-
-@dataclass
-class RRR(Op):
- op: str = ''
- rD: str = ''
- rA: str = ''
- rB: str = ''
- def encode(self, enc):
- return enc.rrr(self.op, self.rD, self.rA, self.rB)
-
-@dataclass
-class AddI(Op):
- rD: str = ''
- rA: str = ''
- imm: int = 0
- def encode(self, enc):
- return enc.addi(self.rD, self.rA, self.imm)
-
-@dataclass
-class LogI(Op):
- op: str = '' # ANDI / ORI
- rD: str = ''
- rA: str = ''
- imm: int = 0
- def encode(self, enc):
- return enc.logi(self.op, self.rD, self.rA, self.imm)
-
-@dataclass
-class ShiftI(Op):
- op: str = '' # SHLI / SHRI / SARI
- rD: str = ''
- rA: str = ''
- imm: int = 0
- def encode(self, enc):
- return enc.shifti(self.op, self.rD, self.rA, self.imm)
-
-@dataclass
-class Mov(Op):
- rD: str = ''
- rA: str = ''
- def encode(self, enc):
- return enc.mov(self.rD, self.rA)
-
-@dataclass
-class Li(Op):
- rD: str = ''
- def encode(self, enc):
- return enc.li(self.rD)
-
-@dataclass
-class La(Op):
- rD: str = ''
- def encode(self, enc):
- return enc.la(self.rD)
-
-@dataclass
-class Mem(Op):
- op: str = '' # LD / ST / LB / SB
- rT: str = ''
- rN: str = ''
- off: int = 0
- def encode(self, enc):
- return enc.mem(self.op, self.rT, self.rN, self.off)
-
-@dataclass
-class B(Op):
- def encode(self, enc):
- return enc.b()
-
-@dataclass
-class CondB(Op):
- op: str = '' # BEQ / BNE / BLT
- rA: str = ''
- rB: str = ''
- def encode(self, enc):
- return enc.condb(self.op, self.rA, self.rB)
-
-@dataclass
-class Literal(Op):
- hex_by_arch: Optional[dict] = None
- def encode(self, enc):
- return enc.literal(self.hex_by_arch)
-
-@dataclass
-class Prologue(Op):
- k: int = 1
- def encode(self, enc):
- return enc.prologue(self.k)
-
-@dataclass
-class Epilogue(Op):
- k: int = 1
- def encode(self, enc):
- return enc.epilogue(self.k)
-
-@dataclass
-class Tail(Op):
- k: int = 1
- def encode(self, enc):
- return enc.tail(self.k)
-
-@dataclass
-class Call(Op):
- def encode(self, enc):
- return enc.call()
-
-@dataclass
-class Ret(Op):
- def encode(self, enc):
- return enc.ret()
-
-
-## ---------- SYSCALL pre-encoded sequences -------------------------------
-## The one-shot syscall wrapper. Shuffles P1's r0=num, r1–r6=args into
-## each arch's native syscall ABI and clobbers only r0 on return.
-## Encoded by hand (per P1.md §"Syscall conventions").
-
-SYSCALL_HEX = {
- 'aarch64': (
- # r4/r5 now live in callee-saved natives (x26/x27), so the
- # kernel preserves them — no save/restore needed. Only r1/r2/r3
- # (in caller-saved x1/x2/x3) must be stashed across the shuffle.
- '' .join([
- le32(0xAA0003E8), # mov x8, x0 (syscall number)
- le32(0xAA0103F5), # mov x21, x1 (save r1)
- le32(0xAA0203F6), # mov x22, x2 (save r2)
- le32(0xAA0303F7), # mov x23, x3 (save r3)
- le32(0xAA1503E0), # mov x0, x21 (arg1 = r1)
- le32(0xAA1603E1), # mov x1, x22 (arg2 = r2)
- le32(0xAA1703E2), # mov x2, x23 (arg3 = r3)
- le32(0xAA1A03E3), # mov x3, x26 (arg4 = r4)
- le32(0xAA1B03E4), # mov x4, x27 (arg5 = r5)
- le32(0xAA1303E5), # mov x5, x19 (arg6 = r6)
- le32(0xD4000001), # svc #0
- le32(0xAA1503E1), # mov x1, x21 (restore r1)
- le32(0xAA1603E2), # mov x2, x22
- le32(0xAA1703E3), # mov x3, x23
- ])
- ),
- # r4=r13, r5=r14 are callee-saved natively, but syscall wants args
- # 4/5 in r10/r8. r6=rbx, but arg6 lives in r9. Three shuffle moves,
- # then syscall. The kernel preserves rdi/rsi/rdx/r12–r15/rbx, so no
- # P1 reg is clobbered beyond r0 (syscall return).
- 'amd64': '4D89EA' + '4D89F0' + '4989D9' + '0F05',
- 'riscv64': (
- # Same story as aarch64: r4/r5 in callee-saved s4/s5 (=x20/x21),
- # so we only save/restore a1/a2/a3. Scratch slots: s3, s6, s7.
- ''.join([
- le32(0x00050893), # mv a7, a0 (syscall number)
- le32(0x00058993), # mv s3, a1 (save r1)
- le32(0x00060B13), # mv s6, a2 (save r2)
- le32(0x00068B93), # mv s7, a3 (save r3)
- le32(0x00098513), # mv a0, s3 (arg1 = r1)
- le32(0x000B0593), # mv a1, s6 (arg2 = r2)
- le32(0x000B8613), # mv a2, s7 (arg3 = r3)
- le32(0x000A0693), # mv a3, s4 (arg4 = r4)
- le32(0x000A8713), # mv a4, s5 (arg5 = r5)
- le32(0x00048793), # mv a5, s1 (arg6 = r6)
- le32(0x00000073), # ecall
- le32(0x00098593), # mv a1, s3 (restore r1)
- le32(0x000B0613), # mv a2, s6
- le32(0x000B8693), # mv a3, s7
- ])
- ),
-}
-
-## Syscall numbers (little-endian 32-bit for LI operand).
-## aarch64 and riscv64 share the asm-generic table; amd64 has its own.
-##
-## Portability notes — every entry below is a syscall that exists on all
-## three with the same semantics under the uniform P1 SYSCALL convention
-## (r0 = num, r1-r6 = args):
-## - `fork` is amd64-only; `wait4` is asm-generic 32-bit compat only.
-## Use `clone(SIGCHLD)` and `waitid` instead.
-## - `open` is amd64-only (removed from asm-generic). Use `openat` with
-## dirfd = AT_FDCWD (-100) as arg1.
-## - `clone` arg order differs: amd64 is (flags, stack, ptid, ctid, tls);
-## aarch64/riscv64 are (flags, stack, ptid, tls, ctid). Benign when
-## ptid/ctid/tls are all zero (the fork-equivalent case).
-SYS_NUM = {
- 'aarch64': {'SYS_WRITE': 64, 'SYS_EXIT': 93, 'SYS_READ': 63, 'SYS_CLOSE': 57,
- 'SYS_OPENAT': 56,
- 'SYS_CLONE': 220, 'SYS_EXECVE': 221, 'SYS_WAITID': 95},
- 'amd64': {'SYS_WRITE': 1, 'SYS_EXIT': 60, 'SYS_READ': 0, 'SYS_CLOSE': 3,
- 'SYS_OPENAT':257,
- 'SYS_CLONE': 56, 'SYS_EXECVE': 59, 'SYS_WAITID':247},
- 'riscv64': {'SYS_WRITE': 64, 'SYS_EXIT': 93, 'SYS_READ': 63, 'SYS_CLOSE': 57,
- 'SYS_OPENAT': 56,
- 'SYS_CLONE': 220, 'SYS_EXECVE': 221, 'SYS_WAITID': 95},
-}
-
-
-## ---------- Canonical imm/offset/shamt sets -----------------------------
-## Enumerated instead of sigil-passed: M1's DEFINE substitutes hex
-## bytes verbatim, so every distinct imm value needs its own DEFINE.
-## These cover every value used across hello/demo/lisp/kaem-minimal
-## plus a little headroom. Extend when a new value appears in P1 src.
-
-## ADDI imms. NEG48/48 handle the ASCII '0' bias; the rest cover tag
-## stripping and loop counters. Full reg product × this set = 8²×N.
-ADDI_IMMS = (-48, -8, -7, -6, -5, -4, -3, -2, -1,
- 1, 2, 3, 4, 5, 6, 7, 8, 48)
-
-## Shift amounts (for SHLI/SHRI/SARI). 32/52 implement low-N-bit masks
-## (length field extraction; 4096-slot symbol-table index); the small
-## values scale-by-N for byte offsets and fixnum encode/decode.
-SHIFT_IMMS = (1, 2, 3, 5, 16, 32, 52)
-
-## ANDI/ORI imms. Every entry must appear in AA64_LOGI_ENC.
-LOGI_IMMS = (1, 2, 3, 4, 6, 7, 8)
-
-## Memory offsets for LD/ST/LB/SB. 0/8/16/24/32 cover slot offsets in
-## N-slot frames and common struct fields; 7 is the NUL terminator
-## position inside an 8-byte zero-padded slot; -8 reaches one slot
-## below the current base.
-MEM_OFFS = (-8, 0, 7, 8, 16, 24, 32)
-
-CONDB_OPS = ('BEQ', 'BNE', 'BLT')
-SHIFT_OPS = ('SHLI', 'SHRI', 'SARI')
-LOGI_OPS = ('ANDI', 'ORI')
-MEM_OPS = ('LD', 'ST', 'LB', 'SB')
-
-
-## Curated RRR triples. The full cube is 11 ops × 8³ regs = 5632
-## entries per arch — >99% would be dead weight. Each tuple below
-## is one actually used by hello/demo/lisp/kaem-minimal. Lint
-## catches missing triples on assembly; add a line here and
-## regenerate.
-RRR_TABLE = (
- # demo/lisp step-1 arith cube
- ('ADD','r1','r1','r2'), ('ADD','r1','r1','r4'),
- ('ADD','r2','r2','r6'), ('ADD','r2','r3','r1'),
- ('SUB','r1','r1','r2'), ('SUB','r2','r2','r6'),
- ('AND','r1','r1','r5'),
- ('OR', 'r1','r1','r2'),
- ('XOR','r1','r1','r2'),
- ('MUL','r1','r1','r2'),
- ('DIV','r1','r1','r2'),
- ('REM','r1','r1','r5'),
- ('SHL','r1','r1','r2'),
- ('SHR','r1','r1','r2'),
- ('SAR','r4','r4','r2'),
- # alloc / pointer arithmetic
- ('ADD','r2','r0','r1'),
- ('ADD','r0','r0','r3'),
- ('ADD','r2','r2','r0'),
- ('ADD','r2','r2','r1'),
- ('SUB','r3','r3','r0'),
- # reader / display index+offset fold
- ('ADD','r6','r1','r2'),
- ('ADD','r6','r6','r0'),
- ('ADD','r7','r1','r2'),
- ('SUB','r2','r1','r6'),
- ('SUB','r3','r1','r6'),
- ('REM','r1','r1','r2'),
- # kaem-minimal bump-pointer + accumulator updates
- ('ADD','r1','r1','r0'),
- ('ADD','r5','r5','r0'),
- ('ADD','r7','r7','r0'),
- ('SUB','r3','r3','r2'),
- ('SUB','r6','r6','r0'),
-)
-
-
-## ---------- Row assembly ------------------------------------------------
-
-HEADER = """## p1_{arch}.M1 — GENERATED by p1_gen.py. Do not edit by hand.
-##
-## Shared op-table lives in p1_gen.py; each arch's encoder lowers
-## (op, register-tuple, imm) rows into native bytes. See P1.md for the
-## ISA spec and register mapping.
-"""
-
-@dataclass
-class Banner:
- text: str
-
-
-def _imm_suf(imm):
- return f'NEG{-imm}' if imm < 0 else f'{imm}'
-
-
-def rows():
- R = []
-
- # --- LI / LA — wide literal and address loads ---
- R.append(Banner('LI / LA — load 4-byte zero-extended literal or label addr'))
- for rd in P1_REGS:
- R.append(Li(name=f'LI_{rd.upper()}', rD=rd))
- # LI_BR loads into the hidden branch-target scratch (x17/r11/t5).
- # Every branch/call site is `LI_BR &target ; P1_<BR>`. The scratch
- # is *not* a P1 reg.
- R.append(Li(name='LI_BR', rD='br'))
- for rd in P1_REGS:
- R.append(La(name=f'LA_{rd.upper()}', rD=rd))
-
- # --- MOV — register-to-register + MOV rD, sp ---
- R.append(Banner('MOV — full register product (src may be sp)'))
- for rd in P1_REGS:
- for ra in P1_REGS:
- R.append(Mov(name=f'MOV_{rd.upper()}_{ra.upper()}', rD=rd, rA=ra))
- R.append(Mov(name=f'MOV_{rd.upper()}_SP', rD=rd, rA='sp'))
-
- # --- RRR — curated triples (full cube would be 5.6k/arch) ---
- R.append(Banner('RRR — curated triples (explicit table in p1_gen.py)'))
- for op, d, a, b in RRR_TABLE:
- R.append(RRR(name=f'{op}_{d.upper()}_{a.upper()}_{b.upper()}',
- op=op, rD=d, rA=a, rB=b))
-
- # --- Immediate arith: ADDI × full reg product × imm set ---
- R.append(Banner('ADDI — full register product × ADDI_IMMS'))
- for d, a, imm in product(P1_REGS, P1_REGS, ADDI_IMMS):
- R.append(AddI(name=f'ADDI_{d.upper()}_{a.upper()}_{_imm_suf(imm)}',
- rD=d, rA=a, imm=imm))
-
- # --- ANDI / ORI × full reg product × LOGI_IMMS ---
- R.append(Banner('ANDI / ORI — full register product × LOGI_IMMS'))
- for op, d, a, imm in product(LOGI_OPS, P1_REGS, P1_REGS, LOGI_IMMS):
- R.append(LogI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}',
- op=op, rD=d, rA=a, imm=imm))
-
- # --- SHLI / SHRI / SARI × full reg product × SHIFT_IMMS ---
- R.append(Banner('SHLI / SHRI / SARI — full register product × SHIFT_IMMS'))
- for op, d, a, imm in product(SHIFT_OPS, P1_REGS, P1_REGS, SHIFT_IMMS):
- R.append(ShiftI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}',
- op=op, rD=d, rA=a, imm=imm))
-
- # --- Memory: LD/ST/LB/SB × full reg product × MEM_OFFS ---
- R.append(Banner('LD / ST / LB / SB — full register product × MEM_OFFS'))
- for op, rt, rn, off in product(MEM_OPS, P1_REGS, P1_REGS, MEM_OFFS):
- R.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{_imm_suf(off)}',
- op=op, rT=rt, rN=rn, off=off))
-
- # --- Branches: BEQ/BNE/BLT × full reg product + unconditional B ---
- R.append(Banner('Branches — LI_BR-indirect pattern'))
- R.append(B(name='B'))
- for op, a, b in product(CONDB_OPS, P1_REGS, P1_REGS):
- R.append(CondB(name=f'{op}_{a.upper()}_{b.upper()}',
- op=op, rA=a, rB=b))
-
- # --- Control: CALL / RET / PROLOGUE / EPILOGUE / TAIL (Nk = 1..4) ---
- R.append(Banner('Control: CALL/RET + single-slot and N-slot PROLOGUE/EPILOGUE/TAIL'))
- R.append(Prologue(name='PROLOGUE', k=1))
- R.append(Epilogue(name='EPILOGUE', k=1))
- R.append(Ret(name='RET'))
- R.append(Call(name='CALL'))
- R.append(Tail(name='TAIL', k=1))
- for k in (2, 3, 4):
- R.append(Prologue(name=f'PROLOGUE_N{k}', k=k))
- R.append(Epilogue(name=f'EPILOGUE_N{k}', k=k))
- R.append(Tail(name=f'TAIL_N{k}', k=k))
-
- # --- SYSCALL — pre-encoded per-arch wrapper ---
- R.append(Banner('SYSCALL — uniform "clobbers r0 only" across arches'))
- R.append(Literal(name='SYSCALL', hex_by_arch=SYSCALL_HEX))
-
- # --- Syscall numbers (LE-32 immediates) ---
- R.append(Banner('Linux syscall numbers (per-arch table). LE-32 operands for LI.'))
- for name in ('SYS_WRITE', 'SYS_EXIT', 'SYS_READ', 'SYS_CLOSE', 'SYS_OPENAT',
- 'SYS_CLONE', 'SYS_EXECVE', 'SYS_WAITID'):
- R.append(Literal(name=name,
- hex_by_arch={a: le32(SYS_NUM[a][name]) for a in ARCHES}))
-
- return R
-
-
-## ---------- File emission -----------------------------------------------
-
-def emit(arch: str) -> str:
- enc = ENCODERS[arch]
- out = [HEADER.format(arch=arch).rstrip(), '']
- seen = set()
- for row in rows():
- if isinstance(row, Banner):
- out.append('')
- out.append('## ---- ' + row.text + ' ' + '-' * max(0, 60 - len(row.text)))
- continue
- name = 'P1_' + row.name if not row.name.startswith('SYS_') else row.name
- if name in seen:
- raise RuntimeError(f'duplicate DEFINE: {name}')
- seen.add(name)
- out.append(f'DEFINE {name} {row.encode(enc)}')
- out.append('')
- return '\n'.join(out)
-
-
-def main():
- here = os.path.dirname(os.path.abspath(__file__))
- check = '--check' in sys.argv
-
- had_diff = False
- for arch in ARCHES:
- path = os.path.join(here, f'p1_{arch}.M1')
- content = emit(arch)
- if check:
- with open(path) as f:
- existing = f.read()
- if existing != content:
- sys.stderr.write(f'DIFF: {path}\n')
- had_diff = True
- else:
- with open(path, 'w') as f:
- f.write(content)
- print(f'wrote {path} ({len(content)} bytes)')
-
- if check and had_diff:
- sys.exit(1)
-
-
-if __name__ == '__main__':
- main()
diff --git a/bootstrap.sh b/bootstrap.sh
@@ -9,6 +9,10 @@
# bytes, hand-assembled, shipped by stage0-posix). Nothing above M0 is built,
# which is the whole point — no C compiler is involved, not even cc_<arch>.
#
+# Inputs are read from build/upstream/, which populate-upstream.sh mirrors
+# from live-bootstrap's stage0-posix on the host. The container mounts only
+# curdir, so everything bootstrap.sh needs must already live inside it.
+#
# Phase map (stage0-posix mescc-tools-{seed,mini}-kaem.kaem phases 0-3):
# 0) hex0-seed + hex0_<A>.hex0 -> hex0
# 1) hex0 + hex1_<A>.hex0 -> hex1
@@ -34,9 +38,8 @@ case "$ARCH" in
*) echo "bootstrap.sh: unsupported arch '$ARCH'" >&2 ; exit 1 ;;
esac
-S=/work/live-bootstrap/seed/stage0-posix
+S=build/upstream
mkdir -p "$OUT"
-cd "$S"
# qemu-user amd64 workaround: the shipped hex0-seed and the hex0 it produces
# both have a program header with p_flags=0x01 (PF_X only, no PF_R). A native
@@ -47,7 +50,7 @@ cd "$S"
#
# This only affects foreign-arch builds on non-amd64 hosts; on a native amd64
# host the patch is a no-op (binary would load fine either way).
-SEED=./bootstrap-seeds/POSIX/"$A"/hex0-seed
+SEED="$S"/bootstrap-seeds/POSIX/"$A"/hex0-seed
if [ "$ARCH" = amd64 ]; then
cp "$SEED" "$OUT"/hex0-seed
printf '\5' | dd of="$OUT"/hex0-seed bs=1 seek=68 count=1 conv=notrunc status=none
@@ -55,13 +58,13 @@ if [ "$ARCH" = amd64 ]; then
SEED="$OUT"/hex0-seed
fi
-"$SEED" "$A"/hex0_"$A".hex0 "$OUT"/hex0
+"$SEED" "$S"/"$A"/hex0_"$A".hex0 "$OUT"/hex0
if [ "$ARCH" = amd64 ]; then
printf '\5' | dd of="$OUT"/hex0 bs=1 seek=68 count=1 conv=notrunc status=none
fi
-"$OUT"/hex0 "$A"/hex1_"$A".hex0 "$OUT"/hex1
-"$OUT"/hex1 "$A"/hex2_"$A".hex1 "$OUT"/hex2-0
-"$OUT"/"$CATM_ASM" "$A"/"$CATM_SRC" "$OUT"/catm
-"$OUT"/catm "$OUT"/M0.hex2 "$A"/ELF-"$ARCH".hex2 "$A"/M0_"$A".hex2
+"$OUT"/hex0 "$S"/"$A"/hex1_"$A".hex0 "$OUT"/hex1
+"$OUT"/hex1 "$S"/"$A"/hex2_"$A".hex1 "$OUT"/hex2-0
+"$OUT"/"$CATM_ASM" "$S"/"$A"/"$CATM_SRC" "$OUT"/catm
+"$OUT"/catm "$OUT"/M0.hex2 "$S"/"$A"/ELF-"$ARCH".hex2 "$S"/"$A"/M0_"$A".hex2
"$OUT"/hex2-0 "$OUT"/M0.hex2 "$OUT"/M0
diff --git a/populate-upstream.sh b/populate-upstream.sh
@@ -0,0 +1,48 @@
+#!/bin/sh
+# Copy the files bootstrap.sh needs from live-bootstrap's stage0-posix into
+# build/upstream/, mirroring the upstream directory layout. Runs on the host:
+# the podman invocations in the Makefile only mount curdir, so anything
+# bootstrap.sh needs has to land inside curdir first.
+#
+# Inputs per arch (A in AArch64|AMD64|riscv64):
+# bootstrap-seeds/POSIX/$A/hex0-seed
+# $A/hex0_$A.hex0
+# $A/hex1_$A.hex0
+# $A/hex2_$A.hex1
+# $A/catm_$A.(hex1|hex2) extension differs across arches
+# $A/M0_$A.hex2
+# $A/ELF-<arch>.hex2 used by both bootstrap.sh (M0.hex2 link) and
+# the Makefile's final program link
+#
+# Usage: populate-upstream.sh [UPSTREAM]
+# UPSTREAM: path to live-bootstrap checkout (default: ../live-bootstrap)
+set -eu
+
+UPSTREAM=${1:-../live-bootstrap}
+S="$UPSTREAM/seed/stage0-posix"
+OUT=build/upstream
+
+if [ ! -d "$S" ]; then
+ echo "populate-upstream.sh: expected '$S' to exist" >&2
+ exit 1
+fi
+
+for A in AArch64 AMD64 riscv64; do
+ case "$A" in
+ AArch64) arch=aarch64 ; CATM=catm_AArch64.hex1 ;;
+ AMD64) arch=amd64 ; CATM=catm_AMD64.hex2 ;;
+ riscv64) arch=riscv64 ; CATM=catm_riscv64.hex2 ;;
+ esac
+
+ mkdir -p "$OUT/bootstrap-seeds/POSIX/$A" "$OUT/$A"
+
+ cp "$S/bootstrap-seeds/POSIX/$A/hex0-seed" "$OUT/bootstrap-seeds/POSIX/$A/"
+ cp "$S/$A/hex0_$A.hex0" "$OUT/$A/"
+ cp "$S/$A/hex1_$A.hex0" "$OUT/$A/"
+ cp "$S/$A/hex2_$A.hex1" "$OUT/$A/"
+ cp "$S/$A/$CATM" "$OUT/$A/"
+ cp "$S/$A/M0_$A.hex2" "$OUT/$A/"
+ cp "$S/$A/ELF-$arch.hex2" "$OUT/$A/"
+done
+
+echo "populate-upstream: copied into $OUT from $UPSTREAM"
diff --git a/src/p1_gen.py b/src/p1_gen.py
@@ -0,0 +1,1089 @@
+#!/usr/bin/env python3
+"""p1_gen.py — generate p1_<arch>.M1 from a per-arch encoder table.
+
+Single source of truth for the P1 DEFINE tables across all three target
+arches. Running this script writes <build>/aarch64/p1_aarch64.M1 and the
+amd64/riscv64 siblings (default <build> = "build").
+
+Structure:
+ * Low-level native encoders (amd_*, aa_*, rv_*) — one bank of
+ helpers per arch.
+ * Encoder classes AA64/AMD64/RV64 (subclasses of Encoder): one
+ method per P1 op category, lowering (op, reg-tuple, imm) into
+ native hex. Each arch's encoder is a coherent bundle — adding a
+ new op means one new method on each of the three.
+ * Op dataclasses — thin rows holding the DEFINE's name + data.
+ Op.encode(enc) dispatches into enc.<op-method>() with the Op's
+ fields unpacked. No per-arch branching lives in Op classes.
+ * rows() — builds the output list. Non-RRR ops are emitted as the
+ full register product × a curated imm/offset/shamt set. RRR
+ keeps an explicit table (the full 8³ cube is 5.6k entries per
+ arch, >99% dead weight). Adding a new RRR triple or a new imm
+ value is a one-line edit to rows(); a new register combination
+ for any other op needs no edit at all.
+ * emit(arch) / main — iterate rows, ask the arch's encoder to
+ lower each, write out the defs file.
+
+Running:
+ $ python3 p1_gen.py [build-root] # rewrite all three files
+ $ python3 p1_gen.py --check [build-root] # diff against current files
+"""
+
+import os
+import sys
+from dataclasses import dataclass
+from itertools import product
+from typing import Optional
+
+ARCHES = ('aarch64', 'amd64', 'riscv64')
+
+## P1 GPRs (the 8 caller/callee-split registers exposed to P1 source).
+P1_REGS = ('r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7')
+
+## ---------- Register mappings --------------------------------------------
+## P1 register name → native encoding number. The native numbers are what
+## the per-arch encoders insert into instruction fields; the human-facing
+## names (rax, x1, a2, …) never appear in this file.
+
+## 4:4 caller/callee-saved split. r0–r3 caller (native argregs); r4–r7
+## callee (native callee-saved). `br` is the hidden branch-target scratch
+## (not a P1 reg) — picked so every op's expansion clobbers only what its
+## name declares.
+NAT_AA64 = {'r0': 0, 'r1': 1, 'r2': 2, 'r3': 3,
+ 'r4': 26, 'r5': 27, 'r6': 19, 'r7': 20,
+ 'br': 17, # x17 (IP1, caller-saved linker scratch)
+ 'sp': 31, 'xzr': 31, 'lr': 30,
+ 'x21': 21, 'x22': 22, 'x23': 23, 'x8': 8}
+
+## amd64 ModRM.reg/rm + REX.R/B bit: native regnums 0..15 with r8..r15
+## setting the REX bit. We store the 4-bit native number directly.
+NAT_AMD64 = {'r0': 0, # rax
+ 'r1': 7, # rdi
+ 'r2': 6, # rsi
+ 'r3': 2, # rdx
+ 'r4': 13, # r13 (callee-saved)
+ 'r5': 14, # r14 (callee-saved)
+ 'r6': 3, # rbx
+ 'r7': 12, # r12
+ 'br': 11, # r11 — branch/call target scratch + DIV/REM r0 save
+ 'sp': 4, # rsp
+ 'rcx': 1, # shift-count scratch + DIV/REM rdx save (not a P1 reg)
+ 'r10': 10, # syscall arg4 slot (not a P1 reg)
+ 'r8': 8, # syscall arg5 slot (not a P1 reg)
+ 'r9': 9, # syscall arg6 slot (not a P1 reg)
+ 'r11': 11, # alias for br (some expansions spell it r11 directly)
+ }
+
+NAT_RV64 = {'r0': 10, 'r1': 11, 'r2': 12, 'r3': 13,
+ 'r4': 20, 'r5': 21, 'r6': 9, 'r7': 18,
+ 'br': 30, # t5 (caller-saved temp)
+ 'sp': 2, 'ra': 1, 'zero': 0, 'a7': 17,
+ 's3': 19, 's6': 22, 's7': 23}
+
+
+## ---------- Low-level encoding helpers -----------------------------------
+
+def le32(n: int) -> str:
+ return (n & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
+
+def byte(n: int) -> str:
+ return f'{n & 0xFF:02X}'
+
+
+## ---------- amd64 primitive encoders ------------------------------------
+## amd64 is variable-length. Helpers below emit specific instruction
+## shapes used by the P1 expansions. REX prefix bits: W=64b, R=ModRM.reg
+## high, B=ModRM.rm high, X=SIB.index high (unused here).
+
+def rex(w, r, x, b):
+ v = 0x40 | (w << 3) | (r << 2) | (x << 1) | b
+ return byte(v)
+
+def modrm(mod, reg, rm):
+ return byte((mod << 6) | ((reg & 7) << 3) | (rm & 7))
+
+def amd_mov_rr(dst, src):
+ """mov dst, src — REX.W + 89 /r (MOV r/m64, r64)."""
+ d, s = NAT_AMD64[dst], NAT_AMD64[src]
+ return rex(1, s >> 3, 0, d >> 3) + '89' + modrm(3, s, d)
+
+def amd_alu_rr(op, dst, src):
+ """op dst, src — 2-operand ALU. op is the opcode byte (01 add,
+ 29 sub, 21 and, 09 or, 31 xor)."""
+ d, s = NAT_AMD64[dst], NAT_AMD64[src]
+ return rex(1, s >> 3, 0, d >> 3) + op + modrm(3, s, d)
+
+def amd_alu_ri8(ext, dst, imm):
+ """op dst, imm8 (sign-extended). Opcode 83 /ext ib."""
+ d = NAT_AMD64[dst]
+ return rex(1, 0, 0, d >> 3) + '83' + modrm(3, ext, d) + byte(imm)
+
+def amd_alu_ri32(ext, dst, imm):
+ """op dst, imm32 (sign-extended). Opcode 81 /ext id. Used when
+ an immediate doesn't fit in the imm8 form (e.g., ADDI with
+ values outside [-128, 127])."""
+ d = NAT_AMD64[dst]
+ imm_le = (imm & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
+ return rex(1, 0, 0, d >> 3) + '81' + modrm(3, ext, d) + imm_le
+
+def amd_shift_ri8(ext, dst, imm):
+ """shl/shr/sar dst, imm8. Opcode C1 /ext ib."""
+ d = NAT_AMD64[dst]
+ return rex(1, 0, 0, d >> 3) + 'C1' + modrm(3, ext, d) + byte(imm)
+
+def amd_shift_cl(ext, dst):
+ """shl/shr/sar dst, cl. Opcode D3 /ext."""
+ d = NAT_AMD64[dst]
+ return rex(1, 0, 0, d >> 3) + 'D3' + modrm(3, ext, d)
+
+def amd_imul_rr(dst, src):
+ """imul dst, src — 0F AF /r."""
+ d, s = NAT_AMD64[dst], NAT_AMD64[src]
+ return rex(1, d >> 3, 0, s >> 3) + '0FAF' + modrm(3, d, s)
+
+def amd_idiv(src):
+ """idiv src — F7 /7 (signed div of rdx:rax by src)."""
+ s = NAT_AMD64[src]
+ return rex(1, 0, 0, s >> 3) + 'F7' + modrm(3, 7, s)
+
+def amd_cqo():
+ """cqo — sign-extend rax into rdx:rax. 48 99."""
+ return '4899'
+
+def amd_mem_rm(opcode, reg, base, disp):
+ """[base+disp] <-> reg, for MOV r,r/m or MOV r/m,r (opcode=89 store, 8B load).
+ disp is signed int; encodes as disp8 if in range, else disp32."""
+ r, b = NAT_AMD64[reg], NAT_AMD64[base]
+ prefix = rex(1, r >> 3, 0, b >> 3) + opcode
+ if -128 <= disp <= 127:
+ mod = 1
+ d = byte(disp)
+ elif b == 4: # SIB required for rsp
+ mod = 2
+ d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
+ else:
+ mod = 2
+ d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
+ # rsp as base requires SIB byte (rm=4 with no SIB is rip-relative).
+ if b == 4:
+ return prefix + modrm(mod, r, 4) + '24' + d
+ return prefix + modrm(mod, r, b) + d
+
+def amd_mov_rm_b(reg, base, disp, store):
+ """Byte load/store. 88 /r (store), 0F B6 /r (movzx load)."""
+ r, b = NAT_AMD64[reg], NAT_AMD64[base]
+ if -128 <= disp <= 127:
+ mod = 1
+ d = byte(disp)
+ else:
+ mod = 2
+ d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
+ if store:
+ # MOV r/m8, r8 — 88 /r. Requires REX to address dil/sil/bpl/spl.
+ prefix = rex(1, r >> 3, 0, b >> 3) + '88'
+ sib = '24' if b == 4 else ''
+ rmv = 4 if b == 4 else b
+ return prefix + modrm(mod, r, rmv) + sib + d
+ else:
+ # MOVZX r64, r/m8 — REX.W 0F B6 /r.
+ prefix = rex(1, r >> 3, 0, b >> 3) + '0FB6'
+ sib = '24' if b == 4 else ''
+ rmv = 4 if b == 4 else b
+ return prefix + modrm(mod, r, rmv) + sib + d
+
+
+## ---------- aarch64 primitive encoders ----------------------------------
+## aarch64 is fixed 4-byte insns. Helpers return the 4 bytes LE-encoded.
+
+def aa_rrr(base, rD, rA, rB):
+ d, a, b = NAT_AA64[rD], NAT_AA64[rA], NAT_AA64[rB]
+ return le32(base | (b << 16) | (a << 5) | d)
+
+def aa_add_imm(rD, rA, imm12, sub=False):
+ """ADD/SUB (immediate, shift=0). imm12 unsigned 0..4095."""
+ d, a = NAT_AA64[rD], NAT_AA64[rA]
+ base = 0xD1000000 if sub else 0x91000000
+ return le32(base | ((imm12 & 0xFFF) << 10) | (a << 5) | d)
+
+def aa_logical_imm(base, rD, rA, N, immr, imms):
+ d, a = NAT_AA64[rD], NAT_AA64[rA]
+ return le32(base | (N << 22) | (immr << 16) | (imms << 10) | (a << 5) | d)
+
+def aa_ubfm(rD, rA, immr, imms):
+ """UBFM (N=1 for sf=64)."""
+ d, a = NAT_AA64[rD], NAT_AA64[rA]
+ return le32(0xD3400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
+
+def aa_sbfm(rD, rA, immr, imms):
+ """SBFM (N=1 for sf=64)."""
+ d, a = NAT_AA64[rD], NAT_AA64[rA]
+ return le32(0x93400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
+
+def aa_ldst_uimm12(base, rT, rN, off_bytes, size_log2):
+ """LDR/STR (unsigned offset). off_bytes must be a multiple of
+ 2^size_log2 and non-negative. imm12 = off_bytes >> size_log2."""
+ assert off_bytes >= 0 and (off_bytes % (1 << size_log2)) == 0
+ imm12 = off_bytes >> size_log2
+ assert 0 <= imm12 < 4096
+ t, n = NAT_AA64[rT], NAT_AA64[rN]
+ return le32(base | (imm12 << 10) | (n << 5) | t)
+
+def aa_ldst_unscaled(base, rT, rN, off):
+ """LDUR/STUR (unscaled, signed imm9). Handles arbitrary small
+ offsets — negative, or positive-but-not-a-multiple-of-the-access-
+ size (e.g. LD at offset 7). imm9 range is [-256, 255]."""
+ assert -256 <= off <= 255
+ imm9 = off & 0x1FF
+ t, n = NAT_AA64[rT], NAT_AA64[rN]
+ return le32(base | (imm9 << 12) | (n << 5) | t)
+
+
+## ---------- riscv64 primitive encoders ----------------------------------
+
+def rv_r(base, rD, rA, rB):
+ d, a, b = NAT_RV64[rD], NAT_RV64[rA], NAT_RV64[rB]
+ return le32(base | (b << 20) | (a << 15) | (d << 7))
+
+def rv_i(base, rD, rA, imm12):
+ """I-type: imm12[11:0], rs1, funct3, rd, opcode. imm12 is a signed
+ int that gets masked to 12 bits."""
+ d, a = NAT_RV64[rD], NAT_RV64[rA]
+ return le32(base | ((imm12 & 0xFFF) << 20) | (a << 15) | (d << 7))
+
+def rv_s(base, rS, rA, imm12):
+ """S-type store: imm12[11:5] rs2 rs1 funct3 imm12[4:0] opcode."""
+ s, a = NAT_RV64[rS], NAT_RV64[rA]
+ hi = (imm12 >> 5) & 0x7F
+ lo = imm12 & 0x1F
+ return le32(base | (hi << 25) | (s << 20) | (a << 15) | (lo << 7))
+
+def rv_shift_imm(base, rD, rA, shamt):
+ """Shift-imm: base already has funct7 set; shamt in [0,63]."""
+ d, a = NAT_RV64[rD], NAT_RV64[rA]
+ return le32(base | ((shamt & 0x3F) << 20) | (a << 15) | (d << 7))
+
+
+## ---------- Per-arch op base tables -------------------------------------
+
+AA64_RRR_BASE = {
+ 'ADD': 0x8B000000,
+ 'SUB': 0xCB000000,
+ 'AND': 0x8A000000,
+ 'OR': 0xAA000000,
+ 'XOR': 0xCA000000,
+ 'SHL': 0x9AC02000,
+ 'SHR': 0x9AC02400,
+ 'SAR': 0x9AC02800,
+ 'DIV': 0x9AC00C00,
+}
+AMD64_RRR_OPC = {
+ 'ADD': '01', 'SUB': '29', 'AND': '21', 'OR': '09', 'XOR': '31',
+}
+RV_RRR = {
+ 'ADD': 0x00000033, # funct7=0 funct3=0 opcode=0x33
+ 'SUB': 0x40000033,
+ 'XOR': 0x00004033,
+ 'OR': 0x00006033,
+ 'AND': 0x00007033,
+ 'SHL': 0x00001033,
+ 'SHR': 0x00005033,
+ 'SAR': 0x40005033,
+ 'MUL': 0x02000033,
+ 'DIV': 0x02004033,
+ 'REM': 0x02006033,
+}
+
+
+## aarch64 bitmask-immediate encoding for ANDI/ORI. Entries are the
+## (N, immr, imms) triples that encode each small imm as an aarch64
+## "logical immediate." Computed by hand because the full encoding
+## algorithm (contiguous-run + rotation for element sizes
+## 2/4/8/16/32/64) is substantial and we only need a handful of
+## values. Extend this table if a new imm shows up in P1 source.
+AA64_LOGI_ENC = {
+ 1: (1, 0, 0), # 0b0001 — single bit at position 0
+ 2: (1, 63, 0), # 0b0010 — single bit at position 1
+ 3: (1, 0, 1), # 0b0011 — 2 contiguous ones
+ 4: (1, 62, 0), # 0b0100 — single bit at position 2
+ 6: (1, 63, 1), # 0b0110 — 2 ones rotated by 1
+ 7: (1, 0, 2), # 0b0111 — 3 contiguous ones
+ 8: (1, 61, 0), # 0b1000 — single bit at position 3
+}
+
+
+## Frame layout after PROLOGUE_Nk (k >= 1, rounded up so total frame
+## bytes stay 16-byte aligned on aarch64):
+## [sp + 0] = retaddr (aarch64 lr / riscv64 ra / amd64 retaddr)
+## [sp + 8] = slot 1 (callee-private scratch)
+## [sp + 16] = slot 2
+## ...
+## [sp + 8*k] = slot k
+##
+## Frame size = round_up_to_16(8 + 8*k). So k=1 → 16, k=2 → 24 → 32,
+## k=3 → 32, k=4 → 40 → 48.
+
+def prologue_frame_bytes(k: int) -> int:
+ raw = 8 + 8 * k
+ return (raw + 15) & ~15
+
+
+## ---------- Encoders ----------------------------------------------------
+## One class per arch. Each provides one method per P1 op category,
+## mapping (op, reg-tuple, imm) to native bytes. Op classes dispatch
+## here via `Op.encode(enc)` → `enc.<method>(fields)`.
+
+class Encoder:
+ """Per-arch encoder base. Subclasses implement one method per
+ op category. `arch` is used by literal() to pick the right
+ pre-encoded bytes from an arch-keyed dict."""
+ arch = ''
+
+ def literal(self, hex_by_arch):
+ return hex_by_arch[self.arch]
+
+
+class AA64(Encoder):
+ arch = 'aarch64'
+
+ def rrr(self, op, rD, rA, rB):
+ if op == 'MUL':
+ # MUL = MADD with Ra=xzr. 100 11011 000 mmmmm 0 aaaaa nnnnn ddddd
+ d = NAT_AA64[rD]; a = NAT_AA64[rA]; b = NAT_AA64[rB]
+ return le32(0x9B000000 | (b << 16) | (31 << 10) | (a << 5) | d)
+ if op == 'REM':
+ # SDIV x16, xA, xB ; MSUB xD, x16, xB, xA.
+ # x16 (ARM IP0, caller-saved, not a P1 reg) is scratch so
+ # REM does not hidden-clobber P1 r4 — the op modifies rD only.
+ # MSUB needs bit 15 set (o0=1); without it it decodes as
+ # MADD and REM returns A + (A/B)*B.
+ d = NAT_AA64[rD]; a = NAT_AA64[rA]; b = NAT_AA64[rB]
+ SC = 16
+ sdiv = 0x9AC00C00 | (b << 16) | (a << 5) | SC
+ msub = 0x9B008000 | (b << 16) | (a << 10) | (SC << 5) | d
+ return le32(sdiv) + le32(msub)
+ return aa_rrr(AA64_RRR_BASE[op], rD, rA, rB)
+
+ def addi(self, rD, rA, imm):
+ if imm >= 0:
+ return aa_add_imm(rD, rA, imm, sub=False)
+ return aa_add_imm(rD, rA, -imm, sub=True)
+
+ def logi(self, op, rD, rA, imm):
+ N, immr, imms = AA64_LOGI_ENC[imm]
+ base = 0x92000000 if op == 'ANDI' else 0xB2000000 # ORI = orr
+ return aa_logical_imm(base, rD, rA, N, immr, imms)
+
+ def shifti(self, op, rD, rA, imm):
+ if op == 'SHLI':
+ return aa_ubfm(rD, rA, (-imm) & 63, 63 - imm)
+ if op == 'SHRI':
+ return aa_ubfm(rD, rA, imm, 63)
+ if op == 'SARI':
+ return aa_sbfm(rD, rA, imm, 63)
+
+ def mov(self, rD, rA):
+ if rA == 'sp':
+ return aa_add_imm(rD, 'sp', 0, sub=False)
+ # MOV xD, xA = ORR xD, xzr, xA
+ d = NAT_AA64[rD]; a = NAT_AA64[rA]
+ return le32(0xAA000000 | (a << 16) | (31 << 5) | d)
+
+ def li(self, rD):
+ # ldr wD, [pc+8] ; b +8 (caller emits 4 bytes of data next)
+ d = NAT_AA64[rD]
+ ldr_w_lit = 0x18000040 | d # LDR (literal) 32-bit, offset 8
+ b_plus8 = 0x14000002 # B offset 8 (imm26 = 2 words = 8 bytes)
+ return le32(ldr_w_lit) + le32(b_plus8)
+
+ def la(self, rD):
+ return self.li(rD)
+
+ def mem(self, op, rT, rN, off):
+ # Pick uimm12 (scaled, large range) when the offset is a
+ # non-negative multiple of the access width; otherwise fall
+ # back to the unscaled signed-imm9 form (covers negative
+ # offsets and positive-but-misaligned ones like 7).
+ BASES = {
+ 'LD': (0xF9400000, 3, 0xF8400000),
+ 'ST': (0xF9000000, 3, 0xF8000000),
+ 'LB': (0x39400000, 0, 0x38400000),
+ 'SB': (0x39000000, 0, 0x38000000),
+ }
+ uimm_base, size_log2, unscaled_base = BASES[op]
+ scale = 1 << size_log2
+ if off >= 0 and (off % scale) == 0:
+ return aa_ldst_uimm12(uimm_base, rT, rN, off, size_log2)
+ return aa_ldst_unscaled(unscaled_base, rT, rN, off)
+
+ def b(self):
+ return le32(0xD61F0000 | (NAT_AA64['br'] << 5)) # BR x17
+
+ def condb(self, op, rA, rB):
+ # cmp xA, xB = SUBS xzr, xA, xB (0xEB000000 base, rD=31).
+ # Skip when NOT cond holds. BEQ→NE(1), BNE→EQ(0), BLT→GE(A).
+ a = NAT_AA64[rA]; b_ = NAT_AA64[rB]
+ cmp_ = le32(0xEB000000 | (b_ << 16) | (a << 5) | 31)
+ cond = {'BEQ': 1, 'BNE': 0, 'BLT': 10}[op]
+ bcond = le32(0x54000040 | cond)
+ br = le32(0xD61F0000 | (NAT_AA64['br'] << 5))
+ return cmp_ + bcond + br
+
+ def call(self):
+ return le32(0xD63F0000 | (NAT_AA64['br'] << 5)) # BLR x17
+
+ def ret(self):
+ return le32(0xD65F03C0) # RET (= br x30)
+
+ def prologue(self, k):
+ fb = prologue_frame_bytes(k)
+ sub = aa_add_imm('sp', 'sp', fb, sub=True)
+ str_lr = aa_ldst_uimm12(0xF9000000, 'lr', 'sp', 0, 3)
+ return sub + str_lr
+
+ def epilogue(self, k):
+ fb = prologue_frame_bytes(k)
+ ldr_lr = aa_ldst_uimm12(0xF9400000, 'lr', 'sp', 0, 3)
+ add = aa_add_imm('sp', 'sp', fb, sub=False)
+ return ldr_lr + add
+
+ def tail(self, k):
+ return self.epilogue(k) + self.b()
+
+
+class AMD64(Encoder):
+ arch = 'amd64'
+
+ def rrr(self, op, rD, rA, rB):
+ if op == 'MUL':
+ return amd_mov_rr(rD, rA) + amd_imul_rr(rD, rB)
+ if op in ('DIV', 'REM'):
+ # x86 idiv implicitly reads/writes rax (P1 r0) and rdx
+ # (P1 r3). To keep DIV/REM clobber-free (only rD changes),
+ # stash r0 into r11 and r3 into rcx — neither is a P1 reg —
+ # then restore. If rA or rB alias r0/r3, read from the
+ # saved copy since we've overwritten the originals.
+ # Skip the final restore for whichever of r0/r3 *is* rD,
+ # so rD keeps its newly computed value.
+ seq = amd_mov_rr('r11', 'r0') # save r0 (rax)
+ seq += amd_mov_rr('rcx', 'r3') # save r3 (rdx)
+ src_a = 'r11' if rA == 'r0' else ('rcx' if rA == 'r3' else rA)
+ seq += amd_mov_rr('r0', src_a) # rax = rA
+ seq += amd_cqo() # rdx:rax = sign-ext rax
+ src_b = 'r11' if rB == 'r0' else ('rcx' if rB == 'r3' else rB)
+ seq += amd_idiv(src_b)
+ seq += amd_mov_rr(rD, 'r0' if op == 'DIV' else 'r3')
+ if rD != 'r3':
+ seq += amd_mov_rr('r3', 'rcx')
+ if rD != 'r0':
+ seq += amd_mov_rr('r0', 'r11')
+ return seq
+ if op in ('SHL', 'SHR', 'SAR'):
+ ext = {'SHL': 4, 'SHR': 5, 'SAR': 7}[op]
+ seq = amd_mov_rr(rD, rA)
+ seq += amd_mov_rr('rcx', rB)
+ seq += amd_shift_cl(ext, rD)
+ return seq
+ # ADD/SUB/AND/OR/XOR: mov rD,rA ; op rD,rB
+ seq = amd_mov_rr(rD, rA)
+ seq += amd_alu_rr(AMD64_RRR_OPC[op], rD, rB)
+ return seq
+
+ def addi(self, rD, rA, imm):
+ # mov rD,rA ; add rD,imm. Use imm8 form when it fits
+ # ([-128, 127]); otherwise emit the imm32 form.
+ seq = amd_mov_rr(rD, rA)
+ if -128 <= imm <= 127:
+ seq += amd_alu_ri8(0, rD, imm) # /0 = ADD
+ else:
+ seq += amd_alu_ri32(0, rD, imm)
+ return seq
+
+ def logi(self, op, rD, rA, imm):
+ ext = {'ANDI': 4, 'ORI': 1}[op]
+ seq = amd_mov_rr(rD, rA)
+ seq += amd_alu_ri8(ext, rD, imm)
+ return seq
+
+ def shifti(self, op, rD, rA, imm):
+ ext = {'SHLI': 4, 'SHRI': 5, 'SARI': 7}[op]
+ seq = amd_mov_rr(rD, rA)
+ seq += amd_shift_ri8(ext, rD, imm)
+ return seq
+
+ def mov(self, rD, rA):
+ return amd_mov_rr(rD, rA)
+
+ def li(self, rD):
+ # mov <rD as r32>, imm32 — opcode B8+r (with REX.B if r8..r15)
+ d = NAT_AMD64[rD]
+ if d >= 8:
+ return '41' + byte(0xB8 + (d & 7))
+ return byte(0xB8 + d)
+
+ def la(self, rD):
+ return self.li(rD)
+
+ def mem(self, op, rT, rN, off):
+ if op == 'LD': return amd_mem_rm('8B', rT, rN, off)
+ if op == 'ST': return amd_mem_rm('89', rT, rN, off)
+ if op == 'LB': return amd_mov_rm_b(rT, rN, off, store=False)
+ if op == 'SB': return amd_mov_rm_b(rT, rN, off, store=True)
+
+ def b(self):
+ return '41FFE3' # jmp r11
+
+ def condb(self, op, rA, rB):
+ a, b_ = NAT_AMD64[rA], NAT_AMD64[rB]
+ # cmp rA, rB — opcode 39 /r with rA as r/m
+ cmp_ = rex(1, b_ >> 3, 0, a >> 3) + '39' + modrm(3, b_, a)
+ # jcc rel8 opcode, skip=3 (past jmp r11):
+ # BEQ→JNE 75 03 ; BNE→JE 74 03 ; BLT→JGE 7D 03
+ jop = {'BEQ': '75', 'BNE': '74', 'BLT': '7D'}[op]
+ return cmp_ + jop + '03' + '41FFE3' # jmp r11
+
+ def call(self):
+ return '41FFD3' # call r11
+
+ def ret(self):
+ return 'C3'
+
+ def prologue(self, k):
+ # pop rcx ; sub rsp,fb ; push rcx. rcx is the retaddr-carry
+ # scratch — caller-save, never a P1 reg. r11 (= 'br') is
+ # off-limits because TAIL = EPILOGUE + `jmp r11`, and using
+ # r11 here would clobber the LI_BR-loaded tail target.
+ fb = prologue_frame_bytes(k)
+ assert fb <= 127
+ return '59' + '4883EC' + byte(fb) + '51'
+
+ def epilogue(self, k):
+ # Mirror of prologue: pop rcx ; add rsp,fb ; push rcx.
+ fb = prologue_frame_bytes(k)
+ assert fb <= 127
+ return '59' + '4883C4' + byte(fb) + '51'
+
+ def tail(self, k):
+ return self.epilogue(k) + self.b()
+
+
+class RV64(Encoder):
+ arch = 'riscv64'
+
+ def rrr(self, op, rD, rA, rB):
+ return rv_r(RV_RRR[op], rD, rA, rB)
+
+ def addi(self, rD, rA, imm):
+ return rv_i(0x00000013, rD, rA, imm)
+
+ def logi(self, op, rD, rA, imm):
+ base = {'ANDI': 0x00007013, 'ORI': 0x00006013}[op]
+ return rv_i(base, rD, rA, imm)
+
+ def shifti(self, op, rD, rA, imm):
+ base = {'SHLI': 0x00001013, 'SHRI': 0x00005013, 'SARI': 0x40005013}[op]
+ return rv_shift_imm(base, rD, rA, imm)
+
+ def mov(self, rD, rA):
+ return rv_i(0x00000013, rD, rA, 0) # addi rD, rA, 0
+
+ def li(self, rD):
+ # auipc rD,0 ; lwu rD,12(rD) ; jal x0,+8
+ d = NAT_RV64[rD]
+ auipc = 0x00000017 | (d << 7)
+ lwu = 0x00006003 | (d << 7) | (d << 15) | (12 << 20)
+ jal_p8 = 0x0080006F
+ return le32(auipc) + le32(lwu) + le32(jal_p8)
+
+ def la(self, rD):
+ return self.li(rD)
+
+ def mem(self, op, rT, rN, off):
+ # funct3: LD=3, ST=3, LBU=4, SB=0. Opcodes: load=03, store=23.
+ if op == 'LD': return rv_i(0x00003003, rT, rN, off)
+ if op == 'ST': return rv_s(0x00003023, rT, rN, off)
+ if op == 'LB': return rv_i(0x00004003, rT, rN, off) # LBU
+ if op == 'SB': return rv_s(0x00000023, rT, rN, off)
+
+ def b(self):
+ return le32(0x00000067 | (NAT_RV64['br'] << 15)) # jalr x0, 0(t5)
+
+ def condb(self, op, rA, rB):
+ # B<inv> rA, rB, +8 ; jalr x0, 0(t5). funct3 picks the op:
+ # BEQ→BNE(1), BNE→BEQ(0), BLT→BGE(5).
+ a, b_ = NAT_RV64[rA], NAT_RV64[rB]
+ funct3 = {'BEQ': 1, 'BNE': 0, 'BLT': 5}[op]
+ insn = 0x00000063 | (funct3 << 12) | (a << 15) | (b_ << 20) | (8 << 7)
+ jalr = 0x00000067 | (NAT_RV64['br'] << 15)
+ return le32(insn) + le32(jalr)
+
+ def call(self):
+ return le32(0x000000E7 | (NAT_RV64['br'] << 15)) # jalr ra, 0(t5)
+
+ def ret(self):
+ return le32(0x00008067) # jalr x0, 0(ra)
+
+ def prologue(self, k):
+ fb = prologue_frame_bytes(k)
+ sub = rv_i(0x00000013, 'sp', 'sp', -fb)
+ sd = rv_s(0x00003023, 'ra', 'sp', 0)
+ return sub + sd
+
+ def epilogue(self, k):
+ fb = prologue_frame_bytes(k)
+ ld = rv_i(0x00003003, 'ra', 'sp', 0)
+ add = rv_i(0x00000013, 'sp', 'sp', fb)
+ return ld + add
+
+ def tail(self, k):
+ return self.epilogue(k) + self.b()
+
+
+ENCODERS = {'aarch64': AA64(), 'amd64': AMD64(), 'riscv64': RV64()}
+
+
+## ---------- Op dataclasses ----------------------------------------------
+## Thin wrappers: each row holds its DEFINE name + the data needed to
+## reconstruct the encoding. `encode(enc)` calls the matching method
+## on the arch's encoder.
+
+@dataclass
+class Op:
+ name: str
+ comment: str = ''
+
+ def encode(self, enc: Encoder) -> str:
+ raise NotImplementedError
+
+@dataclass
+class RRR(Op):
+ op: str = ''
+ rD: str = ''
+ rA: str = ''
+ rB: str = ''
+ def encode(self, enc):
+ return enc.rrr(self.op, self.rD, self.rA, self.rB)
+
+@dataclass
+class AddI(Op):
+ rD: str = ''
+ rA: str = ''
+ imm: int = 0
+ def encode(self, enc):
+ return enc.addi(self.rD, self.rA, self.imm)
+
+@dataclass
+class LogI(Op):
+ op: str = '' # ANDI / ORI
+ rD: str = ''
+ rA: str = ''
+ imm: int = 0
+ def encode(self, enc):
+ return enc.logi(self.op, self.rD, self.rA, self.imm)
+
+@dataclass
+class ShiftI(Op):
+ op: str = '' # SHLI / SHRI / SARI
+ rD: str = ''
+ rA: str = ''
+ imm: int = 0
+ def encode(self, enc):
+ return enc.shifti(self.op, self.rD, self.rA, self.imm)
+
+@dataclass
+class Mov(Op):
+ rD: str = ''
+ rA: str = ''
+ def encode(self, enc):
+ return enc.mov(self.rD, self.rA)
+
+@dataclass
+class Li(Op):
+ rD: str = ''
+ def encode(self, enc):
+ return enc.li(self.rD)
+
+@dataclass
+class La(Op):
+ rD: str = ''
+ def encode(self, enc):
+ return enc.la(self.rD)
+
+@dataclass
+class Mem(Op):
+ op: str = '' # LD / ST / LB / SB
+ rT: str = ''
+ rN: str = ''
+ off: int = 0
+ def encode(self, enc):
+ return enc.mem(self.op, self.rT, self.rN, self.off)
+
+@dataclass
+class B(Op):
+ def encode(self, enc):
+ return enc.b()
+
+@dataclass
+class CondB(Op):
+ op: str = '' # BEQ / BNE / BLT
+ rA: str = ''
+ rB: str = ''
+ def encode(self, enc):
+ return enc.condb(self.op, self.rA, self.rB)
+
+@dataclass
+class Literal(Op):
+ hex_by_arch: Optional[dict] = None
+ def encode(self, enc):
+ return enc.literal(self.hex_by_arch)
+
+@dataclass
+class Prologue(Op):
+ k: int = 1
+ def encode(self, enc):
+ return enc.prologue(self.k)
+
+@dataclass
+class Epilogue(Op):
+ k: int = 1
+ def encode(self, enc):
+ return enc.epilogue(self.k)
+
+@dataclass
+class Tail(Op):
+ k: int = 1
+ def encode(self, enc):
+ return enc.tail(self.k)
+
+@dataclass
+class Call(Op):
+ def encode(self, enc):
+ return enc.call()
+
+@dataclass
+class Ret(Op):
+ def encode(self, enc):
+ return enc.ret()
+
+
+## ---------- SYSCALL pre-encoded sequences -------------------------------
+## The one-shot syscall wrapper. Shuffles P1's r0=num, r1–r6=args into
+## each arch's native syscall ABI and clobbers only r0 on return.
+## Encoded by hand (per P1.md §"Syscall conventions").
+
+SYSCALL_HEX = {
+ 'aarch64': (
+ # r4/r5 now live in callee-saved natives (x26/x27), so the
+ # kernel preserves them — no save/restore needed. Only r1/r2/r3
+ # (in caller-saved x1/x2/x3) must be stashed across the shuffle.
+ '' .join([
+ le32(0xAA0003E8), # mov x8, x0 (syscall number)
+ le32(0xAA0103F5), # mov x21, x1 (save r1)
+ le32(0xAA0203F6), # mov x22, x2 (save r2)
+ le32(0xAA0303F7), # mov x23, x3 (save r3)
+ le32(0xAA1503E0), # mov x0, x21 (arg1 = r1)
+ le32(0xAA1603E1), # mov x1, x22 (arg2 = r2)
+ le32(0xAA1703E2), # mov x2, x23 (arg3 = r3)
+ le32(0xAA1A03E3), # mov x3, x26 (arg4 = r4)
+ le32(0xAA1B03E4), # mov x4, x27 (arg5 = r5)
+ le32(0xAA1303E5), # mov x5, x19 (arg6 = r6)
+ le32(0xD4000001), # svc #0
+ le32(0xAA1503E1), # mov x1, x21 (restore r1)
+ le32(0xAA1603E2), # mov x2, x22
+ le32(0xAA1703E3), # mov x3, x23
+ ])
+ ),
+ # r4=r13, r5=r14 are callee-saved natively, but syscall wants args
+ # 4/5 in r10/r8. r6=rbx, but arg6 lives in r9. Three shuffle moves,
+ # then syscall. The kernel preserves rdi/rsi/rdx/r12–r15/rbx, so no
+ # P1 reg is clobbered beyond r0 (syscall return).
+ 'amd64': '4D89EA' + '4D89F0' + '4989D9' + '0F05',
+ 'riscv64': (
+ # Same story as aarch64: r4/r5 in callee-saved s4/s5 (=x20/x21),
+ # so we only save/restore a1/a2/a3. Scratch slots: s3, s6, s7.
+ ''.join([
+ le32(0x00050893), # mv a7, a0 (syscall number)
+ le32(0x00058993), # mv s3, a1 (save r1)
+ le32(0x00060B13), # mv s6, a2 (save r2)
+ le32(0x00068B93), # mv s7, a3 (save r3)
+ le32(0x00098513), # mv a0, s3 (arg1 = r1)
+ le32(0x000B0593), # mv a1, s6 (arg2 = r2)
+ le32(0x000B8613), # mv a2, s7 (arg3 = r3)
+ le32(0x000A0693), # mv a3, s4 (arg4 = r4)
+ le32(0x000A8713), # mv a4, s5 (arg5 = r5)
+ le32(0x00048793), # mv a5, s1 (arg6 = r6)
+ le32(0x00000073), # ecall
+ le32(0x00098593), # mv a1, s3 (restore r1)
+ le32(0x000B0613), # mv a2, s6
+ le32(0x000B8693), # mv a3, s7
+ ])
+ ),
+}
+
+## Syscall numbers (little-endian 32-bit for LI operand).
+## aarch64 and riscv64 share the asm-generic table; amd64 has its own.
+##
+## Portability notes — every entry below is a syscall that exists on all
+## three with the same semantics under the uniform P1 SYSCALL convention
+## (r0 = num, r1-r6 = args):
+## - `fork` is amd64-only; `wait4` is asm-generic 32-bit compat only.
+## Use `clone(SIGCHLD)` and `waitid` instead.
+## - `open` is amd64-only (removed from asm-generic). Use `openat` with
+## dirfd = AT_FDCWD (-100) as arg1.
+## - `clone` arg order differs: amd64 is (flags, stack, ptid, ctid, tls);
+## aarch64/riscv64 are (flags, stack, ptid, tls, ctid). Benign when
+## ptid/ctid/tls are all zero (the fork-equivalent case).
+SYS_NUM = {
+ 'aarch64': {'SYS_WRITE': 64, 'SYS_EXIT': 93, 'SYS_READ': 63, 'SYS_CLOSE': 57,
+ 'SYS_OPENAT': 56,
+ 'SYS_CLONE': 220, 'SYS_EXECVE': 221, 'SYS_WAITID': 95},
+ 'amd64': {'SYS_WRITE': 1, 'SYS_EXIT': 60, 'SYS_READ': 0, 'SYS_CLOSE': 3,
+ 'SYS_OPENAT':257,
+ 'SYS_CLONE': 56, 'SYS_EXECVE': 59, 'SYS_WAITID':247},
+ 'riscv64': {'SYS_WRITE': 64, 'SYS_EXIT': 93, 'SYS_READ': 63, 'SYS_CLOSE': 57,
+ 'SYS_OPENAT': 56,
+ 'SYS_CLONE': 220, 'SYS_EXECVE': 221, 'SYS_WAITID': 95},
+}
+
+
+## ---------- Canonical imm/offset/shamt sets -----------------------------
+## Enumerated instead of sigil-passed: M1's DEFINE substitutes hex
+## bytes verbatim, so every distinct imm value needs its own DEFINE.
+## These cover every value used across hello/demo/lisp/kaem-minimal
+## plus a little headroom. Extend when a new value appears in P1 src.
+
+## ADDI imms. NEG48/48 handle the ASCII '0' bias; the rest cover tag
+## stripping and loop counters. Full reg product × this set = 8²×N.
+ADDI_IMMS = (-48, -8, -7, -6, -5, -4, -3, -2, -1,
+ 1, 2, 3, 4, 5, 6, 7, 8, 48)
+
+## Shift amounts (for SHLI/SHRI/SARI). 32/52 implement low-N-bit masks
+## (length field extraction; 4096-slot symbol-table index); the small
+## values scale-by-N for byte offsets and fixnum encode/decode.
+SHIFT_IMMS = (1, 2, 3, 5, 16, 32, 52)
+
+## ANDI/ORI imms. Every entry must appear in AA64_LOGI_ENC.
+LOGI_IMMS = (1, 2, 3, 4, 6, 7, 8)
+
+## Memory offsets for LD/ST/LB/SB. 0/8/16/24/32 cover slot offsets in
+## N-slot frames and common struct fields; 7 is the NUL terminator
+## position inside an 8-byte zero-padded slot; -8 reaches one slot
+## below the current base.
+MEM_OFFS = (-8, 0, 7, 8, 16, 24, 32)
+
+CONDB_OPS = ('BEQ', 'BNE', 'BLT')
+SHIFT_OPS = ('SHLI', 'SHRI', 'SARI')
+LOGI_OPS = ('ANDI', 'ORI')
+MEM_OPS = ('LD', 'ST', 'LB', 'SB')
+
+
+## Curated RRR triples. The full cube is 11 ops × 8³ regs = 5632
+## entries per arch — >99% would be dead weight. Each tuple below
+## is one actually used by hello/demo/lisp/kaem-minimal. Lint
+## catches missing triples on assembly; add a line here and
+## regenerate.
+RRR_TABLE = (
+ # demo/lisp step-1 arith cube
+ ('ADD','r1','r1','r2'), ('ADD','r1','r1','r4'),
+ ('ADD','r2','r2','r6'), ('ADD','r2','r3','r1'),
+ ('SUB','r1','r1','r2'), ('SUB','r2','r2','r6'),
+ ('AND','r1','r1','r5'),
+ ('OR', 'r1','r1','r2'),
+ ('XOR','r1','r1','r2'),
+ ('MUL','r1','r1','r2'),
+ ('DIV','r1','r1','r2'),
+ ('REM','r1','r1','r5'),
+ ('SHL','r1','r1','r2'),
+ ('SHR','r1','r1','r2'),
+ ('SAR','r4','r4','r2'),
+ # alloc / pointer arithmetic
+ ('ADD','r2','r0','r1'),
+ ('ADD','r0','r0','r3'),
+ ('ADD','r2','r2','r0'),
+ ('ADD','r2','r2','r1'),
+ ('SUB','r3','r3','r0'),
+ # reader / display index+offset fold
+ ('ADD','r6','r1','r2'),
+ ('ADD','r6','r6','r0'),
+ ('ADD','r7','r1','r2'),
+ ('SUB','r2','r1','r6'),
+ ('SUB','r3','r1','r6'),
+ ('REM','r1','r1','r2'),
+ # kaem-minimal bump-pointer + accumulator updates
+ ('ADD','r1','r1','r0'),
+ ('ADD','r5','r5','r0'),
+ ('ADD','r7','r7','r0'),
+ ('SUB','r3','r3','r2'),
+ ('SUB','r6','r6','r0'),
+ # Primitive bodies (LISP.md step 10c). Convention: r1=argc,
+ # r2=argv (both input), r3=accumulator, r0=scratch/return.
+ # Variadic folds: (r3 = r3 op r0). Unary negate / bit-not:
+ # (r3 = r0 - r3) with r0 = 0 or -1. arithmetic-shift k-negate:
+ # (r0 = r1 - r0) with r1 = 0 after argc is consumed.
+ ('ADD','r3','r3','r0'),
+ ## ('SUB','r3','r3','r0') — already above (kaem-minimal row)
+ ('SUB','r3','r0','r3'),
+ ('SUB','r0','r1','r0'),
+ ('MUL','r3','r3','r0'),
+ ('DIV','r3','r3','r0'),
+ ('REM','r3','r3','r0'),
+ ('AND','r3','r3','r0'),
+ ('OR', 'r3','r3','r0'),
+ ('XOR','r3','r3','r0'),
+ ('SHL','r3','r3','r0'),
+ ('SAR','r3','r3','r0'),
+)
+
+
+## ---------- Row assembly ------------------------------------------------
+
+HEADER = """## p1_{arch}.M1 — GENERATED by p1_gen.py. Do not edit by hand.
+##
+## Shared op-table lives in p1_gen.py; each arch's encoder lowers
+## (op, register-tuple, imm) rows into native bytes. See P1.md for the
+## ISA spec and register mapping.
+"""
+
+@dataclass
+class Banner:
+ text: str
+
+
+def _imm_suf(imm):
+ return f'NEG{-imm}' if imm < 0 else f'{imm}'
+
+
+def rows():
+ R = []
+
+ # --- LI / LA — wide literal and address loads ---
+ R.append(Banner('LI / LA — load 4-byte zero-extended literal or label addr'))
+ for rd in P1_REGS:
+ R.append(Li(name=f'LI_{rd.upper()}', rD=rd))
+ # LI_BR loads into the hidden branch-target scratch (x17/r11/t5).
+ # Every branch/call site is `LI_BR &target ; P1_<BR>`. The scratch
+ # is *not* a P1 reg.
+ R.append(Li(name='LI_BR', rD='br'))
+ for rd in P1_REGS:
+ R.append(La(name=f'LA_{rd.upper()}', rD=rd))
+
+ # --- MOV — register-to-register + MOV rD, sp ---
+ R.append(Banner('MOV — full register product (src may be sp)'))
+ for rd in P1_REGS:
+ for ra in P1_REGS:
+ R.append(Mov(name=f'MOV_{rd.upper()}_{ra.upper()}', rD=rd, rA=ra))
+ R.append(Mov(name=f'MOV_{rd.upper()}_SP', rD=rd, rA='sp'))
+
+ # --- RRR — curated triples (full cube would be 5.6k/arch) ---
+ R.append(Banner('RRR — curated triples (explicit table in p1_gen.py)'))
+ for op, d, a, b in RRR_TABLE:
+ R.append(RRR(name=f'{op}_{d.upper()}_{a.upper()}_{b.upper()}',
+ op=op, rD=d, rA=a, rB=b))
+
+ # --- Immediate arith: ADDI × full reg product × imm set ---
+ R.append(Banner('ADDI — full register product × ADDI_IMMS'))
+ for d, a, imm in product(P1_REGS, P1_REGS, ADDI_IMMS):
+ R.append(AddI(name=f'ADDI_{d.upper()}_{a.upper()}_{_imm_suf(imm)}',
+ rD=d, rA=a, imm=imm))
+
+ # --- ANDI / ORI × full reg product × LOGI_IMMS ---
+ R.append(Banner('ANDI / ORI — full register product × LOGI_IMMS'))
+ for op, d, a, imm in product(LOGI_OPS, P1_REGS, P1_REGS, LOGI_IMMS):
+ R.append(LogI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}',
+ op=op, rD=d, rA=a, imm=imm))
+
+ # --- SHLI / SHRI / SARI × full reg product × SHIFT_IMMS ---
+ R.append(Banner('SHLI / SHRI / SARI — full register product × SHIFT_IMMS'))
+ for op, d, a, imm in product(SHIFT_OPS, P1_REGS, P1_REGS, SHIFT_IMMS):
+ R.append(ShiftI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}',
+ op=op, rD=d, rA=a, imm=imm))
+
+ # --- Memory: LD/ST/LB/SB × full reg product × MEM_OFFS ---
+ R.append(Banner('LD / ST / LB / SB — full register product × MEM_OFFS'))
+ for op, rt, rn, off in product(MEM_OPS, P1_REGS, P1_REGS, MEM_OFFS):
+ R.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{_imm_suf(off)}',
+ op=op, rT=rt, rN=rn, off=off))
+
+ # --- Branches: BEQ/BNE/BLT × full reg product + unconditional B ---
+ R.append(Banner('Branches — LI_BR-indirect pattern'))
+ R.append(B(name='B'))
+ for op, a, b in product(CONDB_OPS, P1_REGS, P1_REGS):
+ R.append(CondB(name=f'{op}_{a.upper()}_{b.upper()}',
+ op=op, rA=a, rB=b))
+
+ # --- Control: CALL / RET / PROLOGUE / EPILOGUE / TAIL (Nk = 1..4) ---
+ R.append(Banner('Control: CALL/RET + single-slot and N-slot PROLOGUE/EPILOGUE/TAIL'))
+ R.append(Prologue(name='PROLOGUE', k=1))
+ R.append(Epilogue(name='EPILOGUE', k=1))
+ R.append(Ret(name='RET'))
+ R.append(Call(name='CALL'))
+ R.append(Tail(name='TAIL', k=1))
+ for k in (2, 3, 4):
+ R.append(Prologue(name=f'PROLOGUE_N{k}', k=k))
+ R.append(Epilogue(name=f'EPILOGUE_N{k}', k=k))
+ R.append(Tail(name=f'TAIL_N{k}', k=k))
+
+ # --- SYSCALL — pre-encoded per-arch wrapper ---
+ R.append(Banner('SYSCALL — uniform "clobbers r0 only" across arches'))
+ R.append(Literal(name='SYSCALL', hex_by_arch=SYSCALL_HEX))
+
+ # --- Syscall numbers (LE-32 immediates) ---
+ R.append(Banner('Linux syscall numbers (per-arch table). LE-32 operands for LI.'))
+ for name in ('SYS_WRITE', 'SYS_EXIT', 'SYS_READ', 'SYS_CLOSE', 'SYS_OPENAT',
+ 'SYS_CLONE', 'SYS_EXECVE', 'SYS_WAITID'):
+ R.append(Literal(name=name,
+ hex_by_arch={a: le32(SYS_NUM[a][name]) for a in ARCHES}))
+
+ return R
+
+
+## ---------- File emission -----------------------------------------------
+
+def emit(arch: str) -> str:
+ enc = ENCODERS[arch]
+ out = [HEADER.format(arch=arch).rstrip(), '']
+ seen = set()
+ for row in rows():
+ if isinstance(row, Banner):
+ out.append('')
+ out.append('## ---- ' + row.text + ' ' + '-' * max(0, 60 - len(row.text)))
+ continue
+ name = 'P1_' + row.name if not row.name.startswith('SYS_') else row.name
+ if name in seen:
+ raise RuntimeError(f'duplicate DEFINE: {name}')
+ seen.add(name)
+ out.append(f'DEFINE {name} {row.encode(enc)}')
+ out.append('')
+ return '\n'.join(out)
+
+
+def main():
+ check = '--check' in sys.argv
+ positional = [a for a in sys.argv[1:] if not a.startswith('--')]
+ build_root = positional[0] if positional else 'build'
+
+ had_diff = False
+ for arch in ARCHES:
+ dest_dir = os.path.join(build_root, arch)
+ path = os.path.join(dest_dir, f'p1_{arch}.M1')
+ content = emit(arch)
+ if check:
+ try:
+ with open(path) as f:
+ existing = f.read()
+ except FileNotFoundError:
+ existing = ''
+ if existing != content:
+ sys.stderr.write(f'DIFF: {path}\n')
+ had_diff = True
+ else:
+ os.makedirs(dest_dir, exist_ok=True)
+ with open(path, 'w') as f:
+ f.write(content)
+ print(f'wrote {path} ({len(content)} bytes)')
+
+ if check and had_diff:
+ sys.exit(1)
+
+
+if __name__ == '__main__':
+ main()