Add riscv64 and amd64 P1v2 backends with multi-arch test harness - boot2

commit aa0c83900e7fb42f42a0361f46597e9bc5aabbaf
parent a570d90096e6f998b88f626c4ea05d870e80e84f
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 24 Apr 2026 07:54:52 -0700

Add riscv64 and amd64 P1v2 backends with multi-arch test harness

- p1/P1-riscv64.M1pp, p1/P1-amd64.M1pp: new M1pp-form P1v2 backends
  mirroring p1/P1-aarch64.M1pp. Native register picks follow docs/P1.md's
  64-bit mapping table. rv64 lowers every op to a single R/I/S-type
  instruction or a source-before-destination sequence, so no aliasing
  hazards are possible. amd64 handles variable-length encoding (REX,
  ModRM, SIB) and the native syscall/shift register constraints
  (rcx/r11 clobbered by syscall; CL-only shift counts; rdx implicit in
  idiv) via stack saves and a hidden `scratch` / `rbp` pair.

- m1pp/build-p1.sh: per-arch pipeline (host-native m1pp -> M1 -> hex2
  with the matching stage0 ELF header). The m1pp expander is
  architecture-neutral, so one host-compiled binary drives all targets.

- tests/p1/test.sh: rewritten to iterate over every fixture for every
  backend under podman/qemu (`--arch` narrows to one target). Cross-arch
  consistency falls out because every arch must match the same
  <name>.expected stdout.

- tests/p1/p1-call.P1: exercises ENTER / LEAVE / CALL / RET / MOV / ADDI
  on a program with a helper subroutine.

- tests/p1/p1-aliasing.P1: regression coverage for three amd64 contract
  violations fixed in this commit.  Each test picks a register pattern
  that makes the bug observable as a wrong output byte:

    1. DIV/REM with rd == rb == a2 (rdx). Two bugs in one sequence --
       `idiv rb` ran after `cqo` clobbered rdx (so idiv divided by the
       sign-extension of rax instead of the original rb), and the rdx
       restore ran after the rd write (clobbering the quotient when
       rd == a2). Fixed by stashing rb to `scratch` before `cqo` and
       reordering the restore to precede the rd write.

    2. SHL/SHR/SAR reg-count with rd == a3 (rcx). Mirror of (1) -- ra
       was read from rcx after the count overwrite, and the rcx restore
       ran after the rd write. Fixed by the same reordering.

    3. ANDI/ORI with imm > 127 unconditionally used `83 /ext ib`, whose
       sign-extension turned `ANDI rd, 255` into AND with -1. Widened to
       `81 /ext id` for imms outside [-128, 127].

- Makefile: new `test-p1` target invoking tests/p1/test.sh.

15 passed, 0 failed across aarch64 / amd64 / riscv64 on hello, double,
argc_exit, p1-call, p1-aliasing.

Diffstat:
M Makefile  | 9 ++++++++-
A m1pp/build-p1.sh  | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A p1/P1-amd64.M1pp  | 844 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A p1/P1-riscv64.M1pp  | 535 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A tests/p1/p1-aliasing.P1  | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A tests/p1/p1-aliasing.expected  | 1 +
A tests/p1/p1-call.P1  | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
A tests/p1/p1-call.expected  | 2 ++
M tests/p1/test.sh  | 151 ++++++++++++++++++++++++++++++++++++-------------------------------------------

9 files changed, 1672 insertions(+), 84 deletions(-)
diff --git a/Makefile b/Makefile
@@ -120,7 +120,7 @@ IMAGE_STAMP := $(OUT_DIR)/.image
 
 # --- Targets ---------------------------------------------------------------
 
-.PHONY: all toolchain populate-upstream run run-all test-lisp test-lisp-all test-m1pp clean
+.PHONY: all toolchain populate-upstream run run-all test-lisp test-lisp-all test-m1pp test-p1 clean
 
 all: $(OUT_DIR)/$(PROG)
 
@@ -312,6 +312,13 @@ test-lisp-all:
 test-m1pp: build/p1v2/aarch64/p1_aarch64.M1 $(TOOLS_DIR)/M0 | $(IMAGE_STAMP)
 	sh m1pp/test.sh
 
+# P1-language fixtures under tests/p1/ run through the M1pp-driven
+# pipeline (p1/P1-<arch>.M1pp + p1/P1.M1pp) for every backend arch.
+# Cross-arch consistency falls out because every arch must match the
+# same <name>.expected output.
+test-p1:
+	sh tests/p1/test.sh
+
 # P1v2 DEFINE table for aarch64. Generated by p1/p1_gen.py from
 # p1/aarch64.py. Used by the m1pp port (build/m1pp/) — distinct from the
 # legacy build/aarch64/p1_aarch64.M1 used by PROG=hello/lisp/m1m.
diff --git a/m1pp/build-p1.sh b/m1pp/build-p1.sh
@@ -0,0 +1,94 @@
+#!/bin/sh
+## build-p1.sh -- build a P1v2 .M1pp source for the M1pp-based P1 pipeline.
+##
+## Pipeline:
+##   1. catm     p1/P1-<arch>.M1pp + p1/P1.M1pp + <source.M1pp> -> combined.M1pp
+##   2. m1pp     combined.M1pp -> expanded.M1 (macros -> raw hex + labels)
+##   3. M1       expanded.M1 -> prog.hex2 (stringify literals, pass hex through)
+##   4. catm     ELF-<arch>.hex2 + prog.hex2 -> linked.hex2
+##   5. hex2     linked.hex2 -> raw ELF
+##   6. chmod + trim to p_filesz, deposit at <output>
+##
+## Usage: m1pp/build-p1.sh <arch> <source.M1pp> <output>
+##   arch: aarch64 | riscv64 | amd64
+##
+## The m1pp expander itself is architecture-neutral; we run the aarch64
+## binary under podman linux/arm64 for all target arches. M1/hex2 run
+## natively on the host via build/native-tools/.
+
+set -eu
+
+if [ "$#" -ne 3 ]; then
+    echo "usage: $0 <arch> <source.M1pp> <output>" >&2
+    exit 2
+fi
+
+ARCH=$1
+SRC=$2
+OUT=$3
+
+case "$ARCH" in
+    aarch64) ELF_HDR=build/upstream/AArch64/ELF-aarch64.hex2 ;;
+    amd64)   ELF_HDR=build/upstream/AMD64/ELF-amd64.hex2 ;;
+    riscv64) ELF_HDR=build/upstream/riscv64/ELF-riscv64.hex2 ;;
+    *) echo "build-p1.sh: unsupported arch '$ARCH'" >&2; exit 1 ;;
+esac
+
+REPO=$(cd "$(dirname "$0")/.." && pwd)
+cd "$REPO"
+
+FRONTEND=p1/P1.M1pp
+BACKEND=p1/P1-$ARCH.M1pp
+
+for f in "$BACKEND" "$FRONTEND" "$ELF_HDR" "$SRC"; do
+    if [ ! -e "$f" ]; then
+        echo "build-p1.sh: missing input: $f" >&2
+        exit 1
+    fi
+done
+
+## Host-compiled m1pp has 64K-token / 512K-text buffers, versus the ~4K /
+## 32K cap in the aarch64 self-hosted m1pp.M1. The combined backend +
+## frontend + source easily blows past the 4K-token cap, so use native.
+NATIVE_M1PP=build/native-tools/m1pp
+NATIVE_M1=build/native-tools/M1
+NATIVE_HEX2=build/native-tools/hex2
+if [ ! -x "$NATIVE_M1PP" ]; then
+    : "${CC:=cc}"
+    mkdir -p build/native-tools
+    $CC -O2 -std=c99 -o "$NATIVE_M1PP" m1pp/m1pp.c
+fi
+if [ ! -x "$NATIVE_M1" ] || [ ! -x "$NATIVE_HEX2" ]; then
+    sh m1pp/build-native-tools.sh
+fi
+
+NAME=$(basename "$SRC" .M1pp)
+WORK=build/p1v2-m1pp/$ARCH/$NAME.work
+mkdir -p "$WORK" "$(dirname "$OUT")"
+
+COMBINED=$WORK/combined.M1pp
+EXPANDED=$WORK/expanded.M1
+PROG_HEX2=$WORK/prog.hex2
+LINKED=$WORK/linked.hex2
+RAW=$WORK/prog.raw
+
+cat "$BACKEND" "$FRONTEND" "$SRC" > "$COMBINED"
+
+"$NATIVE_M1PP" "$COMBINED" "$EXPANDED"
+
+"$NATIVE_M1" --architecture "$ARCH" --little-endian \
+    -f "$EXPANDED" -o "$PROG_HEX2"
+
+cat "$ELF_HDR" "$PROG_HEX2" > "$LINKED"
+
+"$NATIVE_HEX2" --architecture "$ARCH" --little-endian \
+    --base-address 0x600000 \
+    -f "$LINKED" -o "$RAW"
+
+## Trim trailing zero padding past p_filesz (lives at byte offset 96 as a
+## little-endian u32 in the ELF64 program header). The hex2 output
+## zero-fills to p_memsz; the kernel zero-fills the BSS gap at load time
+## so we can chop anything past p_filesz on disk.
+size=$(od -An -tu4 -N4 -j96 "$RAW" | tr -d ' ')
+head -c "$size" "$RAW" > "$OUT"
+chmod 0700 "$OUT"
diff --git a/p1/P1-amd64.M1pp b/p1/P1-amd64.M1pp
@@ -0,0 +1,844 @@
+# P1-amd64.M1pp -- P1v2 amd64 backend expressed in m1macro.
+#
+# Mirrors p1/P1-aarch64.M1pp; native register picks follow docs/P1.md's
+# 64-bit mapping table. amd64 is variable-length, so every op emits its
+# prefix bytes (REX / opcode) directly via the m1pp `!(…)` single-byte
+# builtin; 4-byte immediates still go through `%(…)`.
+#
+# Hidden backend regs:
+#   br      = r15   -- branch-target scratch
+#   scratch = r9    -- per-expansion scratch (rcx shift alias save, etc.)
+#   rax            -- syscall number / return slot + spill buffer
+#   rbp            -- spill buffer when rcx needs saving for SHL/SHR/SAR
+
+# ---- Native register numbers --------------------------------------------
+#
+# Macros emit the 4-bit native regnum 0..15. Callers use `(& N 7)` for the
+# ModRM/SIB low 3 bits and `(>> N 3)` for the REX high bit.
+
+%macro amd_reg_a0()
+7
+%endm
+%macro amd_reg_a1()
+6
+%endm
+%macro amd_reg_a2()
+2
+%endm
+%macro amd_reg_a3()
+1
+%endm
+%macro amd_reg_t0()
+10
+%endm
+%macro amd_reg_t1()
+11
+%endm
+%macro amd_reg_t2()
+8
+%endm
+%macro amd_reg_s0()
+3
+%endm
+%macro amd_reg_s1()
+12
+%endm
+%macro amd_reg_s2()
+13
+%endm
+%macro amd_reg_s3()
+14
+%endm
+%macro amd_reg_sp()
+4
+%endm
+%macro amd_reg_rax()
+0
+%endm
+%macro amd_reg_rcx()
+1
+%endm
+%macro amd_reg_rdx()
+2
+%endm
+%macro amd_reg_rbx()
+3
+%endm
+%macro amd_reg_rsp()
+4
+%endm
+%macro amd_reg_rbp()
+5
+%endm
+%macro amd_reg_rsi()
+6
+%endm
+%macro amd_reg_rdi()
+7
+%endm
+%macro amd_reg_r8()
+8
+%endm
+%macro amd_reg_r9()
+9
+%endm
+%macro amd_reg_r10()
+10
+%endm
+%macro amd_reg_r11()
+11
+%endm
+%macro amd_reg_r12()
+12
+%endm
+%macro amd_reg_r13()
+13
+%endm
+%macro amd_reg_r14()
+14
+%endm
+%macro amd_reg_r15()
+15
+%endm
+%macro amd_reg_br()
+15
+%endm
+%macro amd_reg_scratch()
+9
+%endm
+
+%macro amd_reg(r)
+%amd_reg_##r()
+%endm
+
+# ---- REX / ModRM helpers ------------------------------------------------
+
+# REX.WB: W=1 for 64-bit, B=(r>>3) to extend ModRM.rm / SIB.base.
+%macro amd_rex_wb(r)
+!((| 0x48 (& (>> %amd_reg(r) 3) 1)))
+%endm
+
+# REX.WRB: W=1, R=(rg>>3), B=(rm>>3). Used whenever a ModRM.reg field is
+# in use together with a ModRM.rm field.
+%macro amd_rex_wrb(rg, rm)
+!((| 0x48 (| (<< (& (>> %amd_reg(rg) 3) 1) 2) (& (>> %amd_reg(rm) 3) 1))))
+%endm
+
+# ModRM byte for register/register: mod=3, reg=rg low3, rm=rm low3.
+%macro amd_modrm_rr(rg, rm)
+!((| 0xC0 (| (<< (& %amd_reg(rg) 7) 3) (& %amd_reg(rm) 7))))
+%endm
+
+# ModRM /ext, rm: mod=3, reg=ext, rm=low3(rm). ext is 0..7.
+%macro amd_modrm_ext_r(ext, rm)
+!((| 0xC0 (| (<< ext 3) (& %amd_reg(rm) 7))))
+%endm
+
+# ---- Memory-addressing ModRM (+ SIB + disp) ----------------------------
+#
+# [base + disp] with `reg` in ModRM.reg. Bases whose low 3 bits are 100 —
+# rsp and r12 — must go through a SIB byte; all others use the plain
+# encoding. disp selects mod=1 (disp8) when it fits in [-128,127], else
+# mod=2 (disp32). We never emit mod=0 / no-disp; the extra byte is fine.
+
+%macro amd_modrm_disp8_plain(reg, base, disp)
+!((| 0x40 (| (<< (& %amd_reg(reg) 7) 3) (& %amd_reg(base) 7))))
+!((& disp 0xFF))
+%endm
+
+%macro amd_modrm_disp32_plain(reg, base, disp)
+!((| 0x80 (| (<< (& %amd_reg(reg) 7) 3) (& %amd_reg(base) 7))))
+%((& disp 0xFFFFFFFF))
+%endm
+
+%macro amd_modrm_disp8_sib(reg, disp)
+!((| 0x44 (<< (& %amd_reg(reg) 7) 3)))
+!(0x24)
+!((& disp 0xFF))
+%endm
+
+%macro amd_modrm_disp32_sib(reg, disp)
+!((| 0x84 (<< (& %amd_reg(reg) 7) 3)))
+!(0x24)
+%((& disp 0xFFFFFFFF))
+%endm
+
+%macro amd_modrm_disp_plain(reg, base, disp)
+%select((>= disp -128),
+    %select((<= disp 127),
+        %amd_modrm_disp8_plain(reg, base, disp),
+        %amd_modrm_disp32_plain(reg, base, disp)),
+    %amd_modrm_disp32_plain(reg, base, disp))
+%endm
+
+%macro amd_modrm_disp_sib(reg, disp)
+%select((>= disp -128),
+    %select((<= disp 127),
+        %amd_modrm_disp8_sib(reg, disp),
+        %amd_modrm_disp32_sib(reg, disp)),
+    %amd_modrm_disp32_sib(reg, disp))
+%endm
+
+%macro amd_modrm_disp(reg, base, disp)
+%select((= (& %amd_reg(base) 7) 4),
+    %amd_modrm_disp_sib(reg, disp),
+    %amd_modrm_disp_plain(reg, base, disp))
+%endm
+
+# ---- Register / arithmetic primitives ----------------------------------
+
+# mov dst, src -- 48 89 /r  (modrm form: source in reg, dest in rm).
+%macro amd_mov_rr(dst, src)
+%amd_rex_wrb(src, dst)
+!(0x89)
+%amd_modrm_rr(src, dst)
+%endm
+
+# op dst, src for ADD/SUB/AND/OR/XOR (same shape, different opcode byte).
+%macro amd_alu_rr(opcode, dst, src)
+%amd_rex_wrb(src, dst)
+!(opcode)
+%amd_modrm_rr(src, dst)
+%endm
+
+# op dst, imm8 -- 48 83 /ext ib.
+%macro amd_alu_ri8(ext, dst, imm)
+%amd_rex_wb(dst)
+!(0x83)
+%amd_modrm_ext_r(ext, dst)
+!((& imm 0xFF))
+%endm
+
+# op dst, imm32 -- 48 81 /ext id.
+%macro amd_alu_ri32(ext, dst, imm)
+%amd_rex_wb(dst)
+!(0x81)
+%amd_modrm_ext_r(ext, dst)
+%((& imm 0xFFFFFFFF))
+%endm
+
+# shift dst, imm8 -- 48 C1 /ext ib.  (ext: SHL=4, SHR=5, SAR=7)
+%macro amd_shift_ri8(ext, dst, imm)
+%amd_rex_wb(dst)
+!(0xC1)
+%amd_modrm_ext_r(ext, dst)
+!((& imm 0x3F))
+%endm
+
+# shift dst, cl -- 48 D3 /ext.
+%macro amd_shift_cl(ext, dst)
+%amd_rex_wb(dst)
+!(0xD3)
+%amd_modrm_ext_r(ext, dst)
+%endm
+
+# imul dst, src -- 48 0F AF /r (load source into reg, dest in rm? actually
+# the canonical form is IMUL r64, r/m64 — dest in reg, source in rm.)
+%macro amd_imul_rr(dst, src)
+%amd_rex_wrb(dst, src)
+!(0x0F)
+!(0xAF)
+%amd_modrm_rr(dst, src)
+%endm
+
+# idiv src -- 48 F7 /7.
+%macro amd_idiv_r(src)
+%amd_rex_wb(src)
+!(0xF7)
+%amd_modrm_ext_r(7, src)
+%endm
+
+# cqo -- 48 99 (sign-extend rax into rdx:rax).
+%macro amd_cqo()
+!(0x48)
+!(0x99)
+%endm
+
+# push / pop r64.  50+r / 58+r; REX.B=0x41 if r8-r15.
+%macro amd_push(r)
+%select((>= %amd_reg(r) 8),
+    %amd_push_hi(r),
+    %amd_push_lo(r))
+%endm
+%macro amd_push_lo(r)
+!((| 0x50 %amd_reg(r)))
+%endm
+%macro amd_push_hi(r)
+!(0x41)
+!((| 0x50 (& %amd_reg(r) 7)))
+%endm
+
+%macro amd_pop(r)
+%select((>= %amd_reg(r) 8),
+    %amd_pop_hi(r),
+    %amd_pop_lo(r))
+%endm
+%macro amd_pop_lo(r)
+!((| 0x58 %amd_reg(r)))
+%endm
+%macro amd_pop_hi(r)
+!(0x41)
+!((| 0x58 (& %amd_reg(r) 7)))
+%endm
+
+# mov r32, imm32 -- B8+r id.  Low-register form skips REX; r8-r15 need
+# REX.B=0x41.  The 4-byte literal the caller emits is zero-extended into
+# the full 64-bit register, matching the LA / LA_BR literal-pool contract.
+%macro amd_mov_imm32_prefix(rd)
+%select((>= %amd_reg(rd) 8),
+    %amd_mov_imm32_prefix_hi(rd),
+    %amd_mov_imm32_prefix_lo(rd))
+%endm
+%macro amd_mov_imm32_prefix_lo(rd)
+!((| 0xB8 %amd_reg(rd)))
+%endm
+%macro amd_mov_imm32_prefix_hi(rd)
+!(0x41)
+!((| 0xB8 (& %amd_reg(rd) 7)))
+%endm
+
+# mov r64, imm64 -- REX.W [+ REX.B] B8+r  followed by 8 bytes of literal.
+%macro amd_mov_imm64_prefix(rd)
+%amd_rex_wb(rd)
+!((| 0xB8 (& %amd_reg(rd) 7)))
+%endm
+
+# ---- Memory ops ---------------------------------------------------------
+
+# mov rT, [rN + off]        48 8B /r  modrm-with-disp
+%macro amd_mem_LD(rt, rn, off)
+%amd_rex_wrb(rt, rn)
+!(0x8B)
+%amd_modrm_disp(rt, rn, off)
+%endm
+
+# mov [rN + off], rT        48 89 /r
+%macro amd_mem_ST(rt, rn, off)
+%amd_rex_wrb(rt, rn)
+!(0x89)
+%amd_modrm_disp(rt, rn, off)
+%endm
+
+# mov [rN + off], rT8       48 88 /r  (REX.W forces the rD8 encoding of
+# dil/sil/bpl/spl when the byte view of those regs is needed.)
+%macro amd_mem_SB(rt, rn, off)
+%amd_rex_wrb(rt, rn)
+!(0x88)
+%amd_modrm_disp(rt, rn, off)
+%endm
+
+# movzx rT, byte ptr [rN + off]  -- 48 0F B6 /r
+%macro amd_mem_LB(rt, rn, off)
+%amd_rex_wrb(rt, rn)
+!(0x0F)
+!(0xB6)
+%amd_modrm_disp(rt, rn, off)
+%endm
+
+# ---- Control flow primitives -------------------------------------------
+
+# jmp r/m64        -- FF /4
+# call r/m64       -- FF /2
+# ret              -- C3
+# syscall          -- 0F 05
+# cmp rA, rB       -- 48 39 /r  (modrm: rB in reg, rA in rm)
+# test rA, rA      -- 48 85 /r
+# Jcc rel8         -- 7x ib
+
+%macro amd_jmp_r(r)
+%select((>= %amd_reg(r) 8),
+    %amd_jmp_r_hi(r),
+    %amd_jmp_r_lo(r))
+%endm
+%macro amd_jmp_r_lo(r)
+!(0xFF)
+!((| 0xE0 (& %amd_reg(r) 7)))
+%endm
+%macro amd_jmp_r_hi(r)
+!(0x41)
+!(0xFF)
+!((| 0xE0 (& %amd_reg(r) 7)))
+%endm
+
+%macro amd_call_r(r)
+%select((>= %amd_reg(r) 8),
+    %amd_call_r_hi(r),
+    %amd_call_r_lo(r))
+%endm
+%macro amd_call_r_lo(r)
+!(0xFF)
+!((| 0xD0 (& %amd_reg(r) 7)))
+%endm
+%macro amd_call_r_hi(r)
+!(0x41)
+!(0xFF)
+!((| 0xD0 (& %amd_reg(r) 7)))
+%endm
+
+%macro amd_ret()
+!(0xC3)
+%endm
+
+%macro amd_syscall()
+!(0x0F)
+!(0x05)
+%endm
+
+# cmp rA, rB -- 48 39 /r (modrm: rB in reg, rA in rm).
+%macro amd_cmp_rr(ra, rb)
+%amd_rex_wrb(rb, ra)
+!(0x39)
+%amd_modrm_rr(rb, ra)
+%endm
+
+%macro amd_test_rr(ra, rb)
+%amd_rex_wrb(rb, ra)
+!(0x85)
+%amd_modrm_rr(rb, ra)
+%endm
+
+# ---- P1 register-register op lowering ----------------------------------
+#
+# For ADD/SUB/AND/OR/XOR we honor rD=rB aliasing — the naive `mov rD,rA ;
+# op rD,rB` would clobber rB before the op reads it. Route rB through the
+# scratch reg when that aliasing shows up.
+
+%macro amd_rrr_simple_ADD(rd, ra, rb)
+%amd_rrr_simple(0x01, rd, ra, rb)
+%endm
+%macro amd_rrr_simple_SUB(rd, ra, rb)
+%amd_rrr_simple(0x29, rd, ra, rb)
+%endm
+%macro amd_rrr_simple_AND(rd, ra, rb)
+%amd_rrr_simple(0x21, rd, ra, rb)
+%endm
+%macro amd_rrr_simple_OR(rd, ra, rb)
+%amd_rrr_simple(0x09, rd, ra, rb)
+%endm
+%macro amd_rrr_simple_XOR(rd, ra, rb)
+%amd_rrr_simple(0x31, rd, ra, rb)
+%endm
+
+%macro amd_rrr_simple(opcode, rd, ra, rb)
+%select((= %amd_reg(rd) %amd_reg(rb)),
+    %amd_rrr_simple_via_scratch(opcode, rd, ra, rb),
+    %amd_rrr_simple_direct(opcode, rd, ra, rb))
+%endm
+
+%macro amd_rrr_simple_direct(opcode, rd, ra, rb)
+%amd_mov_rr(rd, ra)
+%amd_alu_rr(opcode, rd, rb)
+%endm
+
+%macro amd_rrr_simple_via_scratch(opcode, rd, ra, rb)
+%amd_mov_rr(scratch, rb)
+%amd_mov_rr(rd, ra)
+%amd_alu_rr(opcode, rd, scratch)
+%endm
+
+%macro amd_rrr_MUL(rd, ra, rb)
+%select((= %amd_reg(rd) %amd_reg(rb)),
+    %amd_rrr_MUL_via_scratch(rd, ra, rb),
+    %amd_rrr_MUL_direct(rd, ra, rb))
+%endm
+%macro amd_rrr_MUL_direct(rd, ra, rb)
+%amd_mov_rr(rd, ra)
+%amd_imul_rr(rd, rb)
+%endm
+%macro amd_rrr_MUL_via_scratch(rd, ra, rb)
+%amd_mov_rr(scratch, rb)
+%amd_mov_rr(rd, ra)
+%amd_imul_rr(rd, scratch)
+%endm
+
+# DIV / REM clobber rax and rdx natively. rax is not a P1 register, so
+# we clobber it freely; rdx IS P1 a2, so we stash it to rbp (also outside
+# the P1 mapping) for the lifetime of the op.
+#
+# Aliasing-safety plan, same for DIV and REM:
+#   1. rbp = rdx                    -- saved a2, also serves as "original rb
+#                                       if rb == a2" via the scratch copy
+#   2. scratch = rb                 -- read rb while rdx still holds its
+#                                       original value (in case rb == a2)
+#   3. rax = ra                     -- ra == a2 reads original rdx for the
+#                                       same reason; cqo hasn't run yet
+#   4. cqo ; idiv scratch           -- divide
+#   5. rdx = rbp (restore) BEFORE   -- so `mov rd, rax/rdx` below can
+#      writing rd                      legitimately overwrite rdx when
+#                                      rd == a2 without losing the result
+#   6. mov rd, rax                  -- DIV quotient
+#      or capture rdx -> rax first,
+#      then rd = rax                -- REM remainder (capture dodges the
+#                                      restore overwriting the remainder)
+
+%macro amd_rrr_DIV(rd, ra, rb)
+%amd_mov_rr(rbp, rdx)
+%amd_mov_rr(scratch, rb)
+%amd_mov_rr(rax, ra)
+%amd_cqo()
+%amd_idiv_r(scratch)
+%amd_mov_rr(rdx, rbp)
+%amd_mov_rr(rd, rax)
+%endm
+
+%macro amd_rrr_REM(rd, ra, rb)
+%amd_mov_rr(rbp, rdx)
+%amd_mov_rr(scratch, rb)
+%amd_mov_rr(rax, ra)
+%amd_cqo()
+%amd_idiv_r(scratch)
+%amd_mov_rr(rax, rdx)
+%amd_mov_rr(rdx, rbp)
+%amd_mov_rr(rd, rax)
+%endm
+
+# SHL / SHR / SAR with reg count. x86 reads the count from CL only, so
+# staging goes through rcx — which IS P1 a3. Save rcx to rbp for the
+# duration.
+#
+# Ordering is load-bearing:
+#   1. rbp = rcx                     -- save a3
+#   2. scratch = ra                  -- read ra BEFORE we overwrite rcx;
+#                                      otherwise `ra == a3` reads the count
+#                                      we just staged
+#   3. rcx = rb                      -- count into cl
+#   4. shift scratch, cl             -- do the work
+#   5. rcx = rbp (restore) BEFORE    -- so `mov rd, scratch` below can
+#      writing rd                       legitimately overwrite rcx when
+#                                      rd == a3 without losing the result
+#   6. mov rd, scratch
+
+%macro amd_rrr_SHL(rd, ra, rb)
+%amd_rrr_shift(4, rd, ra, rb)
+%endm
+%macro amd_rrr_SHR(rd, ra, rb)
+%amd_rrr_shift(5, rd, ra, rb)
+%endm
+%macro amd_rrr_SAR(rd, ra, rb)
+%amd_rrr_shift(7, rd, ra, rb)
+%endm
+
+%macro amd_rrr_shift(ext, rd, ra, rb)
+%amd_mov_rr(rbp, rcx)
+%amd_mov_rr(scratch, ra)
+%amd_mov_rr(rcx, rb)
+%amd_shift_cl(ext, scratch)
+%amd_mov_rr(rcx, rbp)
+%amd_mov_rr(rd, scratch)
+%endm
+
+%macro amd_rrr_op(op, rd, ra, rb)
+%amd_rrr_##op(rd, ra, rb)
+%endm
+
+%macro amd_rrr_ADD(rd, ra, rb)
+%amd_rrr_simple_ADD(rd, ra, rb)
+%endm
+%macro amd_rrr_SUB(rd, ra, rb)
+%amd_rrr_simple_SUB(rd, ra, rb)
+%endm
+%macro amd_rrr_AND(rd, ra, rb)
+%amd_rrr_simple_AND(rd, ra, rb)
+%endm
+%macro amd_rrr_OR(rd, ra, rb)
+%amd_rrr_simple_OR(rd, ra, rb)
+%endm
+%macro amd_rrr_XOR(rd, ra, rb)
+%amd_rrr_simple_XOR(rd, ra, rb)
+%endm
+
+# ---- P1 operation lowering ---------------------------------------------
+
+%macro p1_li(rd)
+%amd_mov_imm64_prefix(rd)
+%endm
+
+%macro p1_la(rd)
+%amd_mov_imm32_prefix(rd)
+%endm
+
+%macro p1_labr()
+%amd_mov_imm32_prefix(br)
+%endm
+
+%macro p1_mov(rd, rs)
+%amd_mov_rr(rd, rs)
+%endm
+
+%macro p1_rrr(op, rd, ra, rb)
+%amd_rrr_op(op, rd, ra, rb)
+%endm
+
+%macro p1_addi(rd, ra, imm)
+%amd_mov_rr(rd, ra)
+%select((>= imm -128),
+    %select((<= imm 127),
+        %amd_alu_ri8(0, rd, imm),
+        %amd_alu_ri32(0, rd, imm)),
+    %amd_alu_ri32(0, rd, imm))
+%endm
+
+# AND/OR with imm: 83 /ext ib sign-extends imm8 to 64 bits. That works for
+# imm in [-128, 127] (and for -1 as a convenient all-ones mask), but breaks
+# for positive imms >= 128 — ANDI with 255 would become AND with
+# 0xFFFFFFFFFFFFFFFF. Widen to the imm32 form when imm8 would misencode.
+%macro p1_logi_ANDI(rd, ra, imm)
+%amd_mov_rr(rd, ra)
+%select((>= imm -128),
+    %select((<= imm 127),
+        %amd_alu_ri8(4, rd, imm),
+        %amd_alu_ri32(4, rd, imm)),
+    %amd_alu_ri32(4, rd, imm))
+%endm
+%macro p1_logi_ORI(rd, ra, imm)
+%amd_mov_rr(rd, ra)
+%select((>= imm -128),
+    %select((<= imm 127),
+        %amd_alu_ri8(1, rd, imm),
+        %amd_alu_ri32(1, rd, imm)),
+    %amd_alu_ri32(1, rd, imm))
+%endm
+%macro p1_logi(op, rd, ra, imm)
+%p1_logi_##op(rd, ra, imm)
+%endm
+
+%macro p1_shifti_SHLI(rd, ra, imm)
+%amd_mov_rr(rd, ra)
+%amd_shift_ri8(4, rd, imm)
+%endm
+%macro p1_shifti_SHRI(rd, ra, imm)
+%amd_mov_rr(rd, ra)
+%amd_shift_ri8(5, rd, imm)
+%endm
+%macro p1_shifti_SARI(rd, ra, imm)
+%amd_mov_rr(rd, ra)
+%amd_shift_ri8(7, rd, imm)
+%endm
+%macro p1_shifti(op, rd, ra, imm)
+%p1_shifti_##op(rd, ra, imm)
+%endm
+
+%macro p1_mem_LD(rt, rn, off)
+%amd_mem_LD(rt, rn, off)
+%endm
+%macro p1_mem_ST(rt, rn, off)
+%amd_mem_ST(rt, rn, off)
+%endm
+%macro p1_mem_LB(rt, rn, off)
+%amd_mem_LB(rt, rn, off)
+%endm
+%macro p1_mem_SB(rt, rn, off)
+%amd_mem_SB(rt, rn, off)
+%endm
+%macro p1_mem(op, rt, rn, off)
+%p1_mem_##op(rt, rn, off)
+%endm
+
+%macro p1_ldarg(rd, slot)
+%amd_mem_LD(scratch, sp, 8)
+%amd_mem_LD(rd, scratch, (+ 16 (* 8 slot)))
+%endm
+
+%macro p1_b()
+%amd_jmp_r(br)
+%endm
+
+%macro p1_br(rs)
+%amd_jmp_r(rs)
+%endm
+
+%macro p1_call()
+%amd_call_r(br)
+%endm
+
+%macro p1_callr(rs)
+%amd_call_r(rs)
+%endm
+
+%macro p1_ret()
+%amd_ret()
+%endm
+
+# LEAVE
+#   r9 = [sp + 0]       -- retaddr into scratch
+#   rax = [sp + 8]      -- saved caller sp into rax (an unused native reg)
+#   sp = rax            -- unwind to caller sp
+#   push r9             -- reinstall retaddr so RET returns correctly
+%macro p1_leave()
+%amd_mem_LD(scratch, sp, 0)
+%amd_mem_LD(rax, sp, 8)
+%amd_mov_rr(sp, rax)
+%amd_push(scratch)
+%endm
+
+%macro p1_tail()
+%p1_leave()
+%amd_jmp_r(br)
+%endm
+
+%macro p1_tailr(rs)
+%p1_leave()
+%amd_jmp_r(rs)
+%endm
+
+# Conditional-branch lowering:
+#   compare / test
+#   Jcc_inverse +3          skip the 3-byte `jmp r15`
+#   jmp r15                 P1 branch-taken path
+#
+# Invert codes: BEQ->JNE(75), BNE->JE(74), BLT->JGE(7D), BLTU->JAE(73),
+# BLTZ->JGE(7D), BEQZ->JNE(75), BNEZ->JE(74).
+
+%macro p1_condb_BEQ(ra, rb)
+%amd_cmp_rr(ra, rb)
+!(0x75)
+!(0x03)
+%amd_jmp_r(br)
+%endm
+%macro p1_condb_BNE(ra, rb)
+%amd_cmp_rr(ra, rb)
+!(0x74)
+!(0x03)
+%amd_jmp_r(br)
+%endm
+%macro p1_condb_BLT(ra, rb)
+%amd_cmp_rr(ra, rb)
+!(0x7D)
+!(0x03)
+%amd_jmp_r(br)
+%endm
+%macro p1_condb_BLTU(ra, rb)
+%amd_cmp_rr(ra, rb)
+!(0x73)
+!(0x03)
+%amd_jmp_r(br)
+%endm
+%macro p1_condb(op, ra, rb)
+%p1_condb_##op(ra, rb)
+%endm
+
+%macro p1_condbz_BEQZ(ra)
+%amd_test_rr(ra, ra)
+!(0x75)
+!(0x03)
+%amd_jmp_r(br)
+%endm
+%macro p1_condbz_BNEZ(ra)
+%amd_test_rr(ra, ra)
+!(0x74)
+!(0x03)
+%amd_jmp_r(br)
+%endm
+%macro p1_condbz_BLTZ(ra)
+%amd_test_rr(ra, ra)
+!(0x7D)
+!(0x03)
+%amd_jmp_r(br)
+%endm
+%macro p1_condbz(op, ra)
+%p1_condbz_##op(ra)
+%endm
+
+# ENTER size
+#
+# CALL on amd64 pushed the retaddr, so on entry:
+#   rsp = caller_sp - 8
+#   [rsp] = retaddr
+#
+# We want the standard frame:
+#   [sp + 0] = retaddr
+#   [sp + 8] = saved caller_sp
+#   [sp + 16 .. 16 + size - 1] = locals
+#   total frame = round_up(16, 16 + size)
+#
+# Pop retaddr into scratch, save caller_sp into rax (unused by P1),
+# allocate frame, restore retaddr at [sp], store caller_sp at [sp+8].
+%macro p1_enter(size)
+%amd_pop(scratch)
+%amd_mov_rr(rax, sp)
+%amd_alu_ri32(5, sp, (& (+ (+ 16 size) 15) -16))
+%amd_mem_ST(scratch, sp, 0)
+%amd_mem_ST(rax, sp, 8)
+%endm
+
+%macro p1_entry()
+# :_start stub per the P1v2 program-entry model. Linux amd64 puts argc
+# at [rsp] and argv starting at [rsp+8]. Load argc into a0 (rdi),
+# compute &argv[0] into a1 (rsi), call p1_main under the one-word
+# direct-result convention, then issue sys_exit with p1_main's return
+# value in a0.
+:_start
+%amd_mem_LD(a0, sp, 0)
+%amd_mov_rr(a1, sp)
+%amd_alu_ri8(0, a1, 8)
+%amd_mov_imm32_prefix(br)
+&p1_main
+%amd_call_r(br)
+# mov eax, 60  (sys_exit); syscall. P1 a0 (native rdi) already holds
+# p1_main's return value.
+!(0xB8)
+%(60)
+!(0x0F)
+!(0x05)
+%endm
+
+%macro p1_syscall()
+# P1: a0=num, a1..a3,t0,s0,s1 = args 0..5. Linux amd64: rax=num,
+# rdi/rsi/rdx/r10/r8/r9 = args 0..5, return in rax; syscall also
+# clobbers rcx and r11.
+#
+# Plan: push the P1 registers whose native slots get overwritten or
+# syscall-clobbered — rsi (a1), rdx (a2), rcx (a3), r11 (t1), r8 (t2) —
+# then shuffle into the native slots, issue syscall, restore, and move
+# the return value (rax) into a0 (rdi).
+%amd_push(rsi)
+%amd_push(rdx)
+%amd_push(rcx)
+%amd_push(r11)
+%amd_push(r8)
+
+%amd_mov_rr(rax, rdi)
+%amd_mem_LD(rdi, sp, 32)
+%amd_mem_LD(rsi, sp, 24)
+%amd_mem_LD(rdx, sp, 16)
+%amd_mov_rr(r8, rbx)
+%amd_mov_rr(r9, r12)
+
+!(0x0F)
+!(0x05)
+
+%amd_pop(r8)
+%amd_pop(r11)
+%amd_pop(rcx)
+%amd_pop(rdx)
+%amd_pop(rsi)
+
+%amd_mov_rr(rdi, rax)
+%endm
+
+# ---- Linux amd64 syscall number data words ------------------------------
+
+%macro p1_sys_read()
+$(0)
+%endm
+%macro p1_sys_write()
+$(1)
+%endm
+%macro p1_sys_close()
+$(3)
+%endm
+%macro p1_sys_openat()
+$(257)
+%endm
+%macro p1_sys_exit()
+$(60)
+%endm
+%macro p1_sys_clone()
+$(56)
+%endm
+%macro p1_sys_execve()
+$(59)
+%endm
+%macro p1_sys_waitid()
+$(247)
+%endm
diff --git a/p1/P1-riscv64.M1pp b/p1/P1-riscv64.M1pp
@@ -0,0 +1,535 @@
+# P1-riscv64.M1pp -- P1v2 riscv64 backend expressed in m1macro.
+#
+# Mirrors p1/P1-aarch64.M1pp; same macro surface, different encodings.
+# Native register picks follow docs/P1.md's 64-bit mapping table.
+#
+# Hidden backend regs:
+#   br      = t6 (x31)  -- dedicated branch-target mechanism
+#   scratch = t5 (x30)  -- per-expansion scratch, never live across ops
+#   save0   = t4 (x29)  -- transient across SYSCALL only
+#   save1   = t3 (x28)
+#   save2   = a6 (x16)
+#   saved_fp = fp (x8)  -- used by ENTER/LEAVE to capture caller sp
+#   a7      = x17       -- Linux riscv64 syscall-number slot
+#   a4      = x14       -- syscall arg4 slot
+#   a5      = x15       -- syscall arg5 slot
+
+# ---- Native register numbers --------------------------------------------
+
+%macro rv_reg_a0()
+10
+%endm
+%macro rv_reg_a1()
+11
+%endm
+%macro rv_reg_a2()
+12
+%endm
+%macro rv_reg_a3()
+13
+%endm
+%macro rv_reg_a4()
+14
+%endm
+%macro rv_reg_a5()
+15
+%endm
+%macro rv_reg_a6()
+16
+%endm
+%macro rv_reg_a7()
+17
+%endm
+%macro rv_reg_t0()
+5
+%endm
+%macro rv_reg_t1()
+6
+%endm
+%macro rv_reg_t2()
+7
+%endm
+%macro rv_reg_s0()
+9
+%endm
+%macro rv_reg_s1()
+18
+%endm
+%macro rv_reg_s2()
+19
+%endm
+%macro rv_reg_s3()
+20
+%endm
+%macro rv_reg_sp()
+2
+%endm
+%macro rv_reg_zero()
+0
+%endm
+%macro rv_reg_ra()
+1
+%endm
+%macro rv_reg_fp()
+8
+%endm
+%macro rv_reg_br()
+31
+%endm
+%macro rv_reg_scratch()
+30
+%endm
+%macro rv_reg_save0()
+29
+%endm
+%macro rv_reg_save1()
+28
+%endm
+%macro rv_reg_save2()
+16
+%endm
+
+%macro rv_reg(r)
+%rv_reg_##r()
+%endm
+
+%macro rv_is_sp_a0()
+0
+%endm
+%macro rv_is_sp_a1()
+0
+%endm
+%macro rv_is_sp_a2()
+0
+%endm
+%macro rv_is_sp_a3()
+0
+%endm
+%macro rv_is_sp_a4()
+0
+%endm
+%macro rv_is_sp_a5()
+0
+%endm
+%macro rv_is_sp_a6()
+0
+%endm
+%macro rv_is_sp_a7()
+0
+%endm
+%macro rv_is_sp_t0()
+0
+%endm
+%macro rv_is_sp_t1()
+0
+%endm
+%macro rv_is_sp_t2()
+0
+%endm
+%macro rv_is_sp_s0()
+0
+%endm
+%macro rv_is_sp_s1()
+0
+%endm
+%macro rv_is_sp_s2()
+0
+%endm
+%macro rv_is_sp_s3()
+0
+%endm
+%macro rv_is_sp_sp()
+1
+%endm
+%macro rv_is_sp_zero()
+0
+%endm
+%macro rv_is_sp_ra()
+0
+%endm
+%macro rv_is_sp_fp()
+0
+%endm
+%macro rv_is_sp_br()
+0
+%endm
+%macro rv_is_sp_scratch()
+0
+%endm
+%macro rv_is_sp_save0()
+0
+%endm
+%macro rv_is_sp_save1()
+0
+%endm
+%macro rv_is_sp_save2()
+0
+%endm
+
+%macro rv_is_sp(r)
+%rv_is_sp_##r()
+%endm
+
+# ---- Low-level instruction encoders --------------------------------------
+
+# R-type: funct7[31:25] rs2[24:20] rs1[19:15] funct3[14:12] rd[11:7] opcode[6:0]
+%macro rv_r_type(base, rd, ra, rb)
+%((| base (<< %rv_reg(rb) 20) (<< %rv_reg(ra) 15) (<< %rv_reg(rd) 7)))
+%endm
+
+# I-type: imm[31:20] rs1[19:15] funct3[14:12] rd[11:7] opcode[6:0]
+%macro rv_i_type(base, rd, ra, imm12)
+%((| base (<< (& imm12 0xFFF) 20) (<< %rv_reg(ra) 15) (<< %rv_reg(rd) 7)))
+%endm
+
+# S-type: imm[31:25] rs2[24:20] rs1[19:15] funct3[14:12] imm[11:7] opcode[6:0]
+%macro rv_s_type(base, rs, ra, imm12)
+%((| base (<< (& (>> imm12 5) 0x7F) 25) (<< %rv_reg(rs) 20) (<< %rv_reg(ra) 15) (<< (& imm12 0x1F) 7)))
+%endm
+
+# B-type: imm[12|10:5] rs2 rs1 funct3 imm[4:1|11] opcode. 12-bit signed,
+# imm[0] always 0. For the hardcoded skip-over-jalr we only need a fixed
+# positive offset (8 bytes = 2 insns), so inline the resulting bit pattern.
+%macro rv_b_type_skip8(base, ra, rb)
+# imm value 8 -> imm[11:0] = 0000_0000_0100. Bits of encoded imm:
+#   imm[12]=0, imm[10:5]=0, imm[4:1]=0100 (=4), imm[11]=0.
+#   encoded bits: [31:25]=0, [11:7]= (imm[4:1] << 1) | imm[11] = (4<<1)|0 = 8.
+%((| base (<< %rv_reg(rb) 20) (<< %rv_reg(ra) 15) (<< 8 7)))
+%endm
+
+%macro rv_addi(rd, ra, imm12)
+%rv_i_type(0x00000013, rd, ra, imm12)
+%endm
+
+%macro rv_ld(rd, ra, imm12)
+%rv_i_type(0x00003003, rd, ra, imm12)
+%endm
+
+%macro rv_sd(rs, ra, imm12)
+%rv_s_type(0x00003023, rs, ra, imm12)
+%endm
+
+%macro rv_lbu(rd, ra, imm12)
+%rv_i_type(0x00004003, rd, ra, imm12)
+%endm
+
+%macro rv_sb(rs, ra, imm12)
+%rv_s_type(0x00000023, rs, ra, imm12)
+%endm
+
+%macro rv_lwu(rd, ra, imm12)
+%rv_i_type(0x00006003, rd, ra, imm12)
+%endm
+
+%macro rv_mov_rr(dst, src)
+%rv_addi(dst, src, 0)
+%endm
+
+%macro rv_slli(rd, ra, shamt)
+%((| 0x00001013 (<< (& shamt 0x3F) 20) (<< %rv_reg(ra) 15) (<< %rv_reg(rd) 7)))
+%endm
+
+%macro rv_srli(rd, ra, shamt)
+%((| 0x00005013 (<< (& shamt 0x3F) 20) (<< %rv_reg(ra) 15) (<< %rv_reg(rd) 7)))
+%endm
+
+%macro rv_srai(rd, ra, shamt)
+%((| 0x40005013 (<< (& shamt 0x3F) 20) (<< %rv_reg(ra) 15) (<< %rv_reg(rd) 7)))
+%endm
+
+%macro rv_jalr(rd, rs, imm12)
+%((| 0x00000067 (<< (& imm12 0xFFF) 20) (<< %rv_reg(rs) 15) (<< %rv_reg(rd) 7)))
+%endm
+
+%macro rv_ecall()
+%(0x00000073)
+%endm
+
+# 64-bit literal-pool prefix for LI:
+#   auipc rd, 0          pc-relative base
+#   ld    rd, 12(rd)     load 8-byte literal from pc+12
+#   jal   x0, 12         skip 12 bytes (literal + pad, =8 bytes of literal).
+# The 8 bytes that follow in source become the literal.
+%macro rv_lit64_prefix(rd)
+%((| 0x00000017 (<< %rv_reg(rd) 7)))
+%((| 0x00C03003 (<< %rv_reg(rd) 15) (<< %rv_reg(rd) 7)))
+%(0x00C0006F)
+%endm
+
+# 32-bit literal-pool prefix for LA / LA_BR:
+#   auipc rd, 0
+#   lwu   rd, 12(rd)     zero-extend 4-byte literal from pc+12
+#   jal   x0, 8          skip 8 bytes (=4 of insn slot? no, literal + align).
+# lwu zero-extends into the full 64-bit register, so 4 bytes is enough for
+# any address in the stage0 layout. Lets source use `&label` directly
+# without padding to 8 bytes.
+%macro rv_lit32_prefix(rd)
+%((| 0x00000017 (<< %rv_reg(rd) 7)))
+%((| 0x00C06003 (<< %rv_reg(rd) 15) (<< %rv_reg(rd) 7)))
+%(0x0080006F)
+%endm
+
+# Memory op fallback: offset outside signed 12-bit range. Load the
+# offset into `scratch` via LUI+ADDI dance? For stage0 programs the
+# curated offsets stay inside -2048..2047, so fall back is unused;
+# still emit a defensive failure to flag any future overflow.
+# (In practice none of the LD/ST off values in p1_gen.py exceed the
+# signed 12-bit range, so no fallback path is wired in here.)
+
+# ---- P1 register-register op lowering -----------------------------------
+
+%macro rv_rrr_ADD(rd, ra, rb)
+%rv_r_type(0x00000033, rd, ra, rb)
+%endm
+%macro rv_rrr_SUB(rd, ra, rb)
+%rv_r_type(0x40000033, rd, ra, rb)
+%endm
+%macro rv_rrr_AND(rd, ra, rb)
+%rv_r_type(0x00007033, rd, ra, rb)
+%endm
+%macro rv_rrr_OR(rd, ra, rb)
+%rv_r_type(0x00006033, rd, ra, rb)
+%endm
+%macro rv_rrr_XOR(rd, ra, rb)
+%rv_r_type(0x00004033, rd, ra, rb)
+%endm
+%macro rv_rrr_SHL(rd, ra, rb)
+%rv_r_type(0x00001033, rd, ra, rb)
+%endm
+%macro rv_rrr_SHR(rd, ra, rb)
+%rv_r_type(0x00005033, rd, ra, rb)
+%endm
+%macro rv_rrr_SAR(rd, ra, rb)
+%rv_r_type(0x40005033, rd, ra, rb)
+%endm
+%macro rv_rrr_MUL(rd, ra, rb)
+%rv_r_type(0x02000033, rd, ra, rb)
+%endm
+%macro rv_rrr_DIV(rd, ra, rb)
+%rv_r_type(0x02004033, rd, ra, rb)
+%endm
+%macro rv_rrr_REM(rd, ra, rb)
+%rv_r_type(0x02006033, rd, ra, rb)
+%endm
+
+%macro rv_rrr_op(op, rd, ra, rb)
+%rv_rrr_##op(rd, ra, rb)
+%endm
+
+# ---- P1 operation lowering -----------------------------------------------
+
+%macro p1_li(rd)
+%rv_lit64_prefix(rd)
+%endm
+
+%macro p1_la(rd)
+%rv_lit32_prefix(rd)
+%endm
+
+%macro p1_labr()
+%rv_lit32_prefix(br)
+%endm
+
+%macro p1_mov(rd, rs)
+%rv_mov_rr(rd, rs)
+%endm
+
+%macro p1_rrr(op, rd, ra, rb)
+%rv_rrr_op(op, rd, ra, rb)
+%endm
+
+%macro p1_addi(rd, ra, imm)
+%rv_addi(rd, ra, imm)
+%endm
+
+%macro p1_logi_ANDI(rd, ra, imm)
+%rv_i_type(0x00007013, rd, ra, imm)
+%endm
+%macro p1_logi_ORI(rd, ra, imm)
+%rv_i_type(0x00006013, rd, ra, imm)
+%endm
+%macro p1_logi(op, rd, ra, imm)
+%p1_logi_##op(rd, ra, imm)
+%endm
+
+%macro p1_shifti_SHLI(rd, ra, imm)
+%rv_slli(rd, ra, imm)
+%endm
+%macro p1_shifti_SHRI(rd, ra, imm)
+%rv_srli(rd, ra, imm)
+%endm
+%macro p1_shifti_SARI(rd, ra, imm)
+%rv_srai(rd, ra, imm)
+%endm
+%macro p1_shifti(op, rd, ra, imm)
+%p1_shifti_##op(rd, ra, imm)
+%endm
+
+%macro p1_mem_LD(rt, rn, off)
+%rv_ld(rt, rn, off)
+%endm
+%macro p1_mem_ST(rt, rn, off)
+%rv_sd(rt, rn, off)
+%endm
+%macro p1_mem_LB(rt, rn, off)
+%rv_lbu(rt, rn, off)
+%endm
+%macro p1_mem_SB(rt, rn, off)
+%rv_sb(rt, rn, off)
+%endm
+%macro p1_mem(op, rt, rn, off)
+%p1_mem_##op(rt, rn, off)
+%endm
+
+%macro p1_ldarg(rd, slot)
+%rv_ld(scratch, sp, 8)
+%rv_ld(rd, scratch, (+ 16 (* 8 slot)))
+%endm
+
+%macro p1_b()
+%rv_jalr(zero, br, 0)
+%endm
+
+%macro p1_br(rs)
+%rv_jalr(zero, rs, 0)
+%endm
+
+%macro p1_call()
+%rv_jalr(ra, br, 0)
+%endm
+
+%macro p1_callr(rs)
+%rv_jalr(ra, rs, 0)
+%endm
+
+%macro p1_ret()
+%rv_jalr(zero, ra, 0)
+%endm
+
+%macro p1_leave()
+%rv_ld(ra, sp, 0)
+%rv_ld(fp, sp, 8)
+%rv_mov_rr(sp, fp)
+%endm
+
+%macro p1_tail()
+%p1_leave()
+%rv_jalr(zero, br, 0)
+%endm
+
+%macro p1_tailr(rs)
+%p1_leave()
+%rv_jalr(zero, rs, 0)
+%endm
+
+# Conditional branch: emit a skip-taken native branch over the `%p1_b()`
+# fall-through, then the jalr(br) that takes the P1 branch. Each native
+# B-type here uses the inverted condition with a +8 offset so the `jalr`
+# two insns below is the taken target.
+%macro p1_condb_BEQ(ra, rb)
+%rv_b_type_skip8(0x00001063, ra, rb)
+%p1_b()
+%endm
+%macro p1_condb_BNE(ra, rb)
+%rv_b_type_skip8(0x00000063, ra, rb)
+%p1_b()
+%endm
+%macro p1_condb_BLT(ra, rb)
+%rv_b_type_skip8(0x00005063, ra, rb)
+%p1_b()
+%endm
+%macro p1_condb_BLTU(ra, rb)
+%rv_b_type_skip8(0x00007063, ra, rb)
+%p1_b()
+%endm
+%macro p1_condb(op, ra, rb)
+%p1_condb_##op(ra, rb)
+%endm
+
+%macro p1_condbz_BEQZ(ra)
+%rv_b_type_skip8(0x00001063, ra, zero)
+%p1_b()
+%endm
+%macro p1_condbz_BNEZ(ra)
+%rv_b_type_skip8(0x00000063, ra, zero)
+%p1_b()
+%endm
+%macro p1_condbz_BLTZ(ra)
+%rv_b_type_skip8(0x00005063, ra, zero)
+%p1_b()
+%endm
+%macro p1_condbz(op, ra)
+%p1_condbz_##op(ra)
+%endm
+
+%macro p1_enter(size)
+%rv_addi(sp, sp, (- 0 (& (+ (+ 16 size) 15) -16)))
+%rv_sd(ra, sp, 0)
+%rv_addi(fp, sp, (& (+ (+ 16 size) 15) -16))
+%rv_sd(fp, sp, 8)
+%endm
+
+%macro p1_entry()
+# :_start stub per the P1v2 program-entry model. Linux riscv64 puts argc
+# at [sp] and argv starting at [sp+8], matching the generic SysV entry
+# stack. Load argc into a0, compute &argv[0] into a1, call p1_main under
+# the one-word direct-result convention, then issue sys_exit with the
+# returned status.
+:_start
+%rv_ld(a0, sp, 0)
+%rv_addi(a1, sp, 8)
+%rv_lit32_prefix(br)
+&p1_main
+%rv_jalr(ra, br, 0)
+%rv_addi(a7, zero, 93)
+%rv_ecall()
+%endm
+
+%macro p1_syscall()
+# P1: a0=number, a1,a2,a3,t0,s0,s1 = args 0..5.
+# Linux riscv64: a7=number, a0..a5 = args 0..5, return in a0.
+# SYSCALL clobbers only P1 a0; restore a1/a2/a3 after ecall.
+# Native a4/a5 (x14/x15) aren't P1-exposed; we use them as syscall arg
+# slots and don't need to save them.
+%rv_mov_rr(save0, a1)
+%rv_mov_rr(save1, a2)
+%rv_mov_rr(save2, a3)
+%rv_mov_rr(a7, a0)
+%rv_mov_rr(a0, save0)
+%rv_mov_rr(a1, save1)
+%rv_mov_rr(a2, save2)
+%rv_mov_rr(a3, t0)
+%rv_mov_rr(a4, s0)
+%rv_mov_rr(a5, s1)
+%rv_ecall()
+%rv_mov_rr(a1, save0)
+%rv_mov_rr(a2, save1)
+%rv_mov_rr(a3, save2)
+%endm
+
+# ---- Linux riscv64 syscall number data words -----------------------------
+
+%macro p1_sys_read()
+$(63)
+%endm
+%macro p1_sys_write()
+$(64)
+%endm
+%macro p1_sys_close()
+$(57)
+%endm
+%macro p1_sys_openat()
+$(56)
+%endm
+%macro p1_sys_exit()
+$(93)
+%endm
+%macro p1_sys_clone()
+$(220)
+%endm
+%macro p1_sys_execve()
+$(221)
+%endm
+%macro p1_sys_waitid()
+$(95)
+%endm
diff --git a/tests/p1/p1-aliasing.P1 b/tests/p1/p1-aliasing.P1
@@ -0,0 +1,72 @@
+# tests/p1/p1-aliasing.P1 -- regression coverage for the amd64 backend's
+# register-aliasing corner cases. Each test exercises a native encoding
+# that, if mishandled, clobbers a non-rD P1 register and produces a value
+# that differs from what aarch64 / riscv64 return. The combined result
+# is written to stdout as a single byte + newline so the harness can
+# diff it against `*` (0x2A = decimal 42).
+#
+# Test 1: ANDI with imm > 127. amd64 `83 /4 ib` sign-extends its byte
+#   immediate to 64 bits, so naive `ANDI rd, 255` AND's against -1
+#   (= all ones) rather than 255. Fixed by widening to `81 /4 id` for
+#   imms outside [-128, 127].
+#
+# Test 2: SHL / SHR / SAR (reg count) where rd == P1 a3 (native rcx).
+#   amd64 stages the count through rcx; the naive ordering restores rcx
+#   from its save-slot AFTER writing rd, clobbering the result when
+#   rd == a3.
+#
+# Test 3: DIV / REM where rd == rb == P1 a2 (native rdx). amd64's
+#   idiv writes rdx, and the save-slot restore has to happen BEFORE the
+#   mov-rd so that when rd == a2 the restore doesn't overwrite the
+#   quotient.  Plus, the divisor has to be stashed before cqo overwrites
+#   rdx, or `idiv rdx` divides by the sign-extension of rax.
+
+:p1_main
+    %enter(0)
+
+    # Test 1: ANDI imm=255.  0x2345 & 0xFF = 0x45 (69).
+    %li(a0) $(0x2345)
+    %andi(a0, a0, 255)
+    %mov(s0, a0)
+    %addi(s0, s0, -61)   # s0 = 8
+
+    # Test 2: SHL with rd aliasing rb (both a3), and ra == a0.
+    # a0=1, a3=3.  Correct: a3 = 1 << 3 = 8. Buggy (rcx restore wins): a3 = 3.
+    %li(a0) $(1)
+    %li(a3) $(3)
+    %shl(a3, a0, a3)
+    %sub(s0, a3, s0)     # s0 = a3 - 8.  Correct: 0.  Buggy: -5.
+
+    # Test 3: DIV with rd aliasing rb (both a2), and ra == a0.
+    # a0=182, a2=7.  Correct: a2 = 182 / 7 = 26. Buggy (idiv uses rdx after
+    # cqo; or mov-rdx-restore clobbers quotient): a2 = 7 or 1.
+    %li(a0) $(182)
+    %li(a2) $(7)
+    %div(a2, a0, a2)
+    %add(s0, s0, a2)     # s0 += a2.  Correct: 26.  Buggy: varies.
+
+    # Compose the output byte. Add 16 so the correct result maps to '*'
+    # (0x2A = 42). Wrong results land on different bytes.
+    %addi(s0, s0, 16)
+
+    # Store s0 as the first byte of msg_buf, then write msg_buf[0..2].
+    %la(t0) &msg_buf
+    %sb(s0, t0, 0)
+
+    %li(a0) %sys_write()
+    %li(a1) $(1)
+    %la(a2) &msg_buf
+    %li(a3) $(2)
+    %syscall()
+
+    %li(a0) $(0)
+    %leave()
+    %ret()
+
+# Two-byte output scratch: [0] = computed byte, [1] = newline. The space
+# placeholder gets overwritten by SB before the write syscall.
+:msg_buf
+" 
+"
+
+:ELF_end
diff --git a/tests/p1/p1-aliasing.expected b/tests/p1/p1-aliasing.expected
@@ -0,0 +1 @@
+*
diff --git a/tests/p1/p1-call.P1 b/tests/p1/p1-call.P1
@@ -0,0 +1,48 @@
+# tests/p1/p1-call.P1 -- exercise ENTER, LEAVE, CALL, RET, MOV, ADDI
+# across a nontrivial P1 program. Calls a `write_msg` subroutine twice
+# and returns argc + 1 as the exit status so we also verify the argv-
+# aware _start stub (argc is always >= 1).
+
+:p1_main
+    # Build a frame and stash argc in s0 so write_msg can trash a0.
+    %enter(0)
+    %mov(s0, a0)
+
+    # write_msg("A\n")
+    %la(a0) &msg_a
+    %li(a1) $(2)
+    %la_br() &write_msg
+    %call()
+
+    # write_msg("BC\n")
+    %la(a0) &msg_bc
+    %li(a1) $(3)
+    %la_br() &write_msg
+    %call()
+
+    # exit status = argc + 1 (so it's always >= 2).
+    %addi(a0, s0, 1)
+    %leave()
+    %ret()
+
+# write_msg(buf=a0, len=a1) -> void
+:write_msg
+    %enter(0)
+    # Shift (a0, a1) -> (a2, a3); fd goes in a1, number in a0.
+    %mov(a2, a0)
+    %mov(a3, a1)
+    %li(a0) %sys_write()
+    %li(a1) $(1)
+    %syscall()
+    %leave()
+    %ret()
+
+:msg_a
+"A
+"
+
+:msg_bc
+"BC
+"
+
+:ELF_end
diff --git a/tests/p1/p1-call.expected b/tests/p1/p1-call.expected
@@ -0,0 +1,2 @@
+A
+BC
diff --git a/tests/p1/test.sh b/tests/p1/test.sh
@@ -1,59 +1,53 @@
 #!/bin/sh
-## tests/p1/test.sh — run the P1-language test suite.
+## tests/p1/test.sh -- run the P1-language test suite.
 ##
-## A P1 fixture is `<name>.P1`. The runner concatenates the P1 frontend
-## (p1/P1-aarch64.M1pp + p1/P1.M1pp) with the fixture, expands the result
-## with the M1 build of m1pp, then hands the resulting .M1 source to
-## m1pp/build.sh which lints / preprocesses / M0-assembles / ELF-links it
-## into an aarch64 binary. The binary is executed inside the standard
-## distroless-busybox container; its stdout is diffed against
-## `<name>.expected`.
+## A P1 fixture is `<name>.P1`. For each fixture with a `<name>.expected`
+## sibling, the runner builds the program for every backend arch via
+## m1pp/build-p1.sh (which handles m1pp expansion, M1 stringification, and
+## hex2 linking against the per-arch ELF header), runs it under the
+## matching podman platform, and diffs stdout against the expectation.
+## Cross-arch consistency falls out by transitivity.
 ##
 ## Filenames starting with `_` are skipped (parked).
 ##
-## Usage: tests/p1/test.sh [fixture-name ...]
-##   No args: every non-`_` fixture under tests/p1/.
+## Usage: tests/p1/test.sh [--arch ARCH] [fixture-name ...]
+##   No args: every non-`_` fixture under tests/p1/ for every arch.
+##   --arch: restrict to a single arch (aarch64 | amd64 | riscv64).
 
 set -eu
 
 REPO=$(cd "$(dirname "$0")/../.." && pwd)
-PLATFORM=linux/arm64
-IMAGE=localhost/distroless-busybox:latest
-
-EXPANDER_BIN=build/m1pp/m1pp
-EXPANDER_SRC=m1pp/m1pp.M1
-EXPANDER_DEFS=build/p1v2/aarch64/p1_aarch64.M1
-EXPANDER_BUILT=0
-
 cd "$REPO"
 
-## Rebuild m1pp only if its inputs are newer than the cached binary.
-## The full build is ~110s; skipping it when unchanged is the iteration win.
-expander_up_to_date() {
-    [ -x "$EXPANDER_BIN" ] || return 1
-    [ "$EXPANDER_BIN" -nt "$EXPANDER_SRC" ] || return 1
-    [ "$EXPANDER_BIN" -nt "$EXPANDER_DEFS" ] || return 1
-    return 0
-}
-
-build_expander() {
-    if [ "$EXPANDER_BUILT" = 0 ]; then
-        if expander_up_to_date; then
-            echo "  (m1pp up to date, skipping rebuild)"
-        else
-            sh m1pp/build.sh "$EXPANDER_SRC" "$EXPANDER_BIN" >/dev/null 2>&1 || {
-                echo "FATAL: failed to build $EXPANDER_SRC" >&2
-                sh m1pp/build.sh "$EXPANDER_SRC" "$EXPANDER_BIN" 2>&1 | sed 's/^/  /' >&2
-                exit 1
-            }
-        fi
-        EXPANDER_BUILT=1
-    fi
-}
+ALL_ARCHES="aarch64 amd64 riscv64"
+
+PLATFORM_aarch64=linux/arm64
+PLATFORM_amd64=linux/amd64
+PLATFORM_riscv64=linux/riscv64
+
+IMAGE_aarch64=public.ecr.aws/docker/library/alpine@sha256:378c4c5418f7493bd500ad21ffb43818d0689daaad43e3261859fb417d1481a0
+IMAGE_amd64=public.ecr.aws/docker/library/alpine@sha256:4d889c14e7d5a73929ab00be2ef8ff22437e7cbc545931e52554a7b00e123d8b
+IMAGE_riscv64=public.ecr.aws/docker/library/alpine@sha256:667d07bf2f6239f094f64b5682c8ffbe24c9f3139b1fb854f85caf931a3d7439
+
+ARCHES="$ALL_ARCHES"
+NAMES=""
+while [ "$#" -gt 0 ]; do
+    case "$1" in
+        --arch)
+            shift
+            ARCHES="$1"
+            ;;
+        --arch=*)
+            ARCHES="${1#--arch=}"
+            ;;
+        *)
+            NAMES="$NAMES $1"
+            ;;
+    esac
+    shift
+done
 
-if [ "$#" -gt 0 ]; then
-    NAMES="$*"
-else
+if [ -z "$NAMES" ]; then
     NAMES=$(ls tests/p1/ 2>/dev/null \
         | sed -n 's/^\([^_][^.]*\)\.P1$/\1/p' \
         | sort -u)
@@ -79,50 +73,41 @@ for name in $NAMES; do
         continue
     fi
 
-    build_expander
+    expected_content=$(cat "$expected")
 
-    work=build/p1-tests/$name
-    mkdir -p "$work"
-    combined=$work/combined.M1pp
-    expanded=$work/$name.M1
-    binary=$work/$name
+    for arch in $ARCHES; do
+        eval platform=\$PLATFORM_$arch
+        eval image=\$IMAGE_$arch
+        if [ -z "$platform" ]; then
+            echo "  SKIP [$arch] $name (unknown arch)"
+            continue
+        fi
 
-    cat p1/P1-aarch64.M1pp p1/P1.M1pp "$fixture" > "$combined"
+        binary=build/p1-tests/$arch/$name
 
-    if ! podman run --rm --pull=never --platform "$PLATFORM" \
-            -v "$REPO":/work -w /work "$IMAGE" \
-            "./$EXPANDER_BIN" "$combined" "$expanded" >/dev/null 2>&1; then
-        echo "  FAIL $name (m1pp expansion failed)"
-        podman run --rm --pull=never --platform "$PLATFORM" \
-                -v "$REPO":/work -w /work "$IMAGE" \
-                "./$EXPANDER_BIN" "$combined" "$expanded" 2>&1 | sed 's/^/    /'
-        fail=$((fail + 1))
-        continue
-    fi
+        if ! sh m1pp/build-p1.sh "$arch" "$fixture" "$binary" >/dev/null 2>&1; then
+            echo "  FAIL [$arch] $name (build failed)"
+            sh m1pp/build-p1.sh "$arch" "$fixture" "$binary" 2>&1 | sed 's/^/    /'
+            fail=$((fail + 1))
+            continue
+        fi
 
-    if ! sh m1pp/build.sh "$expanded" "$binary" >/dev/null 2>&1; then
-        echo "  FAIL $name (m1pp/build.sh failed)"
-        sh m1pp/build.sh "$expanded" "$binary" 2>&1 | sed 's/^/    /'
-        fail=$((fail + 1))
-        continue
-    fi
+        actual=$(podman run --rm --pull=never --platform "$platform" \
+            -v "$REPO":/work -w /work "$image" \
+            "./$binary" 2>&1 || true)
 
-    actual=$(podman run --rm --pull=never --platform "$PLATFORM" \
-        -v "$REPO":/work -w /work "$IMAGE" \
-        "./$binary" 2>&1 || true)
-    expected_content=$(cat "$expected")
-
-    if [ "$actual" = "$expected_content" ]; then
-        echo "  PASS $name"
-        pass=$((pass + 1))
-    else
-        echo "  FAIL $name"
-        echo "    --- expected ---"
-        printf '%s\n' "$expected_content" | sed 's/^/    /'
-        echo "    --- actual ---"
-        printf '%s\n' "$actual" | sed 's/^/    /'
-        fail=$((fail + 1))
-    fi
+        if [ "$actual" = "$expected_content" ]; then
+            echo "  PASS [$arch] $name"
+            pass=$((pass + 1))
+        else
+            echo "  FAIL [$arch] $name"
+            echo "    --- expected ---"
+            printf '%s\n' "$expected_content" | sed 's/^/    /'
+            echo "    --- actual ---"
+            printf '%s\n' "$actual" | sed 's/^/    /'
+            fail=$((fail + 1))
+        fi
+    done
 done
 
 echo "$pass passed, $fail failed"

	boot2 Playing with the boostrap
	git clone https://git.ryansepassi.com/git/boot2.git
	Log \| Files \| Refs

M	Makefile	\|	9	++++++++-
A	m1pp/build-p1.sh	\|	94	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	p1/P1-amd64.M1pp	\|	844	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	p1/P1-riscv64.M1pp	\|	535	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	tests/p1/p1-aliasing.P1	\|	72	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	tests/p1/p1-aliasing.expected	\|	1	+
A	tests/p1/p1-call.P1	\|	48	++++++++++++++++++++++++++++++++++++++++++++++++
A	tests/p1/p1-call.expected	\|	2	++
M	tests/p1/test.sh	\|	151	++++++++++++++++++++++++++++++++++++-------------------------------------------