Simplify P1-amd64.M1pp helpers; drop native reg tables from P1.md - boot2

commit 7e769ab87300ff635a3aaf9227db798f9a642ad6
parent b024f5b4fca4986de572255327cb70230dd51001
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 24 Apr 2026 15:03:38 -0700

Simplify P1-amd64.M1pp helpers; drop native reg tables from P1.md

Introduces `amd_maybe_rex_b` to collapse the _lo/_hi/dispatch triplets
on push/pop/jmp_r/call_r/mov_imm32_prefix, and a per-name `amd_is_sp`
predicate to collapse the 48-macro per-(op,base) `p1_mem` fan-out into
a single dispatcher. Native register assignment is unchanged (still
C-ABI-aligned for args 0-3).

Also removes the per-arch native register mapping tables from
docs/P1.md — the P1↔native mapping is backend-internal and was
creating the illusion of a portable contract. Target-notes bullets
generalized accordingly.

Diffstat:
M docs/P1.md  | 58 +++++++++++-----------------------------------------------
M p1/P1-amd64.M1pp  | 309 ++++++++++++++++++++++---------------------------------------------------------

2 files changed, 95 insertions(+), 272 deletions(-)
diff --git a/docs/P1.md b/docs/P1.md
@@ -512,53 +512,17 @@ runtime interface document.
 
 - `a0` is argument 0, the one-word direct return-value register, the low word
   of the two-word direct return pair, and the indirect-result buffer pointer.
-- On aarch64, riscv64, arm32, and rv32, that matches the native integer/pointer
-  ABI directly.
-- On amd64, the backend must translate between portable `a0` and native
-  return register `rax` at call and return boundaries. For the two-word direct
-  return, the backend must also translate `a1` against native `rdx`.
-- On amd64, `LDARG` must account for the return address pushed by the native
-  `call` instruction. On aarch64, riscv64, arm32, and rv32, it maps more
-  directly to the entry `sp` plus the backend's standard frame/header policy.
+- Some targets call conventions hand results back in registers that differ
+  from the native equivalent of `a0` or `a1`; such targets must translate
+  between portable and native return registers at call and return boundaries.
+- On targets whose native call instruction pushes a return address, `LDARG`
+  must account for that slot. On other targets, `LDARG` maps more directly
+  to entry `sp` plus the backend's standard frame/header policy.
 - `br` is implemented as a dedicated hidden native register on every target.
-- On arm32, `t1` and `t2` map to natively callee-saved registers; the backend
-  is responsible for preserving them across function boundaries in accordance
-  with the native ABI, even though P1 treats them as caller-saved.
+- Each backend chooses the native register that holds each P1 register.
+  Those choices are backend-private and may differ from native ABI
+  conventions; backends may preserve P1 caller-saved registers that happen
+  to land in natively callee-saved native registers as a matter of backend
+  policy.
 - Frame-pointer use is backend policy, not part of the P1 v2 architectural
   register set.
-
-### Native register mapping
-
-#### 64-bit targets
-
-| P1   | amd64 | aarch64 | riscv64 |
-|------|-------|---------|---------|
-| `a0` | `rdi` | `x0`    | `a0`    |
-| `a1` | `rsi` | `x1`    | `a1`    |
-| `a2` | `rdx` | `x2`    | `a2`    |
-| `a3` | `rcx` | `x3`    | `a3`    |
-| `t0` | `r10` | `x9`    | `t0`    |
-| `t1` | `r11` | `x10`   | `t1`    |
-| `t2` | `r8`  | `x11`   | `t2`    |
-| `s0` | `rbx` | `x19`   | `s1`    |
-| `s1` | `r12` | `x20`   | `s2`    |
-| `s2` | `r13` | `x21`   | `s3`    |
-| `s3` | `r14` | `x22`   | `s4`    |
-| `sp` | `rsp` | `sp`    | `sp`    |
-
-#### 32-bit targets
-
-| P1   | arm32 | rv32  |
-|------|-------|-------|
-| `a0` | `r0`  | `a0`  |
-| `a1` | `r1`  | `a1`  |
-| `a2` | `r2`  | `a2`  |
-| `a3` | `r3`  | `a3`  |
-| `t0` | `r12` | `t0`  |
-| `t1` | `r6`  | `t1`  |
-| `t2` | `r7`  | `t2`  |
-| `s0` | `r4`  | `s1`  |
-| `s1` | `r5`  | `s2`  |
-| `s2` | `r8`  | `s3`  |
-| `s3` | `r9`  | `s4`  |
-| `sp` | `sp`  | `sp`  |
diff --git a/p1/P1-amd64.M1pp b/p1/P1-amd64.M1pp
@@ -1,15 +1,16 @@
 # P1-amd64.M1pp -- P1v2 amd64 backend expressed in m1macro.
 #
-# Mirrors p1/P1-aarch64.M1pp; native register picks follow docs/P1.md's
-# 64-bit mapping table. amd64 is variable-length, so every op emits its
-# prefix bytes (REX / opcode) directly via the m1pp `!(…)` single-byte
-# builtin; 4-byte immediates still go through `%(…)`.
+# Mirrors p1/P1-aarch64.M1pp. Native register mapping is backend-private;
+# see the amd_reg_* table below. amd64 is variable-length, so every op
+# emits its prefix bytes (REX / opcode) directly via the m1pp `!(…)`
+# single-byte builtin; 4-byte immediates still go through `%(…)`.
 #
 # Hidden backend regs:
-#   br      = r15   -- branch-target scratch
-#   scratch = r9    -- per-expansion scratch (rcx shift alias save, etc.)
-#   rax            -- syscall number / return slot + spill buffer
-#   rbp            -- spill buffer when rcx needs saving for SHL/SHR/SAR
+#   br      = r15   -- branch-target mechanism
+#   scratch = r9    -- per-expansion scratch (e.g. rcx save slot for SHIFT)
+#   rax            -- syscall number / return slot + retaddr spill
+#   rbp            -- spill slot for rcx / rdx when SHIFT and DIV/REM need
+#                      to preserve a3 / a2
 
 # ---- Native register numbers --------------------------------------------
 #
@@ -111,8 +112,69 @@
 %amd_reg_##r()
 %endm
 
+# Per-P1-name `is this sp?` predicate. Used by p1_mem to decide whether
+# the supplied offset needs the +16 frame-header adjustment.
+
+%macro amd_is_sp_a0()
+0
+%endm
+%macro amd_is_sp_a1()
+0
+%endm
+%macro amd_is_sp_a2()
+0
+%endm
+%macro amd_is_sp_a3()
+0
+%endm
+%macro amd_is_sp_t0()
+0
+%endm
+%macro amd_is_sp_t1()
+0
+%endm
+%macro amd_is_sp_t2()
+0
+%endm
+%macro amd_is_sp_s0()
+0
+%endm
+%macro amd_is_sp_s1()
+0
+%endm
+%macro amd_is_sp_s2()
+0
+%endm
+%macro amd_is_sp_s3()
+0
+%endm
+%macro amd_is_sp_sp()
+1
+%endm
+
+%macro amd_is_sp(r)
+%amd_is_sp_##r()
+%endm
+
 # ---- REX / ModRM helpers ------------------------------------------------
 
+# Short one-byte REX.B prefix (no W). Used by opcodes that don't need 64-bit
+# width — push/pop/jmp r/call r/mov r,imm32 — when the target reg is r8-r15.
+%macro amd_rex_b_short()
+!(0x41)
+%endm
+
+# No-op sentinel for %select branches that shouldn't emit anything.
+%macro amd_nobytes()
+%endm
+
+# Emit REX.B (0x41) iff r is r8-r15. Used by the short-prefix opcodes above.
+%macro amd_maybe_rex_b(r)
+%select((>= %amd_reg(r) 8),
+    %amd_rex_b_short(),
+    %amd_nobytes())
+%endm
+
 # REX.WB: W=1 for 64-bit, B=(r>>3) to extend ModRM.rm / SIB.base.
 %macro amd_rex_wb(r)
 !((| 0x48 (& (>> %amd_reg(r) 3) 1)))
@@ -256,28 +318,12 @@
 
 # push / pop r64.  50+r / 58+r; REX.B=0x41 if r8-r15.
 %macro amd_push(r)
-%select((>= %amd_reg(r) 8),
-    %amd_push_hi(r),
-    %amd_push_lo(r))
-%endm
-%macro amd_push_lo(r)
-!((| 0x50 %amd_reg(r)))
-%endm
-%macro amd_push_hi(r)
-!(0x41)
+%amd_maybe_rex_b(r)
 !((| 0x50 (& %amd_reg(r) 7)))
 %endm
 
 %macro amd_pop(r)
-%select((>= %amd_reg(r) 8),
-    %amd_pop_hi(r),
-    %amd_pop_lo(r))
-%endm
-%macro amd_pop_lo(r)
-!((| 0x58 %amd_reg(r)))
-%endm
-%macro amd_pop_hi(r)
-!(0x41)
+%amd_maybe_rex_b(r)
 !((| 0x58 (& %amd_reg(r) 7)))
 %endm
 
@@ -285,15 +331,7 @@
 # REX.B=0x41.  The 4-byte literal the caller emits is zero-extended into
 # the full 64-bit register, matching the LA / LA_BR literal-pool contract.
 %macro amd_mov_imm32_prefix(rd)
-%select((>= %amd_reg(rd) 8),
-    %amd_mov_imm32_prefix_hi(rd),
-    %amd_mov_imm32_prefix_lo(rd))
-%endm
-%macro amd_mov_imm32_prefix_lo(rd)
-!((| 0xB8 %amd_reg(rd)))
-%endm
-%macro amd_mov_imm32_prefix_hi(rd)
-!(0x41)
+%amd_maybe_rex_b(rd)
 !((| 0xB8 (& %amd_reg(rd) 7)))
 %endm
 
@@ -346,31 +384,13 @@
 # Jcc rel8         -- 7x ib
 
 %macro amd_jmp_r(r)
-%select((>= %amd_reg(r) 8),
-    %amd_jmp_r_hi(r),
-    %amd_jmp_r_lo(r))
-%endm
-%macro amd_jmp_r_lo(r)
-!(0xFF)
-!((| 0xE0 (& %amd_reg(r) 7)))
-%endm
-%macro amd_jmp_r_hi(r)
-!(0x41)
+%amd_maybe_rex_b(r)
 !(0xFF)
 !((| 0xE0 (& %amd_reg(r) 7)))
 %endm
 
 %macro amd_call_r(r)
-%select((>= %amd_reg(r) 8),
-    %amd_call_r_hi(r),
-    %amd_call_r_lo(r))
-%endm
-%macro amd_call_r_lo(r)
-!(0xFF)
-!((| 0xD0 (& %amd_reg(r) 7)))
-%endm
-%macro amd_call_r_hi(r)
-!(0x41)
+%amd_maybe_rex_b(r)
 !(0xFF)
 !((| 0xD0 (& %amd_reg(r) 7)))
 %endm
@@ -661,178 +681,17 @@
 %p1_shifti_##op(rd, ra, imm)
 %endm
 
-# p1_mem dispatches on (op, base). When the base is sp, portable sp is the
-# frame-local base — 16 bytes above native rsp — so the physical access needs
-# the supplied portable offset plus 16. For any other base, the portable and
-# native offset coincide. Internal backend callers that need raw native-rsp
-# access (p1_enter, p1_eret, _start stub, p1_ldarg, p1_syscall) use
-# amd_mem_LD/amd_mem_ST directly and bypass this translation.
-
-%macro p1_mem_LD_sp(rt, off)
-%amd_mem_LD(rt, sp, (+ off 16))
-%endm
-%macro p1_mem_ST_sp(rt, off)
-%amd_mem_ST(rt, sp, (+ off 16))
-%endm
-%macro p1_mem_LB_sp(rt, off)
-%amd_mem_LB(rt, sp, (+ off 16))
-%endm
-%macro p1_mem_SB_sp(rt, off)
-%amd_mem_SB(rt, sp, (+ off 16))
-%endm
-
-%macro p1_mem_LD(rt, rn, off)
-%p1_mem_LD_##rn(rt, off)
-%endm
-%macro p1_mem_ST(rt, rn, off)
-%p1_mem_ST_##rn(rt, off)
-%endm
-%macro p1_mem_LB(rt, rn, off)
-%p1_mem_LB_##rn(rt, off)
-%endm
-%macro p1_mem_SB(rt, rn, off)
-%p1_mem_SB_##rn(rt, off)
-%endm
-
-# Non-sp bases for each op -- plain native load/store with portable offset.
-%macro p1_mem_LD_a0(rt, off)
-%amd_mem_LD(rt, a0, off)
-%endm
-%macro p1_mem_LD_a1(rt, off)
-%amd_mem_LD(rt, a1, off)
-%endm
-%macro p1_mem_LD_a2(rt, off)
-%amd_mem_LD(rt, a2, off)
-%endm
-%macro p1_mem_LD_a3(rt, off)
-%amd_mem_LD(rt, a3, off)
-%endm
-%macro p1_mem_LD_t0(rt, off)
-%amd_mem_LD(rt, t0, off)
-%endm
-%macro p1_mem_LD_t1(rt, off)
-%amd_mem_LD(rt, t1, off)
-%endm
-%macro p1_mem_LD_t2(rt, off)
-%amd_mem_LD(rt, t2, off)
-%endm
-%macro p1_mem_LD_s0(rt, off)
-%amd_mem_LD(rt, s0, off)
-%endm
-%macro p1_mem_LD_s1(rt, off)
-%amd_mem_LD(rt, s1, off)
-%endm
-%macro p1_mem_LD_s2(rt, off)
-%amd_mem_LD(rt, s2, off)
-%endm
-%macro p1_mem_LD_s3(rt, off)
-%amd_mem_LD(rt, s3, off)
-%endm
-
-%macro p1_mem_ST_a0(rt, off)
-%amd_mem_ST(rt, a0, off)
-%endm
-%macro p1_mem_ST_a1(rt, off)
-%amd_mem_ST(rt, a1, off)
-%endm
-%macro p1_mem_ST_a2(rt, off)
-%amd_mem_ST(rt, a2, off)
-%endm
-%macro p1_mem_ST_a3(rt, off)
-%amd_mem_ST(rt, a3, off)
-%endm
-%macro p1_mem_ST_t0(rt, off)
-%amd_mem_ST(rt, t0, off)
-%endm
-%macro p1_mem_ST_t1(rt, off)
-%amd_mem_ST(rt, t1, off)
-%endm
-%macro p1_mem_ST_t2(rt, off)
-%amd_mem_ST(rt, t2, off)
-%endm
-%macro p1_mem_ST_s0(rt, off)
-%amd_mem_ST(rt, s0, off)
-%endm
-%macro p1_mem_ST_s1(rt, off)
-%amd_mem_ST(rt, s1, off)
-%endm
-%macro p1_mem_ST_s2(rt, off)
-%amd_mem_ST(rt, s2, off)
-%endm
-%macro p1_mem_ST_s3(rt, off)
-%amd_mem_ST(rt, s3, off)
-%endm
-
-%macro p1_mem_LB_a0(rt, off)
-%amd_mem_LB(rt, a0, off)
-%endm
-%macro p1_mem_LB_a1(rt, off)
-%amd_mem_LB(rt, a1, off)
-%endm
-%macro p1_mem_LB_a2(rt, off)
-%amd_mem_LB(rt, a2, off)
-%endm
-%macro p1_mem_LB_a3(rt, off)
-%amd_mem_LB(rt, a3, off)
-%endm
-%macro p1_mem_LB_t0(rt, off)
-%amd_mem_LB(rt, t0, off)
-%endm
-%macro p1_mem_LB_t1(rt, off)
-%amd_mem_LB(rt, t1, off)
-%endm
-%macro p1_mem_LB_t2(rt, off)
-%amd_mem_LB(rt, t2, off)
-%endm
-%macro p1_mem_LB_s0(rt, off)
-%amd_mem_LB(rt, s0, off)
-%endm
-%macro p1_mem_LB_s1(rt, off)
-%amd_mem_LB(rt, s1, off)
-%endm
-%macro p1_mem_LB_s2(rt, off)
-%amd_mem_LB(rt, s2, off)
-%endm
-%macro p1_mem_LB_s3(rt, off)
-%amd_mem_LB(rt, s3, off)
-%endm
-
-%macro p1_mem_SB_a0(rt, off)
-%amd_mem_SB(rt, a0, off)
-%endm
-%macro p1_mem_SB_a1(rt, off)
-%amd_mem_SB(rt, a1, off)
-%endm
-%macro p1_mem_SB_a2(rt, off)
-%amd_mem_SB(rt, a2, off)
-%endm
-%macro p1_mem_SB_a3(rt, off)
-%amd_mem_SB(rt, a3, off)
-%endm
-%macro p1_mem_SB_t0(rt, off)
-%amd_mem_SB(rt, t0, off)
-%endm
-%macro p1_mem_SB_t1(rt, off)
-%amd_mem_SB(rt, t1, off)
-%endm
-%macro p1_mem_SB_t2(rt, off)
-%amd_mem_SB(rt, t2, off)
-%endm
-%macro p1_mem_SB_s0(rt, off)
-%amd_mem_SB(rt, s0, off)
-%endm
-%macro p1_mem_SB_s1(rt, off)
-%amd_mem_SB(rt, s1, off)
-%endm
-%macro p1_mem_SB_s2(rt, off)
-%amd_mem_SB(rt, s2, off)
-%endm
-%macro p1_mem_SB_s3(rt, off)
-%amd_mem_SB(rt, s3, off)
-%endm
+# p1_mem -- portable-offset memory access. When the base is sp, portable
+# sp is the frame-local base (16 bytes above native rsp), so the physical
+# access needs the supplied offset plus 16. For any other base, portable
+# and native offsets coincide. Internal backend callers that need raw
+# native-rsp access (p1_enter, p1_eret, _start stub, p1_ldarg, p1_syscall)
+# use amd_mem_LD/amd_mem_ST directly and bypass this translation.
 
 %macro p1_mem(op, rt, rn, off)
-%p1_mem_##op(rt, rn, off)
+%select((= %amd_is_sp(rn) 1),
+    %amd_mem_##op(rt, rn, (+ off 16)),
+    %amd_mem_##op(rt, rn, off))
 %endm
 
 %macro p1_ldarg(rd, slot)

	boot2 Playing with the boostrap
	git clone https://git.ryansepassi.com/git/boot2.git
	Log \| Files \| Refs

M	docs/P1.md	\|	58	+++++++++++-----------------------------------------------
M	p1/P1-amd64.M1pp	\|	309	++++++++++++++++++++++---------------------------------------------------------