commit 7e769ab87300ff635a3aaf9227db798f9a642ad6
parent b024f5b4fca4986de572255327cb70230dd51001
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 24 Apr 2026 15:03:38 -0700
Simplify P1-amd64.M1pp helpers; drop native reg tables from P1.md
Introduces `amd_maybe_rex_b` to collapse the _lo/_hi/dispatch triplets
on push/pop/jmp_r/call_r/mov_imm32_prefix, and a per-name `amd_is_sp`
predicate to collapse the 48-macro per-(op,base) `p1_mem` fan-out into
a single dispatcher. Native register assignment is unchanged (still
C-ABI-aligned for args 0-3).
Also removes the per-arch native register mapping tables from
docs/P1.md — the P1↔native mapping is backend-internal and was
creating the illusion of a portable contract. Target-notes bullets
generalized accordingly.
Diffstat:
| M | docs/P1.md | | | 58 | +++++++++++----------------------------------------------- |
| M | p1/P1-amd64.M1pp | | | 309 | ++++++++++++++++++++++--------------------------------------------------------- |
2 files changed, 95 insertions(+), 272 deletions(-)
diff --git a/docs/P1.md b/docs/P1.md
@@ -512,53 +512,17 @@ runtime interface document.
- `a0` is argument 0, the one-word direct return-value register, the low word
of the two-word direct return pair, and the indirect-result buffer pointer.
-- On aarch64, riscv64, arm32, and rv32, that matches the native integer/pointer
- ABI directly.
-- On amd64, the backend must translate between portable `a0` and native
- return register `rax` at call and return boundaries. For the two-word direct
- return, the backend must also translate `a1` against native `rdx`.
-- On amd64, `LDARG` must account for the return address pushed by the native
- `call` instruction. On aarch64, riscv64, arm32, and rv32, it maps more
- directly to the entry `sp` plus the backend's standard frame/header policy.
+- Some targets call conventions hand results back in registers that differ
+ from the native equivalent of `a0` or `a1`; such targets must translate
+ between portable and native return registers at call and return boundaries.
+- On targets whose native call instruction pushes a return address, `LDARG`
+ must account for that slot. On other targets, `LDARG` maps more directly
+ to entry `sp` plus the backend's standard frame/header policy.
- `br` is implemented as a dedicated hidden native register on every target.
-- On arm32, `t1` and `t2` map to natively callee-saved registers; the backend
- is responsible for preserving them across function boundaries in accordance
- with the native ABI, even though P1 treats them as caller-saved.
+- Each backend chooses the native register that holds each P1 register.
+ Those choices are backend-private and may differ from native ABI
+ conventions; backends may preserve P1 caller-saved registers that happen
+ to land in natively callee-saved native registers as a matter of backend
+ policy.
- Frame-pointer use is backend policy, not part of the P1 v2 architectural
register set.
-
-### Native register mapping
-
-#### 64-bit targets
-
-| P1 | amd64 | aarch64 | riscv64 |
-|------|-------|---------|---------|
-| `a0` | `rdi` | `x0` | `a0` |
-| `a1` | `rsi` | `x1` | `a1` |
-| `a2` | `rdx` | `x2` | `a2` |
-| `a3` | `rcx` | `x3` | `a3` |
-| `t0` | `r10` | `x9` | `t0` |
-| `t1` | `r11` | `x10` | `t1` |
-| `t2` | `r8` | `x11` | `t2` |
-| `s0` | `rbx` | `x19` | `s1` |
-| `s1` | `r12` | `x20` | `s2` |
-| `s2` | `r13` | `x21` | `s3` |
-| `s3` | `r14` | `x22` | `s4` |
-| `sp` | `rsp` | `sp` | `sp` |
-
-#### 32-bit targets
-
-| P1 | arm32 | rv32 |
-|------|-------|-------|
-| `a0` | `r0` | `a0` |
-| `a1` | `r1` | `a1` |
-| `a2` | `r2` | `a2` |
-| `a3` | `r3` | `a3` |
-| `t0` | `r12` | `t0` |
-| `t1` | `r6` | `t1` |
-| `t2` | `r7` | `t2` |
-| `s0` | `r4` | `s1` |
-| `s1` | `r5` | `s2` |
-| `s2` | `r8` | `s3` |
-| `s3` | `r9` | `s4` |
-| `sp` | `sp` | `sp` |
diff --git a/p1/P1-amd64.M1pp b/p1/P1-amd64.M1pp
@@ -1,15 +1,16 @@
# P1-amd64.M1pp -- P1v2 amd64 backend expressed in m1macro.
#
-# Mirrors p1/P1-aarch64.M1pp; native register picks follow docs/P1.md's
-# 64-bit mapping table. amd64 is variable-length, so every op emits its
-# prefix bytes (REX / opcode) directly via the m1pp `!(…)` single-byte
-# builtin; 4-byte immediates still go through `%(…)`.
+# Mirrors p1/P1-aarch64.M1pp. Native register mapping is backend-private;
+# see the amd_reg_* table below. amd64 is variable-length, so every op
+# emits its prefix bytes (REX / opcode) directly via the m1pp `!(…)`
+# single-byte builtin; 4-byte immediates still go through `%(…)`.
#
# Hidden backend regs:
-# br = r15 -- branch-target scratch
-# scratch = r9 -- per-expansion scratch (rcx shift alias save, etc.)
-# rax -- syscall number / return slot + spill buffer
-# rbp -- spill buffer when rcx needs saving for SHL/SHR/SAR
+# br = r15 -- branch-target mechanism
+# scratch = r9 -- per-expansion scratch (e.g. rcx save slot for SHIFT)
+# rax -- syscall number / return slot + retaddr spill
+# rbp -- spill slot for rcx / rdx when SHIFT and DIV/REM need
+# to preserve a3 / a2
# ---- Native register numbers --------------------------------------------
#
@@ -111,8 +112,69 @@
%amd_reg_##r()
%endm
+# Per-P1-name `is this sp?` predicate. Used by p1_mem to decide whether
+# the supplied offset needs the +16 frame-header adjustment.
+
+%macro amd_is_sp_a0()
+0
+%endm
+%macro amd_is_sp_a1()
+0
+%endm
+%macro amd_is_sp_a2()
+0
+%endm
+%macro amd_is_sp_a3()
+0
+%endm
+%macro amd_is_sp_t0()
+0
+%endm
+%macro amd_is_sp_t1()
+0
+%endm
+%macro amd_is_sp_t2()
+0
+%endm
+%macro amd_is_sp_s0()
+0
+%endm
+%macro amd_is_sp_s1()
+0
+%endm
+%macro amd_is_sp_s2()
+0
+%endm
+%macro amd_is_sp_s3()
+0
+%endm
+%macro amd_is_sp_sp()
+1
+%endm
+
+%macro amd_is_sp(r)
+%amd_is_sp_##r()
+%endm
+
# ---- REX / ModRM helpers ------------------------------------------------
+# Short one-byte REX.B prefix (no W). Used by opcodes that don't need 64-bit
+# width — push/pop/jmp r/call r/mov r,imm32 — when the target reg is r8-r15.
+%macro amd_rex_b_short()
+!(0x41)
+%endm
+
+# No-op sentinel for %select branches that shouldn't emit anything.
+%macro amd_nobytes()
+%endm
+
+# Emit REX.B (0x41) iff r is r8-r15. Used by the short-prefix opcodes above.
+%macro amd_maybe_rex_b(r)
+%select((>= %amd_reg(r) 8),
+ %amd_rex_b_short(),
+ %amd_nobytes())
+%endm
+
# REX.WB: W=1 for 64-bit, B=(r>>3) to extend ModRM.rm / SIB.base.
%macro amd_rex_wb(r)
!((| 0x48 (& (>> %amd_reg(r) 3) 1)))
@@ -256,28 +318,12 @@
# push / pop r64. 50+r / 58+r; REX.B=0x41 if r8-r15.
%macro amd_push(r)
-%select((>= %amd_reg(r) 8),
- %amd_push_hi(r),
- %amd_push_lo(r))
-%endm
-%macro amd_push_lo(r)
-!((| 0x50 %amd_reg(r)))
-%endm
-%macro amd_push_hi(r)
-!(0x41)
+%amd_maybe_rex_b(r)
!((| 0x50 (& %amd_reg(r) 7)))
%endm
%macro amd_pop(r)
-%select((>= %amd_reg(r) 8),
- %amd_pop_hi(r),
- %amd_pop_lo(r))
-%endm
-%macro amd_pop_lo(r)
-!((| 0x58 %amd_reg(r)))
-%endm
-%macro amd_pop_hi(r)
-!(0x41)
+%amd_maybe_rex_b(r)
!((| 0x58 (& %amd_reg(r) 7)))
%endm
@@ -285,15 +331,7 @@
# REX.B=0x41. The 4-byte literal the caller emits is zero-extended into
# the full 64-bit register, matching the LA / LA_BR literal-pool contract.
%macro amd_mov_imm32_prefix(rd)
-%select((>= %amd_reg(rd) 8),
- %amd_mov_imm32_prefix_hi(rd),
- %amd_mov_imm32_prefix_lo(rd))
-%endm
-%macro amd_mov_imm32_prefix_lo(rd)
-!((| 0xB8 %amd_reg(rd)))
-%endm
-%macro amd_mov_imm32_prefix_hi(rd)
-!(0x41)
+%amd_maybe_rex_b(rd)
!((| 0xB8 (& %amd_reg(rd) 7)))
%endm
@@ -346,31 +384,13 @@
# Jcc rel8 -- 7x ib
%macro amd_jmp_r(r)
-%select((>= %amd_reg(r) 8),
- %amd_jmp_r_hi(r),
- %amd_jmp_r_lo(r))
-%endm
-%macro amd_jmp_r_lo(r)
-!(0xFF)
-!((| 0xE0 (& %amd_reg(r) 7)))
-%endm
-%macro amd_jmp_r_hi(r)
-!(0x41)
+%amd_maybe_rex_b(r)
!(0xFF)
!((| 0xE0 (& %amd_reg(r) 7)))
%endm
%macro amd_call_r(r)
-%select((>= %amd_reg(r) 8),
- %amd_call_r_hi(r),
- %amd_call_r_lo(r))
-%endm
-%macro amd_call_r_lo(r)
-!(0xFF)
-!((| 0xD0 (& %amd_reg(r) 7)))
-%endm
-%macro amd_call_r_hi(r)
-!(0x41)
+%amd_maybe_rex_b(r)
!(0xFF)
!((| 0xD0 (& %amd_reg(r) 7)))
%endm
@@ -661,178 +681,17 @@
%p1_shifti_##op(rd, ra, imm)
%endm
-# p1_mem dispatches on (op, base). When the base is sp, portable sp is the
-# frame-local base — 16 bytes above native rsp — so the physical access needs
-# the supplied portable offset plus 16. For any other base, the portable and
-# native offset coincide. Internal backend callers that need raw native-rsp
-# access (p1_enter, p1_eret, _start stub, p1_ldarg, p1_syscall) use
-# amd_mem_LD/amd_mem_ST directly and bypass this translation.
-
-%macro p1_mem_LD_sp(rt, off)
-%amd_mem_LD(rt, sp, (+ off 16))
-%endm
-%macro p1_mem_ST_sp(rt, off)
-%amd_mem_ST(rt, sp, (+ off 16))
-%endm
-%macro p1_mem_LB_sp(rt, off)
-%amd_mem_LB(rt, sp, (+ off 16))
-%endm
-%macro p1_mem_SB_sp(rt, off)
-%amd_mem_SB(rt, sp, (+ off 16))
-%endm
-
-%macro p1_mem_LD(rt, rn, off)
-%p1_mem_LD_##rn(rt, off)
-%endm
-%macro p1_mem_ST(rt, rn, off)
-%p1_mem_ST_##rn(rt, off)
-%endm
-%macro p1_mem_LB(rt, rn, off)
-%p1_mem_LB_##rn(rt, off)
-%endm
-%macro p1_mem_SB(rt, rn, off)
-%p1_mem_SB_##rn(rt, off)
-%endm
-
-# Non-sp bases for each op -- plain native load/store with portable offset.
-%macro p1_mem_LD_a0(rt, off)
-%amd_mem_LD(rt, a0, off)
-%endm
-%macro p1_mem_LD_a1(rt, off)
-%amd_mem_LD(rt, a1, off)
-%endm
-%macro p1_mem_LD_a2(rt, off)
-%amd_mem_LD(rt, a2, off)
-%endm
-%macro p1_mem_LD_a3(rt, off)
-%amd_mem_LD(rt, a3, off)
-%endm
-%macro p1_mem_LD_t0(rt, off)
-%amd_mem_LD(rt, t0, off)
-%endm
-%macro p1_mem_LD_t1(rt, off)
-%amd_mem_LD(rt, t1, off)
-%endm
-%macro p1_mem_LD_t2(rt, off)
-%amd_mem_LD(rt, t2, off)
-%endm
-%macro p1_mem_LD_s0(rt, off)
-%amd_mem_LD(rt, s0, off)
-%endm
-%macro p1_mem_LD_s1(rt, off)
-%amd_mem_LD(rt, s1, off)
-%endm
-%macro p1_mem_LD_s2(rt, off)
-%amd_mem_LD(rt, s2, off)
-%endm
-%macro p1_mem_LD_s3(rt, off)
-%amd_mem_LD(rt, s3, off)
-%endm
-
-%macro p1_mem_ST_a0(rt, off)
-%amd_mem_ST(rt, a0, off)
-%endm
-%macro p1_mem_ST_a1(rt, off)
-%amd_mem_ST(rt, a1, off)
-%endm
-%macro p1_mem_ST_a2(rt, off)
-%amd_mem_ST(rt, a2, off)
-%endm
-%macro p1_mem_ST_a3(rt, off)
-%amd_mem_ST(rt, a3, off)
-%endm
-%macro p1_mem_ST_t0(rt, off)
-%amd_mem_ST(rt, t0, off)
-%endm
-%macro p1_mem_ST_t1(rt, off)
-%amd_mem_ST(rt, t1, off)
-%endm
-%macro p1_mem_ST_t2(rt, off)
-%amd_mem_ST(rt, t2, off)
-%endm
-%macro p1_mem_ST_s0(rt, off)
-%amd_mem_ST(rt, s0, off)
-%endm
-%macro p1_mem_ST_s1(rt, off)
-%amd_mem_ST(rt, s1, off)
-%endm
-%macro p1_mem_ST_s2(rt, off)
-%amd_mem_ST(rt, s2, off)
-%endm
-%macro p1_mem_ST_s3(rt, off)
-%amd_mem_ST(rt, s3, off)
-%endm
-
-%macro p1_mem_LB_a0(rt, off)
-%amd_mem_LB(rt, a0, off)
-%endm
-%macro p1_mem_LB_a1(rt, off)
-%amd_mem_LB(rt, a1, off)
-%endm
-%macro p1_mem_LB_a2(rt, off)
-%amd_mem_LB(rt, a2, off)
-%endm
-%macro p1_mem_LB_a3(rt, off)
-%amd_mem_LB(rt, a3, off)
-%endm
-%macro p1_mem_LB_t0(rt, off)
-%amd_mem_LB(rt, t0, off)
-%endm
-%macro p1_mem_LB_t1(rt, off)
-%amd_mem_LB(rt, t1, off)
-%endm
-%macro p1_mem_LB_t2(rt, off)
-%amd_mem_LB(rt, t2, off)
-%endm
-%macro p1_mem_LB_s0(rt, off)
-%amd_mem_LB(rt, s0, off)
-%endm
-%macro p1_mem_LB_s1(rt, off)
-%amd_mem_LB(rt, s1, off)
-%endm
-%macro p1_mem_LB_s2(rt, off)
-%amd_mem_LB(rt, s2, off)
-%endm
-%macro p1_mem_LB_s3(rt, off)
-%amd_mem_LB(rt, s3, off)
-%endm
-
-%macro p1_mem_SB_a0(rt, off)
-%amd_mem_SB(rt, a0, off)
-%endm
-%macro p1_mem_SB_a1(rt, off)
-%amd_mem_SB(rt, a1, off)
-%endm
-%macro p1_mem_SB_a2(rt, off)
-%amd_mem_SB(rt, a2, off)
-%endm
-%macro p1_mem_SB_a3(rt, off)
-%amd_mem_SB(rt, a3, off)
-%endm
-%macro p1_mem_SB_t0(rt, off)
-%amd_mem_SB(rt, t0, off)
-%endm
-%macro p1_mem_SB_t1(rt, off)
-%amd_mem_SB(rt, t1, off)
-%endm
-%macro p1_mem_SB_t2(rt, off)
-%amd_mem_SB(rt, t2, off)
-%endm
-%macro p1_mem_SB_s0(rt, off)
-%amd_mem_SB(rt, s0, off)
-%endm
-%macro p1_mem_SB_s1(rt, off)
-%amd_mem_SB(rt, s1, off)
-%endm
-%macro p1_mem_SB_s2(rt, off)
-%amd_mem_SB(rt, s2, off)
-%endm
-%macro p1_mem_SB_s3(rt, off)
-%amd_mem_SB(rt, s3, off)
-%endm
+# p1_mem -- portable-offset memory access. When the base is sp, portable
+# sp is the frame-local base (16 bytes above native rsp), so the physical
+# access needs the supplied offset plus 16. For any other base, portable
+# and native offsets coincide. Internal backend callers that need raw
+# native-rsp access (p1_enter, p1_eret, _start stub, p1_ldarg, p1_syscall)
+# use amd_mem_LD/amd_mem_ST directly and bypass this translation.
%macro p1_mem(op, rt, rn, off)
-%p1_mem_##op(rt, rn, off)
+%select((= %amd_is_sp(rn) 1),
+ %amd_mem_##op(rt, rn, (+ off 16)),
+ %amd_mem_##op(rt, rn, off))
%endm
%macro p1_ldarg(rd, slot)