P1 promote PROLOGUE/EPILOGUE, add TAIL stack check and neg-imm12 mem test - boot2

commit de399adbe43b14b14b58c0f55c521fc29bd08328
parent 5f7a29d41cc6ab83345fbdec6befa62971970c20
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Mon, 20 Apr 2026 17:01:51 -0700

P1 promote PROLOGUE/EPILOGUE, add TAIL stack check and neg-imm12 mem test

Tighten the demo's ISA coverage in three places flagged during review:

- Promote PROLOGUE/EPILOGUE to first-class ops in §"Instruction set"
  and a dedicated §"Semantics" bullet; they were already used by
  demo.M1 but only documented obliquely under CALL/RET.
- Add MOV rD, sp (read-only) so demo.M1 can snapshot sp before and
  after tranche 5. A TAIL that skips its epilogue now folds the
  16-byte frame leak into the accumulator on aarch64/riscv64 — the
  previous test's "broken TAIL = broken B" path silently passed.
- Exercise signed imm12 on LD/ST via a NEG8-offset round-trip through
  a new scratch_mid label, mirroring ADDI NEG1/NEG3's proof that the
  signed-imm encoding works on arith.

Five new per-arch DEFINEs (MOV_R{2,6}_SP, SUB_R2_R2_R6, LD/ST_R1_R4_NEG8);
aarch64 uses STUR/LDUR for the negative offset since the unsigned-offset
LDR/STR forms can't express one.

Diffstat:
M P1.md  | 37 ++++++++++++++++++++++++-------------
M demo.M1  | 45 ++++++++++++++++++++++++++++++++++++++++++---
M p1_aarch64.M1  | 12 ++++++++++++
M p1_amd64.M1  | 12 ++++++++++++
M p1_riscv64.M1  | 11 +++++++++++

5 files changed, 101 insertions(+), 16 deletions(-)
diff --git a/P1.md b/P1.md
@@ -107,7 +107,7 @@ ADDI rD, rA, !imm     ANDI rD, rA, !imm     ORI  rD, rA, !imm
 SHLI rD, rA, !imm     SHRI rD, rA, !imm     SARI rD, rA, !imm
 
 # Moves
-MOV  rD, rA                           # reg-to-reg
+MOV  rD, rA                           # reg-to-reg (rA may be sp)
 LI   rD, %label                       # load 64-bit literal from pool
 LA   rD, %label                       # load PC-relative address
 
@@ -120,6 +120,7 @@ B    %label                           # unconditional branch
 BEQ  rA, rB, %label   BNE  rA, rB, %label
 BLT  rA, rB, %label                          # signed less-than
 CALL %label           RET
+PROLOGUE              EPILOGUE               # frame setup / teardown (see Semantics)
 TAIL %label                           # tail call: epilogue + B %label
 
 # System
@@ -146,15 +147,25 @@ SYSCALL                               # num in r0, args r1-r6, ret in r0
   register; range is therefore unbounded within the 4 GiB address space.
   Native-encoded branches (with tighter range limits) are an optional
   future optimization.
-- `CALL` pushes a return address, jumps to target. `RET` pops and jumps.
-  Prologue convention: function entry does `ST lr, sp, 0; ADDI sp, sp, -16`
-  (or arch-specific equivalent); epilogue reverses.
-- `TAIL %label` is a tail call — it performs the callee's standard
-  epilogue (restore `lr` from `[sp+0]`, pop the frame) and then branches
-  unconditionally to `%label`, reusing the caller's return address
-  instead of pushing a new frame. The current function must be using the
-  standard prologue. Interpreter `eval` loops rely on `TAIL` to recurse
-  on sub-expressions without growing the stack.
+- `MOV rD, rA` copies `rA` into `rD`. The source may be `sp` (read the
+  current stack pointer into a GPR — used e.g. for stack-balance assertions
+  around a call tree). The reverse (`MOV sp, rA`) is not provided; `sp`
+  is only mutated by `PROLOGUE`/`EPILOGUE`.
+- `CALL %label` pushes a return address (via the arch's native mechanism
+  or the caller-emitted `PROLOGUE`, see below) and jumps. `RET` pops and
+  jumps.
+- `PROLOGUE` / `EPILOGUE` set up and tear down a 16-byte frame, spilling
+  the return address to `[sp + 0]` on entry and reloading it on exit. On
+  amd64 the native `CALL` already pushes the retaddr, so both are a NOP;
+  on aarch64/riscv64 they do real work spilling `lr`/`ra`. After
+  `PROLOGUE`, `[sp + 0]` holds the caller's return address uniformly
+  across all three arches.
+- `TAIL %label` is a tail call — it performs the current function's
+  standard epilogue (restore `lr` from `[sp+0]`, pop the frame) and then
+  branches unconditionally to `%label`, reusing the caller's return
+  address instead of pushing a new frame. The current function must be
+  using the standard prologue. Interpreter `eval` loops rely on `TAIL`
+  to recurse on sub-expressions without growing the stack.
 - `SYSCALL` is a single opcode in P1 source. Each arch's defs file expands it
   to the native syscall sequence, including the register shuffle from P1's
   `r0`=num, `r1`–`r6`=args convention into the platform's native convention
@@ -186,12 +197,12 @@ Per-arch defs count (immediates handled by sigil, not enumerated):
   removing trivially-equivalent tuples.
 - 6 immediate arith × 8² = 384. Each entry uses an immediate sigil (`!imm`),
   so the immediate value itself is not enumerated.
-- 3 move ops × 8 or 8² = ~80.
+- 3 move ops × 8 or 8² (plus +8 for the `MOV rD, sp` variant) = ~88.
 - 4 memory ops × 8² = 256. Offsets use `!imm` sigil.
 - 3 conditional branches × 8² = 192.
-- Singletons (`B`, `CALL`, `RET`, `TAIL`, `SYSCALL`) = 5.
+- Singletons (`B`, `CALL`, `RET`, `PROLOGUE`, `EPILOGUE`, `TAIL`, `SYSCALL`) = 7.
 
-Total ≈ 1200 defines per arch. Template-generated.
+Total ≈ 1210 defines per arch. Template-generated.
 
 ## Syscall conventions
 
diff --git a/demo.M1 b/demo.M1
@@ -19,14 +19,22 @@
 ##   (No SUBI/XORI/MULI in P1 — see PLAN.md §"Feature floor".)
 ## Tranche 3: LA + memory round-trip. LD/ST (64-bit) and LB/SB
 ##   (8-bit zero-extended). 32-bit LW/SW dropped from the ISA —
-##   synthesize from LD + mask/shift if needed.
+##   synthesize from LD + mask/shift if needed. Signed imm12 on
+##   LD/ST is exercised via a NEG8-offset round-trip through
+##   scratch_mid, mirroring the ADDI NEG1/NEG3 signed-imm test.
 ## Tranche 4: r7-indirect branches. B, BEQ, BNE, BLT each tested
 ##   in BOTH taken and fall-through directions so an inverted
 ##   condition encoding doesn't pass. BLT additionally tested with
 ##   a negative operand (-1 < 0 signed) to prove signed semantics.
 ##   BLTU/BGEU/BGE dropped from the ISA — see P1.md.
 ## Tranche 5: CALL / RET / TAIL / PROLOGUE / EPILOGUE. Nested CALL
-##   stresses PROLOGUE lr-save; TAIL unwinds the frame.
+##   stresses PROLOGUE lr-save; TAIL unwinds the frame. Stack
+##   balance is additionally verified by snapshotting sp via
+##   MOV_R6_SP before tranche 5 and comparing after — a TAIL that
+##   omits its epilogue leaks fn_parent_tail's 16-byte frame and
+##   the delta folds into the accumulator. On amd64 PROLOGUE/
+##   EPILOGUE are NOPs so the delta is always 0; the check bites
+##   on aarch64/riscv64 where prologue does real sp work.
 ##
 ## Run-and-verify:
 ##   make PROG=demo ARCH=<arch> run && echo "exit=$?"
@@ -125,6 +133,24 @@
     '00000000'
     P1_LB_R1_R4_16              # r1 = zext [scratch+16]      -> 5
 
+    ## Negative imm12 on the memory forms. LA to scratch_mid (= scratch+16)
+    ## then round-trip a distinct sentinel (13) via [r4 + -8] = scratch+8.
+    ## Proves the signed offset encoding sign-extends on LD/ST the same
+    ## way ADDI NEG1/NEG3 proves it on arith. A symmetric "both sides
+    ## miscompiled to the same wrong offset" bug could still false-pass,
+    ## but the common cases (sign bit dropped, imm zero-extended) blow
+    ## up via segfault or a mismatched round-trip value.
+    P1_LA_R4
+    &scratch_mid
+    P1_LI_R1
+    '0D000000'                  # r1 = 13 (distinct sentinel)
+    P1_ST_R1_R4_NEG8            # [scratch+8] = 13 (overwrites +8 slot's old 8)
+    P1_LI_R1
+    '00000000'
+    P1_LD_R1_R4_NEG8            # r1 = [scratch+8]            -> 13
+    P1_ADDI_R1_R1_NEG3          # r1 = 13 - 3 = 10
+    P1_SHRI_R1_R1_1             # r1 = 10 >> 1 = 5
+
     ## Tranche 4: branches. r2=0, r3=1 to start; each subtest resets
     ## as needed. Taken-path subtests clobber r1 on fall-through;
     ## fall-through subtests clobber r1 on incorrect branch.
@@ -235,6 +261,14 @@
     ## return-to-_start address and we crash or hang. The function
     ## bodies live inline below the subtests, guarded by a P1_B over
     ## them so we don't fall through after the last subtest.
+    ##
+    ## Stack-balance discriminator for TAIL: snapshot sp into r6
+    ## (callee-saved) before any CALL. A correctly-paired call tree
+    ## nets to sp_after == sp_before. A TAIL that skips its epilogue
+    ## leaks fn_parent_tail's 16-byte frame — the delta is folded
+    ## into the accumulator below via SUB r1, r1, delta.
+    P1_MOV_R6_SP                # r6 = sp snapshot (pre-tranche)
+
     P1_LI_R7
     &fn_identity
     P1_CALL                     # nested-CALL test: returns r1 unchanged
@@ -267,6 +301,9 @@
     P1_TAIL
 
 :b5_end
+    P1_MOV_R2_SP                # r2 = sp snapshot (post-tranche)
+    P1_SUB_R2_R2_R6             # r2 = sp_after - sp_before (0 if balanced)
+    P1_SUB_R1_R1_R2             # r1 -= delta; unchanged (= 5) iff balanced
 
     P1_MOV_R6_R1                # r6 = 5 (callee-saved, survives syscalls)
 
@@ -320,10 +357,12 @@
 
 ## 32 bytes reserved for tranche 3 memory round-trip. The LOAD segment
 ## is RWX (see ELF-<arch>.hex2 ph_flags=7) so we can store into this
-## region at runtime.
+## region at runtime. scratch_mid = scratch+16, the base address for
+## the negative-imm12 LD/ST test ([scratch_mid + -8] = [scratch+8]).
 :scratch
 '0000000000000000'
 '0000000000000000'
+:scratch_mid
 '0000000000000000'
 '0000000000000000'
 
diff --git a/p1_aarch64.M1 b/p1_aarch64.M1
@@ -74,6 +74,11 @@ DEFINE SYS_EXIT  5D000000
 DEFINE P1_MOV_R1_R6 E10313AA               ## mov x1,  x19
 DEFINE P1_MOV_R6_R1 F30301AA               ## mov x19, x1
 
+## MOV rD, sp  ->  add xD, sp, #0 (canonical "mov rD, sp" form).
+## Used by the tranche-5 stack-balance discriminator.
+DEFINE P1_MOV_R6_SP F3030091               ## mov x19, sp
+DEFINE P1_MOV_R2_SP E2030091               ## mov x2,  sp
+
 ## ADD rD, rA, rB  ->  add xD, xA, xB
 DEFINE P1_ADD_R1_R1_R2 2100028B            ## add x1, x1, x2
 DEFINE P1_ADD_R1_R1_R4 2100048B            ## add x1, x1, x4
@@ -81,6 +86,7 @@ DEFINE P1_ADD_R2_R2_R6 4200138B            ## add x2, x2, x19  (syscall-arg comp
 
 ## SUB / AND / OR / XOR — base opcodes CB / 8A / AA / CA.
 DEFINE P1_SUB_R1_R1_R2 210002CB            ## sub x1, x1, x2
+DEFINE P1_SUB_R2_R2_R6 420013CB            ## sub x2, x2, x19  (tranche-5 sp delta)
 DEFINE P1_AND_R1_R1_R5 2100058A            ## and x1, x1, x5
 DEFINE P1_OR_R1_R1_R2  210002AA            ## orr x1, x1, x2
 DEFINE P1_XOR_R1_R1_R2 210002CA            ## eor x1, x1, x2
@@ -150,6 +156,12 @@ DEFINE P1_LD_R1_R4_8   810440F9        ## ldr  x1, [x4, #8]
 DEFINE P1_SB_R1_R4_16  81400039        ## strb w1, [x4, #16]
 DEFINE P1_LB_R1_R4_16  81404039        ## ldrb w1, [x4, #16]  (zero-ext)
 
+## Negative imm12: aarch64's unsigned-offset LDR/STR can't encode a
+## negative offset, so use the unscaled-offset forms STUR/LDUR, which
+## carry a signed 9-bit immediate. sign-extended imm9 for -8 = 0x1F8.
+DEFINE P1_ST_R1_R4_NEG8  81801FF8      ## stur x1, [x4, #-8]
+DEFINE P1_LD_R1_R4_NEG8  81805FF8      ## ldur x1, [x4, #-8]
+
 
 ## ---- Tranche 4: branches (r7-indirect, no hex2_word needed) ------------
 ## Conditional branches compare ra vs rb, then jump to the address in r7.
diff --git a/p1_amd64.M1 b/p1_amd64.M1
@@ -61,6 +61,13 @@ DEFINE SYS_EXIT  3C000000
 DEFINE P1_MOV_R1_R6 4889DF                 ## mov rdi, rbx
 DEFINE P1_MOV_R6_R1 4889FB                 ## mov rbx, rdi
 
+## MOV rD, sp — read rsp into a GPR. Used by the tranche-5 stack-
+## balance discriminator. On amd64 PROLOGUE/EPILOGUE are NOPs and
+## CALL/RET self-balance, so this check is a no-op here; still
+## encoded so the same demo assembles unchanged.
+DEFINE P1_MOV_R6_SP 4889E3                 ## mov rbx, rsp
+DEFINE P1_MOV_R2_SP 4889E6                 ## mov rsi, rsp
+
 ## ADD rD, rA, rB  ->  mov rD,rA ; add rD,rB
 DEFINE P1_ADD_R1_R1_R2 4889FF4801F7        ## mov rdi,rdi ; add rdi,rsi
 DEFINE P1_ADD_R1_R1_R4 4889FF4C01D7        ## mov rdi,rdi ; add rdi,r10
@@ -68,6 +75,7 @@ DEFINE P1_ADD_R2_R2_R6 4889F64801DE        ## mov rsi,rsi ; add rsi,rbx  (syscal
 
 ## SUB / AND / OR / XOR — 2-insn form, leading mov rdi,rdi kept.
 DEFINE P1_SUB_R1_R1_R2 4889FF4829F7        ## mov rdi,rdi ; sub rdi,rsi
+DEFINE P1_SUB_R2_R2_R6 4889F64829DE        ## mov rsi,rsi ; sub rsi,rbx   (tranche-5 sp delta)
 DEFINE P1_XOR_R1_R1_R2 4889FF4831F7        ## mov rdi,rdi ; xor rdi,rsi
 DEFINE P1_OR_R1_R1_R2  4889FF4809F7        ## mov rdi,rdi ; or  rdi,rsi
 DEFINE P1_AND_R1_R1_R5 4889FF4C21C7        ## mov rdi,rdi ; and rdi,r8
@@ -129,6 +137,10 @@ DEFINE P1_LD_R1_R4_8   498B7A08             ## mov rdi, [r10+8]  (disp8)
 DEFINE P1_SB_R1_R4_16  41887A10             ## mov [r10+16], dil
 DEFINE P1_LB_R1_R4_16  490FB67A10           ## movzx rdi, byte [r10+16]
 
+## Negative disp8: x86-64 disp8 is signed, so -8 = 0xF8.
+DEFINE P1_ST_R1_R4_NEG8  49897AF8           ## mov [r10-8], rdi
+DEFINE P1_LD_R1_R4_NEG8  498B7AF8           ## mov rdi, [r10-8]
+
 
 ## ---- Tranche 4: branches (r7-indirect, no hex2_word needed) ------------
 ## Pattern: cmp ra,rb ; short native jcc over a jmp-through-r7.
diff --git a/p1_riscv64.M1 b/p1_riscv64.M1
@@ -67,12 +67,18 @@ DEFINE SYS_EXIT  5D000000
 DEFINE P1_MOV_R1_R6 93850400               ## mv a1, s1
 DEFINE P1_MOV_R6_R1 93840500               ## mv s1, a1
 
+## MOV rD, sp  ->  addi rD, sp, 0  (sp = x2).
+## Used by the tranche-5 stack-balance discriminator.
+DEFINE P1_MOV_R6_SP 93040100               ## addi s1, sp, 0
+DEFINE P1_MOV_R2_SP 13060100               ## addi a2, sp, 0
+
 ## ADD / SUB / AND / OR / XOR — R-type, funct3 picks the op.
 DEFINE P1_ADD_R1_R1_R2 B385C500            ## add a1, a1, a2
 DEFINE P1_ADD_R1_R1_R4 B385E500            ## add a1, a1, a4
 DEFINE P1_ADD_R2_R2_R6 33069600            ## add a2, a2, s1   (syscall-arg computation)
 
 DEFINE P1_SUB_R1_R1_R2 B385C540            ## sub a1, a1, a2  (funct7=0x20)
+DEFINE P1_SUB_R2_R2_R6 33069640            ## sub a2, a2, s1  (tranche-5 sp delta)
 DEFINE P1_XOR_R1_R1_R2 B3C5C500            ## xor a1, a1, a2  (funct3=4)
 DEFINE P1_OR_R1_R1_R2  B3E5C500            ## or  a1, a1, a2  (funct3=6)
 DEFINE P1_AND_R1_R1_R5 B3F5F500            ## and a1, a1, a5  (funct3=7)
@@ -124,6 +130,11 @@ DEFINE P1_LD_R1_R4_8   83358700             ## ld  a1, 8(a4)
 DEFINE P1_SB_R1_R4_16  2308B700             ## sb  a1, 16(a4)
 DEFINE P1_LB_R1_R4_16  83450701             ## lbu a1, 16(a4)
 
+## Negative imm12: RISC-V load/store immediates are signed 12-bit,
+## so -8 = 0xFF8 sign-extended.
+DEFINE P1_ST_R1_R4_NEG8  233CB7FE            ## sd a1, -8(a4)
+DEFINE P1_LD_R1_R4_NEG8  833587FF            ## ld a1, -8(a4)
+
 
 ## ---- Tranche 4: branches (r7-indirect, no hex2_word needed) ------------
 ## RISC-V has B-type conditional branches with a scattered immediate; writing

	boot2 Playing with the boostrap
	git clone https://git.ryansepassi.com/git/boot2.git
	Log \| Files \| Refs

M	P1.md	\|	37	++++++++++++++++++++++++-------------
M	demo.M1	\|	45	++++++++++++++++++++++++++++++++++++++++++---
M	p1_aarch64.M1	\|	12	++++++++++++
M	p1_amd64.M1	\|	12	++++++++++++
M	p1_riscv64.M1	\|	11	+++++++++++