commit de399adbe43b14b14b58c0f55c521fc29bd08328
parent 5f7a29d41cc6ab83345fbdec6befa62971970c20
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Mon, 20 Apr 2026 17:01:51 -0700
P1 promote PROLOGUE/EPILOGUE, add TAIL stack check and neg-imm12 mem test
Tighten the demo's ISA coverage in three places flagged during review:
- Promote PROLOGUE/EPILOGUE to first-class ops in §"Instruction set"
and a dedicated §"Semantics" bullet; they were already used by
demo.M1 but only documented obliquely under CALL/RET.
- Add MOV rD, sp (read-only) so demo.M1 can snapshot sp before and
after tranche 5. A TAIL that skips its epilogue now folds the
16-byte frame leak into the accumulator on aarch64/riscv64 — the
previous test's "broken TAIL = broken B" path silently passed.
- Exercise signed imm12 on LD/ST via a NEG8-offset round-trip through
a new scratch_mid label, mirroring ADDI NEG1/NEG3's proof that the
signed-imm encoding works on arith.
Five new per-arch DEFINEs (MOV_R{2,6}_SP, SUB_R2_R2_R6, LD/ST_R1_R4_NEG8);
aarch64 uses STUR/LDUR for the negative offset since the unsigned-offset
LDR/STR forms can't express one.
Diffstat:
5 files changed, 101 insertions(+), 16 deletions(-)
diff --git a/P1.md b/P1.md
@@ -107,7 +107,7 @@ ADDI rD, rA, !imm ANDI rD, rA, !imm ORI rD, rA, !imm
SHLI rD, rA, !imm SHRI rD, rA, !imm SARI rD, rA, !imm
# Moves
-MOV rD, rA # reg-to-reg
+MOV rD, rA # reg-to-reg (rA may be sp)
LI rD, %label # load 64-bit literal from pool
LA rD, %label # load PC-relative address
@@ -120,6 +120,7 @@ B %label # unconditional branch
BEQ rA, rB, %label BNE rA, rB, %label
BLT rA, rB, %label # signed less-than
CALL %label RET
+PROLOGUE EPILOGUE # frame setup / teardown (see Semantics)
TAIL %label # tail call: epilogue + B %label
# System
@@ -146,15 +147,25 @@ SYSCALL # num in r0, args r1-r6, ret in r0
register; range is therefore unbounded within the 4 GiB address space.
Native-encoded branches (with tighter range limits) are an optional
future optimization.
-- `CALL` pushes a return address, jumps to target. `RET` pops and jumps.
- Prologue convention: function entry does `ST lr, sp, 0; ADDI sp, sp, -16`
- (or arch-specific equivalent); epilogue reverses.
-- `TAIL %label` is a tail call — it performs the callee's standard
- epilogue (restore `lr` from `[sp+0]`, pop the frame) and then branches
- unconditionally to `%label`, reusing the caller's return address
- instead of pushing a new frame. The current function must be using the
- standard prologue. Interpreter `eval` loops rely on `TAIL` to recurse
- on sub-expressions without growing the stack.
+- `MOV rD, rA` copies `rA` into `rD`. The source may be `sp` (read the
+ current stack pointer into a GPR — used e.g. for stack-balance assertions
+ around a call tree). The reverse (`MOV sp, rA`) is not provided; `sp`
+ is only mutated by `PROLOGUE`/`EPILOGUE`.
+- `CALL %label` pushes a return address (via the arch's native mechanism
+ or the caller-emitted `PROLOGUE`, see below) and jumps. `RET` pops and
+ jumps.
+- `PROLOGUE` / `EPILOGUE` set up and tear down a 16-byte frame, spilling
+ the return address to `[sp + 0]` on entry and reloading it on exit. On
+ amd64 the native `CALL` already pushes the retaddr, so both are a NOP;
+ on aarch64/riscv64 they do real work spilling `lr`/`ra`. After
+ `PROLOGUE`, `[sp + 0]` holds the caller's return address uniformly
+ across all three arches.
+- `TAIL %label` is a tail call — it performs the current function's
+ standard epilogue (restore `lr` from `[sp+0]`, pop the frame) and then
+ branches unconditionally to `%label`, reusing the caller's return
+ address instead of pushing a new frame. The current function must be
+ using the standard prologue. Interpreter `eval` loops rely on `TAIL`
+ to recurse on sub-expressions without growing the stack.
- `SYSCALL` is a single opcode in P1 source. Each arch's defs file expands it
to the native syscall sequence, including the register shuffle from P1's
`r0`=num, `r1`–`r6`=args convention into the platform's native convention
@@ -186,12 +197,12 @@ Per-arch defs count (immediates handled by sigil, not enumerated):
removing trivially-equivalent tuples.
- 6 immediate arith × 8² = 384. Each entry uses an immediate sigil (`!imm`),
so the immediate value itself is not enumerated.
-- 3 move ops × 8 or 8² = ~80.
+- 3 move ops × 8 or 8² (plus +8 for the `MOV rD, sp` variant) = ~88.
- 4 memory ops × 8² = 256. Offsets use `!imm` sigil.
- 3 conditional branches × 8² = 192.
-- Singletons (`B`, `CALL`, `RET`, `TAIL`, `SYSCALL`) = 5.
+- Singletons (`B`, `CALL`, `RET`, `PROLOGUE`, `EPILOGUE`, `TAIL`, `SYSCALL`) = 7.
-Total ≈ 1200 defines per arch. Template-generated.
+Total ≈ 1210 defines per arch. Template-generated.
## Syscall conventions
diff --git a/demo.M1 b/demo.M1
@@ -19,14 +19,22 @@
## (No SUBI/XORI/MULI in P1 — see PLAN.md §"Feature floor".)
## Tranche 3: LA + memory round-trip. LD/ST (64-bit) and LB/SB
## (8-bit zero-extended). 32-bit LW/SW dropped from the ISA —
-## synthesize from LD + mask/shift if needed.
+## synthesize from LD + mask/shift if needed. Signed imm12 on
+## LD/ST is exercised via a NEG8-offset round-trip through
+## scratch_mid, mirroring the ADDI NEG1/NEG3 signed-imm test.
## Tranche 4: r7-indirect branches. B, BEQ, BNE, BLT each tested
## in BOTH taken and fall-through directions so an inverted
## condition encoding doesn't pass. BLT additionally tested with
## a negative operand (-1 < 0 signed) to prove signed semantics.
## BLTU/BGEU/BGE dropped from the ISA — see P1.md.
## Tranche 5: CALL / RET / TAIL / PROLOGUE / EPILOGUE. Nested CALL
-## stresses PROLOGUE lr-save; TAIL unwinds the frame.
+## stresses PROLOGUE lr-save; TAIL unwinds the frame. Stack
+## balance is additionally verified by snapshotting sp via
+## MOV_R6_SP before tranche 5 and comparing after — a TAIL that
+## omits its epilogue leaks fn_parent_tail's 16-byte frame and
+## the delta folds into the accumulator. On amd64 PROLOGUE/
+## EPILOGUE are NOPs so the delta is always 0; the check bites
+## on aarch64/riscv64 where prologue does real sp work.
##
## Run-and-verify:
## make PROG=demo ARCH=<arch> run && echo "exit=$?"
@@ -125,6 +133,24 @@
'00000000'
P1_LB_R1_R4_16 # r1 = zext [scratch+16] -> 5
+ ## Negative imm12 on the memory forms. LA to scratch_mid (= scratch+16)
+ ## then round-trip a distinct sentinel (13) via [r4 + -8] = scratch+8.
+ ## Proves the signed offset encoding sign-extends on LD/ST the same
+ ## way ADDI NEG1/NEG3 proves it on arith. A symmetric "both sides
+ ## miscompiled to the same wrong offset" bug could still false-pass,
+ ## but the common cases (sign bit dropped, imm zero-extended) blow
+ ## up via segfault or a mismatched round-trip value.
+ P1_LA_R4
+ &scratch_mid
+ P1_LI_R1
+ '0D000000' # r1 = 13 (distinct sentinel)
+ P1_ST_R1_R4_NEG8 # [scratch+8] = 13 (overwrites +8 slot's old 8)
+ P1_LI_R1
+ '00000000'
+ P1_LD_R1_R4_NEG8 # r1 = [scratch+8] -> 13
+ P1_ADDI_R1_R1_NEG3 # r1 = 13 - 3 = 10
+ P1_SHRI_R1_R1_1 # r1 = 10 >> 1 = 5
+
## Tranche 4: branches. r2=0, r3=1 to start; each subtest resets
## as needed. Taken-path subtests clobber r1 on fall-through;
## fall-through subtests clobber r1 on incorrect branch.
@@ -235,6 +261,14 @@
## return-to-_start address and we crash or hang. The function
## bodies live inline below the subtests, guarded by a P1_B over
## them so we don't fall through after the last subtest.
+ ##
+ ## Stack-balance discriminator for TAIL: snapshot sp into r6
+ ## (callee-saved) before any CALL. A correctly-paired call tree
+ ## nets to sp_after == sp_before. A TAIL that skips its epilogue
+ ## leaks fn_parent_tail's 16-byte frame — the delta is folded
+ ## into the accumulator below via SUB r1, r1, delta.
+ P1_MOV_R6_SP # r6 = sp snapshot (pre-tranche)
+
P1_LI_R7
&fn_identity
P1_CALL # nested-CALL test: returns r1 unchanged
@@ -267,6 +301,9 @@
P1_TAIL
:b5_end
+ P1_MOV_R2_SP # r2 = sp snapshot (post-tranche)
+ P1_SUB_R2_R2_R6 # r2 = sp_after - sp_before (0 if balanced)
+ P1_SUB_R1_R1_R2 # r1 -= delta; unchanged (= 5) iff balanced
P1_MOV_R6_R1 # r6 = 5 (callee-saved, survives syscalls)
@@ -320,10 +357,12 @@
## 32 bytes reserved for tranche 3 memory round-trip. The LOAD segment
## is RWX (see ELF-<arch>.hex2 ph_flags=7) so we can store into this
-## region at runtime.
+## region at runtime. scratch_mid = scratch+16, the base address for
+## the negative-imm12 LD/ST test ([scratch_mid + -8] = [scratch+8]).
:scratch
'0000000000000000'
'0000000000000000'
+:scratch_mid
'0000000000000000'
'0000000000000000'
diff --git a/p1_aarch64.M1 b/p1_aarch64.M1
@@ -74,6 +74,11 @@ DEFINE SYS_EXIT 5D000000
DEFINE P1_MOV_R1_R6 E10313AA ## mov x1, x19
DEFINE P1_MOV_R6_R1 F30301AA ## mov x19, x1
+## MOV rD, sp -> add xD, sp, #0 (canonical "mov rD, sp" form).
+## Used by the tranche-5 stack-balance discriminator.
+DEFINE P1_MOV_R6_SP F3030091 ## mov x19, sp
+DEFINE P1_MOV_R2_SP E2030091 ## mov x2, sp
+
## ADD rD, rA, rB -> add xD, xA, xB
DEFINE P1_ADD_R1_R1_R2 2100028B ## add x1, x1, x2
DEFINE P1_ADD_R1_R1_R4 2100048B ## add x1, x1, x4
@@ -81,6 +86,7 @@ DEFINE P1_ADD_R2_R2_R6 4200138B ## add x2, x2, x19 (syscall-arg comp
## SUB / AND / OR / XOR — base opcodes CB / 8A / AA / CA.
DEFINE P1_SUB_R1_R1_R2 210002CB ## sub x1, x1, x2
+DEFINE P1_SUB_R2_R2_R6 420013CB ## sub x2, x2, x19 (tranche-5 sp delta)
DEFINE P1_AND_R1_R1_R5 2100058A ## and x1, x1, x5
DEFINE P1_OR_R1_R1_R2 210002AA ## orr x1, x1, x2
DEFINE P1_XOR_R1_R1_R2 210002CA ## eor x1, x1, x2
@@ -150,6 +156,12 @@ DEFINE P1_LD_R1_R4_8 810440F9 ## ldr x1, [x4, #8]
DEFINE P1_SB_R1_R4_16 81400039 ## strb w1, [x4, #16]
DEFINE P1_LB_R1_R4_16 81404039 ## ldrb w1, [x4, #16] (zero-ext)
+## Negative imm12: aarch64's unsigned-offset LDR/STR can't encode a
+## negative offset, so use the unscaled-offset forms STUR/LDUR, which
+## carry a signed 9-bit immediate. sign-extended imm9 for -8 = 0x1F8.
+DEFINE P1_ST_R1_R4_NEG8 81801FF8 ## stur x1, [x4, #-8]
+DEFINE P1_LD_R1_R4_NEG8 81805FF8 ## ldur x1, [x4, #-8]
+
## ---- Tranche 4: branches (r7-indirect, no hex2_word needed) ------------
## Conditional branches compare ra vs rb, then jump to the address in r7.
diff --git a/p1_amd64.M1 b/p1_amd64.M1
@@ -61,6 +61,13 @@ DEFINE SYS_EXIT 3C000000
DEFINE P1_MOV_R1_R6 4889DF ## mov rdi, rbx
DEFINE P1_MOV_R6_R1 4889FB ## mov rbx, rdi
+## MOV rD, sp — read rsp into a GPR. Used by the tranche-5 stack-
+## balance discriminator. On amd64 PROLOGUE/EPILOGUE are NOPs and
+## CALL/RET self-balance, so this check is a no-op here; still
+## encoded so the same demo assembles unchanged.
+DEFINE P1_MOV_R6_SP 4889E3 ## mov rbx, rsp
+DEFINE P1_MOV_R2_SP 4889E6 ## mov rsi, rsp
+
## ADD rD, rA, rB -> mov rD,rA ; add rD,rB
DEFINE P1_ADD_R1_R1_R2 4889FF4801F7 ## mov rdi,rdi ; add rdi,rsi
DEFINE P1_ADD_R1_R1_R4 4889FF4C01D7 ## mov rdi,rdi ; add rdi,r10
@@ -68,6 +75,7 @@ DEFINE P1_ADD_R2_R2_R6 4889F64801DE ## mov rsi,rsi ; add rsi,rbx (syscal
## SUB / AND / OR / XOR — 2-insn form, leading mov rdi,rdi kept.
DEFINE P1_SUB_R1_R1_R2 4889FF4829F7 ## mov rdi,rdi ; sub rdi,rsi
+DEFINE P1_SUB_R2_R2_R6 4889F64829DE ## mov rsi,rsi ; sub rsi,rbx (tranche-5 sp delta)
DEFINE P1_XOR_R1_R1_R2 4889FF4831F7 ## mov rdi,rdi ; xor rdi,rsi
DEFINE P1_OR_R1_R1_R2 4889FF4809F7 ## mov rdi,rdi ; or rdi,rsi
DEFINE P1_AND_R1_R1_R5 4889FF4C21C7 ## mov rdi,rdi ; and rdi,r8
@@ -129,6 +137,10 @@ DEFINE P1_LD_R1_R4_8 498B7A08 ## mov rdi, [r10+8] (disp8)
DEFINE P1_SB_R1_R4_16 41887A10 ## mov [r10+16], dil
DEFINE P1_LB_R1_R4_16 490FB67A10 ## movzx rdi, byte [r10+16]
+## Negative disp8: x86-64 disp8 is signed, so -8 = 0xF8.
+DEFINE P1_ST_R1_R4_NEG8 49897AF8 ## mov [r10-8], rdi
+DEFINE P1_LD_R1_R4_NEG8 498B7AF8 ## mov rdi, [r10-8]
+
## ---- Tranche 4: branches (r7-indirect, no hex2_word needed) ------------
## Pattern: cmp ra,rb ; short native jcc over a jmp-through-r7.
diff --git a/p1_riscv64.M1 b/p1_riscv64.M1
@@ -67,12 +67,18 @@ DEFINE SYS_EXIT 5D000000
DEFINE P1_MOV_R1_R6 93850400 ## mv a1, s1
DEFINE P1_MOV_R6_R1 93840500 ## mv s1, a1
+## MOV rD, sp -> addi rD, sp, 0 (sp = x2).
+## Used by the tranche-5 stack-balance discriminator.
+DEFINE P1_MOV_R6_SP 93040100 ## addi s1, sp, 0
+DEFINE P1_MOV_R2_SP 13060100 ## addi a2, sp, 0
+
## ADD / SUB / AND / OR / XOR — R-type, funct3 picks the op.
DEFINE P1_ADD_R1_R1_R2 B385C500 ## add a1, a1, a2
DEFINE P1_ADD_R1_R1_R4 B385E500 ## add a1, a1, a4
DEFINE P1_ADD_R2_R2_R6 33069600 ## add a2, a2, s1 (syscall-arg computation)
DEFINE P1_SUB_R1_R1_R2 B385C540 ## sub a1, a1, a2 (funct7=0x20)
+DEFINE P1_SUB_R2_R2_R6 33069640 ## sub a2, a2, s1 (tranche-5 sp delta)
DEFINE P1_XOR_R1_R1_R2 B3C5C500 ## xor a1, a1, a2 (funct3=4)
DEFINE P1_OR_R1_R1_R2 B3E5C500 ## or a1, a1, a2 (funct3=6)
DEFINE P1_AND_R1_R1_R5 B3F5F500 ## and a1, a1, a5 (funct3=7)
@@ -124,6 +130,11 @@ DEFINE P1_LD_R1_R4_8 83358700 ## ld a1, 8(a4)
DEFINE P1_SB_R1_R4_16 2308B700 ## sb a1, 16(a4)
DEFINE P1_LB_R1_R4_16 83450701 ## lbu a1, 16(a4)
+## Negative imm12: RISC-V load/store immediates are signed 12-bit,
+## so -8 = 0xFF8 sign-extended.
+DEFINE P1_ST_R1_R4_NEG8 233CB7FE ## sd a1, -8(a4)
+DEFINE P1_LD_R1_R4_NEG8 833587FF ## ld a1, -8(a4)
+
## ---- Tranche 4: branches (r7-indirect, no hex2_word needed) ------------
## RISC-V has B-type conditional branches with a scattered immediate; writing