commit c97fe9fc116a451582d73ddcba70c6eecef2c08c
parent 29dd4df7b3d09e614c5999efd71cbe9651d2ef02
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Mon, 20 Apr 2026 16:36:01 -0700
P1 prune ISA and rewrite demo with discriminating tests
Drop BLTU/BGEU/BGE and LW/SW from the P1 ISA: seed programs with
tagged-cell pointers don't need unsigned compares, and byte/64-bit
loads are enough once LW is synthesizable from LD + mask. Combinatoric
footprint drops from ~1500 to ~1200 defines/arch.
Rewrite demo.M1 so every op is applied with non-identity operands
whose result is unique vs. neighbor ops on the same inputs, so a
mis-encoded op poisons the accumulator and fails the exit-code check.
Branch tranche now tests both taken and fall-through directions for
each conditional, plus a signed-vs-unsigned discriminator on BLT via
r4=-1. SAR/SARI discriminated on a negative value (SHR/SHRI would
disagree). LD/ST exercised at both offset=0 and offset=8; LI_R6
roundtripped through MOV_R1_R6 so the otherwise-unused LI_R6 encoding
is covered.
Per-arch defs rewritten to match: dropped encodings for the removed
ops; added the new register/immediate tuples the demo consumes.
Diffstat:
| M | P1.md | | | 22 | ++++++++++++++-------- |
| M | PLAN.md | | | 10 | ++++++---- |
| M | demo.M1 | | | 243 | ++++++++++++++++++++++++++++++++++++++++++++++++------------------------------- |
| M | p1_aarch64.M1 | | | 112 | ++++++++++++++++++++++++++++++++++++++----------------------------------------- |
| M | p1_amd64.M1 | | | 105 | ++++++++++++++++++++++++++++++++++++------------------------------------------- |
| M | p1_riscv64.M1 | | | 84 | ++++++++++++++++++++++++++++++++++++------------------------------------------- |
6 files changed, 309 insertions(+), 267 deletions(-)
diff --git a/P1.md b/P1.md
@@ -113,14 +113,12 @@ LA rD, %label # load PC-relative address
# Memory (offset is signed 12-bit)
LD rD, rA, !off ST rS, rA, !off # 64-bit
-LW rD, rA, !off SW rS, rA, !off # 32-bit zero-extended / truncated
LB rD, rA, !off SB rS, rA, !off # 8-bit zero-extended / truncated
# Control flow
B %label # unconditional branch
BEQ rA, rB, %label BNE rA, rB, %label
-BLT rA, rB, %label BGE rA, rB, %label # signed
-BLTU rA, rB, %label BGEU rA, rB, %label # unsigned
+BLT rA, rB, %label # signed less-than
CALL %label RET
TAIL %label # tail call: epilogue + B %label
@@ -133,8 +131,16 @@ SYSCALL # num in r0, args r1-r6, ret in r0
- All arithmetic is on 64-bit values. `SHL`/`SHR`/`SAR` take shift amount in
the low 6 bits of `rB` (or the `!imm` for immediate forms).
- `DIV` is signed, truncated toward zero. `REM` matches `DIV`.
-- `LW`/`LB` zero-extend the loaded value into the 64-bit destination.
- (Signed-extending variants — `LWS`, `LBS` — can be added later if needed.)
+- `LB` zero-extends the loaded value into the 64-bit destination.
+ (A signed-extending variant `LBS` can be added later if needed. 32-bit
+ `LW`/`SW` are deliberately omitted — emulate with `LD`+`ANDI`/shift and
+ `ST` through a 64-bit scratch when needed.)
+- Unsigned comparisons (`BLTU`/`BGEU`) are not in the ISA: seed programs
+ with tagged-cell pointers only need signed comparisons. Synthesize from
+ `BLT` via operand-bias if unsigned compare is ever required.
+- `BGE rA, rB, %L` is not in the ISA: synthesize as
+ `BLT rA, rB, %skip; B %L; :skip` (the r7-indirect branch pattern makes
+ the skip cheap). `BLT rB, rA, %L` handles the strict-greater case.
- Branch offsets are PC-relative. In the v0.1 spike they are realized by
loading the target address via `LI` into `r7` and jumping through the
register; range is therefore unbounded within the 4 GiB address space.
@@ -181,11 +187,11 @@ Per-arch defs count (immediates handled by sigil, not enumerated):
- 6 immediate arith × 8² = 384. Each entry uses an immediate sigil (`!imm`),
so the immediate value itself is not enumerated.
- 3 move ops × 8 or 8² = ~80.
-- 6 memory ops × 8² = 384. Offsets use `!imm` sigil.
-- 7 branches × 8² = 448.
+- 4 memory ops × 8² = 256. Offsets use `!imm` sigil.
+- 3 conditional branches × 8² = 192.
- Singletons (`B`, `CALL`, `RET`, `TAIL`, `SYSCALL`) = 5.
-Total ≈ 1500 defines per arch. Template-generated.
+Total ≈ 1200 defines per arch. Template-generated.
## Syscall conventions
diff --git a/PLAN.md b/PLAN.md
@@ -150,10 +150,12 @@ with any future seed-stage program.
## Resolutions
-- **Narrow loads: zero-extend only.** P1 keeps `LB`/`LW` zero-extending;
- no `LBS`/`LWS` added. Fixnums live in full 64-bit tagged cells, so
- the interpreter never needs a sign-extended narrow load — byte/ASCII
- access is unsigned, and arithmetic happens on 64-bit values already.
+- **Narrow loads: zero-extend only, 8-bit only.** P1 keeps `LB`
+ zero-extending; no `LBS` added, and 32-bit `LW`/`SW` are out of the
+ ISA entirely (emulate through 64-bit `LD`/`ST` + mask/shift if ever
+ needed). Fixnums live in full 64-bit tagged cells, so the interpreter
+ never needs a sign-extended or 32-bit load — byte/ASCII access is
+ unsigned, and arithmetic happens on 64-bit values already.
- **Static code size: accept the 2× tax.** P1's destructive-expansion
rule on amd64 roughly doubles instruction count vs. hand-tuned amd64.
Matches P1's "deliberately dumb" contract (see `P1.md`). Interpreter
diff --git a/demo.M1 b/demo.M1
@@ -1,87 +1,139 @@
## P1 broader-ISA demo — portable across aarch64, amd64, riscv64.
##
-## Exercises the P1 ISA in tranches (see P1.md §"ISA"). Each op is
-## applied in a way that preserves the running value r1=5, so a
-## miscoded op produces the wrong exit status.
+## Exercises the P1 ISA in tranches (see P1.md §"Instruction set").
+## Each op is applied with non-identity operands so a mis-encoded op
+## produces a detectably-wrong final r1. If every op in every tranche
+## is correct, r1 ends at 5, stdout is "P1 = 5\n", exit is 5.
##
-## Tranche 1: LI, MOV, SYSCALL, plus every reg-reg-reg arith op:
-## ADD, SUB, AND, OR, XOR, MUL, DIV, REM, SHL, SHR, SAR.
-## Tranche 2: immediate forms
-## ADDI, ANDI, ORI, SHLI, SHRI, SARI
-## (no SUBI/XORI/MULI in P1 — see PLAN.md §"Feature floor").
-## Tranche 3: LA + memory round-trip
-## LA, ST/LD (64b), SW/LW (32b zero-ext), SB/LB (8b zero-ext)
-## Tranche 4: r7-indirect branches
-## B, BEQ, BNE, BLT, BGE, BLTU, BGEU — 7 taken-path subtests + 1 fall-through
-## Tranche 5 (current): CALL / RET / TAIL / PROLOGUE / EPILOGUE
-## Nested CALL (stresses PROLOGUE lr-save) + TAIL (must unwind before B)
-##
-## Constants for the identity chain:
-## r2 = 0 identity for ADD/SUB/XOR/OR/SHL/SHR/SAR
-## r3 = 1 identity for MUL/DIV
-## r5 = 7 identity for AND (5 & 7 = 5) and REM (5 % 7 = 5)
-## r1 holds the running value — starts at 5 and stays 5.
+## Tranche 1: reg-reg-reg arith (11 ops). Each step's result is
+## unique vs. neighbor ops on the same operands, e.g.
+## ADD(5,3)=8 SUB(8,3)=5 XOR(5,3)=6 OR(6,3)=7 AND(7,5)=5
+## MUL(5,3)=15 DIV(15,3)=5 REM(5,7)=5 SHL(5,3)=40 SHR(40,3)=5.
+## SAR vs SHR agree on positive values; SAR is separately tested
+## on a negative value in r4 (SAR(-1,3)=-1, SHR would give huge).
+## Tranche 2: immediate forms. ADDI tested with BOTH positive and
+## negative imm12 (proves signed-immediate encoding). SHLI/SHRI
+## with non-zero shift amount. ANDI/ORI with overlapping-bit
+## masks so the result is distinguishable from ADDI/XORI. SARI
+## tested on negative value via r4.
+## (No SUBI/XORI/MULI in P1 — see PLAN.md §"Feature floor".)
+## Tranche 3: LA + memory round-trip. LD/ST (64-bit) and LB/SB
+## (8-bit zero-extended). 32-bit LW/SW dropped from the ISA —
+## synthesize from LD + mask/shift if needed.
+## Tranche 4: r7-indirect branches. B, BEQ, BNE, BLT each tested
+## in BOTH taken and fall-through directions so an inverted
+## condition encoding doesn't pass. BLT additionally tested with
+## a negative operand (-1 < 0 signed) to prove signed semantics.
+## BLTU/BGEU/BGE dropped from the ISA — see P1.md.
+## Tranche 5: CALL / RET / TAIL / PROLOGUE / EPILOGUE. Nested CALL
+## stresses PROLOGUE lr-save; TAIL unwinds the frame.
##
## Run-and-verify:
## make PROG=demo ARCH=<arch> run && echo "exit=$?"
## expected stdout: "P1 = 5\n" expected exit: 5
:_start
+ ## Setup: r1 is the running accumulator; r2/r5 are arith partners.
+ ## r1 is seeded via the LI_R6 → MOV_R1_R6 roundtrip so the LI_R6
+ ## encoding gets exercised — otherwise r6 is only written by
+ ## MOV_R6_R1 in the exit path. Clobbering r1 to 0 between the two
+ ## ops means a broken MOV_R1_R6 (wrong source reg) leaves r1=0 and
+ ## poisons the chain; a broken LI_R6 (wrong dest reg or wrong
+ ## literal-slot offset) puts the wrong value in r6 and MOV
+ ## propagates it.
+ P1_LI_R6
+ '05000000' # r6 = 5 via LI_R6 (discriminator)
P1_LI_R1
- '05000000'
+ '00000000' # clobber r1 before the MOV
+ P1_MOV_R1_R6 # r1 = r6 = 5
P1_LI_R2
- '00000000'
- P1_LI_R3
- '01000000'
+ '03000000' # r2 = 3 (arith partner & shift amount)
P1_LI_R5
- '07000000'
-
- P1_ADD_R1_R1_R2 # 5 + 0 = 5
- P1_SUB_R1_R1_R2 # 5 - 0 = 5
- P1_XOR_R1_R1_R2 # 5 ^ 0 = 5
- P1_OR_R1_R1_R2 # 5 | 0 = 5
- P1_AND_R1_R1_R5 # 5 & 7 = 5
- P1_MUL_R1_R1_R3 # 5 * 1 = 5
- P1_DIV_R1_R1_R3 # 5 / 1 = 5
+ '05000000' # r5 = 5 (AND partner)
+
+ ## Tranche 1: reg-reg-reg arith.
+ P1_ADD_R1_R1_R2 # 5 + 3 = 8
+ P1_SUB_R1_R1_R2 # 8 - 3 = 5
+ P1_XOR_R1_R1_R2 # 5 ^ 3 = 6
+ P1_OR_R1_R1_R2 # 6 | 3 = 7
+ P1_AND_R1_R1_R5 # 7 & 5 = 5
+ P1_MUL_R1_R1_R2 # 5 * 3 = 15
+ P1_DIV_R1_R1_R2 # 15 / 3 = 5
+ P1_LI_R5
+ '07000000' # r5 = 7 for REM
P1_REM_R1_R1_R5 # 5 % 7 = 5
- P1_SHL_R1_R1_R2 # 5 << 0 = 5
- P1_SHR_R1_R1_R2 # 5 >> 0 = 5 (logical)
- P1_SAR_R1_R1_R2 # 5 >> 0 = 5 (arithmetic)
+ P1_SHL_R1_R1_R2 # 5 << 3 = 40
+ P1_SHR_R1_R1_R2 # 40 >> 3 = 5
- ## Tranche 2: immediate forms. Chain 5 -> 8 -> 4 -> 5 -> 5 -> 5 -> 5.
+ ## SAR discriminator: on positive values SAR and SHR agree, so
+ ## put a negative value in r4 and check that SAR preserves the
+ ## sign bits. Then fold r4 into r1 so a misbehaving SAR poisons
+ ## the accumulator.
+ P1_LI_R4
+ '00000000' # r4 = 0
+ P1_ADDI_R4_R4_NEG1 # r4 = 0 + (-1) = -1 (sign-extended 64-bit)
+ P1_SAR_R4_R4_R2 # r4 = -1 >> 3 = -1 (SHR would be 0x1FFF...FFFF)
+ P1_ADD_R1_R1_R4 # r1 = 5 + (-1) = 4
+ P1_ADDI_R1_R1_1 # r1 = 4 + 1 = 5
+
+ ## Tranche 2: immediate forms. Chain 5 → 8 → 5 → 10 → 5 → 4 → 5
+ ## plus a SARI discriminator via r4.
P1_ADDI_R1_R1_3 # 5 + 3 = 8
- P1_SHRI_R1_R1_1 # 8 >> 1 = 4
- P1_ORI_R1_R1_1 # 4 | 1 = 5
- P1_ANDI_R1_R1_7 # 5 & 7 = 5
- P1_SHLI_R1_R1_0 # 5 << 0 = 5
- P1_SARI_R1_R1_0 # 5 >> 0 = 5 (arithmetic)
+ P1_ADDI_R1_R1_NEG3 # 8 + (-3) = 5 (signed imm12)
+ P1_SHLI_R1_R1_1 # 5 << 1 = 10 (non-zero shift)
+ P1_SHRI_R1_R1_1 # 10 >> 1 = 5
+ P1_ANDI_R1_R1_6 # 5 & 6 = 4 (0b101 & 0b110 = 0b100)
+ P1_ORI_R1_R1_1 # 4 | 1 = 5 (aarch64 bitmask-immediate
+ # requires valid-mask imm; 5 is invalid, 1 is
+ # valid. Tranche 1 OR_R1_R1_R2 (6|3=7) already
+ # discriminates OR from ADD/XOR on overlapping
+ # bits; ORI here just tests the imm-ORR encoding
+ # exists and doesn't crash, plus distinguishes
+ # from AND (4&1=0 would fail the chain).)
- ## Tranche 3: memory round-trip. For each access width, store r1,
+ P1_LI_R4
+ '00000000'
+ P1_ADDI_R4_R4_NEG1 # r4 = -1
+ P1_SARI_R4_R4_1 # r4 = -1 >> 1 = -1 (SHRI would be 0x7FFF...FFFF)
+ P1_ADD_R1_R1_R4 # r1 = 5 + (-1) = 4
+ P1_ADDI_R1_R1_1 # r1 = 5
+
+ ## Tranche 3: memory round-trip. For each width, store r1,
## clobber r1 to 0, then reload. A broken ST/LD leaves r1 != 5.
P1_LA_R4
&scratch
- P1_ST_R1_R4_0 # [scratch+0..8] = r1 (= 5)
+ P1_ST_R1_R4_0 # [scratch+0..8] = r1 (= 5)
P1_LI_R1
'00000000'
P1_LD_R1_R4_0 # r1 = [scratch+0..8] -> 5
- P1_SW_R1_R4_8 # [scratch+8..12] = r1 (low 4 bytes)
+ ## Non-zero imm12 on the 64-bit forms. Distinct value 8 at +8 so
+ ## an LD_R4_8 that silently aliased to +0 would read 5 and fail the
+ ## SUB step; a broken ST_R4_8 leaves [+8]=0 (scratch init) which the
+ ## LD observes. The +0 round-trip above anchors the imm12=0 case.
+ P1_LI_R1
+ '08000000' # distinct from 5
+ P1_ST_R1_R4_8 # [scratch+8..16] = 8
P1_LI_R1
'00000000'
- P1_LW_R1_R4_8 # r1 = zext [scratch+8..12] -> 5
+ P1_LD_R1_R4_8 # r1 = [scratch+8..16] -> 8
+ P1_SUB_R1_R1_R2 # r1 = 8 - 3 = 5 (r2 still 3 from setup)
- P1_SB_R1_R4_16 # [scratch+16] = r1 (low byte)
+ P1_SB_R1_R4_16 # [scratch+16] = r1 (low byte)
P1_LI_R1
'00000000'
P1_LB_R1_R4_16 # r1 = zext [scratch+16] -> 5
- ## Tranche 4: branches. r2=0, r3=1 already, r5=7 already.
- ## Pattern for each taken-path test:
- ## set r7 = &b4_N_ok ; <branch cond met> ; clobber r1->0 ; :b4_N_ok
- ## If the branch correctly fires, the clobber is skipped.
- ##
- ## B — unconditional, jumps to &b4_1_ok.
+ ## Tranche 4: branches. r2=0, r3=1 to start; each subtest resets
+ ## as needed. Taken-path subtests clobber r1 on fall-through;
+ ## fall-through subtests clobber r1 on incorrect branch.
+ P1_LI_R2
+ '00000000' # r2 = 0
+ P1_LI_R3
+ '01000000' # r3 = 1
+
+ ## B — unconditional. Correct: jump to b4_1_ok, skipping clobber.
P1_LI_R7
&b4_1_ok
P1_B
@@ -89,97 +141,100 @@
'00000000'
:b4_1_ok
- ## BEQ r2,r3,r7 — 0 == 0? set r3 back to 0 first, branch, then restore.
+ ## BEQ taken: r3=0 so r2==r3.
P1_LI_R3
- '00000000' # r3 = 0 so r2 == r3
+ '00000000'
P1_LI_R7
&b4_2_ok
- P1_BEQ_R2_R3_R7 # 0 == 0, branch taken
+ P1_BEQ_R2_R3_R7 # 0 == 0, taken
P1_LI_R1
'00000000'
:b4_2_ok
P1_LI_R3
'01000000' # restore r3 = 1
- ## BNE r2,r3,r7 — 0 != 1? yes, branch taken.
+ ## BEQ fall-through: 0 != 1, branch must NOT fire. If it
+ ## (incorrectly) fires we jump to b4_3_bad and clobber r1.
+ P1_LI_R7
+ &b4_3_bad
+ P1_BEQ_R2_R3_R7 # 0 == 1? no, fall through
P1_LI_R7
&b4_3_ok
- P1_BNE_R2_R3_R7
+ P1_B
+:b4_3_bad
P1_LI_R1
'00000000'
:b4_3_ok
- ## BLT r2,r3,r7 — 0 < 1 (signed)? yes, branch taken.
+ ## BNE taken: 0 != 1.
P1_LI_R7
&b4_4_ok
- P1_BLT_R2_R3_R7
+ P1_BNE_R2_R3_R7 # 0 != 1, taken
P1_LI_R1
'00000000'
:b4_4_ok
- ## BGE r3,r2,r7 — needs a3 >= a2. Swap: load r2=1 r3=0 for this test.
- P1_LI_R2
- '01000000'
+ ## BNE fall-through: r3=0 so r2==r3; branch must NOT fire.
P1_LI_R3
'00000000'
P1_LI_R7
+ &b4_5_bad
+ P1_BNE_R2_R3_R7 # 0 != 0? no, fall through
+ P1_LI_R7
&b4_5_ok
- P1_BGE_R2_R3_R7 # 1 >= 0 (signed), branch taken
+ P1_B
+:b4_5_bad
P1_LI_R1
'00000000'
:b4_5_ok
- P1_LI_R2
- '00000000' # restore r2 = 0
P1_LI_R3
'01000000' # restore r3 = 1
- ## BLTU r2,r3,r7 — 0 < 1 (unsigned)? yes, branch taken.
+ ## BLT taken: 0 < 1 (signed).
P1_LI_R7
&b4_6_ok
- P1_BLTU_R2_R3_R7
+ P1_BLT_R2_R3_R7 # 0 < 1, taken
P1_LI_R1
'00000000'
:b4_6_ok
- ## BGEU r2,r3,r7 — needs a2 >= a3 (unsigned). Swap r2/r3.
+ ## BLT fall-through: 1 < 0 is false.
P1_LI_R2
- '01000000'
+ '01000000' # r2 = 1
P1_LI_R3
- '00000000'
+ '00000000' # r3 = 0
+ P1_LI_R7
+ &b4_7_bad
+ P1_BLT_R2_R3_R7 # 1 < 0? no, fall through
P1_LI_R7
&b4_7_ok
- P1_BGEU_R2_R3_R7 # 1 >= 0 (unsigned), branch taken
+ P1_B
+:b4_7_bad
P1_LI_R1
'00000000'
:b4_7_ok
+
+ ## BLT signed discrimination: -1 < 0 must be taken. If BLT were
+ ## accidentally unsigned, -1 as 0xFFFF...FFFF > 0 and the branch
+ ## would not fire.
P1_LI_R2
+ '00000000' # r2 = 0
+ P1_LI_R4
'00000000'
- P1_LI_R3
- '01000000'
-
- ## Fall-through test: BEQ with r2=0, r3=1 (unequal). The branch must NOT
- ## fire. If it does fire (incorrectly), we jump to &b4_ft_bad which
- ## clobbers r1. Correct behavior falls through the branch, then an
- ## unconditional B jumps past the clobber to &b4_ft_good.
- P1_LI_R7
- &b4_ft_bad
- P1_BEQ_R2_R3_R7 # 0 == 1? no, fall through
+ P1_ADDI_R4_R4_NEG1 # r4 = -1 (sign-extended)
P1_LI_R7
- &b4_ft_good
- P1_B # skip the bad-path clobber
-:b4_ft_bad
+ &b4_8_ok
+ P1_BLT_R4_R2_R7 # -1 < 0 (signed)? yes, taken
P1_LI_R1
'00000000'
-:b4_ft_good
-
- ## Restore r1 = 5 is implicit — we never clobbered it on the happy path.
+:b4_8_ok
## Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL.
- ## fn_identity does its own nested CALL to fn_inner — if PROLOGUE doesn't
- ## spill lr correctly, the inner CALL clobbers the return-to-_start
- ## address and we crash or hang. The function bodies live inline below
- ## the subtests, guarded by a P1_B over them so we don't fall through
- ## into them after the last subtest.
+ ## fn_identity does its own nested CALL to fn_inner — if PROLOGUE
+ ## doesn't spill lr correctly, the inner CALL clobbers the
+ ## return-to-_start address and we crash or hang. The function
+ ## bodies live inline below the subtests, guarded by a P1_B over
+ ## them so we don't fall through after the last subtest.
P1_LI_R7
&fn_identity
P1_CALL # nested-CALL test: returns r1 unchanged
@@ -249,7 +304,7 @@
'01000000'
P1_SYSCALL
- ## exit(r6) — exit status = computed result
+ ## exit(r6) — exit status = computed result
P1_LI_R0
SYS_EXIT
P1_MOV_R1_R6
diff --git a/p1_aarch64.M1 b/p1_aarch64.M1
@@ -1,7 +1,7 @@
## P1 pseudo-ISA — aarch64 backing defs (v0.1 spike)
##
## Implements the subset needed by hello.M1 and demo.M1: LI, SYSCALL,
-## plus a handful of MOV/ADD/SUB register tuples. The full 1500-entry
+## the demo's arith/imm/memory/branch/call tuples. The full ~1200-entry
## table described in P1.md is generator-driven; what's here is the
## spike's hand-written sliver.
## See P1.md for the full ISA and register mapping.
@@ -23,8 +23,7 @@
##
## The 4-byte form is deliberate for the spike: it pairs with hex2's
## `&label` sigil unchanged. A proper 64-bit LI via a PC-relative LDR X
-## literal is left for the aarch64 hex2_word extensions described in
-## P1.md §"What needs to be added".
+## literal is left for later.
DEFINE P1_LI_R0 4000001802000014
DEFINE P1_LI_R1 4100001802000014
@@ -64,90 +63,90 @@ DEFINE SYS_WRITE 40000000
DEFINE SYS_EXIT 5D000000
-## ---- MOV / ADD / SUB (demo.M1 subset) ------------------------------------
-## One native insn per op. Named after the P1-register tuple they realize.
-## Add entries here as new programs need them; a generator will replace the
-## hand-maintenance once more than a handful of tuples are in use.
-
-## MOV rD, rA -> orr xD, xzr, xA
-DEFINE P1_MOV_R6_R3 F30303AA ## mov x19, x3
-DEFINE P1_MOV_R1_R6 E10313AA ## mov x1, x19
-
-## ADD rD, rA, rB -> add xD, xA, xB
-DEFINE P1_ADD_R3_R1_R2 2300028B ## add x3, x1, x2
-DEFINE P1_ADD_R2_R2_R6 4200138B ## add x2, x2, x19
-
-## SUB rD, rA, rB -> sub xD, xA, xB
-DEFINE P1_SUB_R3_R3_R4 630004CB ## sub x3, x3, x4
-
-
## ---- Tranche 1: full arith reg-reg-reg ----------------------------------
-## Identity-chain tuples used by demo.M1 to exercise every P1 arith op
-## without branches or memory. r1 is the running accumulator; each op
-## uses an identity partner so the correct output is always r1.
-## aarch64 has 3-operand forms for all of these, so one insn per op.
+## Non-identity discriminator chain used by demo.M1 to exercise every
+## P1 arith op. r1 is the running accumulator; each step's result is
+## unique vs. its neighbor ops on the chosen operands (see demo.M1).
+## aarch64 has 3-operand forms for all of these — one insn per op.
## REM has no native op — expands to sdiv+msub through scratch reg x4.
## MOV rD, rA -> orr xD, xzr, xA
+DEFINE P1_MOV_R1_R6 E10313AA ## mov x1, x19
DEFINE P1_MOV_R6_R1 F30301AA ## mov x19, x1
-## ADD / SUB / AND / OR / XOR — base opcodes 8B / CB / 8A / AA / CA.
+## ADD rD, rA, rB -> add xD, xA, xB
DEFINE P1_ADD_R1_R1_R2 2100028B ## add x1, x1, x2
+DEFINE P1_ADD_R1_R1_R4 2100048B ## add x1, x1, x4
+DEFINE P1_ADD_R2_R2_R6 4200138B ## add x2, x2, x19 (syscall-arg computation)
+
+## SUB / AND / OR / XOR — base opcodes CB / 8A / AA / CA.
DEFINE P1_SUB_R1_R1_R2 210002CB ## sub x1, x1, x2
DEFINE P1_AND_R1_R1_R5 2100058A ## and x1, x1, x5
DEFINE P1_OR_R1_R1_R2 210002AA ## orr x1, x1, x2
DEFINE P1_XOR_R1_R1_R2 210002CA ## eor x1, x1, x2
-## MUL rD, rA, rB -> madd xD, xA, xB, xzr
-DEFINE P1_MUL_R1_R1_R3 217C039B ## mul x1, x1, x3
+## MUL rD, rA, rB -> mul xD, xA, xB (= madd xD, xA, xB, xzr)
+DEFINE P1_MUL_R1_R1_R2 217C029B ## mul x1, x1, x2
## DIV rD, rA, rB -> sdiv xD, xA, xB
-DEFINE P1_DIV_R1_R1_R3 210CC39A ## sdiv x1, x1, x3
+DEFINE P1_DIV_R1_R1_R2 210CC29A ## sdiv x1, x1, x2
## REM rD, rA, rB -> sdiv x4, xA, xB ; msub xD, x4, xB, xA
## Two insns; x4 is a caller-saved scratch (= P1 r4). Demo keeps any
## live r4 value dead across this op.
DEFINE P1_REM_R1_R1_R5 240CC59A8184059B ## sdiv x4,x1,x5; msub x1,x4,x5,x1
-## SHL / SHR / SAR -> lslv / lsrv / asrv (64-bit variants).
+## SHL / SHR -> lslv / lsrv (64-bit).
DEFINE P1_SHL_R1_R1_R2 2120C29A ## lsl x1, x1, x2
DEFINE P1_SHR_R1_R1_R2 2124C29A ## lsr x1, x1, x2
-DEFINE P1_SAR_R1_R1_R2 2128C29A ## asr x1, x1, x2
+
+## SAR rD, rA, rB -> asrv xD, xA, xB. Discriminator lives on r4
+## (negative value) — see demo.M1 tranche 1.
+DEFINE P1_SAR_R4_R4_R2 8428C29A ## asr x4, x4, x2
## ---- Tranche 2: immediate arith ---------------------------------------
-## Tuples exercised by demo.M1: chain 5 -> 8 -> 4 -> 5 -> 5 -> 5 -> 5.
+## Demo chain: 5 → 8 → 5 → 10 → 5 → 4 → 5 plus a SARI discriminator
+## on r4 (negative). All imms are valid bitmask-immediates where
+## applicable (see P1.md); 5 is NOT a valid bitmask so ORI uses 1.
##
-## Bitwise-imm ops (AND/OR) use aarch64's logical-immediate encoding:
-## (N, immr, imms) encodes a run of consecutive 1s rotated by immr.
-## For 64-bit patterns N=1 and imms = (ones_count - 1). Immediates:
-## imm=1 -> N=1 imms=0 immr=0 (one 1, no rotation)
-## imm=7 -> N=1 imms=2 immr=0 (three consecutive 1s)
## Shift-imm ops are UBFM/SBFM aliases:
## LSL #n -> UBFM immr=-n mod 64, imms=63-n
## LSR #n -> UBFM immr=n, imms=63
## ASR #n -> SBFM immr=n, imms=63
+## Bitwise-imm ops use aarch64's logical-immediate encoding
+## (N:immr:imms over 13 bits).
+
+## aarch64 has no native "ADDI with negative imm" form — model P1's
+## signed ADDI by emitting ADD (imm) for non-negative values and
+## SUB (imm) for negative ones. Both flavors sign-extend the imm12
+## into the 64-bit destination.
+DEFINE P1_ADDI_R1_R1_3 210C0091 ## add x1, x1, #3
+DEFINE P1_ADDI_R1_R1_1 21040091 ## add x1, x1, #1
+DEFINE P1_ADDI_R1_R1_NEG3 210C00D1 ## sub x1, x1, #3
+DEFINE P1_ADDI_R4_R4_NEG1 840400D1 ## sub x4, x4, #1
+
+DEFINE P1_SHLI_R1_R1_1 21F87FD3 ## lsl x1, x1, #1 (UBFM immr=63, imms=62)
+DEFINE P1_SHRI_R1_R1_1 21FC41D3 ## lsr x1, x1, #1 (UBFM immr=1, imms=63)
+
+DEFINE P1_ANDI_R1_R1_6 21043F92 ## and x1, x1, #6
+DEFINE P1_ORI_R1_R1_1 210040B2 ## orr x1, x1, #1
-DEFINE P1_ADDI_R1_R1_3 210C0091 ## add x1, x1, #3
-DEFINE P1_ANDI_R1_R1_7 21084092 ## and x1, x1, #7
-DEFINE P1_ORI_R1_R1_1 210040B2 ## orr x1, x1, #1
-DEFINE P1_SHLI_R1_R1_0 21FC40D3 ## lsl x1, x1, #0 (UBFM #0,#63)
-DEFINE P1_SHRI_R1_R1_1 21FC41D3 ## lsr x1, x1, #1 (UBFM #1,#63)
-DEFINE P1_SARI_R1_R1_0 21FC4093 ## asr x1, x1, #0 (SBFM #0,#63)
+DEFINE P1_SARI_R4_R4_1 84FC4193 ## asr x4, x4, #1 (SBFM immr=1, imms=63)
## ---- Tranche 3: LA + memory ops ---------------------------------------
## LA is LI in the spike — both load a 4-byte zero-extended literal,
-## which is enough to address the ELF (base 0x00600000 < 2^32). Extending
-## LA to a full 64-bit load is future work (P1.md §"What needs added").
+## which is enough to address the ELF (base 0x00600000 < 2^32).
DEFINE P1_LA_R4 4400001802000014 ## ldr w4, [pc+8] ; b +8 ; <4 bytes>
-## Unsigned-offset forms of LDR/STR at 8/4/1-byte widths.
-## imm12 is scaled by access size, so e.g. LDR-w offset 8 uses imm12=2.
+## Unsigned-offset forms of LDR/STR at 8/1-byte widths. imm12 is scaled
+## by access size, so e.g. LDR-B offset 16 uses imm12=16.
+## 32-bit LW/SW dropped from the ISA.
DEFINE P1_ST_R1_R4_0 810000F9 ## str x1, [x4, #0]
DEFINE P1_LD_R1_R4_0 810040F9 ## ldr x1, [x4, #0]
-DEFINE P1_SW_R1_R4_8 810800B9 ## str w1, [x4, #8]
-DEFINE P1_LW_R1_R4_8 810840B9 ## ldr w1, [x4, #8] (zero-ext)
+DEFINE P1_ST_R1_R4_8 810400F9 ## str x1, [x4, #8] (imm12=1, scaled ×8)
+DEFINE P1_LD_R1_R4_8 810440F9 ## ldr x1, [x4, #8]
DEFINE P1_SB_R1_R4_16 81400039 ## strb w1, [x4, #16]
DEFINE P1_LB_R1_R4_16 81404039 ## ldrb w1, [x4, #16] (zero-ext)
@@ -162,18 +161,15 @@ DEFINE P1_LB_R1_R4_16 81404039 ## ldrb w1, [x4, #16] (zero-ext)
##
## Conditions for "skip if NOT cond":
## BEQ -> B.NE (cond 1) BNE -> B.EQ (cond 0)
-## BLT -> B.GE (cond A) BGE -> B.LT (cond B)
-## BLTU-> B.HS (cond 2) BGEU -> B.LO (cond 3)
-## CMP x2, x3 = SUBS xzr, x2, x3 = 0xEB03005F (Rm=3, Rn=2, Rd=31).
-## BR x20 = 0xD61F0280.
+## BLT -> B.GE (cond A)
+## Unsigned branches (BLTU/BGEU/BGE) dropped from the ISA — see P1.md.
+## CMP xN, xM = SUBS xzr, xN, xM. BR x20 = 0xD61F0280.
DEFINE P1_B 80021FD6
-DEFINE P1_BEQ_R2_R3_R7 5F0003EB4100005480021FD6 ## cmp ; b.ne +8 ; br x20
-DEFINE P1_BNE_R2_R3_R7 5F0003EB4000005480021FD6 ## cmp ; b.eq +8 ; br x20
-DEFINE P1_BLT_R2_R3_R7 5F0003EB4A00005480021FD6 ## cmp ; b.ge +8 ; br x20
-DEFINE P1_BGE_R2_R3_R7 5F0003EB4B00005480021FD6 ## cmp ; b.lt +8 ; br x20
-DEFINE P1_BLTU_R2_R3_R7 5F0003EB4200005480021FD6 ## cmp ; b.hs +8 ; br x20
-DEFINE P1_BGEU_R2_R3_R7 5F0003EB4300005480021FD6 ## cmp ; b.lo +8 ; br x20
+DEFINE P1_BEQ_R2_R3_R7 5F0003EB4100005480021FD6 ## cmp x2,x3 ; b.ne +8 ; br x20
+DEFINE P1_BNE_R2_R3_R7 5F0003EB4000005480021FD6 ## cmp x2,x3 ; b.eq +8 ; br x20
+DEFINE P1_BLT_R2_R3_R7 5F0003EB4A00005480021FD6 ## cmp x2,x3 ; b.ge +8 ; br x20
+DEFINE P1_BLT_R4_R2_R7 9F0002EB4A00005480021FD6 ## cmp x4,x2 ; b.ge +8 ; br x20
## ---- Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL -----------------
diff --git a/p1_amd64.M1 b/p1_amd64.M1
@@ -1,7 +1,6 @@
## P1 pseudo-ISA — amd64 backing defs (v0.1 spike)
##
-## Implements the subset needed by hello.M1 and demo.M1: LI, SYSCALL,
-## plus a handful of MOV/ADD/SUB register tuples. See P1.md.
+## Implements the subset needed by hello.M1 and demo.M1. See P1.md.
##
## Register mapping (P1 → amd64):
## r0 → rax , r1 → rdi , r2 → rsi , r3 → rdx
@@ -37,11 +36,6 @@ DEFINE P1_LI_R7 41BC # mov r12d, imm32
## Expansion:
## mov r9, rbx ; 49 89 D9
## syscall ; 0F 05
-##
-## The shuffle runs unconditionally — syscalls that ignore arg 6 don't
-## care that r9 got overwritten with rbx (both are caller-saved on the
-## platform ABI; the kernel reads only the registers the specific
-## syscall cares about).
DEFINE P1_SYSCALL 4989D90F05
@@ -51,109 +45,107 @@ DEFINE SYS_WRITE 01000000
DEFINE SYS_EXIT 3C000000
-## ---- MOV / ADD / SUB (demo.M1 subset) ------------------------------------
+## ---- Tranche 1: full arith reg-reg-reg ----------------------------------
## amd64 lacks a 3-operand arithmetic form, so every "rD = rA op rB"
## expands unconditionally to two native insns:
## mov rD_native, rA_native
## <op> rD_native, rB_native
## When rD == rA the leading mov is a no-op write, kept anyway — P1 is
## deliberately unoptimized (P1.md §"Non-goals").
-
-## MOV rD, rA -> mov rD_native, rA_native (single insn)
-DEFINE P1_MOV_R6_R3 4889D3 ## mov rbx, rdx
-DEFINE P1_MOV_R1_R6 4889DF ## mov rdi, rbx
-
-## ADD rD, rA, rB -> mov rD,rA ; add rD,rB
-DEFINE P1_ADD_R3_R1_R2 4889FA4801F2 ## mov rdx,rdi ; add rdx,rsi
-DEFINE P1_ADD_R2_R2_R6 4889F64801DE ## mov rsi,rsi ; add rsi,rbx
-
-## SUB rD, rA, rB -> mov rD,rA ; sub rD,rB
-DEFINE P1_SUB_R3_R3_R4 4889D24C29D2 ## mov rdx,rdx ; sub rdx,r10
-
-
-## ---- Tranche 1: full arith reg-reg-reg ----------------------------------
-## Identity-chain tuples used by demo.M1. r1=rdi runs through each op.
## x86-64 oddities:
## - Shifts need count in cl; three-insn form mov rD,rA; mov rcx,rB; shl.
## - IDIV needs dividend in rdx:rax. We save rdx to rcx before CQO so
## both the divisor (when it is rdx) and the caller's r3 survive.
## MOV rD, rA -> mov rD_native, rA_native
+DEFINE P1_MOV_R1_R6 4889DF ## mov rdi, rbx
DEFINE P1_MOV_R6_R1 4889FB ## mov rbx, rdi
-## ADD / SUB / AND / OR / XOR — 2-insn form, leading mov rdi,rdi kept.
+## ADD rD, rA, rB -> mov rD,rA ; add rD,rB
DEFINE P1_ADD_R1_R1_R2 4889FF4801F7 ## mov rdi,rdi ; add rdi,rsi
+DEFINE P1_ADD_R1_R1_R4 4889FF4C01D7 ## mov rdi,rdi ; add rdi,r10
+DEFINE P1_ADD_R2_R2_R6 4889F64801DE ## mov rsi,rsi ; add rsi,rbx (syscall arg)
+
+## SUB / AND / OR / XOR — 2-insn form, leading mov rdi,rdi kept.
DEFINE P1_SUB_R1_R1_R2 4889FF4829F7 ## mov rdi,rdi ; sub rdi,rsi
DEFINE P1_XOR_R1_R1_R2 4889FF4831F7 ## mov rdi,rdi ; xor rdi,rsi
DEFINE P1_OR_R1_R1_R2 4889FF4809F7 ## mov rdi,rdi ; or rdi,rsi
DEFINE P1_AND_R1_R1_R5 4889FF4C21C7 ## mov rdi,rdi ; and rdi,r8
-## MUL rD, rA, rB -> mov rD,rA ; imul rD,rB (IMUL r64,r/m64 = 0F AF)
-DEFINE P1_MUL_R1_R1_R3 4889FF480FAFFA ## mov rdi,rdi ; imul rdi,rdx
+## MUL rD, rA, rB -> mov rD,rA ; imul rD,rB (IMUL r64,r/m64 = 0F AF)
+DEFINE P1_MUL_R1_R1_R2 4889FF480FAFFE ## mov rdi,rdi ; imul rdi,rsi
-## DIV rD, rA, rB -> divisor in rdx (r3). Save rdx to rcx so the
-## divisor survives CQO's clobber of rdx, and so r3 is restored after.
-## mov rcx,rdx ; mov rax,rdi ; cqo ; idiv rcx ; mov rdi,rax ; mov rdx,rcx
-DEFINE P1_DIV_R1_R1_R3 4889D14889F8489948F7F94889C74889CA
+## DIV rD, rA, rB -> divisor in rsi (r2). CQO still clobbers rdx,
+## so save/restore caller's r3 through rcx.
+## mov rcx,rdx ; mov rax,rdi ; cqo ; idiv rsi ; mov rdi,rax ; mov rdx,rcx
+DEFINE P1_DIV_R1_R1_R2 4889D14889F8489948F7FE4889C74889CA
-## REM rD, rA, rB -> divisor in r8 (r5). CQO still clobbers rdx, so
-## save/restore r3 through rcx.
+## REM rD, rA, rB -> divisor in r8 (r5). Same rdx-save dance.
## mov rcx,rdx ; mov rax,rdi ; cqo ; idiv r8 ; mov rdi,rdx ; mov rdx,rcx
DEFINE P1_REM_R1_R1_R5 4889D14889F8489949F7F84889D74889CA
-## SHL / SHR / SAR -> mov rD,rA ; mov rcx,rB ; shl/shr/sar rD,cl
+## SHL / SHR -> mov rD,rA ; mov rcx,rB ; shl/shr rD,cl
## Opcode D3 /n with REX.W: /4=SHL, /5=SHR, /7=SAR.
DEFINE P1_SHL_R1_R1_R2 4889FF4889F148D3E7 ## mov rdi,rdi; mov rcx,rsi; shl rdi,cl
DEFINE P1_SHR_R1_R1_R2 4889FF4889F148D3EF ## mov rdi,rdi; mov rcx,rsi; shr rdi,cl
-DEFINE P1_SAR_R1_R1_R2 4889FF4889F148D3FF ## mov rdi,rdi; mov rcx,rsi; sar rdi,cl
+
+## SAR — lives on r4 (r10) with negative input; see demo.M1 tranche 1.
+## mov r10,r10 ; mov rcx,rsi ; sar r10,cl
+DEFINE P1_SAR_R4_R4_R2 4D89D24889F149D3FA
## ---- Tranche 2: immediate arith ---------------------------------------
-## mov rdi,rdi ; <op> rdi, imm8 (sign-extended imm8 forms via opcode 83;
+## Pattern: mov rD,rA ; <op> rD, imm8 (sign-extended imm8 via opcode 83;
## shifts via C1). /n is the opcode-extension field in ModRM.reg.
+## Negative ADDI uses the same `add r/m64, imm8` with a sign-extended
+## imm8 — x86 handles the sign natively.
-DEFINE P1_ADDI_R1_R1_3 4889FF4883C703 ## add rdi, 3 (83 /0 ib)
-DEFINE P1_ANDI_R1_R1_7 4889FF4883E707 ## and rdi, 7 (83 /4 ib)
-DEFINE P1_ORI_R1_R1_1 4889FF4883CF01 ## or rdi, 1 (83 /1 ib)
-DEFINE P1_SHLI_R1_R1_0 4889FF48C1E700 ## shl rdi, 0 (C1 /4 ib)
+DEFINE P1_ADDI_R1_R1_3 4889FF4883C703 ## add rdi, 3 (83 /0 ib)
+DEFINE P1_ADDI_R1_R1_1 4889FF4883C701 ## add rdi, 1
+DEFINE P1_ADDI_R1_R1_NEG3 4889FF4883C7FD ## add rdi, -3 (imm8 FD = -3)
+DEFINE P1_ADDI_R4_R4_NEG1 4D89D24983C2FF ## mov r10,r10 ; add r10, -1
+
+DEFINE P1_SHLI_R1_R1_1 4889FF48C1E701 ## shl rdi, 1 (C1 /4 ib)
DEFINE P1_SHRI_R1_R1_1 4889FF48C1EF01 ## shr rdi, 1 (C1 /5 ib)
-DEFINE P1_SARI_R1_R1_0 4889FF48C1FF00 ## sar rdi, 0 (C1 /7 ib)
+
+DEFINE P1_ANDI_R1_R1_6 4889FF4883E706 ## and rdi, 6 (83 /4 ib)
+DEFINE P1_ORI_R1_R1_1 4889FF4883CF01 ## or rdi, 1 (83 /1 ib)
+
+DEFINE P1_SARI_R4_R4_1 4D89D249C1FA01 ## mov r10,r10 ; sar r10, 1
## ---- Tranche 3: LA + memory ops ---------------------------------------
## LA is LI in the spike (addresses fit in 32 bits → zero-extends cleanly
-## through the mov-to-r32 form). r4 is r10, so base-reg encoding uses
-## REX.B=1 and ModRM rm=010. No SIB byte needed (r10's low3 bits = 010).
+## through the mov-to-r32 form). r4 is r10.
DEFINE P1_LA_R4 41BA ## mov r10d, imm32
-## Plain MOV r/m, r / MOV r, r/m with 8-bit displacement.
-## REX: W=1 for 64-bit moves; B=1 always (r10 is the base register).
+## Plain MOV r/m, r / MOV r, r/m with 8-bit displacement. REX.W=1 for
+## 64-bit moves; REX.B=1 always (r10 is the base register).
+## 32-bit LW/SW dropped from the ISA.
DEFINE P1_ST_R1_R4_0 49893A ## mov [r10], rdi
DEFINE P1_LD_R1_R4_0 498B3A ## mov rdi, [r10]
-DEFINE P1_SW_R1_R4_8 41897A08 ## mov [r10+8], edi
-DEFINE P1_LW_R1_R4_8 418B7A08 ## mov edi, [r10+8] (zero-ext)
+DEFINE P1_ST_R1_R4_8 49897A08 ## mov [r10+8], rdi (disp8)
+DEFINE P1_LD_R1_R4_8 498B7A08 ## mov rdi, [r10+8] (disp8)
DEFINE P1_SB_R1_R4_16 41887A10 ## mov [r10+16], dil
DEFINE P1_LB_R1_R4_16 490FB67A10 ## movzx rdi, byte [r10+16]
## ---- Tranche 4: branches (r7-indirect, no hex2_word needed) ------------
-## Same pattern as aarch64: cmp ra,rb ; short native jcc over a jmp-through-r7.
-## If cond is false the native "skip" jcc fires (opposite of the P1 cond) and
-## steps past the 3-byte `jmp r12`, falling through. If cond is true we take
-## the jmp r12 to the address the caller stashed in r7.
+## Pattern: cmp ra,rb ; short native jcc over a jmp-through-r7.
+## If cond is false the native "skip" jcc fires (opposite of the P1 cond)
+## and steps past the 3-byte `jmp r12`, falling through. If cond is true
+## we take the jmp r12 to the address the caller stashed in r7.
## P1_B is just `jmp r12` unconditionally.
##
## CMP rsi, rdx = 48 39 D6 (REX.W, opcode 39 /r, ModRM: 11 010 110).
## JMP r12 = 41 FF E4 (REX.B, opcode FF /4, ModRM: 11 100 100).
-## jcc rel8 opcodes (skip when NOT cond): JE=74 JNE=75 JL=7C JGE=7D JB=72 JAE=73.
+## jcc rel8 opcodes (skip when NOT cond): JE=74 JNE=75 JL=7C JGE=7D.
DEFINE P1_B 41FFE4 ## jmp r12
DEFINE P1_BEQ_R2_R3_R7 4839D6750341FFE4 ## cmp ; jne +3 ; jmp r12
DEFINE P1_BNE_R2_R3_R7 4839D6740341FFE4 ## cmp ; je +3 ; jmp r12
-DEFINE P1_BLT_R2_R3_R7 4839D67D0341FFE4 ## cmp ; jge +3 ; jmp r12
-DEFINE P1_BGE_R2_R3_R7 4839D67C0341FFE4 ## cmp ; jl +3 ; jmp r12
-DEFINE P1_BLTU_R2_R3_R7 4839D6730341FFE4 ## cmp ; jae +3 ; jmp r12
-DEFINE P1_BGEU_R2_R3_R7 4839D6720341FFE4 ## cmp ; jb +3 ; jmp r12
+DEFINE P1_BLT_R2_R3_R7 4839D67D0341FFE4 ## cmp rsi,rdx ; jge +3 ; jmp r12
+DEFINE P1_BLT_R4_R2_R7 4939F27D0341FFE4 ## cmp r10,rsi ; jge +3 ; jmp r12
## ---- Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL -----------------
@@ -164,8 +156,7 @@ DEFINE P1_BGEU_R2_R3_R7 4839D6720341FFE4 ## cmp ; jb +3 ; jmp r12
## work spilling/reloading lr.
##
## CALL expects the target pre-loaded into r7 (= r12); expands to `call r12`.
-## TAIL = EPILOGUE + unconditional B (= `jmp r12`), so the caller of the
-## tail-calling function receives the return from the tail target directly.
+## TAIL = EPILOGUE + unconditional B (= `jmp r12`).
DEFINE P1_PROLOGUE 90 ## nop (retaddr already on stack)
DEFINE P1_EPILOGUE 90 ## nop
diff --git a/p1_riscv64.M1 b/p1_riscv64.M1
@@ -1,7 +1,6 @@
## P1 pseudo-ISA — riscv64 backing defs (v0.1 spike)
##
-## Implements the subset needed by hello.M1 and demo.M1: LI, SYSCALL,
-## plus a handful of MOV/ADD/SUB register tuples. See P1.md.
+## Implements the subset needed by hello.M1 and demo.M1. See P1.md.
##
## Register mapping (P1 → RISC-V):
## r0 → a0 (x10) , r1 → a1 (x11) , r2 → a2 (x12) , r3 → a3 (x13)
@@ -52,9 +51,6 @@ DEFINE P1_LI_R7 170900000369C9006F008000
## mv a4, a5 ; P1 r5 -> native arg5
## mv a5, s1 ; P1 r6 -> native arg6
## ecall
-##
-## Unconditional — unused shuffles read caller-saved scratch, and the
-## kernel reads only the regs each syscall cares about.
DEFINE P1_SYSCALL 9308050013850500930506001386060093060700138707009387040073000000
@@ -63,56 +59,55 @@ DEFINE SYS_WRITE 40000000
DEFINE SYS_EXIT 5D000000
-## ---- MOV / ADD / SUB (demo.M1 subset) ------------------------------------
-## One native insn per op.
-
-## MOV rD, rA -> addi rD, rA, 0 (the `mv` pseudo)
-DEFINE P1_MOV_R6_R3 93840600 ## mv s1, a3
-DEFINE P1_MOV_R1_R6 93850400 ## mv a1, s1
-
-## ADD rD, rA, rB -> add rD, rA, rB
-DEFINE P1_ADD_R3_R1_R2 B386C500 ## add a3, a1, a2
-DEFINE P1_ADD_R2_R2_R6 33069600 ## add a2, a2, s1
-
-## SUB rD, rA, rB -> sub rD, rA, rB
-DEFINE P1_SUB_R3_R3_R4 B386E640 ## sub a3, a3, a4
-
-
## ---- Tranche 1: full arith reg-reg-reg ----------------------------------
-## Identity-chain tuples used by demo.M1. All one-insn R-type ops.
-## MUL/DIV/REM are M-extension ops (standard on rv64gc).
+## Non-identity discriminator chain used by demo.M1. All one-insn R-type
+## ops. MUL/DIV/REM are M-extension ops (standard on rv64gc).
-## MOV rD, rA -> addi rD, rA, 0
+## MOV rD, rA -> addi rD, rA, 0 (the `mv` pseudo)
+DEFINE P1_MOV_R1_R6 93850400 ## mv a1, s1
DEFINE P1_MOV_R6_R1 93840500 ## mv s1, a1
## ADD / SUB / AND / OR / XOR — R-type, funct3 picks the op.
DEFINE P1_ADD_R1_R1_R2 B385C500 ## add a1, a1, a2
+DEFINE P1_ADD_R1_R1_R4 B385E500 ## add a1, a1, a4
+DEFINE P1_ADD_R2_R2_R6 33069600 ## add a2, a2, s1 (syscall-arg computation)
+
DEFINE P1_SUB_R1_R1_R2 B385C540 ## sub a1, a1, a2 (funct7=0x20)
DEFINE P1_XOR_R1_R1_R2 B3C5C500 ## xor a1, a1, a2 (funct3=4)
DEFINE P1_OR_R1_R1_R2 B3E5C500 ## or a1, a1, a2 (funct3=6)
DEFINE P1_AND_R1_R1_R5 B3F5F500 ## and a1, a1, a5 (funct3=7)
## MUL / DIV / REM — M extension, funct7=1.
-DEFINE P1_MUL_R1_R1_R3 B385D502 ## mul a1, a1, a3
-DEFINE P1_DIV_R1_R1_R3 B3C5D502 ## div a1, a1, a3 (funct3=4)
+DEFINE P1_MUL_R1_R1_R2 B385C502 ## mul a1, a1, a2
+DEFINE P1_DIV_R1_R1_R2 B3C5C502 ## div a1, a1, a2 (funct3=4)
DEFINE P1_REM_R1_R1_R5 B3E5F502 ## rem a1, a1, a5 (funct3=6)
-## SHL / SHR / SAR -> sll / srl / sra.
+## SHL / SHR -> sll / srl.
DEFINE P1_SHL_R1_R1_R2 B395C500 ## sll a1, a1, a2 (funct3=1)
DEFINE P1_SHR_R1_R1_R2 B3D5C500 ## srl a1, a1, a2 (funct3=5)
-DEFINE P1_SAR_R1_R1_R2 B3D5C540 ## sra a1, a1, a2 (funct7=0x20)
+
+## SAR rD, rA, rB -> sra xD, xA, xB. Discriminator lives on r4
+## (negative value) — see demo.M1 tranche 1.
+DEFINE P1_SAR_R4_R4_R2 3357C740 ## sra a4, a4, a2 (funct7=0x20)
## ---- Tranche 2: immediate arith ---------------------------------------
## I-type / shift-immediate forms. Shift amount is 6 bits (shamt6) for
-## rv64. SRAI sets bit 30 to distinguish from SRLI.
+## rv64. SRAI sets bit 30 to distinguish from SRLI. Negative ADDI uses
+## sign-extended imm12 directly — no separate "subi" needed.
-DEFINE P1_ADDI_R1_R1_3 93853500 ## addi a1, a1, 3
-DEFINE P1_ANDI_R1_R1_7 93F57500 ## andi a1, a1, 7 (funct3=7)
-DEFINE P1_ORI_R1_R1_1 93E51500 ## ori a1, a1, 1 (funct3=6)
-DEFINE P1_SHLI_R1_R1_0 93950500 ## slli a1, a1, 0 (funct3=1)
+DEFINE P1_ADDI_R1_R1_3 93853500 ## addi a1, a1, 3
+DEFINE P1_ADDI_R1_R1_1 93851500 ## addi a1, a1, 1
+DEFINE P1_ADDI_R1_R1_NEG3 9385D5FF ## addi a1, a1, -3
+DEFINE P1_ADDI_R4_R4_NEG1 1307F7FF ## addi a4, a4, -1
+
+DEFINE P1_SHLI_R1_R1_1 93951500 ## slli a1, a1, 1 (funct3=1)
DEFINE P1_SHRI_R1_R1_1 93D51500 ## srli a1, a1, 1 (funct3=5)
-DEFINE P1_SARI_R1_R1_0 93D50540 ## srai a1, a1, 0 (funct7=0x20)
+
+DEFINE P1_ANDI_R1_R1_6 93F56500 ## andi a1, a1, 6 (funct3=7)
+DEFINE P1_ORI_R1_R1_1 93E51500 ## ori a1, a1, 1 (funct3=6)
+
+DEFINE P1_SARI_R4_R4_1 13571740 ## srai a4, a4, 1 (imm[11:6]=0x10)
## ---- Tranche 3: LA + memory ops ---------------------------------------
@@ -120,12 +115,12 @@ DEFINE P1_SARI_R1_R1_0 93D50540 ## srai a1, a1, 0 (funct7=0x20)
DEFINE P1_LA_R4 170700000367C7006F008000 ## auipc a4,0; lwu a4,12(a4); jal x0,+8
## LOAD (opcode 0x03) / STORE (opcode 0x23) with signed 12-bit offset.
-## For P1: LD=64b, LW=32b zero-ext (= LWU), LB=8b zero-ext (= LBU).
-## ST=SD (64b), SW=32b, SB=8b.
+## For P1: LD=64b, LB=8b zero-ext (= LBU). ST=SD (64b), SB=8b.
+## 32-bit LW/SW dropped from the ISA.
DEFINE P1_ST_R1_R4_0 2330B700 ## sd a1, 0(a4)
DEFINE P1_LD_R1_R4_0 83350700 ## ld a1, 0(a4)
-DEFINE P1_SW_R1_R4_8 2324B700 ## sw a1, 8(a4)
-DEFINE P1_LW_R1_R4_8 83658700 ## lwu a1, 8(a4)
+DEFINE P1_ST_R1_R4_8 2334B700 ## sd a1, 8(a4)
+DEFINE P1_LD_R1_R4_8 83358700 ## ld a1, 8(a4)
DEFINE P1_SB_R1_R4_16 2308B700 ## sb a1, 16(a4)
DEFINE P1_LB_R1_R4_16 83450701 ## lbu a1, 16(a4)
@@ -137,20 +132,17 @@ DEFINE P1_LB_R1_R4_16 83450701 ## lbu a1, 16(a4)
## the unconditional `jalr x0, 0(s2)` when the P1 condition is NOT met.
##
## Fixed offset: the conditional branches below all use imm=8 (skip past the
-## 4-byte JALR on the false path). B-type imm=8 encodes to (imm[12|10:5]=0,
-## imm[4:1|11]=4, so imms spread as funct7 bits 0x00 and rd bits 0x4).
-## Branch instruction byte strings below were assembled and verified by hand.
+## 4-byte JALR on the false path). B-type imm=8 encodes to [11:7]=0x08,
+## [31:25]=0x00.
##
## P1_B is just `jalr x0, 0(s2)` — unconditional jump to address in r7.
-## All comparisons use a2 (P1 r2) vs a3 (P1 r3).
+## Unsigned branches (BLTU/BGEU/BGE) dropped from the ISA — see P1.md.
DEFINE P1_B 67000900 ## jalr x0, 0(s2)
DEFINE P1_BEQ_R2_R3_R7 6314D60067000900 ## bne a2,a3,+8 ; jalr x0,0(s2)
-DEFINE P1_BNE_R2_R3_R7 6304D60067000900 ## beq ; jalr
-DEFINE P1_BLT_R2_R3_R7 6354D60067000900 ## bge ; jalr
-DEFINE P1_BGE_R2_R3_R7 6344D60067000900 ## blt ; jalr
-DEFINE P1_BLTU_R2_R3_R7 6374D60067000900 ## bgeu; jalr
-DEFINE P1_BGEU_R2_R3_R7 6364D60067000900 ## bltu; jalr
+DEFINE P1_BNE_R2_R3_R7 6304D60067000900 ## beq a2,a3,+8 ; jalr
+DEFINE P1_BLT_R2_R3_R7 6354D60067000900 ## bge a2,a3,+8 ; jalr
+DEFINE P1_BLT_R4_R2_R7 6354C70067000900 ## bge a4,a2,+8 ; jalr
## ---- Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL -----------------