commit 29dd4df7b3d09e614c5999efd71cbe9651d2ef02
parent c352715fd06e83f2dc4ebd6094943a4c5626a203
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Mon, 20 Apr 2026 15:35:35 -0700
P1 extend demo.M1 to cover all ops
Diffstat:
| M | demo.M1 | | | 311 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------- |
| M | p1_aarch64.M1 | | | 116 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| M | p1_amd64.M1 | | | 103 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| M | p1_riscv64.M1 | | | 93 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- |
4 files changed, 568 insertions(+), 55 deletions(-)
diff --git a/demo.M1 b/demo.M1
@@ -1,65 +1,259 @@
## P1 broader-ISA demo — portable across aarch64, amd64, riscv64.
##
-## Computes (3 + 4) - 2 = 5 in registers, prints "P1 = 5\n", then
-## exits with status 5. Exercises LI, ADD, SUB, MOV, and SYSCALL.
+## Exercises the P1 ISA in tranches (see P1.md §"ISA"). Each op is
+## applied in a way that preserves the running value r1=5, so a
+## miscoded op produces the wrong exit status.
+##
+## Tranche 1: LI, MOV, SYSCALL, plus every reg-reg-reg arith op:
+## ADD, SUB, AND, OR, XOR, MUL, DIV, REM, SHL, SHR, SAR.
+## Tranche 2: immediate forms
+## ADDI, ANDI, ORI, SHLI, SHRI, SARI
+## (no SUBI/XORI/MULI in P1 — see PLAN.md §"Feature floor").
+## Tranche 3: LA + memory round-trip
+## LA, ST/LD (64b), SW/LW (32b zero-ext), SB/LB (8b zero-ext)
+## Tranche 4: r7-indirect branches
+## B, BEQ, BNE, BLT, BGE, BLTU, BGEU — 7 taken-path subtests + 1 fall-through
+## Tranche 5 (current): CALL / RET / TAIL / PROLOGUE / EPILOGUE
+## Nested CALL (stresses PROLOGUE lr-save) + TAIL (must unwind before B)
+##
+## Constants for the identity chain:
+## r2 = 0 identity for ADD/SUB/XOR/OR/SHL/SHR/SAR
+## r3 = 1 identity for MUL/DIV
+## r5 = 7 identity for AND (5 & 7 = 5) and REM (5 % 7 = 5)
+## r1 holds the running value — starts at 5 and stays 5.
##
## Run-and-verify:
## make PROG=demo ARCH=<arch> run && echo "exit=$?"
## expected stdout: "P1 = 5\n" expected exit: 5
:_start
- ## Compute result = (3 + 4) - 2 = 5, stash in r6.
- ## r6 maps to a callee-saved native reg on every arch (x19 / rbx /
- ## s1), so it survives SYSCALL's argument shuffle.
- P1_LI_R1
- '03000000' # r1 = 3
- P1_LI_R2
- '04000000' # r2 = 4
- P1_ADD_R3_R1_R2 # r3 = r1 + r2 (= 7)
- P1_LI_R4
- '02000000' # r4 = 2
- P1_SUB_R3_R3_R4 # r3 = r3 - r4 (= 5)
- P1_MOV_R6_R3 # r6 = r3 (save across syscalls)
-
- ## write(1, &prefix, 5) — "P1 = "
- P1_LI_R0
- SYS_WRITE
- P1_LI_R1
- '01000000'
- P1_LI_R2
- &prefix
- P1_LI_R3
- '05000000'
- P1_SYSCALL
-
- ## write(1, &digits + r6, 1) — the computed digit ('5')
- P1_LI_R0
- SYS_WRITE
- P1_LI_R1
- '01000000'
- P1_LI_R2
- &digits
- P1_ADD_R2_R2_R6 # r2 = &digits + 5
- P1_LI_R3
- '01000000'
- P1_SYSCALL
-
- ## write(1, &newline, 1)
- P1_LI_R0
- SYS_WRITE
- P1_LI_R1
- '01000000'
- P1_LI_R2
- &newline
- P1_LI_R3
- '01000000'
- P1_SYSCALL
-
- ## exit(r6) — exit status = computed result
- P1_LI_R0
- SYS_EXIT
- P1_MOV_R1_R6
- P1_SYSCALL
+ P1_LI_R1
+ '05000000'
+ P1_LI_R2
+ '00000000'
+ P1_LI_R3
+ '01000000'
+ P1_LI_R5
+ '07000000'
+
+ P1_ADD_R1_R1_R2 # 5 + 0 = 5
+ P1_SUB_R1_R1_R2 # 5 - 0 = 5
+ P1_XOR_R1_R1_R2 # 5 ^ 0 = 5
+ P1_OR_R1_R1_R2 # 5 | 0 = 5
+ P1_AND_R1_R1_R5 # 5 & 7 = 5
+ P1_MUL_R1_R1_R3 # 5 * 1 = 5
+ P1_DIV_R1_R1_R3 # 5 / 1 = 5
+ P1_REM_R1_R1_R5 # 5 % 7 = 5
+ P1_SHL_R1_R1_R2 # 5 << 0 = 5
+ P1_SHR_R1_R1_R2 # 5 >> 0 = 5 (logical)
+ P1_SAR_R1_R1_R2 # 5 >> 0 = 5 (arithmetic)
+
+ ## Tranche 2: immediate forms. Chain 5 -> 8 -> 4 -> 5 -> 5 -> 5 -> 5.
+ P1_ADDI_R1_R1_3 # 5 + 3 = 8
+ P1_SHRI_R1_R1_1 # 8 >> 1 = 4
+ P1_ORI_R1_R1_1 # 4 | 1 = 5
+ P1_ANDI_R1_R1_7 # 5 & 7 = 5
+ P1_SHLI_R1_R1_0 # 5 << 0 = 5
+ P1_SARI_R1_R1_0 # 5 >> 0 = 5 (arithmetic)
+
+ ## Tranche 3: memory round-trip. For each access width, store r1,
+ ## clobber r1 to 0, then reload. A broken ST/LD leaves r1 != 5.
+ P1_LA_R4
+ &scratch
+
+ P1_ST_R1_R4_0 # [scratch+0..8] = r1 (= 5)
+ P1_LI_R1
+ '00000000'
+ P1_LD_R1_R4_0 # r1 = [scratch+0..8] -> 5
+
+ P1_SW_R1_R4_8 # [scratch+8..12] = r1 (low 4 bytes)
+ P1_LI_R1
+ '00000000'
+ P1_LW_R1_R4_8 # r1 = zext [scratch+8..12] -> 5
+
+ P1_SB_R1_R4_16 # [scratch+16] = r1 (low byte)
+ P1_LI_R1
+ '00000000'
+ P1_LB_R1_R4_16 # r1 = zext [scratch+16] -> 5
+
+ ## Tranche 4: branches. r2=0, r3=1 already, r5=7 already.
+ ## Pattern for each taken-path test:
+ ## set r7 = &b4_N_ok ; <branch cond met> ; clobber r1->0 ; :b4_N_ok
+ ## If the branch correctly fires, the clobber is skipped.
+ ##
+ ## B — unconditional, jumps to &b4_1_ok.
+ P1_LI_R7
+ &b4_1_ok
+ P1_B
+ P1_LI_R1
+ '00000000'
+:b4_1_ok
+
+ ## BEQ r2,r3,r7 — 0 == 0? set r3 back to 0 first, branch, then restore.
+ P1_LI_R3
+ '00000000' # r3 = 0 so r2 == r3
+ P1_LI_R7
+ &b4_2_ok
+ P1_BEQ_R2_R3_R7 # 0 == 0, branch taken
+ P1_LI_R1
+ '00000000'
+:b4_2_ok
+ P1_LI_R3
+ '01000000' # restore r3 = 1
+
+ ## BNE r2,r3,r7 — 0 != 1? yes, branch taken.
+ P1_LI_R7
+ &b4_3_ok
+ P1_BNE_R2_R3_R7
+ P1_LI_R1
+ '00000000'
+:b4_3_ok
+
+ ## BLT r2,r3,r7 — 0 < 1 (signed)? yes, branch taken.
+ P1_LI_R7
+ &b4_4_ok
+ P1_BLT_R2_R3_R7
+ P1_LI_R1
+ '00000000'
+:b4_4_ok
+
+ ## BGE r3,r2,r7 — needs a3 >= a2. Swap: load r2=1 r3=0 for this test.
+ P1_LI_R2
+ '01000000'
+ P1_LI_R3
+ '00000000'
+ P1_LI_R7
+ &b4_5_ok
+ P1_BGE_R2_R3_R7 # 1 >= 0 (signed), branch taken
+ P1_LI_R1
+ '00000000'
+:b4_5_ok
+ P1_LI_R2
+ '00000000' # restore r2 = 0
+ P1_LI_R3
+ '01000000' # restore r3 = 1
+
+ ## BLTU r2,r3,r7 — 0 < 1 (unsigned)? yes, branch taken.
+ P1_LI_R7
+ &b4_6_ok
+ P1_BLTU_R2_R3_R7
+ P1_LI_R1
+ '00000000'
+:b4_6_ok
+
+ ## BGEU r2,r3,r7 — needs a2 >= a3 (unsigned). Swap r2/r3.
+ P1_LI_R2
+ '01000000'
+ P1_LI_R3
+ '00000000'
+ P1_LI_R7
+ &b4_7_ok
+ P1_BGEU_R2_R3_R7 # 1 >= 0 (unsigned), branch taken
+ P1_LI_R1
+ '00000000'
+:b4_7_ok
+ P1_LI_R2
+ '00000000'
+ P1_LI_R3
+ '01000000'
+
+ ## Fall-through test: BEQ with r2=0, r3=1 (unequal). The branch must NOT
+ ## fire. If it does fire (incorrectly), we jump to &b4_ft_bad which
+ ## clobbers r1. Correct behavior falls through the branch, then an
+ ## unconditional B jumps past the clobber to &b4_ft_good.
+ P1_LI_R7
+ &b4_ft_bad
+ P1_BEQ_R2_R3_R7 # 0 == 1? no, fall through
+ P1_LI_R7
+ &b4_ft_good
+ P1_B # skip the bad-path clobber
+:b4_ft_bad
+ P1_LI_R1
+ '00000000'
+:b4_ft_good
+
+ ## Restore r1 = 5 is implicit — we never clobbered it on the happy path.
+
+ ## Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL.
+ ## fn_identity does its own nested CALL to fn_inner — if PROLOGUE doesn't
+ ## spill lr correctly, the inner CALL clobbers the return-to-_start
+ ## address and we crash or hang. The function bodies live inline below
+ ## the subtests, guarded by a P1_B over them so we don't fall through
+ ## into them after the last subtest.
+ P1_LI_R7
+ &fn_identity
+ P1_CALL # nested-CALL test: returns r1 unchanged
+
+ P1_LI_R7
+ &fn_parent_tail
+ P1_CALL # TAIL test: fn_identity RETs to here
+
+ P1_LI_R7
+ &b5_end
+ P1_B # skip over the inlined function bodies
+
+:fn_inner
+ P1_PROLOGUE
+ P1_EPILOGUE
+ P1_RET
+
+:fn_identity
+ P1_PROLOGUE
+ P1_LI_R7
+ &fn_inner
+ P1_CALL
+ P1_EPILOGUE
+ P1_RET
+
+:fn_parent_tail
+ P1_PROLOGUE
+ P1_LI_R7
+ &fn_identity
+ P1_TAIL
+
+:b5_end
+
+ P1_MOV_R6_R1 # r6 = 5 (callee-saved, survives syscalls)
+
+ ## write(1, &prefix, 5) — "P1 = "
+ P1_LI_R0
+ SYS_WRITE
+ P1_LI_R1
+ '01000000'
+ P1_LI_R2
+ &prefix
+ P1_LI_R3
+ '05000000'
+ P1_SYSCALL
+
+ ## write(1, &digits + r6, 1) — the computed digit ('5')
+ P1_LI_R0
+ SYS_WRITE
+ P1_LI_R1
+ '01000000'
+ P1_LI_R2
+ &digits
+ P1_ADD_R2_R2_R6 # r2 = &digits + 5
+ P1_LI_R3
+ '01000000'
+ P1_SYSCALL
+
+ ## write(1, &newline, 1)
+ P1_LI_R0
+ SYS_WRITE
+ P1_LI_R1
+ '01000000'
+ P1_LI_R2
+ &newline
+ P1_LI_R3
+ '01000000'
+ P1_SYSCALL
+
+ ## exit(r6) — exit status = computed result
+ P1_LI_R0
+ SYS_EXIT
+ P1_MOV_R1_R6
+ P1_SYSCALL
:prefix
"P1 = "
@@ -69,4 +263,13 @@
"
"
+## 32 bytes reserved for tranche 3 memory round-trip. The LOAD segment
+## is RWX (see ELF-<arch>.hex2 ph_flags=7) so we can store into this
+## region at runtime.
+:scratch
+'0000000000000000'
+'0000000000000000'
+'0000000000000000'
+'0000000000000000'
+
:ELF_end
diff --git a/p1_aarch64.M1 b/p1_aarch64.M1
@@ -79,3 +79,119 @@ DEFINE P1_ADD_R2_R2_R6 4200138B ## add x2, x2, x19
## SUB rD, rA, rB -> sub xD, xA, xB
DEFINE P1_SUB_R3_R3_R4 630004CB ## sub x3, x3, x4
+
+
+## ---- Tranche 1: full arith reg-reg-reg ----------------------------------
+## Identity-chain tuples used by demo.M1 to exercise every P1 arith op
+## without branches or memory. r1 is the running accumulator; each op
+## uses an identity partner so the correct output is always r1.
+## aarch64 has 3-operand forms for all of these, so one insn per op.
+## REM has no native op — expands to sdiv+msub through scratch reg x4.
+
+## MOV rD, rA -> orr xD, xzr, xA
+DEFINE P1_MOV_R6_R1 F30301AA ## mov x19, x1
+
+## ADD / SUB / AND / OR / XOR — base opcodes 8B / CB / 8A / AA / CA.
+DEFINE P1_ADD_R1_R1_R2 2100028B ## add x1, x1, x2
+DEFINE P1_SUB_R1_R1_R2 210002CB ## sub x1, x1, x2
+DEFINE P1_AND_R1_R1_R5 2100058A ## and x1, x1, x5
+DEFINE P1_OR_R1_R1_R2 210002AA ## orr x1, x1, x2
+DEFINE P1_XOR_R1_R1_R2 210002CA ## eor x1, x1, x2
+
+## MUL rD, rA, rB -> madd xD, xA, xB, xzr
+DEFINE P1_MUL_R1_R1_R3 217C039B ## mul x1, x1, x3
+
+## DIV rD, rA, rB -> sdiv xD, xA, xB
+DEFINE P1_DIV_R1_R1_R3 210CC39A ## sdiv x1, x1, x3
+
+## REM rD, rA, rB -> sdiv x4, xA, xB ; msub xD, x4, xB, xA
+## Two insns; x4 is a caller-saved scratch (= P1 r4). Demo keeps any
+## live r4 value dead across this op.
+DEFINE P1_REM_R1_R1_R5 240CC59A8184059B ## sdiv x4,x1,x5; msub x1,x4,x5,x1
+
+## SHL / SHR / SAR -> lslv / lsrv / asrv (64-bit variants).
+DEFINE P1_SHL_R1_R1_R2 2120C29A ## lsl x1, x1, x2
+DEFINE P1_SHR_R1_R1_R2 2124C29A ## lsr x1, x1, x2
+DEFINE P1_SAR_R1_R1_R2 2128C29A ## asr x1, x1, x2
+
+
+## ---- Tranche 2: immediate arith ---------------------------------------
+## Tuples exercised by demo.M1: chain 5 -> 8 -> 4 -> 5 -> 5 -> 5 -> 5.
+##
+## Bitwise-imm ops (AND/OR) use aarch64's logical-immediate encoding:
+## (N, immr, imms) encodes a run of consecutive 1s rotated by immr.
+## For 64-bit patterns N=1 and imms = (ones_count - 1). Immediates:
+## imm=1 -> N=1 imms=0 immr=0 (one 1, no rotation)
+## imm=7 -> N=1 imms=2 immr=0 (three consecutive 1s)
+## Shift-imm ops are UBFM/SBFM aliases:
+## LSL #n -> UBFM immr=-n mod 64, imms=63-n
+## LSR #n -> UBFM immr=n, imms=63
+## ASR #n -> SBFM immr=n, imms=63
+
+DEFINE P1_ADDI_R1_R1_3 210C0091 ## add x1, x1, #3
+DEFINE P1_ANDI_R1_R1_7 21084092 ## and x1, x1, #7
+DEFINE P1_ORI_R1_R1_1 210040B2 ## orr x1, x1, #1
+DEFINE P1_SHLI_R1_R1_0 21FC40D3 ## lsl x1, x1, #0 (UBFM #0,#63)
+DEFINE P1_SHRI_R1_R1_1 21FC41D3 ## lsr x1, x1, #1 (UBFM #1,#63)
+DEFINE P1_SARI_R1_R1_0 21FC4093 ## asr x1, x1, #0 (SBFM #0,#63)
+
+
+## ---- Tranche 3: LA + memory ops ---------------------------------------
+## LA is LI in the spike — both load a 4-byte zero-extended literal,
+## which is enough to address the ELF (base 0x00600000 < 2^32). Extending
+## LA to a full 64-bit load is future work (P1.md §"What needs added").
+DEFINE P1_LA_R4 4400001802000014 ## ldr w4, [pc+8] ; b +8 ; <4 bytes>
+
+## Unsigned-offset forms of LDR/STR at 8/4/1-byte widths.
+## imm12 is scaled by access size, so e.g. LDR-w offset 8 uses imm12=2.
+DEFINE P1_ST_R1_R4_0 810000F9 ## str x1, [x4, #0]
+DEFINE P1_LD_R1_R4_0 810040F9 ## ldr x1, [x4, #0]
+DEFINE P1_SW_R1_R4_8 810800B9 ## str w1, [x4, #8]
+DEFINE P1_LW_R1_R4_8 810840B9 ## ldr w1, [x4, #8] (zero-ext)
+DEFINE P1_SB_R1_R4_16 81400039 ## strb w1, [x4, #16]
+DEFINE P1_LB_R1_R4_16 81404039 ## ldrb w1, [x4, #16] (zero-ext)
+
+
+## ---- Tranche 4: branches (r7-indirect, no hex2_word needed) ------------
+## Conditional branches compare ra vs rb, then jump to the address in r7.
+## Pattern: cmp xRa, xRb ; b.<INV> +8 ; br x20
+## - If cond is false we take the +8 skip and fall through the BR.
+## - If cond is true we execute the BR x20 jump to r7.
+## Caller loads the target into r7 (via P1_LI_R7 &label) beforehand.
+## Unconditional P1_B is just BR x20.
+##
+## Conditions for "skip if NOT cond":
+## BEQ -> B.NE (cond 1) BNE -> B.EQ (cond 0)
+## BLT -> B.GE (cond A) BGE -> B.LT (cond B)
+## BLTU-> B.HS (cond 2) BGEU -> B.LO (cond 3)
+## CMP x2, x3 = SUBS xzr, x2, x3 = 0xEB03005F (Rm=3, Rn=2, Rd=31).
+## BR x20 = 0xD61F0280.
+
+DEFINE P1_B 80021FD6
+DEFINE P1_BEQ_R2_R3_R7 5F0003EB4100005480021FD6 ## cmp ; b.ne +8 ; br x20
+DEFINE P1_BNE_R2_R3_R7 5F0003EB4000005480021FD6 ## cmp ; b.eq +8 ; br x20
+DEFINE P1_BLT_R2_R3_R7 5F0003EB4A00005480021FD6 ## cmp ; b.ge +8 ; br x20
+DEFINE P1_BGE_R2_R3_R7 5F0003EB4B00005480021FD6 ## cmp ; b.lt +8 ; br x20
+DEFINE P1_BLTU_R2_R3_R7 5F0003EB4200005480021FD6 ## cmp ; b.hs +8 ; br x20
+DEFINE P1_BGEU_R2_R3_R7 5F0003EB4300005480021FD6 ## cmp ; b.lo +8 ; br x20
+
+
+## ---- Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL -----------------
+## CALL is an r7-indirect branch-and-link, same pattern as branches: caller
+## loads &target into r7, then `blr x20` sets x30 = PC+4 and jumps.
+## RET jumps through x30 (the native `ret` pseudo).
+##
+## PROLOGUE / EPILOGUE explicitly spill and reload x30 on aarch64 so that
+## nested CALLs can't clobber the caller's return address. After PROLOGUE,
+## [sp+0] holds the return address, matching P1.md's uniform convention.
+##
+## TAIL = EPILOGUE + B. Calling convention: load &target into r7 first,
+## then TAIL performs the callee's own epilogue (restoring the parent's
+## caller-retaddr into x30) and jumps to r7. When the tail target later
+## RETs, control returns to the parent's caller.
+
+DEFINE P1_PROLOGUE FF4300D1FE0300F9 ## sub sp,#16 ; str x30,[sp]
+DEFINE P1_EPILOGUE FE0340F9FF430091 ## ldr x30,[sp] ; add sp,#16
+DEFINE P1_RET C0035FD6 ## ret (br x30)
+DEFINE P1_CALL 80023FD6 ## blr x20
+DEFINE P1_TAIL FE0340F9FF43009180021FD6 ## epilogue ; br x20
diff --git a/p1_amd64.M1 b/p1_amd64.M1
@@ -69,3 +69,106 @@ DEFINE P1_ADD_R2_R2_R6 4889F64801DE ## mov rsi,rsi ; add rsi,rbx
## SUB rD, rA, rB -> mov rD,rA ; sub rD,rB
DEFINE P1_SUB_R3_R3_R4 4889D24C29D2 ## mov rdx,rdx ; sub rdx,r10
+
+
+## ---- Tranche 1: full arith reg-reg-reg ----------------------------------
+## Identity-chain tuples used by demo.M1. r1=rdi runs through each op.
+## x86-64 oddities:
+## - Shifts need count in cl; three-insn form mov rD,rA; mov rcx,rB; shl.
+## - IDIV needs dividend in rdx:rax. We save rdx to rcx before CQO so
+## both the divisor (when it is rdx) and the caller's r3 survive.
+
+## MOV rD, rA -> mov rD_native, rA_native
+DEFINE P1_MOV_R6_R1 4889FB ## mov rbx, rdi
+
+## ADD / SUB / AND / OR / XOR — 2-insn form, leading mov rdi,rdi kept.
+DEFINE P1_ADD_R1_R1_R2 4889FF4801F7 ## mov rdi,rdi ; add rdi,rsi
+DEFINE P1_SUB_R1_R1_R2 4889FF4829F7 ## mov rdi,rdi ; sub rdi,rsi
+DEFINE P1_XOR_R1_R1_R2 4889FF4831F7 ## mov rdi,rdi ; xor rdi,rsi
+DEFINE P1_OR_R1_R1_R2 4889FF4809F7 ## mov rdi,rdi ; or rdi,rsi
+DEFINE P1_AND_R1_R1_R5 4889FF4C21C7 ## mov rdi,rdi ; and rdi,r8
+
+## MUL rD, rA, rB -> mov rD,rA ; imul rD,rB (IMUL r64,r/m64 = 0F AF)
+DEFINE P1_MUL_R1_R1_R3 4889FF480FAFFA ## mov rdi,rdi ; imul rdi,rdx
+
+## DIV rD, rA, rB -> divisor in rdx (r3). Save rdx to rcx so the
+## divisor survives CQO's clobber of rdx, and so r3 is restored after.
+## mov rcx,rdx ; mov rax,rdi ; cqo ; idiv rcx ; mov rdi,rax ; mov rdx,rcx
+DEFINE P1_DIV_R1_R1_R3 4889D14889F8489948F7F94889C74889CA
+
+## REM rD, rA, rB -> divisor in r8 (r5). CQO still clobbers rdx, so
+## save/restore r3 through rcx.
+## mov rcx,rdx ; mov rax,rdi ; cqo ; idiv r8 ; mov rdi,rdx ; mov rdx,rcx
+DEFINE P1_REM_R1_R1_R5 4889D14889F8489949F7F84889D74889CA
+
+## SHL / SHR / SAR -> mov rD,rA ; mov rcx,rB ; shl/shr/sar rD,cl
+## Opcode D3 /n with REX.W: /4=SHL, /5=SHR, /7=SAR.
+DEFINE P1_SHL_R1_R1_R2 4889FF4889F148D3E7 ## mov rdi,rdi; mov rcx,rsi; shl rdi,cl
+DEFINE P1_SHR_R1_R1_R2 4889FF4889F148D3EF ## mov rdi,rdi; mov rcx,rsi; shr rdi,cl
+DEFINE P1_SAR_R1_R1_R2 4889FF4889F148D3FF ## mov rdi,rdi; mov rcx,rsi; sar rdi,cl
+
+
+## ---- Tranche 2: immediate arith ---------------------------------------
+## mov rdi,rdi ; <op> rdi, imm8 (sign-extended imm8 forms via opcode 83;
+## shifts via C1). /n is the opcode-extension field in ModRM.reg.
+
+DEFINE P1_ADDI_R1_R1_3 4889FF4883C703 ## add rdi, 3 (83 /0 ib)
+DEFINE P1_ANDI_R1_R1_7 4889FF4883E707 ## and rdi, 7 (83 /4 ib)
+DEFINE P1_ORI_R1_R1_1 4889FF4883CF01 ## or rdi, 1 (83 /1 ib)
+DEFINE P1_SHLI_R1_R1_0 4889FF48C1E700 ## shl rdi, 0 (C1 /4 ib)
+DEFINE P1_SHRI_R1_R1_1 4889FF48C1EF01 ## shr rdi, 1 (C1 /5 ib)
+DEFINE P1_SARI_R1_R1_0 4889FF48C1FF00 ## sar rdi, 0 (C1 /7 ib)
+
+
+## ---- Tranche 3: LA + memory ops ---------------------------------------
+## LA is LI in the spike (addresses fit in 32 bits → zero-extends cleanly
+## through the mov-to-r32 form). r4 is r10, so base-reg encoding uses
+## REX.B=1 and ModRM rm=010. No SIB byte needed (r10's low3 bits = 010).
+DEFINE P1_LA_R4 41BA ## mov r10d, imm32
+
+## Plain MOV r/m, r / MOV r, r/m with 8-bit displacement.
+## REX: W=1 for 64-bit moves; B=1 always (r10 is the base register).
+DEFINE P1_ST_R1_R4_0 49893A ## mov [r10], rdi
+DEFINE P1_LD_R1_R4_0 498B3A ## mov rdi, [r10]
+DEFINE P1_SW_R1_R4_8 41897A08 ## mov [r10+8], edi
+DEFINE P1_LW_R1_R4_8 418B7A08 ## mov edi, [r10+8] (zero-ext)
+DEFINE P1_SB_R1_R4_16 41887A10 ## mov [r10+16], dil
+DEFINE P1_LB_R1_R4_16 490FB67A10 ## movzx rdi, byte [r10+16]
+
+
+## ---- Tranche 4: branches (r7-indirect, no hex2_word needed) ------------
+## Same pattern as aarch64: cmp ra,rb ; short native jcc over a jmp-through-r7.
+## If cond is false the native "skip" jcc fires (opposite of the P1 cond) and
+## steps past the 3-byte `jmp r12`, falling through. If cond is true we take
+## the jmp r12 to the address the caller stashed in r7.
+## P1_B is just `jmp r12` unconditionally.
+##
+## CMP rsi, rdx = 48 39 D6 (REX.W, opcode 39 /r, ModRM: 11 010 110).
+## JMP r12 = 41 FF E4 (REX.B, opcode FF /4, ModRM: 11 100 100).
+## jcc rel8 opcodes (skip when NOT cond): JE=74 JNE=75 JL=7C JGE=7D JB=72 JAE=73.
+
+DEFINE P1_B 41FFE4 ## jmp r12
+DEFINE P1_BEQ_R2_R3_R7 4839D6750341FFE4 ## cmp ; jne +3 ; jmp r12
+DEFINE P1_BNE_R2_R3_R7 4839D6740341FFE4 ## cmp ; je +3 ; jmp r12
+DEFINE P1_BLT_R2_R3_R7 4839D67D0341FFE4 ## cmp ; jge +3 ; jmp r12
+DEFINE P1_BGE_R2_R3_R7 4839D67C0341FFE4 ## cmp ; jl +3 ; jmp r12
+DEFINE P1_BLTU_R2_R3_R7 4839D6730341FFE4 ## cmp ; jae +3 ; jmp r12
+DEFINE P1_BGEU_R2_R3_R7 4839D6720341FFE4 ## cmp ; jb +3 ; jmp r12
+
+
+## ---- Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL -----------------
+## amd64's native CALL already pushes the return address to the stack, and
+## RET pops it. So PROLOGUE/EPILOGUE are no-ops here — a single NOP keeps
+## them non-empty (handy in disasm, and sidesteps any M0 quirks around an
+## empty DEFINE value). On the other two arches PROLOGUE/EPILOGUE do real
+## work spilling/reloading lr.
+##
+## CALL expects the target pre-loaded into r7 (= r12); expands to `call r12`.
+## TAIL = EPILOGUE + unconditional B (= `jmp r12`), so the caller of the
+## tail-calling function receives the return from the tail target directly.
+
+DEFINE P1_PROLOGUE 90 ## nop (retaddr already on stack)
+DEFINE P1_EPILOGUE 90 ## nop
+DEFINE P1_RET C3 ## ret
+DEFINE P1_CALL 41FFD4 ## call r12
+DEFINE P1_TAIL 9041FFE4 ## epilogue(nop) ; jmp r12
diff --git a/p1_riscv64.M1 b/p1_riscv64.M1
@@ -55,7 +55,7 @@ DEFINE P1_LI_R7 170900000369C9006F008000
##
## Unconditional — unused shuffles read caller-saved scratch, and the
## kernel reads only the regs each syscall cares about.
-DEFINE P1_SYSCALL 9308050013850500930506001386060093060700138707009307040073000000
+DEFINE P1_SYSCALL 9308050013850500930506001386060093060700138707009387040073000000
## Linux syscall numbers (riscv64 uses the generic table — same as aarch64).
@@ -76,3 +76,94 @@ DEFINE P1_ADD_R2_R2_R6 33069600 ## add a2, a2, s1
## SUB rD, rA, rB -> sub rD, rA, rB
DEFINE P1_SUB_R3_R3_R4 B386E640 ## sub a3, a3, a4
+
+
+## ---- Tranche 1: full arith reg-reg-reg ----------------------------------
+## Identity-chain tuples used by demo.M1. All one-insn R-type ops.
+## MUL/DIV/REM are M-extension ops (standard on rv64gc).
+
+## MOV rD, rA -> addi rD, rA, 0
+DEFINE P1_MOV_R6_R1 93840500 ## mv s1, a1
+
+## ADD / SUB / AND / OR / XOR — R-type, funct3 picks the op.
+DEFINE P1_ADD_R1_R1_R2 B385C500 ## add a1, a1, a2
+DEFINE P1_SUB_R1_R1_R2 B385C540 ## sub a1, a1, a2 (funct7=0x20)
+DEFINE P1_XOR_R1_R1_R2 B3C5C500 ## xor a1, a1, a2 (funct3=4)
+DEFINE P1_OR_R1_R1_R2 B3E5C500 ## or a1, a1, a2 (funct3=6)
+DEFINE P1_AND_R1_R1_R5 B3F5F500 ## and a1, a1, a5 (funct3=7)
+
+## MUL / DIV / REM — M extension, funct7=1.
+DEFINE P1_MUL_R1_R1_R3 B385D502 ## mul a1, a1, a3
+DEFINE P1_DIV_R1_R1_R3 B3C5D502 ## div a1, a1, a3 (funct3=4)
+DEFINE P1_REM_R1_R1_R5 B3E5F502 ## rem a1, a1, a5 (funct3=6)
+
+## SHL / SHR / SAR -> sll / srl / sra.
+DEFINE P1_SHL_R1_R1_R2 B395C500 ## sll a1, a1, a2 (funct3=1)
+DEFINE P1_SHR_R1_R1_R2 B3D5C500 ## srl a1, a1, a2 (funct3=5)
+DEFINE P1_SAR_R1_R1_R2 B3D5C540 ## sra a1, a1, a2 (funct7=0x20)
+
+
+## ---- Tranche 2: immediate arith ---------------------------------------
+## I-type / shift-immediate forms. Shift amount is 6 bits (shamt6) for
+## rv64. SRAI sets bit 30 to distinguish from SRLI.
+
+DEFINE P1_ADDI_R1_R1_3 93853500 ## addi a1, a1, 3
+DEFINE P1_ANDI_R1_R1_7 93F57500 ## andi a1, a1, 7 (funct3=7)
+DEFINE P1_ORI_R1_R1_1 93E51500 ## ori a1, a1, 1 (funct3=6)
+DEFINE P1_SHLI_R1_R1_0 93950500 ## slli a1, a1, 0 (funct3=1)
+DEFINE P1_SHRI_R1_R1_1 93D51500 ## srli a1, a1, 1 (funct3=5)
+DEFINE P1_SARI_R1_R1_0 93D50540 ## srai a1, a1, 0 (funct7=0x20)
+
+
+## ---- Tranche 3: LA + memory ops ---------------------------------------
+## LA is LI in the spike.
+DEFINE P1_LA_R4 170700000367C7006F008000 ## auipc a4,0; lwu a4,12(a4); jal x0,+8
+
+## LOAD (opcode 0x03) / STORE (opcode 0x23) with signed 12-bit offset.
+## For P1: LD=64b, LW=32b zero-ext (= LWU), LB=8b zero-ext (= LBU).
+## ST=SD (64b), SW=32b, SB=8b.
+DEFINE P1_ST_R1_R4_0 2330B700 ## sd a1, 0(a4)
+DEFINE P1_LD_R1_R4_0 83350700 ## ld a1, 0(a4)
+DEFINE P1_SW_R1_R4_8 2324B700 ## sw a1, 8(a4)
+DEFINE P1_LW_R1_R4_8 83658700 ## lwu a1, 8(a4)
+DEFINE P1_SB_R1_R4_16 2308B700 ## sb a1, 16(a4)
+DEFINE P1_LB_R1_R4_16 83450701 ## lbu a1, 16(a4)
+
+
+## ---- Tranche 4: branches (r7-indirect, no hex2_word needed) ------------
+## RISC-V has B-type conditional branches with a scattered immediate; writing
+## literal byte strings for arbitrary offsets is painful without hex2_word.
+## Sidestep it with the r7-indirect pattern: a fixed-offset branch that skips
+## the unconditional `jalr x0, 0(s2)` when the P1 condition is NOT met.
+##
+## Fixed offset: the conditional branches below all use imm=8 (skip past the
+## 4-byte JALR on the false path). B-type imm=8 encodes to (imm[12|10:5]=0,
+## imm[4:1|11]=4, so imms spread as funct7 bits 0x00 and rd bits 0x4).
+## Branch instruction byte strings below were assembled and verified by hand.
+##
+## P1_B is just `jalr x0, 0(s2)` — unconditional jump to address in r7.
+## All comparisons use a2 (P1 r2) vs a3 (P1 r3).
+
+DEFINE P1_B 67000900 ## jalr x0, 0(s2)
+DEFINE P1_BEQ_R2_R3_R7 6314D60067000900 ## bne a2,a3,+8 ; jalr x0,0(s2)
+DEFINE P1_BNE_R2_R3_R7 6304D60067000900 ## beq ; jalr
+DEFINE P1_BLT_R2_R3_R7 6354D60067000900 ## bge ; jalr
+DEFINE P1_BGE_R2_R3_R7 6344D60067000900 ## blt ; jalr
+DEFINE P1_BLTU_R2_R3_R7 6374D60067000900 ## bgeu; jalr
+DEFINE P1_BGEU_R2_R3_R7 6364D60067000900 ## bltu; jalr
+
+
+## ---- Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL -----------------
+## CALL is JALR through s2 (= P1 r7), saving PC+4 into ra. Caller loads
+## &target into r7 beforehand. RET is the canonical `ret` pseudo
+## (JALR x0, 0(ra)).
+##
+## PROLOGUE / EPILOGUE save and restore ra around nested calls; after
+## PROLOGUE, [sp+0] holds the return address (matching P1.md §"return
+## address lives in [sp+0] after prologue"). TAIL = EPILOGUE + B.
+
+DEFINE P1_PROLOGUE 130101FF23301100 ## addi sp,sp,-16 ; sd ra,0(sp)
+DEFINE P1_EPILOGUE 8330010013010101 ## ld ra,0(sp) ; addi sp,sp,16
+DEFINE P1_RET 67800000 ## jalr x0, 0(ra)
+DEFINE P1_CALL E7000900 ## jalr ra, 0(s2)
+DEFINE P1_TAIL 833001001301010167000900 ## epilogue ; jalr x0, 0(s2)