P1 extend demo.M1 to cover all ops - boot2

commit 29dd4df7b3d09e614c5999efd71cbe9651d2ef02
parent c352715fd06e83f2dc4ebd6094943a4c5626a203
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Mon, 20 Apr 2026 15:35:35 -0700

P1 extend demo.M1 to cover all ops

Diffstat:
M demo.M1  | 311 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
M p1_aarch64.M1  | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M p1_amd64.M1  | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M p1_riscv64.M1  | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-

4 files changed, 568 insertions(+), 55 deletions(-)
diff --git a/demo.M1 b/demo.M1
@@ -1,65 +1,259 @@
 ## P1 broader-ISA demo — portable across aarch64, amd64, riscv64.
 ##
-## Computes  (3 + 4) - 2 = 5  in registers, prints "P1 = 5\n", then
-## exits with status 5. Exercises LI, ADD, SUB, MOV, and SYSCALL.
+## Exercises the P1 ISA in tranches (see P1.md §"ISA"). Each op is
+## applied in a way that preserves the running value r1=5, so a
+## miscoded op produces the wrong exit status.
+##
+## Tranche 1: LI, MOV, SYSCALL, plus every reg-reg-reg arith op:
+##   ADD, SUB, AND, OR, XOR, MUL, DIV, REM, SHL, SHR, SAR.
+## Tranche 2: immediate forms
+##   ADDI, ANDI, ORI, SHLI, SHRI, SARI
+##   (no SUBI/XORI/MULI in P1 — see PLAN.md §"Feature floor").
+## Tranche 3: LA + memory round-trip
+##   LA, ST/LD (64b), SW/LW (32b zero-ext), SB/LB (8b zero-ext)
+## Tranche 4: r7-indirect branches
+##   B, BEQ, BNE, BLT, BGE, BLTU, BGEU — 7 taken-path subtests + 1 fall-through
+## Tranche 5 (current): CALL / RET / TAIL / PROLOGUE / EPILOGUE
+##   Nested CALL (stresses PROLOGUE lr-save) + TAIL (must unwind before B)
+##
+## Constants for the identity chain:
+##   r2 = 0   identity for ADD/SUB/XOR/OR/SHL/SHR/SAR
+##   r3 = 1   identity for MUL/DIV
+##   r5 = 7   identity for AND (5 & 7 = 5) and REM (5 % 7 = 5)
+## r1 holds the running value — starts at 5 and stays 5.
 ##
 ## Run-and-verify:
 ##   make PROG=demo ARCH=<arch> run && echo "exit=$?"
 ## expected stdout: "P1 = 5\n"   expected exit: 5
 
 :_start
-	## Compute result = (3 + 4) - 2 = 5, stash in r6.
-	## r6 maps to a callee-saved native reg on every arch (x19 / rbx /
-	## s1), so it survives SYSCALL's argument shuffle.
-	P1_LI_R1
-	'03000000'                  # r1 = 3
-	P1_LI_R2
-	'04000000'                  # r2 = 4
-	P1_ADD_R3_R1_R2             # r3 = r1 + r2  (= 7)
-	P1_LI_R4
-	'02000000'                  # r4 = 2
-	P1_SUB_R3_R3_R4             # r3 = r3 - r4  (= 5)
-	P1_MOV_R6_R3                # r6 = r3       (save across syscalls)
-
-	## write(1, &prefix, 5)      — "P1 = "
-	P1_LI_R0
-	SYS_WRITE
-	P1_LI_R1
-	'01000000'
-	P1_LI_R2
-	&prefix
-	P1_LI_R3
-	'05000000'
-	P1_SYSCALL
-
-	## write(1, &digits + r6, 1) — the computed digit ('5')
-	P1_LI_R0
-	SYS_WRITE
-	P1_LI_R1
-	'01000000'
-	P1_LI_R2
-	&digits
-	P1_ADD_R2_R2_R6             # r2 = &digits + 5
-	P1_LI_R3
-	'01000000'
-	P1_SYSCALL
-
-	## write(1, &newline, 1)
-	P1_LI_R0
-	SYS_WRITE
-	P1_LI_R1
-	'01000000'
-	P1_LI_R2
-	&newline
-	P1_LI_R3
-	'01000000'
-	P1_SYSCALL
-
-	## exit(r6)                   — exit status = computed result
-	P1_LI_R0
-	SYS_EXIT
-	P1_MOV_R1_R6
-	P1_SYSCALL
+    P1_LI_R1
+    '05000000'
+    P1_LI_R2
+    '00000000'
+    P1_LI_R3
+    '01000000'
+    P1_LI_R5
+    '07000000'
+
+    P1_ADD_R1_R1_R2             # 5 + 0 = 5
+    P1_SUB_R1_R1_R2             # 5 - 0 = 5
+    P1_XOR_R1_R1_R2             # 5 ^ 0 = 5
+    P1_OR_R1_R1_R2              # 5 | 0 = 5
+    P1_AND_R1_R1_R5             # 5 & 7 = 5
+    P1_MUL_R1_R1_R3             # 5 * 1 = 5
+    P1_DIV_R1_R1_R3             # 5 / 1 = 5
+    P1_REM_R1_R1_R5             # 5 % 7 = 5
+    P1_SHL_R1_R1_R2             # 5 << 0 = 5
+    P1_SHR_R1_R1_R2             # 5 >> 0 = 5 (logical)
+    P1_SAR_R1_R1_R2             # 5 >> 0 = 5 (arithmetic)
+
+    ## Tranche 2: immediate forms. Chain 5 -> 8 -> 4 -> 5 -> 5 -> 5 -> 5.
+    P1_ADDI_R1_R1_3             # 5 + 3 = 8
+    P1_SHRI_R1_R1_1             # 8 >> 1 = 4
+    P1_ORI_R1_R1_1              # 4 | 1 = 5
+    P1_ANDI_R1_R1_7             # 5 & 7 = 5
+    P1_SHLI_R1_R1_0             # 5 << 0 = 5
+    P1_SARI_R1_R1_0             # 5 >> 0 = 5 (arithmetic)
+
+    ## Tranche 3: memory round-trip. For each access width, store r1,
+    ## clobber r1 to 0, then reload. A broken ST/LD leaves r1 != 5.
+    P1_LA_R4
+    &scratch
+
+    P1_ST_R1_R4_0               # [scratch+0..8]  = r1 (= 5)
+    P1_LI_R1
+    '00000000'
+    P1_LD_R1_R4_0               # r1 = [scratch+0..8]         -> 5
+
+    P1_SW_R1_R4_8               # [scratch+8..12] = r1 (low 4 bytes)
+    P1_LI_R1
+    '00000000'
+    P1_LW_R1_R4_8               # r1 = zext [scratch+8..12]   -> 5
+
+    P1_SB_R1_R4_16              # [scratch+16]    = r1 (low byte)
+    P1_LI_R1
+    '00000000'
+    P1_LB_R1_R4_16              # r1 = zext [scratch+16]      -> 5
+
+    ## Tranche 4: branches. r2=0, r3=1 already, r5=7 already.
+    ## Pattern for each taken-path test:
+    ##   set r7 = &b4_N_ok ; <branch cond met> ; clobber r1->0 ; :b4_N_ok
+    ## If the branch correctly fires, the clobber is skipped.
+    ##
+    ## B — unconditional, jumps to &b4_1_ok.
+    P1_LI_R7
+    &b4_1_ok
+    P1_B
+    P1_LI_R1
+    '00000000'
+:b4_1_ok
+
+    ## BEQ r2,r3,r7 — 0 == 0? set r3 back to 0 first, branch, then restore.
+    P1_LI_R3
+    '00000000'                  # r3 = 0 so r2 == r3
+    P1_LI_R7
+    &b4_2_ok
+    P1_BEQ_R2_R3_R7             # 0 == 0, branch taken
+    P1_LI_R1
+    '00000000'
+:b4_2_ok
+    P1_LI_R3
+    '01000000'                  # restore r3 = 1
+
+    ## BNE r2,r3,r7 — 0 != 1? yes, branch taken.
+    P1_LI_R7
+    &b4_3_ok
+    P1_BNE_R2_R3_R7
+    P1_LI_R1
+    '00000000'
+:b4_3_ok
+
+    ## BLT r2,r3,r7 — 0 < 1 (signed)? yes, branch taken.
+    P1_LI_R7
+    &b4_4_ok
+    P1_BLT_R2_R3_R7
+    P1_LI_R1
+    '00000000'
+:b4_4_ok
+
+    ## BGE r3,r2,r7 — needs a3 >= a2. Swap: load r2=1 r3=0 for this test.
+    P1_LI_R2
+    '01000000'
+    P1_LI_R3
+    '00000000'
+    P1_LI_R7
+    &b4_5_ok
+    P1_BGE_R2_R3_R7             # 1 >= 0 (signed), branch taken
+    P1_LI_R1
+    '00000000'
+:b4_5_ok
+    P1_LI_R2
+    '00000000'                  # restore r2 = 0
+    P1_LI_R3
+    '01000000'                  # restore r3 = 1
+
+    ## BLTU r2,r3,r7 — 0 < 1 (unsigned)? yes, branch taken.
+    P1_LI_R7
+    &b4_6_ok
+    P1_BLTU_R2_R3_R7
+    P1_LI_R1
+    '00000000'
+:b4_6_ok
+
+    ## BGEU r2,r3,r7 — needs a2 >= a3 (unsigned). Swap r2/r3.
+    P1_LI_R2
+    '01000000'
+    P1_LI_R3
+    '00000000'
+    P1_LI_R7
+    &b4_7_ok
+    P1_BGEU_R2_R3_R7            # 1 >= 0 (unsigned), branch taken
+    P1_LI_R1
+    '00000000'
+:b4_7_ok
+    P1_LI_R2
+    '00000000'
+    P1_LI_R3
+    '01000000'
+
+    ## Fall-through test: BEQ with r2=0, r3=1 (unequal). The branch must NOT
+    ## fire. If it does fire (incorrectly), we jump to &b4_ft_bad which
+    ## clobbers r1. Correct behavior falls through the branch, then an
+    ## unconditional B jumps past the clobber to &b4_ft_good.
+    P1_LI_R7
+    &b4_ft_bad
+    P1_BEQ_R2_R3_R7             # 0 == 1? no, fall through
+    P1_LI_R7
+    &b4_ft_good
+    P1_B                        # skip the bad-path clobber
+:b4_ft_bad
+    P1_LI_R1
+    '00000000'
+:b4_ft_good
+
+    ## Restore r1 = 5 is implicit — we never clobbered it on the happy path.
+
+    ## Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL.
+    ## fn_identity does its own nested CALL to fn_inner — if PROLOGUE doesn't
+    ## spill lr correctly, the inner CALL clobbers the return-to-_start
+    ## address and we crash or hang. The function bodies live inline below
+    ## the subtests, guarded by a P1_B over them so we don't fall through
+    ## into them after the last subtest.
+    P1_LI_R7
+    &fn_identity
+    P1_CALL                     # nested-CALL test: returns r1 unchanged
+
+    P1_LI_R7
+    &fn_parent_tail
+    P1_CALL                     # TAIL test: fn_identity RETs to here
+
+    P1_LI_R7
+    &b5_end
+    P1_B                        # skip over the inlined function bodies
+
+:fn_inner
+    P1_PROLOGUE
+    P1_EPILOGUE
+    P1_RET
+
+:fn_identity
+    P1_PROLOGUE
+    P1_LI_R7
+    &fn_inner
+    P1_CALL
+    P1_EPILOGUE
+    P1_RET
+
+:fn_parent_tail
+    P1_PROLOGUE
+    P1_LI_R7
+    &fn_identity
+    P1_TAIL
+
+:b5_end
+
+    P1_MOV_R6_R1                # r6 = 5 (callee-saved, survives syscalls)
+
+    ## write(1, &prefix, 5)      — "P1 = "
+    P1_LI_R0
+    SYS_WRITE
+    P1_LI_R1
+    '01000000'
+    P1_LI_R2
+    &prefix
+    P1_LI_R3
+    '05000000'
+    P1_SYSCALL
+
+    ## write(1, &digits + r6, 1) — the computed digit ('5')
+    P1_LI_R0
+    SYS_WRITE
+    P1_LI_R1
+    '01000000'
+    P1_LI_R2
+    &digits
+    P1_ADD_R2_R2_R6             # r2 = &digits + 5
+    P1_LI_R3
+    '01000000'
+    P1_SYSCALL
+
+    ## write(1, &newline, 1)
+    P1_LI_R0
+    SYS_WRITE
+    P1_LI_R1
+    '01000000'
+    P1_LI_R2
+    &newline
+    P1_LI_R3
+    '01000000'
+    P1_SYSCALL
+
+    ## exit(r6)                   — exit status = computed result
+    P1_LI_R0
+    SYS_EXIT
+    P1_MOV_R1_R6
+    P1_SYSCALL
 
 :prefix
 "P1 = "
@@ -69,4 +263,13 @@
 "
 "
 
+## 32 bytes reserved for tranche 3 memory round-trip. The LOAD segment
+## is RWX (see ELF-<arch>.hex2 ph_flags=7) so we can store into this
+## region at runtime.
+:scratch
+'0000000000000000'
+'0000000000000000'
+'0000000000000000'
+'0000000000000000'
+
 :ELF_end
diff --git a/p1_aarch64.M1 b/p1_aarch64.M1
@@ -79,3 +79,119 @@ DEFINE P1_ADD_R2_R2_R6 4200138B            ## add x2, x2, x19
 
 ## SUB rD, rA, rB  ->  sub xD, xA, xB
 DEFINE P1_SUB_R3_R3_R4 630004CB            ## sub x3, x3, x4
+
+
+## ---- Tranche 1: full arith reg-reg-reg ----------------------------------
+## Identity-chain tuples used by demo.M1 to exercise every P1 arith op
+## without branches or memory. r1 is the running accumulator; each op
+## uses an identity partner so the correct output is always r1.
+## aarch64 has 3-operand forms for all of these, so one insn per op.
+## REM has no native op — expands to sdiv+msub through scratch reg x4.
+
+## MOV rD, rA  ->  orr xD, xzr, xA
+DEFINE P1_MOV_R6_R1 F30301AA               ## mov x19, x1
+
+## ADD / SUB / AND / OR / XOR — base opcodes 8B / CB / 8A / AA / CA.
+DEFINE P1_ADD_R1_R1_R2 2100028B            ## add x1, x1, x2
+DEFINE P1_SUB_R1_R1_R2 210002CB            ## sub x1, x1, x2
+DEFINE P1_AND_R1_R1_R5 2100058A            ## and x1, x1, x5
+DEFINE P1_OR_R1_R1_R2  210002AA            ## orr x1, x1, x2
+DEFINE P1_XOR_R1_R1_R2 210002CA            ## eor x1, x1, x2
+
+## MUL rD, rA, rB  ->  madd xD, xA, xB, xzr
+DEFINE P1_MUL_R1_R1_R3 217C039B            ## mul x1, x1, x3
+
+## DIV rD, rA, rB  ->  sdiv xD, xA, xB
+DEFINE P1_DIV_R1_R1_R3 210CC39A            ## sdiv x1, x1, x3
+
+## REM rD, rA, rB  ->  sdiv x4, xA, xB ; msub xD, x4, xB, xA
+## Two insns; x4 is a caller-saved scratch (= P1 r4). Demo keeps any
+## live r4 value dead across this op.
+DEFINE P1_REM_R1_R1_R5 240CC59A8184059B    ## sdiv x4,x1,x5; msub x1,x4,x5,x1
+
+## SHL / SHR / SAR  ->  lslv / lsrv / asrv (64-bit variants).
+DEFINE P1_SHL_R1_R1_R2 2120C29A            ## lsl x1, x1, x2
+DEFINE P1_SHR_R1_R1_R2 2124C29A            ## lsr x1, x1, x2
+DEFINE P1_SAR_R1_R1_R2 2128C29A            ## asr x1, x1, x2
+
+
+## ---- Tranche 2: immediate arith ---------------------------------------
+## Tuples exercised by demo.M1: chain 5 -> 8 -> 4 -> 5 -> 5 -> 5 -> 5.
+##
+## Bitwise-imm ops (AND/OR) use aarch64's logical-immediate encoding:
+## (N, immr, imms) encodes a run of consecutive 1s rotated by immr.
+## For 64-bit patterns N=1 and imms = (ones_count - 1). Immediates:
+##   imm=1 -> N=1 imms=0 immr=0 (one 1, no rotation)
+##   imm=7 -> N=1 imms=2 immr=0 (three consecutive 1s)
+## Shift-imm ops are UBFM/SBFM aliases:
+##   LSL #n -> UBFM immr=-n mod 64, imms=63-n
+##   LSR #n -> UBFM immr=n,         imms=63
+##   ASR #n -> SBFM immr=n,         imms=63
+
+DEFINE P1_ADDI_R1_R1_3  210C0091              ## add x1, x1, #3
+DEFINE P1_ANDI_R1_R1_7  21084092              ## and x1, x1, #7
+DEFINE P1_ORI_R1_R1_1   210040B2              ## orr x1, x1, #1
+DEFINE P1_SHLI_R1_R1_0  21FC40D3              ## lsl x1, x1, #0  (UBFM #0,#63)
+DEFINE P1_SHRI_R1_R1_1  21FC41D3              ## lsr x1, x1, #1  (UBFM #1,#63)
+DEFINE P1_SARI_R1_R1_0  21FC4093              ## asr x1, x1, #0  (SBFM #0,#63)
+
+
+## ---- Tranche 3: LA + memory ops ---------------------------------------
+## LA is LI in the spike — both load a 4-byte zero-extended literal,
+## which is enough to address the ELF (base 0x00600000 < 2^32). Extending
+## LA to a full 64-bit load is future work (P1.md §"What needs added").
+DEFINE P1_LA_R4 4400001802000014       ## ldr w4, [pc+8] ; b +8 ; <4 bytes>
+
+## Unsigned-offset forms of LDR/STR at 8/4/1-byte widths.
+## imm12 is scaled by access size, so e.g. LDR-w offset 8 uses imm12=2.
+DEFINE P1_ST_R1_R4_0   810000F9        ## str  x1, [x4, #0]
+DEFINE P1_LD_R1_R4_0   810040F9        ## ldr  x1, [x4, #0]
+DEFINE P1_SW_R1_R4_8   810800B9        ## str  w1, [x4, #8]
+DEFINE P1_LW_R1_R4_8   810840B9        ## ldr  w1, [x4, #8]   (zero-ext)
+DEFINE P1_SB_R1_R4_16  81400039        ## strb w1, [x4, #16]
+DEFINE P1_LB_R1_R4_16  81404039        ## ldrb w1, [x4, #16]  (zero-ext)
+
+
+## ---- Tranche 4: branches (r7-indirect, no hex2_word needed) ------------
+## Conditional branches compare ra vs rb, then jump to the address in r7.
+## Pattern:  cmp xRa, xRb ; b.<INV> +8 ; br x20
+##   - If cond is false we take the +8 skip and fall through the BR.
+##   - If cond is true we execute the BR x20 jump to r7.
+## Caller loads the target into r7 (via P1_LI_R7 &label) beforehand.
+## Unconditional P1_B is just BR x20.
+##
+## Conditions for "skip if NOT cond":
+##   BEQ -> B.NE (cond 1)      BNE  -> B.EQ (cond 0)
+##   BLT -> B.GE (cond A)      BGE  -> B.LT (cond B)
+##   BLTU-> B.HS (cond 2)      BGEU -> B.LO (cond 3)
+## CMP x2, x3 = SUBS xzr, x2, x3 = 0xEB03005F (Rm=3, Rn=2, Rd=31).
+## BR x20 = 0xD61F0280.
+
+DEFINE P1_B              80021FD6
+DEFINE P1_BEQ_R2_R3_R7   5F0003EB4100005480021FD6   ## cmp ; b.ne +8 ; br x20
+DEFINE P1_BNE_R2_R3_R7   5F0003EB4000005480021FD6   ## cmp ; b.eq +8 ; br x20
+DEFINE P1_BLT_R2_R3_R7   5F0003EB4A00005480021FD6   ## cmp ; b.ge +8 ; br x20
+DEFINE P1_BGE_R2_R3_R7   5F0003EB4B00005480021FD6   ## cmp ; b.lt +8 ; br x20
+DEFINE P1_BLTU_R2_R3_R7  5F0003EB4200005480021FD6   ## cmp ; b.hs +8 ; br x20
+DEFINE P1_BGEU_R2_R3_R7  5F0003EB4300005480021FD6   ## cmp ; b.lo +8 ; br x20
+
+
+## ---- Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL -----------------
+## CALL is an r7-indirect branch-and-link, same pattern as branches: caller
+## loads &target into r7, then `blr x20` sets x30 = PC+4 and jumps.
+## RET jumps through x30 (the native `ret` pseudo).
+##
+## PROLOGUE / EPILOGUE explicitly spill and reload x30 on aarch64 so that
+## nested CALLs can't clobber the caller's return address. After PROLOGUE,
+## [sp+0] holds the return address, matching P1.md's uniform convention.
+##
+## TAIL = EPILOGUE + B. Calling convention: load &target into r7 first,
+## then TAIL performs the callee's own epilogue (restoring the parent's
+## caller-retaddr into x30) and jumps to r7. When the tail target later
+## RETs, control returns to the parent's caller.
+
+DEFINE P1_PROLOGUE  FF4300D1FE0300F9              ## sub sp,#16 ; str x30,[sp]
+DEFINE P1_EPILOGUE  FE0340F9FF430091              ## ldr x30,[sp] ; add sp,#16
+DEFINE P1_RET       C0035FD6                      ## ret (br x30)
+DEFINE P1_CALL      80023FD6                      ## blr x20
+DEFINE P1_TAIL      FE0340F9FF43009180021FD6      ## epilogue ; br x20
diff --git a/p1_amd64.M1 b/p1_amd64.M1
@@ -69,3 +69,106 @@ DEFINE P1_ADD_R2_R2_R6 4889F64801DE        ## mov rsi,rsi ; add rsi,rbx
 
 ## SUB rD, rA, rB  ->  mov rD,rA ; sub rD,rB
 DEFINE P1_SUB_R3_R3_R4 4889D24C29D2        ## mov rdx,rdx ; sub rdx,r10
+
+
+## ---- Tranche 1: full arith reg-reg-reg ----------------------------------
+## Identity-chain tuples used by demo.M1. r1=rdi runs through each op.
+## x86-64 oddities:
+##   - Shifts need count in cl; three-insn form mov rD,rA; mov rcx,rB; shl.
+##   - IDIV needs dividend in rdx:rax. We save rdx to rcx before CQO so
+##     both the divisor (when it is rdx) and the caller's r3 survive.
+
+## MOV rD, rA  ->  mov rD_native, rA_native
+DEFINE P1_MOV_R6_R1 4889FB                 ## mov rbx, rdi
+
+## ADD / SUB / AND / OR / XOR — 2-insn form, leading mov rdi,rdi kept.
+DEFINE P1_ADD_R1_R1_R2 4889FF4801F7        ## mov rdi,rdi ; add rdi,rsi
+DEFINE P1_SUB_R1_R1_R2 4889FF4829F7        ## mov rdi,rdi ; sub rdi,rsi
+DEFINE P1_XOR_R1_R1_R2 4889FF4831F7        ## mov rdi,rdi ; xor rdi,rsi
+DEFINE P1_OR_R1_R1_R2  4889FF4809F7        ## mov rdi,rdi ; or  rdi,rsi
+DEFINE P1_AND_R1_R1_R5 4889FF4C21C7        ## mov rdi,rdi ; and rdi,r8
+
+## MUL rD, rA, rB  ->  mov rD,rA ; imul rD,rB     (IMUL r64,r/m64 = 0F AF)
+DEFINE P1_MUL_R1_R1_R3 4889FF480FAFFA      ## mov rdi,rdi ; imul rdi,rdx
+
+## DIV rD, rA, rB  ->  divisor in rdx (r3). Save rdx to rcx so the
+## divisor survives CQO's clobber of rdx, and so r3 is restored after.
+##   mov rcx,rdx ; mov rax,rdi ; cqo ; idiv rcx ; mov rdi,rax ; mov rdx,rcx
+DEFINE P1_DIV_R1_R1_R3 4889D14889F8489948F7F94889C74889CA
+
+## REM rD, rA, rB  ->  divisor in r8 (r5). CQO still clobbers rdx, so
+## save/restore r3 through rcx.
+##   mov rcx,rdx ; mov rax,rdi ; cqo ; idiv r8 ; mov rdi,rdx ; mov rdx,rcx
+DEFINE P1_REM_R1_R1_R5 4889D14889F8489949F7F84889D74889CA
+
+## SHL / SHR / SAR  ->  mov rD,rA ; mov rcx,rB ; shl/shr/sar rD,cl
+## Opcode D3 /n with REX.W: /4=SHL, /5=SHR, /7=SAR.
+DEFINE P1_SHL_R1_R1_R2 4889FF4889F148D3E7  ## mov rdi,rdi; mov rcx,rsi; shl rdi,cl
+DEFINE P1_SHR_R1_R1_R2 4889FF4889F148D3EF  ## mov rdi,rdi; mov rcx,rsi; shr rdi,cl
+DEFINE P1_SAR_R1_R1_R2 4889FF4889F148D3FF  ## mov rdi,rdi; mov rcx,rsi; sar rdi,cl
+
+
+## ---- Tranche 2: immediate arith ---------------------------------------
+## mov rdi,rdi ; <op> rdi, imm8   (sign-extended imm8 forms via opcode 83;
+## shifts via C1). /n is the opcode-extension field in ModRM.reg.
+
+DEFINE P1_ADDI_R1_R1_3  4889FF4883C703     ## add rdi, 3     (83 /0 ib)
+DEFINE P1_ANDI_R1_R1_7  4889FF4883E707     ## and rdi, 7     (83 /4 ib)
+DEFINE P1_ORI_R1_R1_1   4889FF4883CF01     ## or  rdi, 1     (83 /1 ib)
+DEFINE P1_SHLI_R1_R1_0  4889FF48C1E700     ## shl rdi, 0     (C1 /4 ib)
+DEFINE P1_SHRI_R1_R1_1  4889FF48C1EF01     ## shr rdi, 1     (C1 /5 ib)
+DEFINE P1_SARI_R1_R1_0  4889FF48C1FF00     ## sar rdi, 0     (C1 /7 ib)
+
+
+## ---- Tranche 3: LA + memory ops ---------------------------------------
+## LA is LI in the spike (addresses fit in 32 bits → zero-extends cleanly
+## through the mov-to-r32 form). r4 is r10, so base-reg encoding uses
+## REX.B=1 and ModRM rm=010. No SIB byte needed (r10's low3 bits = 010).
+DEFINE P1_LA_R4 41BA                        ## mov r10d, imm32
+
+## Plain MOV r/m, r / MOV r, r/m with 8-bit displacement.
+## REX: W=1 for 64-bit moves; B=1 always (r10 is the base register).
+DEFINE P1_ST_R1_R4_0   49893A               ## mov [r10], rdi
+DEFINE P1_LD_R1_R4_0   498B3A               ## mov rdi, [r10]
+DEFINE P1_SW_R1_R4_8   41897A08             ## mov [r10+8], edi
+DEFINE P1_LW_R1_R4_8   418B7A08             ## mov edi, [r10+8]  (zero-ext)
+DEFINE P1_SB_R1_R4_16  41887A10             ## mov [r10+16], dil
+DEFINE P1_LB_R1_R4_16  490FB67A10           ## movzx rdi, byte [r10+16]
+
+
+## ---- Tranche 4: branches (r7-indirect, no hex2_word needed) ------------
+## Same pattern as aarch64: cmp ra,rb ; short native jcc over a jmp-through-r7.
+## If cond is false the native "skip" jcc fires (opposite of the P1 cond) and
+## steps past the 3-byte `jmp r12`, falling through. If cond is true we take
+## the jmp r12 to the address the caller stashed in r7.
+## P1_B is just `jmp r12` unconditionally.
+##
+## CMP rsi, rdx = 48 39 D6 (REX.W, opcode 39 /r, ModRM: 11 010 110).
+## JMP r12      = 41 FF E4 (REX.B, opcode FF /4, ModRM: 11 100 100).
+## jcc rel8 opcodes (skip when NOT cond): JE=74 JNE=75 JL=7C JGE=7D JB=72 JAE=73.
+
+DEFINE P1_B              41FFE4                      ## jmp r12
+DEFINE P1_BEQ_R2_R3_R7   4839D6750341FFE4            ## cmp ; jne +3 ; jmp r12
+DEFINE P1_BNE_R2_R3_R7   4839D6740341FFE4            ## cmp ; je  +3 ; jmp r12
+DEFINE P1_BLT_R2_R3_R7   4839D67D0341FFE4            ## cmp ; jge +3 ; jmp r12
+DEFINE P1_BGE_R2_R3_R7   4839D67C0341FFE4            ## cmp ; jl  +3 ; jmp r12
+DEFINE P1_BLTU_R2_R3_R7  4839D6730341FFE4            ## cmp ; jae +3 ; jmp r12
+DEFINE P1_BGEU_R2_R3_R7  4839D6720341FFE4            ## cmp ; jb  +3 ; jmp r12
+
+
+## ---- Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL -----------------
+## amd64's native CALL already pushes the return address to the stack, and
+## RET pops it. So PROLOGUE/EPILOGUE are no-ops here — a single NOP keeps
+## them non-empty (handy in disasm, and sidesteps any M0 quirks around an
+## empty DEFINE value). On the other two arches PROLOGUE/EPILOGUE do real
+## work spilling/reloading lr.
+##
+## CALL expects the target pre-loaded into r7 (= r12); expands to `call r12`.
+## TAIL = EPILOGUE + unconditional B (= `jmp r12`), so the caller of the
+## tail-calling function receives the return from the tail target directly.
+
+DEFINE P1_PROLOGUE  90                          ## nop (retaddr already on stack)
+DEFINE P1_EPILOGUE  90                          ## nop
+DEFINE P1_RET       C3                          ## ret
+DEFINE P1_CALL      41FFD4                      ## call r12
+DEFINE P1_TAIL      9041FFE4                    ## epilogue(nop) ; jmp r12
diff --git a/p1_riscv64.M1 b/p1_riscv64.M1
@@ -55,7 +55,7 @@ DEFINE P1_LI_R7 170900000369C9006F008000
 ##
 ## Unconditional — unused shuffles read caller-saved scratch, and the
 ## kernel reads only the regs each syscall cares about.
-DEFINE P1_SYSCALL 9308050013850500930506001386060093060700138707009307040073000000
+DEFINE P1_SYSCALL 9308050013850500930506001386060093060700138707009387040073000000
 
 
 ## Linux syscall numbers (riscv64 uses the generic table — same as aarch64).
@@ -76,3 +76,94 @@ DEFINE P1_ADD_R2_R2_R6 33069600            ## add a2, a2, s1
 
 ## SUB rD, rA, rB  ->  sub rD, rA, rB
 DEFINE P1_SUB_R3_R3_R4 B386E640            ## sub a3, a3, a4
+
+
+## ---- Tranche 1: full arith reg-reg-reg ----------------------------------
+## Identity-chain tuples used by demo.M1. All one-insn R-type ops.
+## MUL/DIV/REM are M-extension ops (standard on rv64gc).
+
+## MOV rD, rA  ->  addi rD, rA, 0
+DEFINE P1_MOV_R6_R1 93840500               ## mv s1, a1
+
+## ADD / SUB / AND / OR / XOR — R-type, funct3 picks the op.
+DEFINE P1_ADD_R1_R1_R2 B385C500            ## add a1, a1, a2
+DEFINE P1_SUB_R1_R1_R2 B385C540            ## sub a1, a1, a2  (funct7=0x20)
+DEFINE P1_XOR_R1_R1_R2 B3C5C500            ## xor a1, a1, a2  (funct3=4)
+DEFINE P1_OR_R1_R1_R2  B3E5C500            ## or  a1, a1, a2  (funct3=6)
+DEFINE P1_AND_R1_R1_R5 B3F5F500            ## and a1, a1, a5  (funct3=7)
+
+## MUL / DIV / REM — M extension, funct7=1.
+DEFINE P1_MUL_R1_R1_R3 B385D502            ## mul a1, a1, a3
+DEFINE P1_DIV_R1_R1_R3 B3C5D502            ## div a1, a1, a3  (funct3=4)
+DEFINE P1_REM_R1_R1_R5 B3E5F502            ## rem a1, a1, a5  (funct3=6)
+
+## SHL / SHR / SAR  ->  sll / srl / sra.
+DEFINE P1_SHL_R1_R1_R2 B395C500            ## sll a1, a1, a2  (funct3=1)
+DEFINE P1_SHR_R1_R1_R2 B3D5C500            ## srl a1, a1, a2  (funct3=5)
+DEFINE P1_SAR_R1_R1_R2 B3D5C540            ## sra a1, a1, a2  (funct7=0x20)
+
+
+## ---- Tranche 2: immediate arith ---------------------------------------
+## I-type / shift-immediate forms. Shift amount is 6 bits (shamt6) for
+## rv64. SRAI sets bit 30 to distinguish from SRLI.
+
+DEFINE P1_ADDI_R1_R1_3  93853500            ## addi a1, a1, 3
+DEFINE P1_ANDI_R1_R1_7  93F57500            ## andi a1, a1, 7  (funct3=7)
+DEFINE P1_ORI_R1_R1_1   93E51500            ## ori  a1, a1, 1  (funct3=6)
+DEFINE P1_SHLI_R1_R1_0  93950500            ## slli a1, a1, 0  (funct3=1)
+DEFINE P1_SHRI_R1_R1_1  93D51500            ## srli a1, a1, 1  (funct3=5)
+DEFINE P1_SARI_R1_R1_0  93D50540            ## srai a1, a1, 0  (funct7=0x20)
+
+
+## ---- Tranche 3: LA + memory ops ---------------------------------------
+## LA is LI in the spike.
+DEFINE P1_LA_R4 170700000367C7006F008000   ## auipc a4,0; lwu a4,12(a4); jal x0,+8
+
+## LOAD (opcode 0x03) / STORE (opcode 0x23) with signed 12-bit offset.
+## For P1: LD=64b, LW=32b zero-ext (= LWU), LB=8b zero-ext (= LBU).
+##         ST=SD (64b), SW=32b, SB=8b.
+DEFINE P1_ST_R1_R4_0   2330B700             ## sd  a1, 0(a4)
+DEFINE P1_LD_R1_R4_0   83350700             ## ld  a1, 0(a4)
+DEFINE P1_SW_R1_R4_8   2324B700             ## sw  a1, 8(a4)
+DEFINE P1_LW_R1_R4_8   83658700             ## lwu a1, 8(a4)
+DEFINE P1_SB_R1_R4_16  2308B700             ## sb  a1, 16(a4)
+DEFINE P1_LB_R1_R4_16  83450701             ## lbu a1, 16(a4)
+
+
+## ---- Tranche 4: branches (r7-indirect, no hex2_word needed) ------------
+## RISC-V has B-type conditional branches with a scattered immediate; writing
+## literal byte strings for arbitrary offsets is painful without hex2_word.
+## Sidestep it with the r7-indirect pattern: a fixed-offset branch that skips
+## the unconditional `jalr x0, 0(s2)` when the P1 condition is NOT met.
+##
+## Fixed offset: the conditional branches below all use imm=8 (skip past the
+## 4-byte JALR on the false path). B-type imm=8 encodes to (imm[12|10:5]=0,
+## imm[4:1|11]=4, so imms spread as funct7 bits 0x00 and rd bits 0x4).
+## Branch instruction byte strings below were assembled and verified by hand.
+##
+## P1_B is just `jalr x0, 0(s2)` — unconditional jump to address in r7.
+## All comparisons use a2 (P1 r2) vs a3 (P1 r3).
+
+DEFINE P1_B              67000900                    ## jalr x0, 0(s2)
+DEFINE P1_BEQ_R2_R3_R7   6314D60067000900            ## bne a2,a3,+8 ; jalr x0,0(s2)
+DEFINE P1_BNE_R2_R3_R7   6304D60067000900            ## beq ; jalr
+DEFINE P1_BLT_R2_R3_R7   6354D60067000900            ## bge ; jalr
+DEFINE P1_BGE_R2_R3_R7   6344D60067000900            ## blt ; jalr
+DEFINE P1_BLTU_R2_R3_R7  6374D60067000900            ## bgeu; jalr
+DEFINE P1_BGEU_R2_R3_R7  6364D60067000900            ## bltu; jalr
+
+
+## ---- Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL -----------------
+## CALL is JALR through s2 (= P1 r7), saving PC+4 into ra. Caller loads
+## &target into r7 beforehand. RET is the canonical `ret` pseudo
+## (JALR x0, 0(ra)).
+##
+## PROLOGUE / EPILOGUE save and restore ra around nested calls; after
+## PROLOGUE, [sp+0] holds the return address (matching P1.md §"return
+## address lives in [sp+0] after prologue"). TAIL = EPILOGUE + B.
+
+DEFINE P1_PROLOGUE  130101FF23301100            ## addi sp,sp,-16 ; sd ra,0(sp)
+DEFINE P1_EPILOGUE  8330010013010101            ## ld ra,0(sp) ; addi sp,sp,16
+DEFINE P1_RET       67800000                    ## jalr x0, 0(ra)
+DEFINE P1_CALL      E7000900                    ## jalr ra, 0(s2)
+DEFINE P1_TAIL      833001001301010167000900    ## epilogue ; jalr x0, 0(s2)

	boot2 Playing with the boostrap
	git clone https://git.ryansepassi.com/git/boot2.git
	Log \| Files \| Refs

M	demo.M1	\|	311	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
M	p1_aarch64.M1	\|	116	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	p1_amd64.M1	\|	103	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	p1_riscv64.M1	\|	93	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-