boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs

commit 29dd4df7b3d09e614c5999efd71cbe9651d2ef02
parent c352715fd06e83f2dc4ebd6094943a4c5626a203
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Mon, 20 Apr 2026 15:35:35 -0700

P1 extend demo.M1 to cover all ops

Diffstat:
Mdemo.M1 | 311+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
Mp1_aarch64.M1 | 116+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mp1_amd64.M1 | 103+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mp1_riscv64.M1 | 93++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
4 files changed, 568 insertions(+), 55 deletions(-)

diff --git a/demo.M1 b/demo.M1 @@ -1,65 +1,259 @@ ## P1 broader-ISA demo — portable across aarch64, amd64, riscv64. ## -## Computes (3 + 4) - 2 = 5 in registers, prints "P1 = 5\n", then -## exits with status 5. Exercises LI, ADD, SUB, MOV, and SYSCALL. +## Exercises the P1 ISA in tranches (see P1.md §"ISA"). Each op is +## applied in a way that preserves the running value r1=5, so a +## miscoded op produces the wrong exit status. +## +## Tranche 1: LI, MOV, SYSCALL, plus every reg-reg-reg arith op: +## ADD, SUB, AND, OR, XOR, MUL, DIV, REM, SHL, SHR, SAR. +## Tranche 2: immediate forms +## ADDI, ANDI, ORI, SHLI, SHRI, SARI +## (no SUBI/XORI/MULI in P1 — see PLAN.md §"Feature floor"). +## Tranche 3: LA + memory round-trip +## LA, ST/LD (64b), SW/LW (32b zero-ext), SB/LB (8b zero-ext) +## Tranche 4: r7-indirect branches +## B, BEQ, BNE, BLT, BGE, BLTU, BGEU — 7 taken-path subtests + 1 fall-through +## Tranche 5 (current): CALL / RET / TAIL / PROLOGUE / EPILOGUE +## Nested CALL (stresses PROLOGUE lr-save) + TAIL (must unwind before B) +## +## Constants for the identity chain: +## r2 = 0 identity for ADD/SUB/XOR/OR/SHL/SHR/SAR +## r3 = 1 identity for MUL/DIV +## r5 = 7 identity for AND (5 & 7 = 5) and REM (5 % 7 = 5) +## r1 holds the running value — starts at 5 and stays 5. ## ## Run-and-verify: ## make PROG=demo ARCH=<arch> run && echo "exit=$?" ## expected stdout: "P1 = 5\n" expected exit: 5 :_start - ## Compute result = (3 + 4) - 2 = 5, stash in r6. - ## r6 maps to a callee-saved native reg on every arch (x19 / rbx / - ## s1), so it survives SYSCALL's argument shuffle. - P1_LI_R1 - '03000000' # r1 = 3 - P1_LI_R2 - '04000000' # r2 = 4 - P1_ADD_R3_R1_R2 # r3 = r1 + r2 (= 7) - P1_LI_R4 - '02000000' # r4 = 2 - P1_SUB_R3_R3_R4 # r3 = r3 - r4 (= 5) - P1_MOV_R6_R3 # r6 = r3 (save across syscalls) - - ## write(1, &prefix, 5) — "P1 = " - P1_LI_R0 - SYS_WRITE - P1_LI_R1 - '01000000' - P1_LI_R2 - &prefix - P1_LI_R3 - '05000000' - P1_SYSCALL - - ## write(1, &digits + r6, 1) — the computed digit ('5') - P1_LI_R0 - SYS_WRITE - P1_LI_R1 - '01000000' - P1_LI_R2 - &digits - P1_ADD_R2_R2_R6 # r2 = &digits + 5 - P1_LI_R3 - '01000000' - P1_SYSCALL - - ## write(1, &newline, 1) - P1_LI_R0 - SYS_WRITE - P1_LI_R1 - '01000000' - P1_LI_R2 - &newline - P1_LI_R3 - '01000000' - P1_SYSCALL - - ## exit(r6) — exit status = computed result - P1_LI_R0 - SYS_EXIT - P1_MOV_R1_R6 - P1_SYSCALL + P1_LI_R1 + '05000000' + P1_LI_R2 + '00000000' + P1_LI_R3 + '01000000' + P1_LI_R5 + '07000000' + + P1_ADD_R1_R1_R2 # 5 + 0 = 5 + P1_SUB_R1_R1_R2 # 5 - 0 = 5 + P1_XOR_R1_R1_R2 # 5 ^ 0 = 5 + P1_OR_R1_R1_R2 # 5 | 0 = 5 + P1_AND_R1_R1_R5 # 5 & 7 = 5 + P1_MUL_R1_R1_R3 # 5 * 1 = 5 + P1_DIV_R1_R1_R3 # 5 / 1 = 5 + P1_REM_R1_R1_R5 # 5 % 7 = 5 + P1_SHL_R1_R1_R2 # 5 << 0 = 5 + P1_SHR_R1_R1_R2 # 5 >> 0 = 5 (logical) + P1_SAR_R1_R1_R2 # 5 >> 0 = 5 (arithmetic) + + ## Tranche 2: immediate forms. Chain 5 -> 8 -> 4 -> 5 -> 5 -> 5 -> 5. + P1_ADDI_R1_R1_3 # 5 + 3 = 8 + P1_SHRI_R1_R1_1 # 8 >> 1 = 4 + P1_ORI_R1_R1_1 # 4 | 1 = 5 + P1_ANDI_R1_R1_7 # 5 & 7 = 5 + P1_SHLI_R1_R1_0 # 5 << 0 = 5 + P1_SARI_R1_R1_0 # 5 >> 0 = 5 (arithmetic) + + ## Tranche 3: memory round-trip. For each access width, store r1, + ## clobber r1 to 0, then reload. A broken ST/LD leaves r1 != 5. + P1_LA_R4 + &scratch + + P1_ST_R1_R4_0 # [scratch+0..8] = r1 (= 5) + P1_LI_R1 + '00000000' + P1_LD_R1_R4_0 # r1 = [scratch+0..8] -> 5 + + P1_SW_R1_R4_8 # [scratch+8..12] = r1 (low 4 bytes) + P1_LI_R1 + '00000000' + P1_LW_R1_R4_8 # r1 = zext [scratch+8..12] -> 5 + + P1_SB_R1_R4_16 # [scratch+16] = r1 (low byte) + P1_LI_R1 + '00000000' + P1_LB_R1_R4_16 # r1 = zext [scratch+16] -> 5 + + ## Tranche 4: branches. r2=0, r3=1 already, r5=7 already. + ## Pattern for each taken-path test: + ## set r7 = &b4_N_ok ; <branch cond met> ; clobber r1->0 ; :b4_N_ok + ## If the branch correctly fires, the clobber is skipped. + ## + ## B — unconditional, jumps to &b4_1_ok. + P1_LI_R7 + &b4_1_ok + P1_B + P1_LI_R1 + '00000000' +:b4_1_ok + + ## BEQ r2,r3,r7 — 0 == 0? set r3 back to 0 first, branch, then restore. + P1_LI_R3 + '00000000' # r3 = 0 so r2 == r3 + P1_LI_R7 + &b4_2_ok + P1_BEQ_R2_R3_R7 # 0 == 0, branch taken + P1_LI_R1 + '00000000' +:b4_2_ok + P1_LI_R3 + '01000000' # restore r3 = 1 + + ## BNE r2,r3,r7 — 0 != 1? yes, branch taken. + P1_LI_R7 + &b4_3_ok + P1_BNE_R2_R3_R7 + P1_LI_R1 + '00000000' +:b4_3_ok + + ## BLT r2,r3,r7 — 0 < 1 (signed)? yes, branch taken. + P1_LI_R7 + &b4_4_ok + P1_BLT_R2_R3_R7 + P1_LI_R1 + '00000000' +:b4_4_ok + + ## BGE r3,r2,r7 — needs a3 >= a2. Swap: load r2=1 r3=0 for this test. + P1_LI_R2 + '01000000' + P1_LI_R3 + '00000000' + P1_LI_R7 + &b4_5_ok + P1_BGE_R2_R3_R7 # 1 >= 0 (signed), branch taken + P1_LI_R1 + '00000000' +:b4_5_ok + P1_LI_R2 + '00000000' # restore r2 = 0 + P1_LI_R3 + '01000000' # restore r3 = 1 + + ## BLTU r2,r3,r7 — 0 < 1 (unsigned)? yes, branch taken. + P1_LI_R7 + &b4_6_ok + P1_BLTU_R2_R3_R7 + P1_LI_R1 + '00000000' +:b4_6_ok + + ## BGEU r2,r3,r7 — needs a2 >= a3 (unsigned). Swap r2/r3. + P1_LI_R2 + '01000000' + P1_LI_R3 + '00000000' + P1_LI_R7 + &b4_7_ok + P1_BGEU_R2_R3_R7 # 1 >= 0 (unsigned), branch taken + P1_LI_R1 + '00000000' +:b4_7_ok + P1_LI_R2 + '00000000' + P1_LI_R3 + '01000000' + + ## Fall-through test: BEQ with r2=0, r3=1 (unequal). The branch must NOT + ## fire. If it does fire (incorrectly), we jump to &b4_ft_bad which + ## clobbers r1. Correct behavior falls through the branch, then an + ## unconditional B jumps past the clobber to &b4_ft_good. + P1_LI_R7 + &b4_ft_bad + P1_BEQ_R2_R3_R7 # 0 == 1? no, fall through + P1_LI_R7 + &b4_ft_good + P1_B # skip the bad-path clobber +:b4_ft_bad + P1_LI_R1 + '00000000' +:b4_ft_good + + ## Restore r1 = 5 is implicit — we never clobbered it on the happy path. + + ## Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL. + ## fn_identity does its own nested CALL to fn_inner — if PROLOGUE doesn't + ## spill lr correctly, the inner CALL clobbers the return-to-_start + ## address and we crash or hang. The function bodies live inline below + ## the subtests, guarded by a P1_B over them so we don't fall through + ## into them after the last subtest. + P1_LI_R7 + &fn_identity + P1_CALL # nested-CALL test: returns r1 unchanged + + P1_LI_R7 + &fn_parent_tail + P1_CALL # TAIL test: fn_identity RETs to here + + P1_LI_R7 + &b5_end + P1_B # skip over the inlined function bodies + +:fn_inner + P1_PROLOGUE + P1_EPILOGUE + P1_RET + +:fn_identity + P1_PROLOGUE + P1_LI_R7 + &fn_inner + P1_CALL + P1_EPILOGUE + P1_RET + +:fn_parent_tail + P1_PROLOGUE + P1_LI_R7 + &fn_identity + P1_TAIL + +:b5_end + + P1_MOV_R6_R1 # r6 = 5 (callee-saved, survives syscalls) + + ## write(1, &prefix, 5) — "P1 = " + P1_LI_R0 + SYS_WRITE + P1_LI_R1 + '01000000' + P1_LI_R2 + &prefix + P1_LI_R3 + '05000000' + P1_SYSCALL + + ## write(1, &digits + r6, 1) — the computed digit ('5') + P1_LI_R0 + SYS_WRITE + P1_LI_R1 + '01000000' + P1_LI_R2 + &digits + P1_ADD_R2_R2_R6 # r2 = &digits + 5 + P1_LI_R3 + '01000000' + P1_SYSCALL + + ## write(1, &newline, 1) + P1_LI_R0 + SYS_WRITE + P1_LI_R1 + '01000000' + P1_LI_R2 + &newline + P1_LI_R3 + '01000000' + P1_SYSCALL + + ## exit(r6) — exit status = computed result + P1_LI_R0 + SYS_EXIT + P1_MOV_R1_R6 + P1_SYSCALL :prefix "P1 = " @@ -69,4 +263,13 @@ " " +## 32 bytes reserved for tranche 3 memory round-trip. The LOAD segment +## is RWX (see ELF-<arch>.hex2 ph_flags=7) so we can store into this +## region at runtime. +:scratch +'0000000000000000' +'0000000000000000' +'0000000000000000' +'0000000000000000' + :ELF_end diff --git a/p1_aarch64.M1 b/p1_aarch64.M1 @@ -79,3 +79,119 @@ DEFINE P1_ADD_R2_R2_R6 4200138B ## add x2, x2, x19 ## SUB rD, rA, rB -> sub xD, xA, xB DEFINE P1_SUB_R3_R3_R4 630004CB ## sub x3, x3, x4 + + +## ---- Tranche 1: full arith reg-reg-reg ---------------------------------- +## Identity-chain tuples used by demo.M1 to exercise every P1 arith op +## without branches or memory. r1 is the running accumulator; each op +## uses an identity partner so the correct output is always r1. +## aarch64 has 3-operand forms for all of these, so one insn per op. +## REM has no native op — expands to sdiv+msub through scratch reg x4. + +## MOV rD, rA -> orr xD, xzr, xA +DEFINE P1_MOV_R6_R1 F30301AA ## mov x19, x1 + +## ADD / SUB / AND / OR / XOR — base opcodes 8B / CB / 8A / AA / CA. +DEFINE P1_ADD_R1_R1_R2 2100028B ## add x1, x1, x2 +DEFINE P1_SUB_R1_R1_R2 210002CB ## sub x1, x1, x2 +DEFINE P1_AND_R1_R1_R5 2100058A ## and x1, x1, x5 +DEFINE P1_OR_R1_R1_R2 210002AA ## orr x1, x1, x2 +DEFINE P1_XOR_R1_R1_R2 210002CA ## eor x1, x1, x2 + +## MUL rD, rA, rB -> madd xD, xA, xB, xzr +DEFINE P1_MUL_R1_R1_R3 217C039B ## mul x1, x1, x3 + +## DIV rD, rA, rB -> sdiv xD, xA, xB +DEFINE P1_DIV_R1_R1_R3 210CC39A ## sdiv x1, x1, x3 + +## REM rD, rA, rB -> sdiv x4, xA, xB ; msub xD, x4, xB, xA +## Two insns; x4 is a caller-saved scratch (= P1 r4). Demo keeps any +## live r4 value dead across this op. +DEFINE P1_REM_R1_R1_R5 240CC59A8184059B ## sdiv x4,x1,x5; msub x1,x4,x5,x1 + +## SHL / SHR / SAR -> lslv / lsrv / asrv (64-bit variants). +DEFINE P1_SHL_R1_R1_R2 2120C29A ## lsl x1, x1, x2 +DEFINE P1_SHR_R1_R1_R2 2124C29A ## lsr x1, x1, x2 +DEFINE P1_SAR_R1_R1_R2 2128C29A ## asr x1, x1, x2 + + +## ---- Tranche 2: immediate arith --------------------------------------- +## Tuples exercised by demo.M1: chain 5 -> 8 -> 4 -> 5 -> 5 -> 5 -> 5. +## +## Bitwise-imm ops (AND/OR) use aarch64's logical-immediate encoding: +## (N, immr, imms) encodes a run of consecutive 1s rotated by immr. +## For 64-bit patterns N=1 and imms = (ones_count - 1). Immediates: +## imm=1 -> N=1 imms=0 immr=0 (one 1, no rotation) +## imm=7 -> N=1 imms=2 immr=0 (three consecutive 1s) +## Shift-imm ops are UBFM/SBFM aliases: +## LSL #n -> UBFM immr=-n mod 64, imms=63-n +## LSR #n -> UBFM immr=n, imms=63 +## ASR #n -> SBFM immr=n, imms=63 + +DEFINE P1_ADDI_R1_R1_3 210C0091 ## add x1, x1, #3 +DEFINE P1_ANDI_R1_R1_7 21084092 ## and x1, x1, #7 +DEFINE P1_ORI_R1_R1_1 210040B2 ## orr x1, x1, #1 +DEFINE P1_SHLI_R1_R1_0 21FC40D3 ## lsl x1, x1, #0 (UBFM #0,#63) +DEFINE P1_SHRI_R1_R1_1 21FC41D3 ## lsr x1, x1, #1 (UBFM #1,#63) +DEFINE P1_SARI_R1_R1_0 21FC4093 ## asr x1, x1, #0 (SBFM #0,#63) + + +## ---- Tranche 3: LA + memory ops --------------------------------------- +## LA is LI in the spike — both load a 4-byte zero-extended literal, +## which is enough to address the ELF (base 0x00600000 < 2^32). Extending +## LA to a full 64-bit load is future work (P1.md §"What needs added"). +DEFINE P1_LA_R4 4400001802000014 ## ldr w4, [pc+8] ; b +8 ; <4 bytes> + +## Unsigned-offset forms of LDR/STR at 8/4/1-byte widths. +## imm12 is scaled by access size, so e.g. LDR-w offset 8 uses imm12=2. +DEFINE P1_ST_R1_R4_0 810000F9 ## str x1, [x4, #0] +DEFINE P1_LD_R1_R4_0 810040F9 ## ldr x1, [x4, #0] +DEFINE P1_SW_R1_R4_8 810800B9 ## str w1, [x4, #8] +DEFINE P1_LW_R1_R4_8 810840B9 ## ldr w1, [x4, #8] (zero-ext) +DEFINE P1_SB_R1_R4_16 81400039 ## strb w1, [x4, #16] +DEFINE P1_LB_R1_R4_16 81404039 ## ldrb w1, [x4, #16] (zero-ext) + + +## ---- Tranche 4: branches (r7-indirect, no hex2_word needed) ------------ +## Conditional branches compare ra vs rb, then jump to the address in r7. +## Pattern: cmp xRa, xRb ; b.<INV> +8 ; br x20 +## - If cond is false we take the +8 skip and fall through the BR. +## - If cond is true we execute the BR x20 jump to r7. +## Caller loads the target into r7 (via P1_LI_R7 &label) beforehand. +## Unconditional P1_B is just BR x20. +## +## Conditions for "skip if NOT cond": +## BEQ -> B.NE (cond 1) BNE -> B.EQ (cond 0) +## BLT -> B.GE (cond A) BGE -> B.LT (cond B) +## BLTU-> B.HS (cond 2) BGEU -> B.LO (cond 3) +## CMP x2, x3 = SUBS xzr, x2, x3 = 0xEB03005F (Rm=3, Rn=2, Rd=31). +## BR x20 = 0xD61F0280. + +DEFINE P1_B 80021FD6 +DEFINE P1_BEQ_R2_R3_R7 5F0003EB4100005480021FD6 ## cmp ; b.ne +8 ; br x20 +DEFINE P1_BNE_R2_R3_R7 5F0003EB4000005480021FD6 ## cmp ; b.eq +8 ; br x20 +DEFINE P1_BLT_R2_R3_R7 5F0003EB4A00005480021FD6 ## cmp ; b.ge +8 ; br x20 +DEFINE P1_BGE_R2_R3_R7 5F0003EB4B00005480021FD6 ## cmp ; b.lt +8 ; br x20 +DEFINE P1_BLTU_R2_R3_R7 5F0003EB4200005480021FD6 ## cmp ; b.hs +8 ; br x20 +DEFINE P1_BGEU_R2_R3_R7 5F0003EB4300005480021FD6 ## cmp ; b.lo +8 ; br x20 + + +## ---- Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL ----------------- +## CALL is an r7-indirect branch-and-link, same pattern as branches: caller +## loads &target into r7, then `blr x20` sets x30 = PC+4 and jumps. +## RET jumps through x30 (the native `ret` pseudo). +## +## PROLOGUE / EPILOGUE explicitly spill and reload x30 on aarch64 so that +## nested CALLs can't clobber the caller's return address. After PROLOGUE, +## [sp+0] holds the return address, matching P1.md's uniform convention. +## +## TAIL = EPILOGUE + B. Calling convention: load &target into r7 first, +## then TAIL performs the callee's own epilogue (restoring the parent's +## caller-retaddr into x30) and jumps to r7. When the tail target later +## RETs, control returns to the parent's caller. + +DEFINE P1_PROLOGUE FF4300D1FE0300F9 ## sub sp,#16 ; str x30,[sp] +DEFINE P1_EPILOGUE FE0340F9FF430091 ## ldr x30,[sp] ; add sp,#16 +DEFINE P1_RET C0035FD6 ## ret (br x30) +DEFINE P1_CALL 80023FD6 ## blr x20 +DEFINE P1_TAIL FE0340F9FF43009180021FD6 ## epilogue ; br x20 diff --git a/p1_amd64.M1 b/p1_amd64.M1 @@ -69,3 +69,106 @@ DEFINE P1_ADD_R2_R2_R6 4889F64801DE ## mov rsi,rsi ; add rsi,rbx ## SUB rD, rA, rB -> mov rD,rA ; sub rD,rB DEFINE P1_SUB_R3_R3_R4 4889D24C29D2 ## mov rdx,rdx ; sub rdx,r10 + + +## ---- Tranche 1: full arith reg-reg-reg ---------------------------------- +## Identity-chain tuples used by demo.M1. r1=rdi runs through each op. +## x86-64 oddities: +## - Shifts need count in cl; three-insn form mov rD,rA; mov rcx,rB; shl. +## - IDIV needs dividend in rdx:rax. We save rdx to rcx before CQO so +## both the divisor (when it is rdx) and the caller's r3 survive. + +## MOV rD, rA -> mov rD_native, rA_native +DEFINE P1_MOV_R6_R1 4889FB ## mov rbx, rdi + +## ADD / SUB / AND / OR / XOR — 2-insn form, leading mov rdi,rdi kept. +DEFINE P1_ADD_R1_R1_R2 4889FF4801F7 ## mov rdi,rdi ; add rdi,rsi +DEFINE P1_SUB_R1_R1_R2 4889FF4829F7 ## mov rdi,rdi ; sub rdi,rsi +DEFINE P1_XOR_R1_R1_R2 4889FF4831F7 ## mov rdi,rdi ; xor rdi,rsi +DEFINE P1_OR_R1_R1_R2 4889FF4809F7 ## mov rdi,rdi ; or rdi,rsi +DEFINE P1_AND_R1_R1_R5 4889FF4C21C7 ## mov rdi,rdi ; and rdi,r8 + +## MUL rD, rA, rB -> mov rD,rA ; imul rD,rB (IMUL r64,r/m64 = 0F AF) +DEFINE P1_MUL_R1_R1_R3 4889FF480FAFFA ## mov rdi,rdi ; imul rdi,rdx + +## DIV rD, rA, rB -> divisor in rdx (r3). Save rdx to rcx so the +## divisor survives CQO's clobber of rdx, and so r3 is restored after. +## mov rcx,rdx ; mov rax,rdi ; cqo ; idiv rcx ; mov rdi,rax ; mov rdx,rcx +DEFINE P1_DIV_R1_R1_R3 4889D14889F8489948F7F94889C74889CA + +## REM rD, rA, rB -> divisor in r8 (r5). CQO still clobbers rdx, so +## save/restore r3 through rcx. +## mov rcx,rdx ; mov rax,rdi ; cqo ; idiv r8 ; mov rdi,rdx ; mov rdx,rcx +DEFINE P1_REM_R1_R1_R5 4889D14889F8489949F7F84889D74889CA + +## SHL / SHR / SAR -> mov rD,rA ; mov rcx,rB ; shl/shr/sar rD,cl +## Opcode D3 /n with REX.W: /4=SHL, /5=SHR, /7=SAR. +DEFINE P1_SHL_R1_R1_R2 4889FF4889F148D3E7 ## mov rdi,rdi; mov rcx,rsi; shl rdi,cl +DEFINE P1_SHR_R1_R1_R2 4889FF4889F148D3EF ## mov rdi,rdi; mov rcx,rsi; shr rdi,cl +DEFINE P1_SAR_R1_R1_R2 4889FF4889F148D3FF ## mov rdi,rdi; mov rcx,rsi; sar rdi,cl + + +## ---- Tranche 2: immediate arith --------------------------------------- +## mov rdi,rdi ; <op> rdi, imm8 (sign-extended imm8 forms via opcode 83; +## shifts via C1). /n is the opcode-extension field in ModRM.reg. + +DEFINE P1_ADDI_R1_R1_3 4889FF4883C703 ## add rdi, 3 (83 /0 ib) +DEFINE P1_ANDI_R1_R1_7 4889FF4883E707 ## and rdi, 7 (83 /4 ib) +DEFINE P1_ORI_R1_R1_1 4889FF4883CF01 ## or rdi, 1 (83 /1 ib) +DEFINE P1_SHLI_R1_R1_0 4889FF48C1E700 ## shl rdi, 0 (C1 /4 ib) +DEFINE P1_SHRI_R1_R1_1 4889FF48C1EF01 ## shr rdi, 1 (C1 /5 ib) +DEFINE P1_SARI_R1_R1_0 4889FF48C1FF00 ## sar rdi, 0 (C1 /7 ib) + + +## ---- Tranche 3: LA + memory ops --------------------------------------- +## LA is LI in the spike (addresses fit in 32 bits → zero-extends cleanly +## through the mov-to-r32 form). r4 is r10, so base-reg encoding uses +## REX.B=1 and ModRM rm=010. No SIB byte needed (r10's low3 bits = 010). +DEFINE P1_LA_R4 41BA ## mov r10d, imm32 + +## Plain MOV r/m, r / MOV r, r/m with 8-bit displacement. +## REX: W=1 for 64-bit moves; B=1 always (r10 is the base register). +DEFINE P1_ST_R1_R4_0 49893A ## mov [r10], rdi +DEFINE P1_LD_R1_R4_0 498B3A ## mov rdi, [r10] +DEFINE P1_SW_R1_R4_8 41897A08 ## mov [r10+8], edi +DEFINE P1_LW_R1_R4_8 418B7A08 ## mov edi, [r10+8] (zero-ext) +DEFINE P1_SB_R1_R4_16 41887A10 ## mov [r10+16], dil +DEFINE P1_LB_R1_R4_16 490FB67A10 ## movzx rdi, byte [r10+16] + + +## ---- Tranche 4: branches (r7-indirect, no hex2_word needed) ------------ +## Same pattern as aarch64: cmp ra,rb ; short native jcc over a jmp-through-r7. +## If cond is false the native "skip" jcc fires (opposite of the P1 cond) and +## steps past the 3-byte `jmp r12`, falling through. If cond is true we take +## the jmp r12 to the address the caller stashed in r7. +## P1_B is just `jmp r12` unconditionally. +## +## CMP rsi, rdx = 48 39 D6 (REX.W, opcode 39 /r, ModRM: 11 010 110). +## JMP r12 = 41 FF E4 (REX.B, opcode FF /4, ModRM: 11 100 100). +## jcc rel8 opcodes (skip when NOT cond): JE=74 JNE=75 JL=7C JGE=7D JB=72 JAE=73. + +DEFINE P1_B 41FFE4 ## jmp r12 +DEFINE P1_BEQ_R2_R3_R7 4839D6750341FFE4 ## cmp ; jne +3 ; jmp r12 +DEFINE P1_BNE_R2_R3_R7 4839D6740341FFE4 ## cmp ; je +3 ; jmp r12 +DEFINE P1_BLT_R2_R3_R7 4839D67D0341FFE4 ## cmp ; jge +3 ; jmp r12 +DEFINE P1_BGE_R2_R3_R7 4839D67C0341FFE4 ## cmp ; jl +3 ; jmp r12 +DEFINE P1_BLTU_R2_R3_R7 4839D6730341FFE4 ## cmp ; jae +3 ; jmp r12 +DEFINE P1_BGEU_R2_R3_R7 4839D6720341FFE4 ## cmp ; jb +3 ; jmp r12 + + +## ---- Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL ----------------- +## amd64's native CALL already pushes the return address to the stack, and +## RET pops it. So PROLOGUE/EPILOGUE are no-ops here — a single NOP keeps +## them non-empty (handy in disasm, and sidesteps any M0 quirks around an +## empty DEFINE value). On the other two arches PROLOGUE/EPILOGUE do real +## work spilling/reloading lr. +## +## CALL expects the target pre-loaded into r7 (= r12); expands to `call r12`. +## TAIL = EPILOGUE + unconditional B (= `jmp r12`), so the caller of the +## tail-calling function receives the return from the tail target directly. + +DEFINE P1_PROLOGUE 90 ## nop (retaddr already on stack) +DEFINE P1_EPILOGUE 90 ## nop +DEFINE P1_RET C3 ## ret +DEFINE P1_CALL 41FFD4 ## call r12 +DEFINE P1_TAIL 9041FFE4 ## epilogue(nop) ; jmp r12 diff --git a/p1_riscv64.M1 b/p1_riscv64.M1 @@ -55,7 +55,7 @@ DEFINE P1_LI_R7 170900000369C9006F008000 ## ## Unconditional — unused shuffles read caller-saved scratch, and the ## kernel reads only the regs each syscall cares about. -DEFINE P1_SYSCALL 9308050013850500930506001386060093060700138707009307040073000000 +DEFINE P1_SYSCALL 9308050013850500930506001386060093060700138707009387040073000000 ## Linux syscall numbers (riscv64 uses the generic table — same as aarch64). @@ -76,3 +76,94 @@ DEFINE P1_ADD_R2_R2_R6 33069600 ## add a2, a2, s1 ## SUB rD, rA, rB -> sub rD, rA, rB DEFINE P1_SUB_R3_R3_R4 B386E640 ## sub a3, a3, a4 + + +## ---- Tranche 1: full arith reg-reg-reg ---------------------------------- +## Identity-chain tuples used by demo.M1. All one-insn R-type ops. +## MUL/DIV/REM are M-extension ops (standard on rv64gc). + +## MOV rD, rA -> addi rD, rA, 0 +DEFINE P1_MOV_R6_R1 93840500 ## mv s1, a1 + +## ADD / SUB / AND / OR / XOR — R-type, funct3 picks the op. +DEFINE P1_ADD_R1_R1_R2 B385C500 ## add a1, a1, a2 +DEFINE P1_SUB_R1_R1_R2 B385C540 ## sub a1, a1, a2 (funct7=0x20) +DEFINE P1_XOR_R1_R1_R2 B3C5C500 ## xor a1, a1, a2 (funct3=4) +DEFINE P1_OR_R1_R1_R2 B3E5C500 ## or a1, a1, a2 (funct3=6) +DEFINE P1_AND_R1_R1_R5 B3F5F500 ## and a1, a1, a5 (funct3=7) + +## MUL / DIV / REM — M extension, funct7=1. +DEFINE P1_MUL_R1_R1_R3 B385D502 ## mul a1, a1, a3 +DEFINE P1_DIV_R1_R1_R3 B3C5D502 ## div a1, a1, a3 (funct3=4) +DEFINE P1_REM_R1_R1_R5 B3E5F502 ## rem a1, a1, a5 (funct3=6) + +## SHL / SHR / SAR -> sll / srl / sra. +DEFINE P1_SHL_R1_R1_R2 B395C500 ## sll a1, a1, a2 (funct3=1) +DEFINE P1_SHR_R1_R1_R2 B3D5C500 ## srl a1, a1, a2 (funct3=5) +DEFINE P1_SAR_R1_R1_R2 B3D5C540 ## sra a1, a1, a2 (funct7=0x20) + + +## ---- Tranche 2: immediate arith --------------------------------------- +## I-type / shift-immediate forms. Shift amount is 6 bits (shamt6) for +## rv64. SRAI sets bit 30 to distinguish from SRLI. + +DEFINE P1_ADDI_R1_R1_3 93853500 ## addi a1, a1, 3 +DEFINE P1_ANDI_R1_R1_7 93F57500 ## andi a1, a1, 7 (funct3=7) +DEFINE P1_ORI_R1_R1_1 93E51500 ## ori a1, a1, 1 (funct3=6) +DEFINE P1_SHLI_R1_R1_0 93950500 ## slli a1, a1, 0 (funct3=1) +DEFINE P1_SHRI_R1_R1_1 93D51500 ## srli a1, a1, 1 (funct3=5) +DEFINE P1_SARI_R1_R1_0 93D50540 ## srai a1, a1, 0 (funct7=0x20) + + +## ---- Tranche 3: LA + memory ops --------------------------------------- +## LA is LI in the spike. +DEFINE P1_LA_R4 170700000367C7006F008000 ## auipc a4,0; lwu a4,12(a4); jal x0,+8 + +## LOAD (opcode 0x03) / STORE (opcode 0x23) with signed 12-bit offset. +## For P1: LD=64b, LW=32b zero-ext (= LWU), LB=8b zero-ext (= LBU). +## ST=SD (64b), SW=32b, SB=8b. +DEFINE P1_ST_R1_R4_0 2330B700 ## sd a1, 0(a4) +DEFINE P1_LD_R1_R4_0 83350700 ## ld a1, 0(a4) +DEFINE P1_SW_R1_R4_8 2324B700 ## sw a1, 8(a4) +DEFINE P1_LW_R1_R4_8 83658700 ## lwu a1, 8(a4) +DEFINE P1_SB_R1_R4_16 2308B700 ## sb a1, 16(a4) +DEFINE P1_LB_R1_R4_16 83450701 ## lbu a1, 16(a4) + + +## ---- Tranche 4: branches (r7-indirect, no hex2_word needed) ------------ +## RISC-V has B-type conditional branches with a scattered immediate; writing +## literal byte strings for arbitrary offsets is painful without hex2_word. +## Sidestep it with the r7-indirect pattern: a fixed-offset branch that skips +## the unconditional `jalr x0, 0(s2)` when the P1 condition is NOT met. +## +## Fixed offset: the conditional branches below all use imm=8 (skip past the +## 4-byte JALR on the false path). B-type imm=8 encodes to (imm[12|10:5]=0, +## imm[4:1|11]=4, so imms spread as funct7 bits 0x00 and rd bits 0x4). +## Branch instruction byte strings below were assembled and verified by hand. +## +## P1_B is just `jalr x0, 0(s2)` — unconditional jump to address in r7. +## All comparisons use a2 (P1 r2) vs a3 (P1 r3). + +DEFINE P1_B 67000900 ## jalr x0, 0(s2) +DEFINE P1_BEQ_R2_R3_R7 6314D60067000900 ## bne a2,a3,+8 ; jalr x0,0(s2) +DEFINE P1_BNE_R2_R3_R7 6304D60067000900 ## beq ; jalr +DEFINE P1_BLT_R2_R3_R7 6354D60067000900 ## bge ; jalr +DEFINE P1_BGE_R2_R3_R7 6344D60067000900 ## blt ; jalr +DEFINE P1_BLTU_R2_R3_R7 6374D60067000900 ## bgeu; jalr +DEFINE P1_BGEU_R2_R3_R7 6364D60067000900 ## bltu; jalr + + +## ---- Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL ----------------- +## CALL is JALR through s2 (= P1 r7), saving PC+4 into ra. Caller loads +## &target into r7 beforehand. RET is the canonical `ret` pseudo +## (JALR x0, 0(ra)). +## +## PROLOGUE / EPILOGUE save and restore ra around nested calls; after +## PROLOGUE, [sp+0] holds the return address (matching P1.md §"return +## address lives in [sp+0] after prologue"). TAIL = EPILOGUE + B. + +DEFINE P1_PROLOGUE 130101FF23301100 ## addi sp,sp,-16 ; sd ra,0(sp) +DEFINE P1_EPILOGUE 8330010013010101 ## ld ra,0(sp) ; addi sp,sp,16 +DEFINE P1_RET 67800000 ## jalr x0, 0(ra) +DEFINE P1_CALL E7000900 ## jalr ra, 0(s2) +DEFINE P1_TAIL 833001001301010167000900 ## epilogue ; jalr x0, 0(s2)