demo.M1 - boot2 - Playing with the boostrap

demo.M1 (11645B)
      1 ## P1 broader-ISA demo — portable across aarch64, amd64, riscv64.
      2 ##
      3 ## Exercises the P1 ISA in tranches (see P1.md §"Instruction set").
      4 ## Each op is applied with non-identity operands so a mis-encoded op
      5 ## produces a detectably-wrong final r1. If every op in every tranche
      6 ## is correct, r1 ends at 5, stdout is "P1 = 5\n", exit is 5.
      7 ##
      8 ## Tranche 1: reg-reg-reg arith (11 ops). Each step's result is
      9 ##   unique vs. neighbor ops on the same operands, e.g.
     10 ##     ADD(5,3)=8  SUB(8,3)=5  XOR(5,3)=6  OR(6,3)=7  AND(7,5)=5
     11 ##     MUL(5,3)=15 DIV(15,3)=5 REM(5,7)=5  SHL(5,3)=40 SHR(40,3)=5.
     12 ##   SAR vs SHR agree on positive values; SAR is separately tested
     13 ##   on a negative value in r4 (SAR(-1,3)=-1, SHR would give huge).
     14 ## Tranche 2: immediate forms. ADDI tested with BOTH positive and
     15 ##   negative imm12 (proves signed-immediate encoding). SHLI/SHRI
     16 ##   with non-zero shift amount. ANDI/ORI with overlapping-bit
     17 ##   masks so the result is distinguishable from ADDI/XORI. SARI
     18 ##   tested on negative value via r4.
     19 ##   (No SUBI/XORI/MULI in P1 — see PLAN.md §"Feature floor".)
     20 ## Tranche 3: LA + memory round-trip. LD/ST (64-bit) and LB/SB
     21 ##   (8-bit zero-extended). 32-bit LW/SW dropped from the ISA —
     22 ##   synthesize from LD + mask/shift if needed. Signed imm12 on
     23 ##   LD/ST is exercised via a NEG8-offset round-trip through
     24 ##   scratch_mid, mirroring the ADDI NEG1/NEG3 signed-imm test.
     25 ## Tranche 4: LI_BR-indirect branches. B, BEQ, BNE, BLT each tested
     26 ##   in BOTH taken and fall-through directions so an inverted
     27 ##   condition encoding doesn't pass. BLT additionally tested with
     28 ##   a negative operand (-1 < 0 signed) to prove signed semantics.
     29 ##   BLTU/BGEU/BGE dropped from the ISA — see P1.md.
     30 ## Tranche 5: CALL / RET / TAIL / PROLOGUE / EPILOGUE. Nested CALL
     31 ##   stresses PROLOGUE lr-save; TAIL unwinds the frame. Stack
     32 ##   balance is additionally verified by snapshotting sp via
     33 ##   MOV_R6_SP before tranche 5 and comparing after — a TAIL that
     34 ##   omits its epilogue (or whose epilogue clobbers the branch
     35 ##   target) leaks fn_parent_tail's 16-byte frame and the delta
     36 ##   folds into the accumulator. All three arches now run real
     37 ##   sp-moving PROLOGUE/EPILOGUE, so the check bites everywhere.
     38 ##
     39 ## Run-and-verify:
     40 ##   make PROG=demo ARCH=<arch> run && echo "exit=$?"
     41 ## expected stdout: "P1 = 5\n"   expected exit: 5
     42 
     43 :_start
     44     ## Setup: r1 is the running accumulator; r2/r5 are arith partners.
     45     ## r1 is seeded via the LI_R6 → MOV_R1_R6 roundtrip so the LI_R6
     46     ## encoding gets exercised — otherwise r6 is only written by
     47     ## MOV_R6_R1 in the exit path. Clobbering r1 to 0 between the two
     48     ## ops means a broken MOV_R1_R6 (wrong source reg) leaves r1=0 and
     49     ## poisons the chain; a broken LI_R6 (wrong dest reg or wrong
     50     ## literal-slot offset) puts the wrong value in r6 and MOV
     51     ## propagates it.
     52     li_r6 '05000000'  # r6 = 5 via LI_R6 (discriminator)
     53     li_r1 '00000000'  # clobber r1 before the MOV
     54     mov_r1,r6                # r1 = r6 = 5
     55     li_r2 '03000000'  # r2 = 3  (arith partner & shift amount)
     56     li_r5 '05000000'  # r5 = 5  (AND partner)
     57 
     58     ## Tranche 1: reg-reg-reg arith.
     59     add_r1,r1,r2             # 5 + 3 = 8
     60     sub_r1,r1,r2             # 8 - 3 = 5
     61     xor_r1,r1,r2             # 5 ^ 3 = 6
     62     or_r1,r1,r2              # 6 | 3 = 7
     63     and_r1,r1,r5             # 7 & 5 = 5
     64     mul_r1,r1,r2             # 5 * 3 = 15
     65     div_r1,r1,r2             # 15 / 3 = 5
     66     li_r5 '07000000'  # r5 = 7 for REM
     67     rem_r1,r1,r5             # 5 % 7 = 5
     68     shl_r1,r1,r2             # 5 << 3 = 40
     69     shr_r1,r1,r2             # 40 >> 3 = 5
     70 
     71     ## SAR discriminator: on positive values SAR and SHR agree, so
     72     ## put a negative value in r4 and check that SAR preserves the
     73     ## sign bits. Then fold r4 into r1 so a misbehaving SAR poisons
     74     ## the accumulator.
     75     li_r4 '00000000'  # r4 = 0
     76     addi_r4,r4,neg1          # r4 = 0 + (-1) = -1  (sign-extended 64-bit)
     77     sar_r4,r4,r2             # r4 = -1 >> 3 = -1   (SHR would be 0x1FFF...FFFF)
     78     add_r1,r1,r4             # r1 = 5 + (-1) = 4
     79     addi_r1,r1,1             # r1 = 4 + 1 = 5
     80 
     81     ## Tranche 2: immediate forms. Chain 5 → 8 → 5 → 10 → 5 → 4 → 5
     82     ## plus a SARI discriminator via r4.
     83     addi_r1,r1,3             # 5 + 3 = 8
     84     addi_r1,r1,neg3          # 8 + (-3) = 5      (signed imm12)
     85     shli_r1,r1,1             # 5 << 1 = 10       (non-zero shift)
     86     shri_r1,r1,1             # 10 >> 1 = 5
     87     andi_r1,r1,6             # 5 & 6 = 4         (0b101 & 0b110 = 0b100)
     88     ori_r1,r1,1              # 4 | 1 = 5         (aarch64 bitmask-immediate
     89                                 #   requires valid-mask imm; 5 is invalid, 1 is
     90                                 #   valid. Tranche 1 OR_R1_R1_R2 (6|3=7) already
     91                                 #   discriminates OR from ADD/XOR on overlapping
     92                                 #   bits; ORI here just tests the imm-ORR encoding
     93                                 #   exists and doesn't crash, plus distinguishes
     94                                 #   from AND (4&1=0 would fail the chain).)
     95 
     96     li_r4 '00000000'
     97     addi_r4,r4,neg1          # r4 = -1
     98     sari_r4,r4,1             # r4 = -1 >> 1 = -1 (SHRI would be 0x7FFF...FFFF)
     99     add_r1,r1,r4             # r1 = 5 + (-1) = 4
    100     addi_r1,r1,1             # r1 = 5
    101 
    102     ## Tranche 3: memory round-trip. For each width, store r1,
    103     ## clobber r1 to 0, then reload. A broken ST/LD leaves r1 != 5.
    104     la_r4 &scratch
    105 
    106     st_r1,r4,0               # [scratch+0..8] = r1 (= 5)
    107     li_r1 '00000000'
    108     ld_r1,r4,0               # r1 = [scratch+0..8]         -> 5
    109 
    110     ## Non-zero imm12 on the 64-bit forms. Distinct value 8 at +8 so
    111     ## an LD_R4_8 that silently aliased to +0 would read 5 and fail the
    112     ## SUB step; a broken ST_R4_8 leaves [+8]=0 (scratch init) which the
    113     ## LD observes. The +0 round-trip above anchors the imm12=0 case.
    114     li_r1 '08000000'  # distinct from 5
    115     st_r1,r4,8               # [scratch+8..16] = 8
    116     li_r1 '00000000'
    117     ld_r1,r4,8               # r1 = [scratch+8..16]        -> 8
    118     sub_r1,r1,r2             # r1 = 8 - 3 = 5   (r2 still 3 from setup)
    119 
    120     sb_r1,r4,16              # [scratch+16] = r1 (low byte)
    121     li_r1 '00000000'
    122     lb_r1,r4,16              # r1 = zext [scratch+16]      -> 5
    123 
    124     ## Negative imm12 on the memory forms. LA to scratch_mid (= scratch+16)
    125     ## then round-trip a distinct sentinel (13) via [r4 + -8] = scratch+8.
    126     ## Proves the signed offset encoding sign-extends on LD/ST the same
    127     ## way ADDI NEG1/NEG3 proves it on arith. A symmetric "both sides
    128     ## miscompiled to the same wrong offset" bug could still false-pass,
    129     ## but the common cases (sign bit dropped, imm zero-extended) blow
    130     ## up via segfault or a mismatched round-trip value.
    131     la_r4 &scratch_mid
    132     li_r1 '0D000000'  # r1 = 13 (distinct sentinel)
    133     st_r1,r4,neg8            # [scratch+8] = 13 (overwrites +8 slot's old 8)
    134     li_r1 '00000000'
    135     ld_r1,r4,neg8            # r1 = [scratch+8]            -> 13
    136     addi_r1,r1,neg3          # r1 = 13 - 3 = 10
    137     shri_r1,r1,1             # r1 = 10 >> 1 = 5
    138 
    139     ## Tranche 4: branches. r2=0, r3=1 to start; each subtest resets
    140     ## as needed. Taken-path subtests clobber r1 on fall-through;
    141     ## fall-through subtests clobber r1 on incorrect branch.
    142     li_r2 '00000000'  # r2 = 0
    143     li_r3 '01000000'  # r3 = 1
    144 
    145     ## B — unconditional. Correct: jump to b4_1_ok, skipping clobber.
    146     li_br &b4_1_ok
    147     b
    148     li_r1 '00000000'
    149 :b4_1_ok
    150 
    151     ## BEQ taken: r3=0 so r2==r3.
    152     li_r3 '00000000'
    153     li_br &b4_2_ok
    154     beq_r2,r3             # 0 == 0, taken
    155     li_r1 '00000000'
    156 :b4_2_ok
    157     li_r3 '01000000'  # restore r3 = 1
    158 
    159     ## BEQ fall-through: 0 != 1, branch must NOT fire. If it
    160     ## (incorrectly) fires we jump to b4_3_bad and clobber r1.
    161     li_br &b4_3_bad
    162     beq_r2,r3             # 0 == 1? no, fall through
    163     li_br &b4_3_ok
    164     b
    165 :b4_3_bad
    166     li_r1 '00000000'
    167 :b4_3_ok
    168 
    169     ## BNE taken: 0 != 1.
    170     li_br &b4_4_ok
    171     bne_r2,r3             # 0 != 1, taken
    172     li_r1 '00000000'
    173 :b4_4_ok
    174 
    175     ## BNE fall-through: r3=0 so r2==r3; branch must NOT fire.
    176     li_r3 '00000000'
    177     li_br &b4_5_bad
    178     bne_r2,r3             # 0 != 0? no, fall through
    179     li_br &b4_5_ok
    180     b
    181 :b4_5_bad
    182     li_r1 '00000000'
    183 :b4_5_ok
    184     li_r3 '01000000'  # restore r3 = 1
    185 
    186     ## BLT taken: 0 < 1 (signed).
    187     li_br &b4_6_ok
    188     blt_r2,r3             # 0 < 1, taken
    189     li_r1 '00000000'
    190 :b4_6_ok
    191 
    192     ## BLT fall-through: 1 < 0 is false.
    193     li_r2 '01000000'  # r2 = 1
    194     li_r3 '00000000'  # r3 = 0
    195     li_br &b4_7_bad
    196     blt_r2,r3             # 1 < 0? no, fall through
    197     li_br &b4_7_ok
    198     b
    199 :b4_7_bad
    200     li_r1 '00000000'
    201 :b4_7_ok
    202 
    203     ## BLT signed discrimination: -1 < 0 must be taken. If BLT were
    204     ## accidentally unsigned, -1 as 0xFFFF...FFFF > 0 and the branch
    205     ## would not fire.
    206     li_r2 '00000000'  # r2 = 0
    207     li_r4 '00000000'
    208     addi_r4,r4,neg1          # r4 = -1 (sign-extended)
    209     li_br &b4_8_ok
    210     blt_r4,r2             # -1 < 0 (signed)? yes, taken
    211     li_r1 '00000000'
    212 :b4_8_ok
    213 
    214     ## Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL.
    215     ## fn_identity does its own nested CALL to fn_inner — if PROLOGUE
    216     ## doesn't spill lr correctly, the inner CALL clobbers the
    217     ## return-to-_start address and we crash or hang. The function
    218     ## bodies live inline below the subtests, guarded by a b over
    219     ## them so we don't fall through after the last subtest.
    220     ##
    221     ## Stack-balance discriminator for TAIL: snapshot sp into r6
    222     ## (callee-saved) before any CALL. A correctly-paired call tree
    223     ## nets to sp_after == sp_before. A TAIL that skips its epilogue
    224     ## leaks fn_parent_tail's 16-byte frame — the delta is folded
    225     ## into the accumulator below via SUB r1, r1, delta.
    226     mov_r6,sp                # r6 = sp snapshot (pre-tranche)
    227 
    228     li_br &fn_identity
    229     call                     # nested-CALL test: returns r1 unchanged
    230 
    231     li_br &fn_parent_tail
    232     call                     # TAIL test: fn_identity RETs to here
    233 
    234     li_br &b5_end
    235     b                        # skip over the inlined function bodies
    236 
    237 :fn_inner
    238     prologue
    239     epilogue
    240     ret
    241 
    242 :fn_identity
    243     prologue
    244     li_br &fn_inner
    245     call
    246     epilogue
    247     ret
    248 
    249 :fn_parent_tail
    250     prologue
    251     li_br &fn_identity
    252     tail
    253 
    254 :b5_end
    255     mov_r2,sp                # r2 = sp snapshot (post-tranche)
    256     sub_r2,r2,r6             # r2 = sp_after - sp_before (0 if balanced)
    257     sub_r1,r1,r2             # r1 -= delta; unchanged (= 5) iff balanced
    258 
    259     mov_r6,r1                # r6 = 5 (callee-saved, survives syscalls)
    260 
    261     ## write(1, &prefix, 5)      — "P1 = "
    262     li_r0 sys_write
    263     li_r1 '01000000'
    264     li_r2 &prefix
    265     li_r3 '05000000'
    266     syscall
    267 
    268     ## write(1, &digits + r6, 1) — the computed digit ('5')
    269     li_r0 sys_write
    270     li_r1 '01000000'
    271     li_r2 &digits
    272     add_r2,r2,r6             # r2 = &digits + 5
    273     li_r3 '01000000'
    274     syscall
    275 
    276     ## write(1, &newline, 1)
    277     li_r0 sys_write
    278     li_r1 '01000000'
    279     li_r2 &newline
    280     li_r3 '01000000'
    281     syscall
    282 
    283     ## exit(r6)                  — exit status = computed result
    284     li_r0 sys_exit
    285     mov_r1,r6
    286     syscall
    287 
    288 :prefix
    289 "P1 = "
    290 :digits
    291 "0123456789"
    292 :newline
    293 "
    294 "
    295 
    296 ## 32 bytes reserved for tranche 3 memory round-trip. The LOAD segment
    297 ## is RWX (see ELF-<arch>.hex2 ph_flags=7) so we can store into this
    298 ## region at runtime. scratch_mid = scratch+16, the base address for
    299 ## the negative-imm12 LD/ST test ([scratch_mid + -8] = [scratch+8]).
    300 :scratch  '0000000000000000' '0000000000000000'
    301 :scratch_mid  '0000000000000000' '0000000000000000'
    302 
    303 :ELF_end
	boot2 Playing with the boostrap
	git clone https://git.ryansepassi.com/git/boot2.git
	Log \| Files \| Refs \| README