backends: wide-immediate fallbacks across all three arches - boot2

commit 0860de4c2197b1bb58350d0cde82491703ce542b
parent 339180395bd3c74be62412953cfb73b442acf17d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sun,  3 May 2026 16:28:37 -0700

backends: wide-immediate fallbacks across all three arches

Every P1 backend's small-imm encoders silently truncated when the
immediate (or memory offset) didn't fit the native instruction's
window. The portable-ISA contract is full one-word immediates, so each
backend now picks the small encoding when it fits and transparently
spills into a materialise-then-R-type (or address-staging) sequence
otherwise.

aarch64 (P1-aarch64.M1pp):
- aa64_materialize_imm_any used by p1_logi_ANDI/ORI: MOVZ/MOVN when
  the value (or its complement) fits 16 bits, otherwise the 4-insn
  MOVZ + 3*MOVK chain that p1_li already uses.
- aa64_add_imm_any / aa64_sub_imm_any: third arm materialises and
  emits the R-type ADD/SUB once magnitude exceeds 24 bits.
- aa64_mem_fallback routes through the now-correct _any variants so
  memory accesses past the unscaled-imm9 / scaled-imm12 windows no
  longer truncate the address.

riscv64 (P1-riscv64.M1pp):
- rv_logi_any for ANDI/ORI: native I-type when imm fits the 12-bit
  signed window, else materialise in scratch (t5/x30) + R-type AND/OR.
- rv_ld_any / rv_sd_any / rv_lbu_any / rv_sb_any: address-staging
  fallback when the offset exceeds the I/S-type 12-bit signed window.
  p1_mem_LD/ST/LB/SB and p1_ldarg routed through the _any variants.

amd64 (P1-amd64.M1pp):
- p1_logi_ANDI / p1_logi_ORI / p1_addi each get a third arm so values
  outside the signed-imm32 window (>0x7FFFFFFF or <-0x80000000)
  materialise via p1_li(scratch, imm) + R-type ADD/AND/OR. Without
  this the imm32 form silently sign-extends — e.g. ANDI with
  0xFFFFFFFF would mask with -1 and yield the input unchanged.

tests/p1/wide-imm.P1pp: single behavioural fixture exercising every
wide path (%andi/%ori/%addi past each arch's small-imm window;
%ld/%st/%lb/%sb at offsets 40000 / 5000 past every arch's small-off
window). A trap value pre-stored at offset 0 catches silent address
truncation. Same source builds and runs identically on aarch64 /
riscv64 / amd64; expected stdout is "ABCDEFGH\n".

docs/P1.md: rewrite the immediate-class and memory-offset paragraphs
to reflect the new portable contract (full one-word immediate,
backends prefer the small window for code size); refresh the
toolchain envelope to describe M1pp + hex2++ + catm.

Diffstat:
M P1/P1-aarch64.M1pp  | 51 +++++++++++++++++++++++++++++++++++++++------------
M P1/P1-amd64.M1pp  | 45 +++++++++++++++++++++++++++++++++------------
M P1/P1-riscv64.M1pp  | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
M docs/P1.md  | 41 +++++++++++++++++++++++++++--------------
A tests/P1/wide-imm.P1pp  | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A tests/P1/wide-imm.expected  | 1 +

6 files changed, 320 insertions(+), 46 deletions(-)
diff --git a/P1/P1-aarch64.M1pp b/P1/P1-aarch64.M1pp
@@ -169,22 +169,34 @@
 %((| 0xD1400000 (<< (& imm12 0xFFF) 10) (<< %aa64_reg(ra) 5) %aa64_reg(rd)))
 %endm
 
+# ADD/SUB immediate with arbitrary unsigned magnitude. The native imm12
+# form covers [0, 4095]; the imm12<<12 form (optionally combined with a
+# second imm12 for the low bits) covers [4096, 0xFFFFFF]. Past 24 bits,
+# materialize the constant in scratch and emit the R-type ADD/SUB.
+# Callers must not pass `scratch` as `ra` (the materialize would
+# clobber it before the R-type read).
 %macro aa64_add_imm_any(rd, ra, imm)
 %select((<= imm 4095),
     %aa64_add_imm(rd, ra, imm),
-    %select((= (& imm 0xFFF) 0),
-        %aa64_add_imm_lsl12(rd, ra, (>> imm 12)),
-        %aa64_add_imm_lsl12(rd, ra, (>> imm 12))
-        %aa64_add_imm(rd, rd, (& imm 0xFFF))))
+    %select((<= imm 0xFFFFFF),
+        %select((= (& imm 0xFFF) 0),
+            %aa64_add_imm_lsl12(rd, ra, (>> imm 12)),
+            %aa64_add_imm_lsl12(rd, ra, (>> imm 12))
+            %aa64_add_imm(rd, rd, (& imm 0xFFF))),
+        %p1_li(scratch, imm)
+        %aa64_rrr(0x8B000000, rd, ra, scratch)))
 %endm
 
 %macro aa64_sub_imm_any(rd, ra, imm)
 %select((<= imm 4095),
     %aa64_sub_imm(rd, ra, imm),
-    %select((= (& imm 0xFFF) 0),
-        %aa64_sub_imm_lsl12(rd, ra, (>> imm 12)),
-        %aa64_sub_imm_lsl12(rd, ra, (>> imm 12))
-        %aa64_sub_imm(rd, rd, (& imm 0xFFF))))
+    %select((<= imm 0xFFFFFF),
+        %select((= (& imm 0xFFF) 0),
+            %aa64_sub_imm_lsl12(rd, ra, (>> imm 12)),
+            %aa64_sub_imm_lsl12(rd, ra, (>> imm 12))
+            %aa64_sub_imm(rd, rd, (& imm 0xFFF))),
+        %p1_li(scratch, imm)
+        %aa64_rrr(0xCB000000, rd, ra, scratch)))
 %endm
 
 %macro aa64_mov_rr(dst, src)
@@ -217,6 +229,21 @@
     %aa64_movn(rd, (& (~ imm) 0xFFFF)))
 %endm
 
+# Materialize an arbitrary 64-bit signed immediate into `rd`. Picks the
+# 1-insn MOVZ / MOVN form when the value (or its complement, for
+# negatives) fits 16 bits; otherwise emits the 4-insn MOVZ + 3*MOVK
+# chain used by %p1_li. Used by ANDI/ORI/ADDI fallbacks below to avoid
+# silently truncating to the small-imm window.
+%macro aa64_materialize_imm_any(rd, imm)
+%select((>= imm 0),
+    %select((<= imm 0xFFFF),
+        %aa64_movz(rd, imm),
+        %p1_li(rd, imm)),
+    %select((>= imm -65536),
+        %aa64_movn(rd, (& (~ imm) 0xFFFF)),
+        %p1_li(rd, imm)))
+%endm
+
 %macro aa64_ldst_uimm12(base, rt, rn, off_bytes, size_log2)
 %((| base (<< (>> off_bytes size_log2) 10) (<< %aa64_reg(rn) 5) %aa64_reg(rt)))
 %endm
@@ -278,9 +305,9 @@
 
 %macro aa64_mem_fallback(op, rt, rn, off)
 %select((>= off 0),
-    %aa64_add_imm(scratch, rn, off)
+    %aa64_add_imm_any(scratch, rn, off)
     %aa64_ldst_uimm12(%aa64_mem_uimm_base(op), rt, scratch, 0, %aa64_mem_size(op)),
-    %aa64_sub_imm(scratch, rn, (- 0 off))
+    %aa64_sub_imm_any(scratch, rn, (- 0 off))
     %aa64_ldst_uimm12(%aa64_mem_uimm_base(op), rt, scratch, 0, %aa64_mem_size(op)))
 %endm
 
@@ -425,11 +452,11 @@
 %endm
 
 %macro p1_logi_ANDI(rd, ra, imm)
-%aa64_materialize_small_imm(scratch, imm)
+%aa64_materialize_imm_any(scratch, imm)
 %aa64_rrr(0x8A000000, rd, ra, scratch)
 %endm
 %macro p1_logi_ORI(rd, ra, imm)
-%aa64_materialize_small_imm(scratch, imm)
+%aa64_materialize_imm_any(scratch, imm)
 %aa64_rrr(0xAA000000, rd, ra, scratch)
 %endm
 %macro p1_logi(op, rd, ra, imm)
diff --git a/P1/P1-amd64.M1pp b/P1/P1-amd64.M1pp
@@ -638,29 +638,50 @@ $(imm)
 %select((>= imm -128),
     %select((<= imm 127),
         %amd_alu_ri8(0, rd, imm),
-        %amd_alu_ri32(0, rd, imm)),
-    %amd_alu_ri32(0, rd, imm))
-%endm
-
-# AND/OR with imm: 83 /ext ib sign-extends imm8 to 64 bits. That works for
-# imm in [-128, 127] (and for -1 as a convenient all-ones mask), but breaks
-# for positive imms >= 128 — ANDI with 255 would become AND with
-# 0xFFFFFFFFFFFFFFFF. Widen to the imm32 form when imm8 would misencode.
+        %select((<= imm 2147483647),
+            %amd_alu_ri32(0, rd, imm),
+            %p1_li(scratch, imm)
+            %amd_rrr_ADD(rd, rd, scratch))),
+    %select((>= imm -2147483648),
+        %amd_alu_ri32(0, rd, imm),
+        %p1_li(scratch, imm)
+        %amd_rrr_ADD(rd, rd, scratch)))
+%endm
+
+# AND/OR with imm. Three windows:
+#   imm in [-128, 127]            -> 83 /ext ib   (imm8 sign-extends)
+#   imm in [INT32_MIN, INT32_MAX] -> 81 /ext id   (imm32 sign-extends)
+#   else                          -> materialise imm in scratch, R-type AND/OR.
+# The third arm covers positive imms above 0x7FFFFFFF (e.g. 0xFFFFFFFF
+# or 0xDEADBEEF) where the imm32 sign-extension would silently flip the
+# upper word to all-ones.
 %macro p1_logi_ANDI(rd, ra, imm)
 %amd_mov_rr(rd, ra)
 %select((>= imm -128),
     %select((<= imm 127),
         %amd_alu_ri8(4, rd, imm),
-        %amd_alu_ri32(4, rd, imm)),
-    %amd_alu_ri32(4, rd, imm))
+        %select((<= imm 2147483647),
+            %amd_alu_ri32(4, rd, imm),
+            %p1_li(scratch, imm)
+            %amd_rrr_AND(rd, rd, scratch))),
+    %select((>= imm -2147483648),
+        %amd_alu_ri32(4, rd, imm),
+        %p1_li(scratch, imm)
+        %amd_rrr_AND(rd, rd, scratch)))
 %endm
 %macro p1_logi_ORI(rd, ra, imm)
 %amd_mov_rr(rd, ra)
 %select((>= imm -128),
     %select((<= imm 127),
         %amd_alu_ri8(1, rd, imm),
-        %amd_alu_ri32(1, rd, imm)),
-    %amd_alu_ri32(1, rd, imm))
+        %select((<= imm 2147483647),
+            %amd_alu_ri32(1, rd, imm),
+            %p1_li(scratch, imm)
+            %amd_rrr_OR(rd, rd, scratch))),
+    %select((>= imm -2147483648),
+        %amd_alu_ri32(1, rd, imm),
+        %p1_li(scratch, imm)
+        %amd_rrr_OR(rd, rd, scratch)))
 %endm
 %macro p1_logi(op, rd, ra, imm)
 %p1_logi_##op(rd, ra, imm)
diff --git a/P1/P1-riscv64.M1pp b/P1/P1-riscv64.M1pp
@@ -237,6 +237,68 @@
 %rv_i_type(0x00006003, rd, ra, imm12)
 %endm
 
+# Load/store with arbitrary signed offset. The native I-type/S-type
+# imm12 covers [-2048, 2047]; past that, materialize the offset in
+# scratch (t5/x30), compute scratch = ra + scratch via R-type ADD, and
+# issue the load/store with offset 0. Callers must not pass scratch as
+# `ra` or `rs` — the materialize would clobber it before the address
+# computation reads it.
+%macro rv_ld_any(rd, ra, off)
+%select((>= off -2048),
+    %select((<= off 2047),
+        %rv_ld(rd, ra, off),
+        %rv_lit64_prefix(scratch)
+        $(off)
+        %rv_r_type(0x00000033, scratch, ra, scratch)
+        %rv_ld(rd, scratch, 0)),
+    %rv_lit64_prefix(scratch)
+    $(off)
+    %rv_r_type(0x00000033, scratch, ra, scratch)
+    %rv_ld(rd, scratch, 0))
+%endm
+
+%macro rv_sd_any(rs, ra, off)
+%select((>= off -2048),
+    %select((<= off 2047),
+        %rv_sd(rs, ra, off),
+        %rv_lit64_prefix(scratch)
+        $(off)
+        %rv_r_type(0x00000033, scratch, ra, scratch)
+        %rv_sd(rs, scratch, 0)),
+    %rv_lit64_prefix(scratch)
+    $(off)
+    %rv_r_type(0x00000033, scratch, ra, scratch)
+    %rv_sd(rs, scratch, 0))
+%endm
+
+%macro rv_lbu_any(rd, ra, off)
+%select((>= off -2048),
+    %select((<= off 2047),
+        %rv_lbu(rd, ra, off),
+        %rv_lit64_prefix(scratch)
+        $(off)
+        %rv_r_type(0x00000033, scratch, ra, scratch)
+        %rv_lbu(rd, scratch, 0)),
+    %rv_lit64_prefix(scratch)
+    $(off)
+    %rv_r_type(0x00000033, scratch, ra, scratch)
+    %rv_lbu(rd, scratch, 0))
+%endm
+
+%macro rv_sb_any(rs, ra, off)
+%select((>= off -2048),
+    %select((<= off 2047),
+        %rv_sb(rs, ra, off),
+        %rv_lit64_prefix(scratch)
+        $(off)
+        %rv_r_type(0x00000033, scratch, ra, scratch)
+        %rv_sb(rs, scratch, 0)),
+    %rv_lit64_prefix(scratch)
+    $(off)
+    %rv_r_type(0x00000033, scratch, ra, scratch)
+    %rv_sb(rs, scratch, 0))
+%endm
+
 %macro rv_mov_rr(dst, src)
 %rv_addi(dst, src, 0)
 %endm
@@ -361,11 +423,28 @@ $(imm)
 %rv_addi_any(rd, ra, imm)
 %endm
 
+# Logical-immediate fallback: when imm fits the I-type's 12-bit signed
+# field, emit the native ANDI/ORI; otherwise materialize the immediate
+# in scratch (t5/x30) and use the R-type AND/OR. funct3=7 (AND) or 6
+# (OR) is shared between the I-type (opcode 0x13) and R-type
+# (opcode 0x33) encodings.
+%macro rv_logi_any(rd, ra, imm, base_i, base_r)
+%select((>= imm -2048),
+    %select((<= imm 2047),
+        %rv_i_type(base_i, rd, ra, imm),
+        %rv_lit64_prefix(scratch)
+        $(imm)
+        %rv_r_type(base_r, rd, ra, scratch)),
+    %rv_lit64_prefix(scratch)
+    $(imm)
+    %rv_r_type(base_r, rd, ra, scratch))
+%endm
+
 %macro p1_logi_ANDI(rd, ra, imm)
-%rv_i_type(0x00007013, rd, ra, imm)
+%rv_logi_any(rd, ra, imm, 0x00007013, 0x00007033)
 %endm
 %macro p1_logi_ORI(rd, ra, imm)
-%rv_i_type(0x00006013, rd, ra, imm)
+%rv_logi_any(rd, ra, imm, 0x00006013, 0x00006033)
 %endm
 %macro p1_logi(op, rd, ra, imm)
 %p1_logi_##op(rd, ra, imm)
@@ -385,16 +464,16 @@ $(imm)
 %endm
 
 %macro p1_mem_LD(rt, rn, off)
-%rv_ld(rt, rn, off)
+%rv_ld_any(rt, rn, off)
 %endm
 %macro p1_mem_ST(rt, rn, off)
-%rv_sd(rt, rn, off)
+%rv_sd_any(rt, rn, off)
 %endm
 %macro p1_mem_LB(rt, rn, off)
-%rv_lbu(rt, rn, off)
+%rv_lbu_any(rt, rn, off)
 %endm
 %macro p1_mem_SB(rt, rn, off)
-%rv_sb(rt, rn, off)
+%rv_sb_any(rt, rn, off)
 %endm
 %macro p1_mem(op, rt, rn, off)
 %select((= %rv_is_sp(rn) 1),
@@ -403,8 +482,8 @@ $(imm)
 %endm
 
 %macro p1_ldarg(rd, slot)
-%rv_ld(scratch, sp, 8)
-%rv_ld(rd, scratch, (+ 16 (* 8 slot)))
+%rv_ld(rd, sp, 8)
+%rv_ld_any(rd, rd, (+ 16 (* 8 slot)))
 %endm
 
 %macro p1_b()
diff --git a/docs/P1.md b/docs/P1.md
@@ -18,15 +18,21 @@ portable indirect-result convention described below.
 
 ## Toolchain envelope
 
-P1 must be assemblable through the existing `M0` + `hex2` path, with
-`catm` as the only composition primitive between source or generated fragments.
-The spec therefore assumes only the following toolchain features:
-
-- `M0`-level `DEFINE name hex_bytes` substitution
-- raw byte emission
-- labels and label references supported by `hex2`
+P1 source is assembled by the `M1pp → hex2++` chain, with `catm` as the
+only composition primitive between source or generated fragments. The
+spec therefore assumes only the following toolchain features:
+
+- `M1pp` macro expansion: function-like macros, compile-time integer
+  expressions, and the `!@%$` little-endian hex-emission forms used by
+  the per-arch backends to pack instruction words at expansion time
+- labels, label references, and `.scope` / `.endscope` / `.align` /
+  `.fill` / `.ptrsize` directives supported by `hex2++`
 - file concatenation via `catm`
 
+`hex2++` sees only contiguous bytes; all target-specific encoding
+(register packing, bit-scattered immediates, native branch
+displacements) lives in the per-arch M1pp backend.
+
 ## Source notation
 
 This document describes instructions using ordinary assembly notation such as
@@ -245,8 +251,11 @@ Leaf functions that need no frame-local storage may omit the frame entirely.
 Immediate operands appear only in instructions that explicitly admit them.
 Portable source has three immediate classes:
 
-- **Inline integer immediate** — a signed 12-bit assembly-time constant in the
-  range `-2048..2047`
+- **Inline integer immediate** — any assembly-time signed integer constant
+  that fits one word. Backends prefer the native instruction's small-imm
+  encoding when the value fits its window (e.g. signed 12-bit on
+  `ADDI`/`ANDI`/`ORI` and on memory offsets); larger values fall back to
+  a materialise-then-R-type sequence transparent to portable source.
 - **Materialized word value** — a full one-word assembly-time constant loaded
   with `LI`
 - **Materialized address** — the address of a label loaded with `LA`
@@ -264,10 +273,11 @@ The backend may realize `LI` and `LA` using native immediates, literal pools,
 multi-instruction sequences, or other backend-private mechanisms.
 
 Backends may assume labels fit in 32 bits when realizing `LA` and `LA_BR`.
-This reflects the stage0 image layout (`hex2-0` base `0x00600000`, programs
-well under 4 GB), not a portable-ISA-level guarantee. Backends that target
-images loaded above the 4 GB boundary must adjust their `LA` / `LA_BR`
-lowering. `LI` makes no such assumption — it materializes any one-word value.
+This reflects the current image layout (`hex2++` base `0x00600000`,
+programs well under 4 GB), not a portable-ISA-level guarantee. Backends
+that target images loaded above the 4 GB boundary must adjust their `LA`
+/ `LA_BR` lowering. `LI` makes no such assumption — it materializes any
+one-word value.
 
 ## Control Flow
 
@@ -438,7 +448,10 @@ P1 defines the following memory-access operations:
 `LB` loads one byte and zero-extends it to a full word. `SB` stores the low
 8 bits of the source value.
 
-Memory offsets use signed 12-bit inline immediates.
+Memory offsets are signed inline integer immediates and follow the same
+backend-fallback policy as arithmetic immediates: backends prefer the
+native instruction's small-offset encoding (typically signed 12-bit) and
+transparently spill into address-staging when the offset is wider.
 
 The base address for a memory access may be any exposed general register or
 `sp`.
diff --git a/tests/P1/wide-imm.P1pp b/tests/P1/wide-imm.P1pp
@@ -0,0 +1,133 @@
+# tests/p1/wide-imm.P1pp -- backend wide-immediate behavioural test.
+#
+# Each subtest exercises one P1 op with an immediate or offset that
+# falls outside its target instruction's small-imm window, so the
+# backend's "_any" fallback (materialise + R-type / address-staging)
+# must run for the result to be correct. Result is checked against
+# the expected value; "X" on any mismatch.
+#
+# Coverage map (small-imm window per arch shown for context):
+#                          aarch64                 riscv64
+#   %andi imm window       0..0xFFFF / -0x10000..  -2048..2047
+#   %ori  imm window       (same)                  (same)
+#   %addi imm window       0..0xFFFFFF             -2048..2047
+#   %ld/%st 8B off window  scaled 0..32760 +       -2048..2047
+#                          unscaled -256..255
+#   %lb/%sb 1B off window  unscaled -256..255 +    -2048..2047
+#                          scaled 0..4095
+# amd64 has native disp32/imm32, so no fallback runs there but the
+# result must still be correct.
+#
+# Offsets chosen to land outside every arch's window:
+#   - 8-byte LD/ST at +40000 (past aarch64 scaled imm12)
+#   - 1-byte LB/SB at +5000  (past aarch64 unscaled imm12)
+#
+# Buffer storage: `:buf` sits just before `:ELF_end`, so &buf is in
+# the BSS region the loader zero-fills past filesz (ph_memsz = 512 MB
+# in the seed ELF header, so 40008 bytes past &buf is safely mapped).
+#
+# Expected stdout: "ABCDEFGH\n".
+
+%fn(p1_main, 0, {
+    # ---- A: %andi(rd, ra, 0xFFFFFFFF) on -1 -> 0xFFFFFFFF ----------------
+    # Without the wide-andi fix: riscv64 truncates 0xFFFFFFFF to 0xFFF and
+    # aarch64 truncates to 0xFFFF, both giving wrong masks.
+    %li(t0, -1)
+    %andi(t0, t0, 0xFFFFFFFF)
+    %li(t1, 0xFFFFFFFF)
+    %bne(t0, t1, &.fail)
+    %la(a0, &c_a) %li(a1, 1) %call(&print)
+
+    # ---- B: %ori(rd, ra, 0xDEADBEEF) on 0 -> 0xDEADBEEF ------------------
+    %li(t0, 0)
+    %ori(t0, t0, 0xDEADBEEF)
+    %li(t1, 0xDEADBEEF)
+    %bne(t0, t1, &.fail)
+    %la(a0, &c_b) %li(a1, 1) %call(&print)
+
+    # ---- C: %addi(rd, ra, 0xFFFFFFFF) on 0 -> 0xFFFFFFFF -----------------
+    # Past aarch64's 24-bit add-imm window, past riscv64's 12-bit window,
+    # and (critically) past amd64's signed-imm32 range: the imm32 form
+    # would sign-extend 0xFFFFFFFF to -1 and silently subtract.
+    %li(t0, 0)
+    %addi(t0, t0, 0xFFFFFFFF)
+    %li(t1, 0xFFFFFFFF)
+    %bne(t0, t1, &.fail)
+    %la(a0, &c_c) %li(a1, 1) %call(&print)
+
+    # ---- D: %addi(rd, ra, -0xFFFFFFFF) on 0xFFFFFFFF -> 0 ----------------
+    # Negative magnitude past every backend's small-imm window. On amd64
+    # the imm32 form would truncate -0xFFFFFFFF (low 32 bits = 0x1)
+    # and add 1 instead of subtracting 0xFFFFFFFF.
+    %li(t0, 0xFFFFFFFF)
+    %addi(t0, t0, -0xFFFFFFFF)
+    %bnez(t0, &.fail)
+    %la(a0, &c_d) %li(a1, 1) %call(&print)
+
+    # Stage a "trap" value at &buf+0 so a wide-offset store/load that
+    # silently masks its offset down to 0 is detected as the trap value
+    # leaking into the wide slot.
+    %la(s0, &buf)
+    %li(t0, 0xDEAD)
+    %st(t0, s0, 0)
+
+    # ---- E/F: %st + %ld at offset 40000 -> roundtrip 0xCAFEBABE ----------
+    # If the wide store silently truncates to offset 0, it overwrites the
+    # 0xDEAD trap (rather than landing at +40000), and the subsequent
+    # offset-0 ld below would read 0xCAFEBABE instead of 0xDEAD. If the
+    # wide load truncates, it reads the 0xDEAD trap instead of 0xCAFEBABE.
+    %li(t0, 0xCAFEBABE)
+    %st(t0, s0, 40000)
+    %ld(t1, s0, 40000)
+    %li(t2, 0xCAFEBABE)
+    %bne(t1, t2, &.fail)
+    %la(a0, &c_e) %li(a1, 1) %call(&print)
+
+    %ld(t1, s0, 0)
+    %li(t2, 0xDEAD)
+    %bne(t1, t2, &.fail)
+    %la(a0, &c_f) %li(a1, 1) %call(&print)
+
+    # Stage a 1-byte trap at &buf+1 (so it doesn't overlap the 8-byte
+    # value already at &buf+0) before the byte-level subtest.
+    %li(t0, 0x99)
+    %sb(t0, s0, 1)
+
+    # ---- G/H: %sb + %lb at offset 5000 -> roundtrip 0x42 -----------------
+    %li(t0, 0x42)
+    %sb(t0, s0, 5000)
+    %lb(t1, s0, 5000)
+    %li(t2, 0x42)
+    %bne(t1, t2, &.fail)
+    %la(a0, &c_g) %li(a1, 1) %call(&print)
+
+    %lb(t1, s0, 1)
+    %li(t2, 0x99)
+    %bne(t1, t2, &.fail)
+    %la(a0, &c_h) %li(a1, 1) %call(&print)
+
+    %la(a0, &c_nl) %li(a1, 1) %call(&print)
+    %li(a0, 0)
+    %b(&.done)
+
+    :.fail
+    %la(a0, &c_x) %li(a1, 1) %call(&print)
+    %la(a0, &c_nl) %li(a1, 1) %call(&print)
+    %li(a0, 1)
+    :.done
+})
+
+:c_a "A"
+:c_b "B"
+:c_c "C"
+:c_d "D"
+:c_e "E"
+:c_f "F"
+:c_g "G"
+:c_h "H"
+:c_x "X"
+:c_nl "
+"
+
+:buf
+:ELF_end
diff --git a/tests/P1/wide-imm.expected b/tests/P1/wide-imm.expected
@@ -0,0 +1 @@
+ABCDEFGH

	boot2 Playing with the boostrap
	git clone https://git.ryansepassi.com/git/boot2.git
	Log \| Files \| Refs \| README

M	P1/P1-aarch64.M1pp	\|	51	+++++++++++++++++++++++++++++++++++++++------------
M	P1/P1-amd64.M1pp	\|	45	+++++++++++++++++++++++++++++++++------------
M	P1/P1-riscv64.M1pp	\|	95	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
M	docs/P1.md	\|	41	+++++++++++++++++++++++++++--------------
A	tests/P1/wide-imm.P1pp	\|	133	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	tests/P1/wide-imm.expected	\|	1	+