commit 0860de4c2197b1bb58350d0cde82491703ce542b
parent 339180395bd3c74be62412953cfb73b442acf17d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sun, 3 May 2026 16:28:37 -0700
backends: wide-immediate fallbacks across all three arches
Every P1 backend's small-imm encoders silently truncated when the
immediate (or memory offset) didn't fit the native instruction's
window. The portable-ISA contract is full one-word immediates, so each
backend now picks the small encoding when it fits and transparently
spills into a materialise-then-R-type (or address-staging) sequence
otherwise.
aarch64 (P1-aarch64.M1pp):
- aa64_materialize_imm_any used by p1_logi_ANDI/ORI: MOVZ/MOVN when
the value (or its complement) fits 16 bits, otherwise the 4-insn
MOVZ + 3*MOVK chain that p1_li already uses.
- aa64_add_imm_any / aa64_sub_imm_any: third arm materialises and
emits the R-type ADD/SUB once magnitude exceeds 24 bits.
- aa64_mem_fallback routes through the now-correct _any variants so
memory accesses past the unscaled-imm9 / scaled-imm12 windows no
longer truncate the address.
riscv64 (P1-riscv64.M1pp):
- rv_logi_any for ANDI/ORI: native I-type when imm fits the 12-bit
signed window, else materialise in scratch (t5/x30) + R-type AND/OR.
- rv_ld_any / rv_sd_any / rv_lbu_any / rv_sb_any: address-staging
fallback when the offset exceeds the I/S-type 12-bit signed window.
p1_mem_LD/ST/LB/SB and p1_ldarg routed through the _any variants.
amd64 (P1-amd64.M1pp):
- p1_logi_ANDI / p1_logi_ORI / p1_addi each get a third arm so values
outside the signed-imm32 window (>0x7FFFFFFF or <-0x80000000)
materialise via p1_li(scratch, imm) + R-type ADD/AND/OR. Without
this the imm32 form silently sign-extends — e.g. ANDI with
0xFFFFFFFF would mask with -1 and yield the input unchanged.
tests/p1/wide-imm.P1pp: single behavioural fixture exercising every
wide path (%andi/%ori/%addi past each arch's small-imm window;
%ld/%st/%lb/%sb at offsets 40000 / 5000 past every arch's small-off
window). A trap value pre-stored at offset 0 catches silent address
truncation. Same source builds and runs identically on aarch64 /
riscv64 / amd64; expected stdout is "ABCDEFGH\n".
docs/P1.md: rewrite the immediate-class and memory-offset paragraphs
to reflect the new portable contract (full one-word immediate,
backends prefer the small window for code size); refresh the
toolchain envelope to describe M1pp + hex2++ + catm.
Diffstat:
6 files changed, 320 insertions(+), 46 deletions(-)
diff --git a/P1/P1-aarch64.M1pp b/P1/P1-aarch64.M1pp
@@ -169,22 +169,34 @@
%((| 0xD1400000 (<< (& imm12 0xFFF) 10) (<< %aa64_reg(ra) 5) %aa64_reg(rd)))
%endm
+# ADD/SUB immediate with arbitrary unsigned magnitude. The native imm12
+# form covers [0, 4095]; the imm12<<12 form (optionally combined with a
+# second imm12 for the low bits) covers [4096, 0xFFFFFF]. Past 24 bits,
+# materialize the constant in scratch and emit the R-type ADD/SUB.
+# Callers must not pass `scratch` as `ra` (the materialize would
+# clobber it before the R-type read).
%macro aa64_add_imm_any(rd, ra, imm)
%select((<= imm 4095),
%aa64_add_imm(rd, ra, imm),
- %select((= (& imm 0xFFF) 0),
- %aa64_add_imm_lsl12(rd, ra, (>> imm 12)),
- %aa64_add_imm_lsl12(rd, ra, (>> imm 12))
- %aa64_add_imm(rd, rd, (& imm 0xFFF))))
+ %select((<= imm 0xFFFFFF),
+ %select((= (& imm 0xFFF) 0),
+ %aa64_add_imm_lsl12(rd, ra, (>> imm 12)),
+ %aa64_add_imm_lsl12(rd, ra, (>> imm 12))
+ %aa64_add_imm(rd, rd, (& imm 0xFFF))),
+ %p1_li(scratch, imm)
+ %aa64_rrr(0x8B000000, rd, ra, scratch)))
%endm
%macro aa64_sub_imm_any(rd, ra, imm)
%select((<= imm 4095),
%aa64_sub_imm(rd, ra, imm),
- %select((= (& imm 0xFFF) 0),
- %aa64_sub_imm_lsl12(rd, ra, (>> imm 12)),
- %aa64_sub_imm_lsl12(rd, ra, (>> imm 12))
- %aa64_sub_imm(rd, rd, (& imm 0xFFF))))
+ %select((<= imm 0xFFFFFF),
+ %select((= (& imm 0xFFF) 0),
+ %aa64_sub_imm_lsl12(rd, ra, (>> imm 12)),
+ %aa64_sub_imm_lsl12(rd, ra, (>> imm 12))
+ %aa64_sub_imm(rd, rd, (& imm 0xFFF))),
+ %p1_li(scratch, imm)
+ %aa64_rrr(0xCB000000, rd, ra, scratch)))
%endm
%macro aa64_mov_rr(dst, src)
@@ -217,6 +229,21 @@
%aa64_movn(rd, (& (~ imm) 0xFFFF)))
%endm
+# Materialize an arbitrary 64-bit signed immediate into `rd`. Picks the
+# 1-insn MOVZ / MOVN form when the value (or its complement, for
+# negatives) fits 16 bits; otherwise emits the 4-insn MOVZ + 3*MOVK
+# chain used by %p1_li. Used by ANDI/ORI/ADDI fallbacks below to avoid
+# silently truncating to the small-imm window.
+%macro aa64_materialize_imm_any(rd, imm)
+%select((>= imm 0),
+ %select((<= imm 0xFFFF),
+ %aa64_movz(rd, imm),
+ %p1_li(rd, imm)),
+ %select((>= imm -65536),
+ %aa64_movn(rd, (& (~ imm) 0xFFFF)),
+ %p1_li(rd, imm)))
+%endm
+
%macro aa64_ldst_uimm12(base, rt, rn, off_bytes, size_log2)
%((| base (<< (>> off_bytes size_log2) 10) (<< %aa64_reg(rn) 5) %aa64_reg(rt)))
%endm
@@ -278,9 +305,9 @@
%macro aa64_mem_fallback(op, rt, rn, off)
%select((>= off 0),
- %aa64_add_imm(scratch, rn, off)
+ %aa64_add_imm_any(scratch, rn, off)
%aa64_ldst_uimm12(%aa64_mem_uimm_base(op), rt, scratch, 0, %aa64_mem_size(op)),
- %aa64_sub_imm(scratch, rn, (- 0 off))
+ %aa64_sub_imm_any(scratch, rn, (- 0 off))
%aa64_ldst_uimm12(%aa64_mem_uimm_base(op), rt, scratch, 0, %aa64_mem_size(op)))
%endm
@@ -425,11 +452,11 @@
%endm
%macro p1_logi_ANDI(rd, ra, imm)
-%aa64_materialize_small_imm(scratch, imm)
+%aa64_materialize_imm_any(scratch, imm)
%aa64_rrr(0x8A000000, rd, ra, scratch)
%endm
%macro p1_logi_ORI(rd, ra, imm)
-%aa64_materialize_small_imm(scratch, imm)
+%aa64_materialize_imm_any(scratch, imm)
%aa64_rrr(0xAA000000, rd, ra, scratch)
%endm
%macro p1_logi(op, rd, ra, imm)
diff --git a/P1/P1-amd64.M1pp b/P1/P1-amd64.M1pp
@@ -638,29 +638,50 @@ $(imm)
%select((>= imm -128),
%select((<= imm 127),
%amd_alu_ri8(0, rd, imm),
- %amd_alu_ri32(0, rd, imm)),
- %amd_alu_ri32(0, rd, imm))
-%endm
-
-# AND/OR with imm: 83 /ext ib sign-extends imm8 to 64 bits. That works for
-# imm in [-128, 127] (and for -1 as a convenient all-ones mask), but breaks
-# for positive imms >= 128 — ANDI with 255 would become AND with
-# 0xFFFFFFFFFFFFFFFF. Widen to the imm32 form when imm8 would misencode.
+ %select((<= imm 2147483647),
+ %amd_alu_ri32(0, rd, imm),
+ %p1_li(scratch, imm)
+ %amd_rrr_ADD(rd, rd, scratch))),
+ %select((>= imm -2147483648),
+ %amd_alu_ri32(0, rd, imm),
+ %p1_li(scratch, imm)
+ %amd_rrr_ADD(rd, rd, scratch)))
+%endm
+
+# AND/OR with imm. Three windows:
+# imm in [-128, 127] -> 83 /ext ib (imm8 sign-extends)
+# imm in [INT32_MIN, INT32_MAX] -> 81 /ext id (imm32 sign-extends)
+# else -> materialise imm in scratch, R-type AND/OR.
+# The third arm covers positive imms above 0x7FFFFFFF (e.g. 0xFFFFFFFF
+# or 0xDEADBEEF) where the imm32 sign-extension would silently flip the
+# upper word to all-ones.
%macro p1_logi_ANDI(rd, ra, imm)
%amd_mov_rr(rd, ra)
%select((>= imm -128),
%select((<= imm 127),
%amd_alu_ri8(4, rd, imm),
- %amd_alu_ri32(4, rd, imm)),
- %amd_alu_ri32(4, rd, imm))
+ %select((<= imm 2147483647),
+ %amd_alu_ri32(4, rd, imm),
+ %p1_li(scratch, imm)
+ %amd_rrr_AND(rd, rd, scratch))),
+ %select((>= imm -2147483648),
+ %amd_alu_ri32(4, rd, imm),
+ %p1_li(scratch, imm)
+ %amd_rrr_AND(rd, rd, scratch)))
%endm
%macro p1_logi_ORI(rd, ra, imm)
%amd_mov_rr(rd, ra)
%select((>= imm -128),
%select((<= imm 127),
%amd_alu_ri8(1, rd, imm),
- %amd_alu_ri32(1, rd, imm)),
- %amd_alu_ri32(1, rd, imm))
+ %select((<= imm 2147483647),
+ %amd_alu_ri32(1, rd, imm),
+ %p1_li(scratch, imm)
+ %amd_rrr_OR(rd, rd, scratch))),
+ %select((>= imm -2147483648),
+ %amd_alu_ri32(1, rd, imm),
+ %p1_li(scratch, imm)
+ %amd_rrr_OR(rd, rd, scratch)))
%endm
%macro p1_logi(op, rd, ra, imm)
%p1_logi_##op(rd, ra, imm)
diff --git a/P1/P1-riscv64.M1pp b/P1/P1-riscv64.M1pp
@@ -237,6 +237,68 @@
%rv_i_type(0x00006003, rd, ra, imm12)
%endm
+# Load/store with arbitrary signed offset. The native I-type/S-type
+# imm12 covers [-2048, 2047]; past that, materialize the offset in
+# scratch (t5/x30), compute scratch = ra + scratch via R-type ADD, and
+# issue the load/store with offset 0. Callers must not pass scratch as
+# `ra` or `rs` — the materialize would clobber it before the address
+# computation reads it.
+%macro rv_ld_any(rd, ra, off)
+%select((>= off -2048),
+ %select((<= off 2047),
+ %rv_ld(rd, ra, off),
+ %rv_lit64_prefix(scratch)
+ $(off)
+ %rv_r_type(0x00000033, scratch, ra, scratch)
+ %rv_ld(rd, scratch, 0)),
+ %rv_lit64_prefix(scratch)
+ $(off)
+ %rv_r_type(0x00000033, scratch, ra, scratch)
+ %rv_ld(rd, scratch, 0))
+%endm
+
+%macro rv_sd_any(rs, ra, off)
+%select((>= off -2048),
+ %select((<= off 2047),
+ %rv_sd(rs, ra, off),
+ %rv_lit64_prefix(scratch)
+ $(off)
+ %rv_r_type(0x00000033, scratch, ra, scratch)
+ %rv_sd(rs, scratch, 0)),
+ %rv_lit64_prefix(scratch)
+ $(off)
+ %rv_r_type(0x00000033, scratch, ra, scratch)
+ %rv_sd(rs, scratch, 0))
+%endm
+
+%macro rv_lbu_any(rd, ra, off)
+%select((>= off -2048),
+ %select((<= off 2047),
+ %rv_lbu(rd, ra, off),
+ %rv_lit64_prefix(scratch)
+ $(off)
+ %rv_r_type(0x00000033, scratch, ra, scratch)
+ %rv_lbu(rd, scratch, 0)),
+ %rv_lit64_prefix(scratch)
+ $(off)
+ %rv_r_type(0x00000033, scratch, ra, scratch)
+ %rv_lbu(rd, scratch, 0))
+%endm
+
+%macro rv_sb_any(rs, ra, off)
+%select((>= off -2048),
+ %select((<= off 2047),
+ %rv_sb(rs, ra, off),
+ %rv_lit64_prefix(scratch)
+ $(off)
+ %rv_r_type(0x00000033, scratch, ra, scratch)
+ %rv_sb(rs, scratch, 0)),
+ %rv_lit64_prefix(scratch)
+ $(off)
+ %rv_r_type(0x00000033, scratch, ra, scratch)
+ %rv_sb(rs, scratch, 0))
+%endm
+
%macro rv_mov_rr(dst, src)
%rv_addi(dst, src, 0)
%endm
@@ -361,11 +423,28 @@ $(imm)
%rv_addi_any(rd, ra, imm)
%endm
+# Logical-immediate fallback: when imm fits the I-type's 12-bit signed
+# field, emit the native ANDI/ORI; otherwise materialize the immediate
+# in scratch (t5/x30) and use the R-type AND/OR. funct3=7 (AND) or 6
+# (OR) is shared between the I-type (opcode 0x13) and R-type
+# (opcode 0x33) encodings.
+%macro rv_logi_any(rd, ra, imm, base_i, base_r)
+%select((>= imm -2048),
+ %select((<= imm 2047),
+ %rv_i_type(base_i, rd, ra, imm),
+ %rv_lit64_prefix(scratch)
+ $(imm)
+ %rv_r_type(base_r, rd, ra, scratch)),
+ %rv_lit64_prefix(scratch)
+ $(imm)
+ %rv_r_type(base_r, rd, ra, scratch))
+%endm
+
%macro p1_logi_ANDI(rd, ra, imm)
-%rv_i_type(0x00007013, rd, ra, imm)
+%rv_logi_any(rd, ra, imm, 0x00007013, 0x00007033)
%endm
%macro p1_logi_ORI(rd, ra, imm)
-%rv_i_type(0x00006013, rd, ra, imm)
+%rv_logi_any(rd, ra, imm, 0x00006013, 0x00006033)
%endm
%macro p1_logi(op, rd, ra, imm)
%p1_logi_##op(rd, ra, imm)
@@ -385,16 +464,16 @@ $(imm)
%endm
%macro p1_mem_LD(rt, rn, off)
-%rv_ld(rt, rn, off)
+%rv_ld_any(rt, rn, off)
%endm
%macro p1_mem_ST(rt, rn, off)
-%rv_sd(rt, rn, off)
+%rv_sd_any(rt, rn, off)
%endm
%macro p1_mem_LB(rt, rn, off)
-%rv_lbu(rt, rn, off)
+%rv_lbu_any(rt, rn, off)
%endm
%macro p1_mem_SB(rt, rn, off)
-%rv_sb(rt, rn, off)
+%rv_sb_any(rt, rn, off)
%endm
%macro p1_mem(op, rt, rn, off)
%select((= %rv_is_sp(rn) 1),
@@ -403,8 +482,8 @@ $(imm)
%endm
%macro p1_ldarg(rd, slot)
-%rv_ld(scratch, sp, 8)
-%rv_ld(rd, scratch, (+ 16 (* 8 slot)))
+%rv_ld(rd, sp, 8)
+%rv_ld_any(rd, rd, (+ 16 (* 8 slot)))
%endm
%macro p1_b()
diff --git a/docs/P1.md b/docs/P1.md
@@ -18,15 +18,21 @@ portable indirect-result convention described below.
## Toolchain envelope
-P1 must be assemblable through the existing `M0` + `hex2` path, with
-`catm` as the only composition primitive between source or generated fragments.
-The spec therefore assumes only the following toolchain features:
-
-- `M0`-level `DEFINE name hex_bytes` substitution
-- raw byte emission
-- labels and label references supported by `hex2`
+P1 source is assembled by the `M1pp → hex2++` chain, with `catm` as the
+only composition primitive between source or generated fragments. The
+spec therefore assumes only the following toolchain features:
+
+- `M1pp` macro expansion: function-like macros, compile-time integer
+ expressions, and the `!@%$` little-endian hex-emission forms used by
+ the per-arch backends to pack instruction words at expansion time
+- labels, label references, and `.scope` / `.endscope` / `.align` /
+ `.fill` / `.ptrsize` directives supported by `hex2++`
- file concatenation via `catm`
+`hex2++` sees only contiguous bytes; all target-specific encoding
+(register packing, bit-scattered immediates, native branch
+displacements) lives in the per-arch M1pp backend.
+
## Source notation
This document describes instructions using ordinary assembly notation such as
@@ -245,8 +251,11 @@ Leaf functions that need no frame-local storage may omit the frame entirely.
Immediate operands appear only in instructions that explicitly admit them.
Portable source has three immediate classes:
-- **Inline integer immediate** — a signed 12-bit assembly-time constant in the
- range `-2048..2047`
+- **Inline integer immediate** — any assembly-time signed integer constant
+ that fits one word. Backends prefer the native instruction's small-imm
+ encoding when the value fits its window (e.g. signed 12-bit on
+ `ADDI`/`ANDI`/`ORI` and on memory offsets); larger values fall back to
+ a materialise-then-R-type sequence transparent to portable source.
- **Materialized word value** — a full one-word assembly-time constant loaded
with `LI`
- **Materialized address** — the address of a label loaded with `LA`
@@ -264,10 +273,11 @@ The backend may realize `LI` and `LA` using native immediates, literal pools,
multi-instruction sequences, or other backend-private mechanisms.
Backends may assume labels fit in 32 bits when realizing `LA` and `LA_BR`.
-This reflects the stage0 image layout (`hex2-0` base `0x00600000`, programs
-well under 4 GB), not a portable-ISA-level guarantee. Backends that target
-images loaded above the 4 GB boundary must adjust their `LA` / `LA_BR`
-lowering. `LI` makes no such assumption — it materializes any one-word value.
+This reflects the current image layout (`hex2++` base `0x00600000`,
+programs well under 4 GB), not a portable-ISA-level guarantee. Backends
+that target images loaded above the 4 GB boundary must adjust their `LA`
+/ `LA_BR` lowering. `LI` makes no such assumption — it materializes any
+one-word value.
## Control Flow
@@ -438,7 +448,10 @@ P1 defines the following memory-access operations:
`LB` loads one byte and zero-extends it to a full word. `SB` stores the low
8 bits of the source value.
-Memory offsets use signed 12-bit inline immediates.
+Memory offsets are signed inline integer immediates and follow the same
+backend-fallback policy as arithmetic immediates: backends prefer the
+native instruction's small-offset encoding (typically signed 12-bit) and
+transparently spill into address-staging when the offset is wider.
The base address for a memory access may be any exposed general register or
`sp`.
diff --git a/tests/P1/wide-imm.P1pp b/tests/P1/wide-imm.P1pp
@@ -0,0 +1,133 @@
+# tests/p1/wide-imm.P1pp -- backend wide-immediate behavioural test.
+#
+# Each subtest exercises one P1 op with an immediate or offset that
+# falls outside its target instruction's small-imm window, so the
+# backend's "_any" fallback (materialise + R-type / address-staging)
+# must run for the result to be correct. Result is checked against
+# the expected value; "X" on any mismatch.
+#
+# Coverage map (small-imm window per arch shown for context):
+# aarch64 riscv64
+# %andi imm window 0..0xFFFF / -0x10000.. -2048..2047
+# %ori imm window (same) (same)
+# %addi imm window 0..0xFFFFFF -2048..2047
+# %ld/%st 8B off window scaled 0..32760 + -2048..2047
+# unscaled -256..255
+# %lb/%sb 1B off window unscaled -256..255 + -2048..2047
+# scaled 0..4095
+# amd64 has native disp32/imm32, so no fallback runs there but the
+# result must still be correct.
+#
+# Offsets chosen to land outside every arch's window:
+# - 8-byte LD/ST at +40000 (past aarch64 scaled imm12)
+# - 1-byte LB/SB at +5000 (past aarch64 unscaled imm12)
+#
+# Buffer storage: `:buf` sits just before `:ELF_end`, so &buf is in
+# the BSS region the loader zero-fills past filesz (ph_memsz = 512 MB
+# in the seed ELF header, so 40008 bytes past &buf is safely mapped).
+#
+# Expected stdout: "ABCDEFGH\n".
+
+%fn(p1_main, 0, {
+ # ---- A: %andi(rd, ra, 0xFFFFFFFF) on -1 -> 0xFFFFFFFF ----------------
+ # Without the wide-andi fix: riscv64 truncates 0xFFFFFFFF to 0xFFF and
+ # aarch64 truncates to 0xFFFF, both giving wrong masks.
+ %li(t0, -1)
+ %andi(t0, t0, 0xFFFFFFFF)
+ %li(t1, 0xFFFFFFFF)
+ %bne(t0, t1, &.fail)
+ %la(a0, &c_a) %li(a1, 1) %call(&print)
+
+ # ---- B: %ori(rd, ra, 0xDEADBEEF) on 0 -> 0xDEADBEEF ------------------
+ %li(t0, 0)
+ %ori(t0, t0, 0xDEADBEEF)
+ %li(t1, 0xDEADBEEF)
+ %bne(t0, t1, &.fail)
+ %la(a0, &c_b) %li(a1, 1) %call(&print)
+
+ # ---- C: %addi(rd, ra, 0xFFFFFFFF) on 0 -> 0xFFFFFFFF -----------------
+ # Past aarch64's 24-bit add-imm window, past riscv64's 12-bit window,
+ # and (critically) past amd64's signed-imm32 range: the imm32 form
+ # would sign-extend 0xFFFFFFFF to -1 and silently subtract.
+ %li(t0, 0)
+ %addi(t0, t0, 0xFFFFFFFF)
+ %li(t1, 0xFFFFFFFF)
+ %bne(t0, t1, &.fail)
+ %la(a0, &c_c) %li(a1, 1) %call(&print)
+
+ # ---- D: %addi(rd, ra, -0xFFFFFFFF) on 0xFFFFFFFF -> 0 ----------------
+ # Negative magnitude past every backend's small-imm window. On amd64
+ # the imm32 form would truncate -0xFFFFFFFF (low 32 bits = 0x1)
+ # and add 1 instead of subtracting 0xFFFFFFFF.
+ %li(t0, 0xFFFFFFFF)
+ %addi(t0, t0, -0xFFFFFFFF)
+ %bnez(t0, &.fail)
+ %la(a0, &c_d) %li(a1, 1) %call(&print)
+
+ # Stage a "trap" value at &buf+0 so a wide-offset store/load that
+ # silently masks its offset down to 0 is detected as the trap value
+ # leaking into the wide slot.
+ %la(s0, &buf)
+ %li(t0, 0xDEAD)
+ %st(t0, s0, 0)
+
+ # ---- E/F: %st + %ld at offset 40000 -> roundtrip 0xCAFEBABE ----------
+ # If the wide store silently truncates to offset 0, it overwrites the
+ # 0xDEAD trap (rather than landing at +40000), and the subsequent
+ # offset-0 ld below would read 0xCAFEBABE instead of 0xDEAD. If the
+ # wide load truncates, it reads the 0xDEAD trap instead of 0xCAFEBABE.
+ %li(t0, 0xCAFEBABE)
+ %st(t0, s0, 40000)
+ %ld(t1, s0, 40000)
+ %li(t2, 0xCAFEBABE)
+ %bne(t1, t2, &.fail)
+ %la(a0, &c_e) %li(a1, 1) %call(&print)
+
+ %ld(t1, s0, 0)
+ %li(t2, 0xDEAD)
+ %bne(t1, t2, &.fail)
+ %la(a0, &c_f) %li(a1, 1) %call(&print)
+
+ # Stage a 1-byte trap at &buf+1 (so it doesn't overlap the 8-byte
+ # value already at &buf+0) before the byte-level subtest.
+ %li(t0, 0x99)
+ %sb(t0, s0, 1)
+
+ # ---- G/H: %sb + %lb at offset 5000 -> roundtrip 0x42 -----------------
+ %li(t0, 0x42)
+ %sb(t0, s0, 5000)
+ %lb(t1, s0, 5000)
+ %li(t2, 0x42)
+ %bne(t1, t2, &.fail)
+ %la(a0, &c_g) %li(a1, 1) %call(&print)
+
+ %lb(t1, s0, 1)
+ %li(t2, 0x99)
+ %bne(t1, t2, &.fail)
+ %la(a0, &c_h) %li(a1, 1) %call(&print)
+
+ %la(a0, &c_nl) %li(a1, 1) %call(&print)
+ %li(a0, 0)
+ %b(&.done)
+
+ :.fail
+ %la(a0, &c_x) %li(a1, 1) %call(&print)
+ %la(a0, &c_nl) %li(a1, 1) %call(&print)
+ %li(a0, 1)
+ :.done
+})
+
+:c_a "A"
+:c_b "B"
+:c_c "C"
+:c_d "D"
+:c_e "E"
+:c_f "F"
+:c_g "G"
+:c_h "H"
+:c_x "X"
+:c_nl "
+"
+
+:buf
+:ELF_end
diff --git a/tests/P1/wide-imm.expected b/tests/P1/wide-imm.expected
@@ -0,0 +1 @@
+ABCDEFGH