cg+aa64: use immediate operand forms for shifts/and/cmp at -O0 - kit

commit dc1854de22635fd2f86aabe8947012f22b496d58
parent 27f088297163bc712f339f7ee942cceb56b0ae57
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue,  2 Jun 2026 16:14:50 -0700

cg+aa64: use immediate operand forms for shifts/and/cmp at -O0

Piece A — shared NativeDirectTarget layer. nd_binop/nd_cmp/nd_cmp_branch
materialized every RHS into a register, so the backend never saw the
target-legal immediates its binop/cmp hooks already accept. Add
nd_rhs_imm_or_reg, mirroring the optimizer's operand_imm_or_reg: keep a
constant RHS as an immediate when native->imm_legal accepts it for the op,
else materialize. Gated on imm_legal != NULL, so a recording mock (or a
backend without the hook) keeps the register path.

This closes the -O0/-O1 gap on x64 (shl/shr/and $imm, add/sub/or/xor imm,
cmp $imm) and rv64 (slli/srli/andi, addi, ...) with no backend changes:
those backends already encode the immediates (exercised by the optimizer).

Piece B — aa64 backend. aa_imm_legal now accepts SHL/SHR shift counts and
AND/ORR/EOR bitmask immediates; aa_binop encodes them via the existing
aa_lsl_imm/aa_lsr_imm/aa_asr_imm (UBFM/SBFM) and aa64_and_imm/orr_imm/eor_imm
helpers. So `x*8`/`x u/8`/`x u%8` become `lsl/lsr/and #imm` instead of
materializing the count/mask into a scratch register.

Disassembler. The aa64 disassembler couldn't render what the above now emits:
the logical-immediate family (AND/ORR/EOR/ANDS #bitmask) printed `.inst`, and
SBFM/UBFM printed raw instead of their shift aliases. Add the logical-immediate
format + decoder (aa64_logimm_decode, the ARM DecodeBitMasks inverse) and the
lsl/lsr/asr aliases via a shared aa64_bitfield_shift_alias helper used by both
the mnemonic writer and the operand printer. Non-aliasing SBFM/UBFM and the
other bitfield aliases (ubfx/sxtb/...) still print raw.

strength_reduce_test now also asserts the immediate operand form per arch
(x64 "$imm", aa64 "#imm", rv64 i-suffixed mnemonic), not just that the
multiply/divide became a shift. 39/39 across x86_64/aarch64/riscv64.

Diffstat:
M src/arch/aa64/disasm.c  | 10 ++++++++++
M src/arch/aa64/isa.c  | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M src/arch/aa64/isa.h  | 7 +++++++
M src/arch/aa64/native.c  | 38 ++++++++++++++++++++++++++++++++++++++
M src/cg/native_direct_target.c  | 19 ++++++++++++++++---
M test/cg/strength_reduce_test.c  | 185 +++++++++++++++++++++++++++++++++++++++++++++++--------------------------------

6 files changed, 273 insertions(+), 77 deletions(-)
diff --git a/src/arch/aa64/disasm.c b/src/arch/aa64/disasm.c
@@ -51,6 +51,16 @@ static void aa64_write_mnemonic(AA64Disasm* d, const AA64InsnDesc* desc,
     strbuf_puts(&d->mnem, aa64_cond_names[cond]);
     return;
   }
+  if (desc->fmt == AA64_FMT_BITFIELD) {
+    /* SBFM/UBFM disassemble to their lsl/lsr/asr shift aliases when the
+     * fields match; aa64_print_operands renders the matching operands. */
+    u32 shift;
+    const char* alias = aa64_bitfield_shift_alias(word, &shift);
+    if (alias) {
+      strbuf_puts(&d->mnem, alias);
+      return;
+    }
+  }
   strbuf_put_slice(&d->mnem, desc->mnemonic);
 }
 
diff --git a/src/arch/aa64/isa.c b/src/arch/aa64/isa.c
@@ -448,6 +448,14 @@ const AA64InsnDesc aa64_insn_table[] = {
     {MN("sbfm"), 0x13000000u, 0x7F800000u, AA64_FMT_BITFIELD, 0, {0, 0}},
     {MN("ubfm"), 0x53000000u, 0x7F800000u, AA64_FMT_BITFIELD, 0, {0, 0}},
 
+    /* ----- Logical, immediate (AND / ORR / EOR / ANDS, bitmask form) -----
+     * sf (bit31) and the N:immr:imms bitmask fields free; opc (bits30:29)
+     * selects the operation. Family bits[28:23]=100100. */
+    {MN("and"), 0x12000000u, 0x7F800000u, AA64_FMT_LOG_IMM, 0, {0, 0}},
+    {MN("orr"), 0x32000000u, 0x7F800000u, AA64_FMT_LOG_IMM, 0, {0, 0}},
+    {MN("eor"), 0x52000000u, 0x7F800000u, AA64_FMT_LOG_IMM, 0, {0, 0}},
+    {MN("ands"), 0x72000000u, 0x7F800000u, AA64_FMT_LOG_IMM, 0, {0, 0}},
+
     /* ----- Load/store, register offset [Xn, Xm{, LSL #s}] (V=0) -----
      * Family bits[29:24]=111000, bit21=1, bits[11:10]=10; per-size opc rows. */
     {MN("strb"), 0x38200800u, 0xFFE00C00u, AA64_FMT_LDST_REGOFF, 0, {0, 0}},
@@ -1038,8 +1046,47 @@ static void print_dp1(StrBuf* sb, u32 w) {
 }
 
 /* SBFM/UBFM: Rd, Rn, #immr, #imms. */
+/* SBFM/UBFM shift aliases. UBFM is LSR when imms is the top bit index and LSL
+ * when immr == imms+1 (the encoder's `lsl #s` form); SBFM is ASR when imms is
+ * the top bit index. The other bitfield aliases (UBFX/SBFX/UXTB/SBFIZ/...) are
+ * left as the raw mnemonic. */
+const char* aa64_bitfield_shift_alias(u32 word, u32* shift) {
+  u32 sf = (word >> 31) & 1u;
+  u32 opc = (word >> 29) & 3u; /* 0 = SBFM, 2 = UBFM */
+  u32 immr = (word >> 16) & 0x3fu;
+  u32 imms = (word >> 10) & 0x3fu;
+  u32 top = sf ? 63u : 31u;
+  if (opc == 2u) { /* UBFM */
+    if (imms == top) {
+      *shift = immr;
+      return "lsr";
+    }
+    if (immr == imms + 1u) {
+      *shift = top - imms;
+      return "lsl";
+    }
+  } else if (opc == 0u) { /* SBFM */
+    if (imms == top) {
+      *shift = immr;
+      return "asr";
+    }
+  }
+  return NULL;
+}
+
 static void print_bitfield(StrBuf* sb, u32 w) {
   int sf = (int)((w >> 31) & 1u);
+  u32 shift;
+  if (aa64_bitfield_shift_alias(w, &shift)) {
+    /* lsl/lsr/asr Rd, Rn, #shift (mnemonic chosen by the disasm mnemonic
+     * writer via the same helper). */
+    emit_reg(sb, w & 0x1fu, sf, 0);
+    strbuf_puts(sb, ", ");
+    emit_reg(sb, (w >> 5) & 0x1fu, sf, 0);
+    strbuf_puts(sb, ", #");
+    strbuf_put_u64(sb, (u64)shift);
+    return;
+  }
   emit_reg(sb, w & 0x1fu, sf, 0);
   strbuf_puts(sb, ", ");
   emit_reg(sb, (w >> 5) & 0x1fu, sf, 0);
@@ -1049,6 +1096,47 @@ static void print_bitfield(StrBuf* sb, u32 w) {
   strbuf_put_u64(sb, (u64)((w >> 10) & 0x3fu));
 }
 
+/* Decode an AArch64 logical-immediate (N:immr:imms) bitmask to its value, per
+ * the ARM ARM DecodeBitMasks. Inverse of aa64_logimm_encode (isa.h). */
+static u64 aa64_logimm_decode(u32 N, u32 immr, u32 imms, int sf) {
+  u32 combined = (N << 6) | ((~imms) & 0x3fu);
+  int len = -1;
+  u32 esize, levels, S, R, i;
+  u64 welem, elem, elt_mask, result;
+  for (int b = 6; b >= 0; --b) {
+    if (combined & (1u << b)) {
+      len = b;
+      break;
+    }
+  }
+  if (len < 1) return 0; /* reserved encoding */
+  esize = 1u << (u32)len;
+  levels = esize - 1u;
+  S = imms & levels;
+  R = immr & levels;
+  welem = (S + 1u >= 64u) ? ~(u64)0 : (((u64)1 << (S + 1u)) - 1u);
+  elt_mask = (esize >= 64u) ? ~(u64)0 : (((u64)1 << esize) - 1u);
+  elem =
+      (R == 0u) ? welem : (((welem >> R) | (welem << (esize - R))) & elt_mask);
+  result = 0;
+  for (i = 0; i < 64u; i += esize) result |= elem << i;
+  return sf ? result : (result & 0xFFFFFFFFu);
+}
+
+static void print_logimm(StrBuf* sb, u32 w) {
+  int sf = (int)((w >> 31) & 1u);
+  u32 opc = (w >> 29) & 3u; /* 0=AND 1=ORR 2=EOR 3=ANDS */
+  u32 N = (w >> 22) & 1u;
+  u32 immr = (w >> 16) & 0x3fu;
+  u32 imms = (w >> 10) & 0x3fu;
+  /* AND/ORR/EOR use the SP-or-GP destination; ANDS uses ZR. Rn is always GP. */
+  emit_reg(sb, w & 0x1fu, sf, opc != 3u);
+  strbuf_puts(sb, ", ");
+  emit_reg(sb, (w >> 5) & 0x1fu, sf, 0);
+  strbuf_puts(sb, ", #");
+  strbuf_put_hex_u64(sb, aa64_logimm_decode(N, immr, imms, sf));
+}
+
 /* Register-offset load/store: Rt, [Xn, Xm{, LSL #s}]. Rt width follows the
  * size field (X for 64-bit accesses, W otherwise); the index extend is LSL
  * for option=011 (UXTX), with shift amount S ? size : 0. */
@@ -1154,6 +1242,9 @@ void aa64_print_operands(StrBuf* sb, const AA64InsnDesc* desc, u32 word,
     case AA64_FMT_BITFIELD:
       print_bitfield(sb, word);
       break;
+    case AA64_FMT_LOG_IMM:
+      print_logimm(sb, word);
+      break;
     case AA64_FMT_LDST_REGOFF:
       print_ldst_regoff(sb, word);
       break;
diff --git a/src/arch/aa64/isa.h b/src/arch/aa64/isa.h
@@ -68,6 +68,7 @@ typedef enum AA64Format {
                          * (SCVTF/UCVTF/FCVTZS/FCVTZU/FMOV) */
   AA64_FMT_LDST_EXCL,   /* load/store exclusive + acquire/release ordered
                          * (LDXR/LDAXR/STXR/STLXR/LDAR/STLR + b/h) */
+  AA64_FMT_LOG_IMM,     /* logical, immediate (AND/ORR/EOR/ANDS #bitmask) */
 } AA64Format;
 
 /* ---- AsmFlags column on AA64InsnDesc ----
@@ -1543,6 +1544,12 @@ struct AA64AsmTok; /* opaque, defined by the phase-3 asm parser */
 void aa64_print_operands(StrBuf* sb, const AA64InsnDesc* desc, u32 word,
                          u64 vaddr);
 
+/* If `word` is an SBFM/UBFM that has a preferred shift-alias disassembly,
+ * return its mnemonic ("lsl"/"lsr"/"asr") and write the shift amount to
+ * *shift; return NULL otherwise. Shared by the disassembler's mnemonic and
+ * operand printers so the alias decision lives in one place. */
+const char* aa64_bitfield_shift_alias(u32 word, u32* shift);
+
 /* Returns 1 on success, 0 on parse error.  Phase 2 stub returns 0 for
  * every format; phase 3 fills in the bodies. */
 int aa64_parse_operands(struct AA64AsmTok* tok, const AA64InsnDesc* desc,
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -903,6 +903,18 @@ static int aa_imm_legal(NativeTarget* t, NativeImmUse use, u32 op,
         u32 sf = type_size32(t, type) == 8u ? 1u : 0u;
         return aa64_imul_strength_reducible(sf, imm);
       }
+      /* LSL/LSR/ASR #imm via the UBFM/SBFM aliases: shift count in range. */
+      if ((BinOp)op == BO_SHL || (BinOp)op == BO_SHR_S ||
+          (BinOp)op == BO_SHR_U) {
+        u32 bits = type_size32(t, type) == 8u ? 64u : 32u;
+        return imm >= 0 && (u64)imm < (u64)bits;
+      }
+      /* AND/ORR/EOR #bitmask: encodable as an AArch64 logical immediate. */
+      if ((BinOp)op == BO_AND || (BinOp)op == BO_OR || (BinOp)op == BO_XOR) {
+        u32 sf = type_size32(t, type) == 8u ? 1u : 0u;
+        u32 N, immr, imms;
+        return aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms);
+      }
       return 0;
     case NATIVE_IMM_CMP:
       /* cmp lowers to subs #imm12; cmn (negative) is not wired, so require a
@@ -2023,6 +2035,8 @@ static void aa_set_bytes(NativeTarget* t, NativeAddr dst, NativeLoc byte_value,
 }
 
 static void aa_lsl_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh);
+static void aa_lsr_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh);
+static void aa_asr_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh);
 
 /* Strength-reduce `mul rd, rn, #imm` for the constants accepted by
  * aa64_imul_strength_reducible into a single non-mul instruction. Callers
@@ -2142,6 +2156,30 @@ static void aa_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc lhs,
     aa_emit_mul_const_imm(t, sf, rd, rn, rhs.v.imm);
     return;
   }
+  if (rhs.kind == NATIVE_LOC_IMM &&
+      (op == BO_SHL || op == BO_SHR_U || op == BO_SHR_S)) {
+    u32 shamt = (u32)rhs.v.imm; /* imm_legal guarantees 0 <= imm < datasize */
+    if (op == BO_SHL)
+      aa_lsl_imm(t, sf, rd, rn, shamt);
+    else if (op == BO_SHR_U)
+      aa_lsr_imm(t, sf, rd, rn, shamt);
+    else
+      aa_asr_imm(t, sf, rd, rn, shamt);
+    return;
+  }
+  if (rhs.kind == NATIVE_LOC_IMM &&
+      (op == BO_AND || op == BO_OR || op == BO_XOR)) {
+    u32 N, immr, imms;
+    if (!aa64_logimm_encode((u64)rhs.v.imm, sf, &N, &immr, &imms))
+      aa_panic(aa_of(t), "logical immediate not encodable");
+    if (op == BO_AND)
+      aa_emit32(t->mc, aa64_and_imm(sf, rd, rn, N, immr, imms));
+    else if (op == BO_OR)
+      aa_emit32(t->mc, aa64_orr_imm(sf, rd, rn, N, immr, imms));
+    else
+      aa_emit32(t->mc, aa64_eor_imm(sf, rd, rn, N, immr, imms));
+    return;
+  }
   switch (op) {
     case BO_IADD:
       aa_emit32(t->mc, aa64_add(sf, rd, rn, rm));
diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c
@@ -832,6 +832,19 @@ static NativeLoc nd_dst_scratch(NativeDirectTarget* d, Operand dst) {
   return nd_reg_loc(r, cls, dst.type);
 }
 
+/* Arithmetic/compare RHS: keep a constant operand as an immediate when the
+ * target can encode it for `use` (so no scratch register is spent
+ * materializing it), mirroring the optimizer's operand_imm_or_reg. Falls back
+ * to a register when there is no imm_legal hook (e.g. a recording mock target)
+ * or the constant is not target-legal for this op. */
+static NativeLoc nd_rhs_imm_or_reg(NativeDirectTarget* d, NativeImmUse use,
+                                   u32 sub, Operand b) {
+  if (b.kind == OPK_IMM && d->native->imm_legal &&
+      d->native->imm_legal(d->native, use, sub, b.type, b.v.imm))
+    return nd_loc_imm(b.v.imm, b.type);
+  return nd_materialize_operand(d, b);
+}
+
 /* Register a pure-compute op writes its result into. For a cacheable local that
  * is the local's cache register (reused or freshly allocated), pinned for the
  * instruction; nd_dst_writeback then marks it dirty without storing. Otherwise
@@ -1012,7 +1025,7 @@ static void nd_cmp_branch(CgTarget* t, CmpOp op, Operand a, Operand b,
   NativeLoc ar, br;
   nd_flush_all(d);
   ar = nd_materialize_operand(d, a);
-  br = nd_materialize_operand(d, b);
+  br = nd_rhs_imm_or_reg(d, NATIVE_IMM_CMP, (u32)op, b);
   ND_REQUIRE_NATIVE(d, cmp_branch, "target does not emit compare branches");
   d->native->cmp_branch(d->native, op, ar, br, nd_mc_label(d, label));
   nd_release_materialized(d, br);
@@ -1431,7 +1444,7 @@ static void nd_bitfield_store(CgTarget* t, Operand record_addr, Operand src,
 static void nd_binop(CgTarget* t, BinOp op, Operand dst, Operand a, Operand b) {
   NativeDirectTarget* d = nd_of(t);
   NativeLoc ar = nd_materialize_operand(d, a);
-  NativeLoc br = nd_materialize_operand(d, b);
+  NativeLoc br = nd_rhs_imm_or_reg(d, NATIVE_IMM_BINOP, (u32)op, b);
   NativeLoc dr = nd_dst_reg(d, dst);
   ND_REQUIRE_NATIVE(d, binop, "target does not emit binary ops");
   d->native->binop(d->native, op, dr, ar, br);
@@ -1453,7 +1466,7 @@ static void nd_unop(CgTarget* t, UnOp op, Operand dst, Operand a) {
 static void nd_cmp(CgTarget* t, CmpOp op, Operand dst, Operand a, Operand b) {
   NativeDirectTarget* d = nd_of(t);
   NativeLoc ar = nd_materialize_operand(d, a);
-  NativeLoc br = nd_materialize_operand(d, b);
+  NativeLoc br = nd_rhs_imm_or_reg(d, NATIVE_IMM_CMP, (u32)op, b);
   NativeLoc dr = nd_dst_reg(d, dst);
   ND_REQUIRE_NATIVE(d, cmp, "target does not emit compares");
   d->native->cmp(d->native, op, dr, ar, br);
diff --git a/test/cg/strength_reduce_test.c b/test/cg/strength_reduce_test.c
@@ -1,15 +1,21 @@
 /* strength_reduce_test — asserts the -O0 semantic peephole (src/cg/fold.c)
  * turns multiply / unsigned-divide / unsigned-remainder by a power-of-two
- * constant into a shift / and.
+ * constant into a shift / and, AND that the shift/and is emitted in its
+ * immediate-operand form (no scratch register spent materializing the count).
  *
  * Each case builds a tiny
  *     i64 f(i64 x) { return x <op> <imm>; }
- * through the public CG API, emits an ELF object, disassembles its .text, and
- * inspects the instruction mnemonics. The assertions are deliberately on
- * substrings ("mul" / "div" / "rem") rather than exact shift mnemonics, so the
- * test is robust to per-arch shift aliasing (lsl/ubfm, slli, shl, ...): a
- * power-of-two operand must make the multiply/divide instruction disappear, and
- * a non-power-of-two operand (and signed division, left to -O1+) must keep it.
+ * through the public CG API, emits an ELF object, and disassembles its .text.
+ *
+ * Two layers of assertion:
+ *   1. The multiply/divide/remainder instruction disappears for power-of-two
+ *      operands and survives for non-power-of-two (and for signed division,
+ *      which is deliberately left to -O1+). Checked on mnemonic substrings
+ *      ("mul"/"div"/"rem"), robust to per-arch aliasing.
+ *   2. The resulting shift/and carries an immediate operand (x64 "$imm",
+ *      aa64 "#imm", rv64 i-suffixed mnemonic) rather than a materialized
+ *      register — the NativeDirectTarget imm_legal pass-through (Piece A) plus
+ *      the aa64 immediate shift/bitmask encodings (Piece B).
  *
  * Run by: make test-cg-api
  */
@@ -27,13 +33,9 @@
 static KitUnit g_u;
 #define EXPECT(cond, ...) CU_EXPECT(&g_u, cond, __VA_ARGS__)
 
-typedef struct BinFnSpec {
+typedef struct EmitCtx {
   KitCgIntBinOp op;
   int64_t imm;
-} BinFnSpec;
-
-typedef struct EmitCtx {
-  BinFnSpec spec;
   KitObjBuilder* ob;
 } EmitCtx;
 
@@ -92,8 +94,8 @@ static KitStatus emit_binop_fn(KitCompiler* c, void* user) {
   kit_cg_push_local(cg, param);
   kit_cg_load(cg, (KitCgMemAccess){.type = i64_ty,
                                    .align = kit_cg_type_align(c, i64_ty)});
-  kit_cg_push_int(cg, (uint64_t)ctx->spec.imm, i64_ty);
-  kit_cg_int_binop(cg, ctx->spec.op, KIT_CG_INTOP_NONE);
+  kit_cg_push_int(cg, (uint64_t)ctx->imm, i64_ty);
+  kit_cg_int_binop(cg, ctx->op, KIT_CG_INTOP_NONE);
   kit_cg_ret(cg);
   kit_cg_func_end(cg);
 
@@ -102,10 +104,21 @@ static KitStatus emit_binop_fn(KitCompiler* c, void* user) {
   return KIT_OK;
 }
 
-/* Emit the op, disassemble .text, and report whether any mnemonic contains
- * `needle`. Returns 1 if present, 0 if absent, -1 on harness failure. */
-static int op_text_has_mnem(KitArchKind arch, KitCgIntBinOp op, int64_t imm,
-                            const char* needle) {
+typedef struct DisInsn {
+  char mnem[16];
+  char ops[64];
+} DisInsn;
+
+static void slice_to_buf(KitSlice s, char* buf, size_t cap) {
+  size_t n = s.len < cap - 1 ? s.len : cap - 1;
+  if (s.s && n) memcpy(buf, s.s, n);
+  buf[n] = '\0';
+}
+
+/* Build the op and disassemble its .text into `out` (up to `cap` insns).
+ * Returns the instruction count, or -1 on harness failure. */
+static int op_disasm(KitArchKind arch, KitCgIntBinOp op, int64_t imm,
+                     DisInsn* out, int cap) {
   KitTarget target = kit_unit_target(arch, KIT_OS_LINUX, KIT_OBJ_ELF);
   KitCompiler* c = NULL;
   EmitCtx ctx;
@@ -119,12 +132,11 @@ static int op_text_has_mnem(KitArchKind arch, KitCgIntBinOp op, int64_t imm,
   KitDisasmIter* it = NULL;
   KitInsn insn;
   int result = -1;
-  int found = 0;
-  size_t nlen = strlen(needle);
+  int n = 0;
 
   memset(&ctx, 0, sizeof ctx);
-  ctx.spec.op = op;
-  ctx.spec.imm = imm;
+  ctx.op = op;
+  ctx.imm = imm;
 
   if (kit_compiler_new(target, &g_u.ctx, &c) != KIT_OK || !c) return -1;
   if (kit_frontend_run(c, emit_binop_fn, &ctx) != KIT_OK) goto done;
@@ -132,11 +144,9 @@ static int op_text_has_mnem(KitArchKind arch, KitCgIntBinOp op, int64_t imm,
   if (kit_obj_builder_emit(ctx.ob, writer) != KIT_OK) goto done;
   bytes.data = kit_writer_mem_bytes(writer, &len);
   bytes.len = len;
-  if (kit_obj_open(&g_u.ctx, KIT_SLICE_LIT("<sr-test>"), &bytes, &file) !=
-      KIT_OK)
+  if (kit_obj_open(&g_u.ctx, KIT_SLICE_LIT("<sr-test>"), &bytes, &file) != KIT_OK)
     goto done;
-  if (kit_obj_section_by_name(file, KIT_SLICE_LIT(".text"), &text_sec) !=
-      KIT_OK)
+  if (kit_obj_section_by_name(file, KIT_SLICE_LIT(".text"), &text_sec) != KIT_OK)
     goto done;
   if (kit_obj_section_data(file, text_sec, &data, &len) != KIT_OK) goto done;
 
@@ -145,20 +155,12 @@ static int op_text_has_mnem(KitArchKind arch, KitCgIntBinOp op, int64_t imm,
   dc.context = g_u.ctx;
   if (kit_disasm_iter_new(&dc, data, len, 0, file, &it) != KIT_OK || !it)
     goto done;
-  while (kit_disasm_iter_next(it, &insn) == KIT_ITER_ITEM) {
-    const char* m = insn.mnemonic.s;
-    size_t mlen = insn.mnemonic.len;
-    if (m && mlen >= nlen) {
-      for (size_t i = 0; i + nlen <= mlen; ++i) {
-        if (memcmp(m + i, needle, nlen) == 0) {
-          found = 1;
-          break;
-        }
-      }
-    }
-    if (found) break;
+  while (n < cap && kit_disasm_iter_next(it, &insn) == KIT_ITER_ITEM) {
+    slice_to_buf(insn.mnemonic, out[n].mnem, sizeof out[n].mnem);
+    slice_to_buf(insn.operands, out[n].ops, sizeof out[n].ops);
+    ++n;
   }
-  result = found;
+  result = n;
 
 done:
   if (it) kit_disasm_iter_free(it);
@@ -169,57 +171,92 @@ done:
   return result;
 }
 
-static const char* arch_name(KitArchKind arch) {
-  switch (arch) {
-    case KIT_ARCH_X86_64:
-      return "x86_64";
-    case KIT_ARCH_ARM_64:
-      return "aarch64";
-    case KIT_ARCH_RV64:
-      return "riscv64";
-    default:
-      return "arch";
-  }
+/* 1 if any instruction's mnemonic contains `mnem`. */
+static int have_mnem(const DisInsn* in, int n, const char* mnem) {
+  for (int i = 0; i < n; ++i)
+    if (strstr(in[i].mnem, mnem)) return 1;
+  return 0;
+}
+
+/* 1 if the first instruction whose mnemonic contains `mnem` has `marker` in its
+ * operands (an empty marker matches any). This is how we tell an immediate
+ * operand form (x64 "$", aa64 "#") from a register form, and — via the
+ * i-suffixed RISC-V mnemonics — picks out slli/srli/andi over sll/srl/and. */
+static int imm_form(const DisInsn* in, int n, const char* mnem,
+                    const char* marker) {
+  for (int i = 0; i < n; ++i)
+    if (strstr(in[i].mnem, mnem)) return strstr(in[i].ops, marker) != NULL;
+  return 0;
 }
 
-static void check_arch(KitArchKind arch) {
-  const char* an = arch_name(arch);
+typedef struct ArchExpect {
+  KitArchKind arch;
+  const char* name;
+  const char* shl_mnem; /* immediate shift-left mnemonic (x*2^k) */
+  const char* shr_mnem; /* immediate logical shift-right (x u/ 2^k) */
+  const char* and_mnem; /* immediate bitwise-and (x u% 2^k) */
+  const char* mark;     /* operand marker for an immediate ("" if in mnemonic) */
+} ArchExpect;
+
+static void check_arch(const ArchExpect* ex) {
+  const char* an = ex->name;
+  DisInsn buf[128];
+  int n;
 
-  /* x * 8  -> shift: no multiply instruction survives. */
-  EXPECT(op_text_has_mnem(arch, KIT_CG_INT_MUL, 8, "mul") == 0,
+  /* --- multiply --- */
+  n = op_disasm(ex->arch, KIT_CG_INT_MUL, 8, buf, 128);
+  EXPECT(n > 0, "[%s] mul8 disasm failed", an);
+  EXPECT(!have_mnem(buf, n, "mul"),
          "[%s] x * 8 should strength-reduce to a shift (no 'mul')", an);
-  /* x * 7  -> still a multiply (not a power of two). */
-  EXPECT(op_text_has_mnem(arch, KIT_CG_INT_MUL, 7, "mul") == 1,
-         "[%s] x * 7 must stay a multiply", an);
+  EXPECT(imm_form(buf, n, ex->shl_mnem, ex->mark),
+         "[%s] x * 8 should use the immediate shift form", an);
 
-  /* x u/ 8 -> logical shift right: no divide. */
-  EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UDIV, 8, "div") == 0,
+  n = op_disasm(ex->arch, KIT_CG_INT_MUL, 7, buf, 128);
+  EXPECT(n > 0 && have_mnem(buf, n, "mul"), "[%s] x * 7 must stay a multiply",
+         an);
+
+  /* --- unsigned divide --- */
+  n = op_disasm(ex->arch, KIT_CG_INT_UDIV, 8, buf, 128);
+  EXPECT(n > 0, "[%s] udiv8 disasm failed", an);
+  EXPECT(!have_mnem(buf, n, "div"),
          "[%s] x u/ 8 should strength-reduce to a shift (no 'div')", an);
-  /* x u/ 7 -> still a divide. */
-  EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UDIV, 7, "div") == 1,
-         "[%s] x u/ 7 must stay a divide", an);
+  EXPECT(imm_form(buf, n, ex->shr_mnem, ex->mark),
+         "[%s] x u/ 8 should use the immediate shift form", an);
+
+  n = op_disasm(ex->arch, KIT_CG_INT_UDIV, 7, buf, 128);
+  EXPECT(n > 0 && have_mnem(buf, n, "div"), "[%s] x u/ 7 must stay a divide",
+         an);
 
-  /* x u% 8 -> and: no divide and no remainder instruction. */
-  EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UREM, 8, "div") == 0 &&
-             op_text_has_mnem(arch, KIT_CG_INT_UREM, 8, "rem") == 0,
+  /* --- unsigned remainder --- */
+  n = op_disasm(ex->arch, KIT_CG_INT_UREM, 8, buf, 128);
+  EXPECT(n > 0, "[%s] urem8 disasm failed", an);
+  EXPECT(!have_mnem(buf, n, "div") && !have_mnem(buf, n, "rem"),
          "[%s] x u%% 8 should strength-reduce to an and (no 'div'/'rem')", an);
-  /* x u% 7 -> still a divide or remainder. */
-  EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UREM, 7, "div") == 1 ||
-             op_text_has_mnem(arch, KIT_CG_INT_UREM, 7, "rem") == 1,
+  EXPECT(imm_form(buf, n, ex->and_mnem, ex->mark),
+         "[%s] x u%% 8 should use the immediate and form", an);
+
+  n = op_disasm(ex->arch, KIT_CG_INT_UREM, 7, buf, 128);
+  EXPECT(n > 0 && (have_mnem(buf, n, "div") || have_mnem(buf, n, "rem")),
          "[%s] x u%% 7 must stay a divide/remainder", an);
 
-  /* Signed division by a power of two needs a sign-bias sequence; the -O0
-   * peephole deliberately leaves it for the optimizer, so the divide stays. */
-  EXPECT(op_text_has_mnem(arch, KIT_CG_INT_SDIV, 8, "div") == 1,
+  /* --- signed divide: deliberately left for the optimizer --- */
+  n = op_disasm(ex->arch, KIT_CG_INT_SDIV, 8, buf, 128);
+  EXPECT(n > 0 && have_mnem(buf, n, "div"),
          "[%s] signed x / 8 is left as a divide at -O0", an);
 }
 
 int main(void) {
+  /* x64 AT&T immediates read "$N"; aa64 reads "#N"; rv64 folds the immediate
+   * into the mnemonic (slli/srli/andi), so its operand marker is empty. */
+  static const ArchExpect arches[] = {
+      {KIT_ARCH_X86_64, "x86_64", "shl", "shr", "and", "$"},
+      {KIT_ARCH_ARM_64, "aarch64", "lsl", "lsr", "and", "#"},
+      {KIT_ARCH_RV64, "riscv64", "slli", "srli", "andi", ""},
+  };
   kit_unit_init(&g_u);
   g_u.ctx.now = -1;
-  check_arch(KIT_ARCH_X86_64);
-  check_arch(KIT_ARCH_ARM_64);
-  check_arch(KIT_ARCH_RV64);
+  for (size_t i = 0; i < sizeof arches / sizeof arches[0]; ++i)
+    check_arch(&arches[i]);
   kit_unit_summary(&g_u, "strength_reduce_test");
   return kit_unit_status(&g_u);
 }

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/arch/aa64/disasm.c	\|	10	++++++++++
M	src/arch/aa64/isa.c	\|	91	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	src/arch/aa64/isa.h	\|	7	+++++++
M	src/arch/aa64/native.c	\|	38	++++++++++++++++++++++++++++++++++++++
M	src/cg/native_direct_target.c	\|	19	++++++++++++++++---
M	test/cg/strength_reduce_test.c	\|	185	+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------