commit dc1854de22635fd2f86aabe8947012f22b496d58
parent 27f088297163bc712f339f7ee942cceb56b0ae57
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Tue, 2 Jun 2026 16:14:50 -0700
cg+aa64: use immediate operand forms for shifts/and/cmp at -O0
Piece A — shared NativeDirectTarget layer. nd_binop/nd_cmp/nd_cmp_branch
materialized every RHS into a register, so the backend never saw the
target-legal immediates its binop/cmp hooks already accept. Add
nd_rhs_imm_or_reg, mirroring the optimizer's operand_imm_or_reg: keep a
constant RHS as an immediate when native->imm_legal accepts it for the op,
else materialize. Gated on imm_legal != NULL, so a recording mock (or a
backend without the hook) keeps the register path.
This closes the -O0/-O1 gap on x64 (shl/shr/and $imm, add/sub/or/xor imm,
cmp $imm) and rv64 (slli/srli/andi, addi, ...) with no backend changes:
those backends already encode the immediates (exercised by the optimizer).
Piece B — aa64 backend. aa_imm_legal now accepts SHL/SHR shift counts and
AND/ORR/EOR bitmask immediates; aa_binop encodes them via the existing
aa_lsl_imm/aa_lsr_imm/aa_asr_imm (UBFM/SBFM) and aa64_and_imm/orr_imm/eor_imm
helpers. So `x*8`/`x u/8`/`x u%8` become `lsl/lsr/and #imm` instead of
materializing the count/mask into a scratch register.
Disassembler. The aa64 disassembler couldn't render what the above now emits:
the logical-immediate family (AND/ORR/EOR/ANDS #bitmask) printed `.inst`, and
SBFM/UBFM printed raw instead of their shift aliases. Add the logical-immediate
format + decoder (aa64_logimm_decode, the ARM DecodeBitMasks inverse) and the
lsl/lsr/asr aliases via a shared aa64_bitfield_shift_alias helper used by both
the mnemonic writer and the operand printer. Non-aliasing SBFM/UBFM and the
other bitfield aliases (ubfx/sxtb/...) still print raw.
strength_reduce_test now also asserts the immediate operand form per arch
(x64 "$imm", aa64 "#imm", rv64 i-suffixed mnemonic), not just that the
multiply/divide became a shift. 39/39 across x86_64/aarch64/riscv64.
Diffstat:
6 files changed, 273 insertions(+), 77 deletions(-)
diff --git a/src/arch/aa64/disasm.c b/src/arch/aa64/disasm.c
@@ -51,6 +51,16 @@ static void aa64_write_mnemonic(AA64Disasm* d, const AA64InsnDesc* desc,
strbuf_puts(&d->mnem, aa64_cond_names[cond]);
return;
}
+ if (desc->fmt == AA64_FMT_BITFIELD) {
+ /* SBFM/UBFM disassemble to their lsl/lsr/asr shift aliases when the
+ * fields match; aa64_print_operands renders the matching operands. */
+ u32 shift;
+ const char* alias = aa64_bitfield_shift_alias(word, &shift);
+ if (alias) {
+ strbuf_puts(&d->mnem, alias);
+ return;
+ }
+ }
strbuf_put_slice(&d->mnem, desc->mnemonic);
}
diff --git a/src/arch/aa64/isa.c b/src/arch/aa64/isa.c
@@ -448,6 +448,14 @@ const AA64InsnDesc aa64_insn_table[] = {
{MN("sbfm"), 0x13000000u, 0x7F800000u, AA64_FMT_BITFIELD, 0, {0, 0}},
{MN("ubfm"), 0x53000000u, 0x7F800000u, AA64_FMT_BITFIELD, 0, {0, 0}},
+ /* ----- Logical, immediate (AND / ORR / EOR / ANDS, bitmask form) -----
+ * sf (bit31) and the N:immr:imms bitmask fields free; opc (bits30:29)
+ * selects the operation. Family bits[28:23]=100100. */
+ {MN("and"), 0x12000000u, 0x7F800000u, AA64_FMT_LOG_IMM, 0, {0, 0}},
+ {MN("orr"), 0x32000000u, 0x7F800000u, AA64_FMT_LOG_IMM, 0, {0, 0}},
+ {MN("eor"), 0x52000000u, 0x7F800000u, AA64_FMT_LOG_IMM, 0, {0, 0}},
+ {MN("ands"), 0x72000000u, 0x7F800000u, AA64_FMT_LOG_IMM, 0, {0, 0}},
+
/* ----- Load/store, register offset [Xn, Xm{, LSL #s}] (V=0) -----
* Family bits[29:24]=111000, bit21=1, bits[11:10]=10; per-size opc rows. */
{MN("strb"), 0x38200800u, 0xFFE00C00u, AA64_FMT_LDST_REGOFF, 0, {0, 0}},
@@ -1038,8 +1046,47 @@ static void print_dp1(StrBuf* sb, u32 w) {
}
/* SBFM/UBFM: Rd, Rn, #immr, #imms. */
+/* SBFM/UBFM shift aliases. UBFM is LSR when imms is the top bit index and LSL
+ * when immr == imms+1 (the encoder's `lsl #s` form); SBFM is ASR when imms is
+ * the top bit index. The other bitfield aliases (UBFX/SBFX/UXTB/SBFIZ/...) are
+ * left as the raw mnemonic. */
+const char* aa64_bitfield_shift_alias(u32 word, u32* shift) {
+ u32 sf = (word >> 31) & 1u;
+ u32 opc = (word >> 29) & 3u; /* 0 = SBFM, 2 = UBFM */
+ u32 immr = (word >> 16) & 0x3fu;
+ u32 imms = (word >> 10) & 0x3fu;
+ u32 top = sf ? 63u : 31u;
+ if (opc == 2u) { /* UBFM */
+ if (imms == top) {
+ *shift = immr;
+ return "lsr";
+ }
+ if (immr == imms + 1u) {
+ *shift = top - imms;
+ return "lsl";
+ }
+ } else if (opc == 0u) { /* SBFM */
+ if (imms == top) {
+ *shift = immr;
+ return "asr";
+ }
+ }
+ return NULL;
+}
+
static void print_bitfield(StrBuf* sb, u32 w) {
int sf = (int)((w >> 31) & 1u);
+ u32 shift;
+ if (aa64_bitfield_shift_alias(w, &shift)) {
+ /* lsl/lsr/asr Rd, Rn, #shift (mnemonic chosen by the disasm mnemonic
+ * writer via the same helper). */
+ emit_reg(sb, w & 0x1fu, sf, 0);
+ strbuf_puts(sb, ", ");
+ emit_reg(sb, (w >> 5) & 0x1fu, sf, 0);
+ strbuf_puts(sb, ", #");
+ strbuf_put_u64(sb, (u64)shift);
+ return;
+ }
emit_reg(sb, w & 0x1fu, sf, 0);
strbuf_puts(sb, ", ");
emit_reg(sb, (w >> 5) & 0x1fu, sf, 0);
@@ -1049,6 +1096,47 @@ static void print_bitfield(StrBuf* sb, u32 w) {
strbuf_put_u64(sb, (u64)((w >> 10) & 0x3fu));
}
+/* Decode an AArch64 logical-immediate (N:immr:imms) bitmask to its value, per
+ * the ARM ARM DecodeBitMasks. Inverse of aa64_logimm_encode (isa.h). */
+static u64 aa64_logimm_decode(u32 N, u32 immr, u32 imms, int sf) {
+ u32 combined = (N << 6) | ((~imms) & 0x3fu);
+ int len = -1;
+ u32 esize, levels, S, R, i;
+ u64 welem, elem, elt_mask, result;
+ for (int b = 6; b >= 0; --b) {
+ if (combined & (1u << b)) {
+ len = b;
+ break;
+ }
+ }
+ if (len < 1) return 0; /* reserved encoding */
+ esize = 1u << (u32)len;
+ levels = esize - 1u;
+ S = imms & levels;
+ R = immr & levels;
+ welem = (S + 1u >= 64u) ? ~(u64)0 : (((u64)1 << (S + 1u)) - 1u);
+ elt_mask = (esize >= 64u) ? ~(u64)0 : (((u64)1 << esize) - 1u);
+ elem =
+ (R == 0u) ? welem : (((welem >> R) | (welem << (esize - R))) & elt_mask);
+ result = 0;
+ for (i = 0; i < 64u; i += esize) result |= elem << i;
+ return sf ? result : (result & 0xFFFFFFFFu);
+}
+
+static void print_logimm(StrBuf* sb, u32 w) {
+ int sf = (int)((w >> 31) & 1u);
+ u32 opc = (w >> 29) & 3u; /* 0=AND 1=ORR 2=EOR 3=ANDS */
+ u32 N = (w >> 22) & 1u;
+ u32 immr = (w >> 16) & 0x3fu;
+ u32 imms = (w >> 10) & 0x3fu;
+ /* AND/ORR/EOR use the SP-or-GP destination; ANDS uses ZR. Rn is always GP. */
+ emit_reg(sb, w & 0x1fu, sf, opc != 3u);
+ strbuf_puts(sb, ", ");
+ emit_reg(sb, (w >> 5) & 0x1fu, sf, 0);
+ strbuf_puts(sb, ", #");
+ strbuf_put_hex_u64(sb, aa64_logimm_decode(N, immr, imms, sf));
+}
+
/* Register-offset load/store: Rt, [Xn, Xm{, LSL #s}]. Rt width follows the
* size field (X for 64-bit accesses, W otherwise); the index extend is LSL
* for option=011 (UXTX), with shift amount S ? size : 0. */
@@ -1154,6 +1242,9 @@ void aa64_print_operands(StrBuf* sb, const AA64InsnDesc* desc, u32 word,
case AA64_FMT_BITFIELD:
print_bitfield(sb, word);
break;
+ case AA64_FMT_LOG_IMM:
+ print_logimm(sb, word);
+ break;
case AA64_FMT_LDST_REGOFF:
print_ldst_regoff(sb, word);
break;
diff --git a/src/arch/aa64/isa.h b/src/arch/aa64/isa.h
@@ -68,6 +68,7 @@ typedef enum AA64Format {
* (SCVTF/UCVTF/FCVTZS/FCVTZU/FMOV) */
AA64_FMT_LDST_EXCL, /* load/store exclusive + acquire/release ordered
* (LDXR/LDAXR/STXR/STLXR/LDAR/STLR + b/h) */
+ AA64_FMT_LOG_IMM, /* logical, immediate (AND/ORR/EOR/ANDS #bitmask) */
} AA64Format;
/* ---- AsmFlags column on AA64InsnDesc ----
@@ -1543,6 +1544,12 @@ struct AA64AsmTok; /* opaque, defined by the phase-3 asm parser */
void aa64_print_operands(StrBuf* sb, const AA64InsnDesc* desc, u32 word,
u64 vaddr);
+/* If `word` is an SBFM/UBFM that has a preferred shift-alias disassembly,
+ * return its mnemonic ("lsl"/"lsr"/"asr") and write the shift amount to
+ * *shift; return NULL otherwise. Shared by the disassembler's mnemonic and
+ * operand printers so the alias decision lives in one place. */
+const char* aa64_bitfield_shift_alias(u32 word, u32* shift);
+
/* Returns 1 on success, 0 on parse error. Phase 2 stub returns 0 for
* every format; phase 3 fills in the bodies. */
int aa64_parse_operands(struct AA64AsmTok* tok, const AA64InsnDesc* desc,
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -903,6 +903,18 @@ static int aa_imm_legal(NativeTarget* t, NativeImmUse use, u32 op,
u32 sf = type_size32(t, type) == 8u ? 1u : 0u;
return aa64_imul_strength_reducible(sf, imm);
}
+ /* LSL/LSR/ASR #imm via the UBFM/SBFM aliases: shift count in range. */
+ if ((BinOp)op == BO_SHL || (BinOp)op == BO_SHR_S ||
+ (BinOp)op == BO_SHR_U) {
+ u32 bits = type_size32(t, type) == 8u ? 64u : 32u;
+ return imm >= 0 && (u64)imm < (u64)bits;
+ }
+ /* AND/ORR/EOR #bitmask: encodable as an AArch64 logical immediate. */
+ if ((BinOp)op == BO_AND || (BinOp)op == BO_OR || (BinOp)op == BO_XOR) {
+ u32 sf = type_size32(t, type) == 8u ? 1u : 0u;
+ u32 N, immr, imms;
+ return aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms);
+ }
return 0;
case NATIVE_IMM_CMP:
/* cmp lowers to subs #imm12; cmn (negative) is not wired, so require a
@@ -2023,6 +2035,8 @@ static void aa_set_bytes(NativeTarget* t, NativeAddr dst, NativeLoc byte_value,
}
static void aa_lsl_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh);
+static void aa_lsr_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh);
+static void aa_asr_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh);
/* Strength-reduce `mul rd, rn, #imm` for the constants accepted by
* aa64_imul_strength_reducible into a single non-mul instruction. Callers
@@ -2142,6 +2156,30 @@ static void aa_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc lhs,
aa_emit_mul_const_imm(t, sf, rd, rn, rhs.v.imm);
return;
}
+ if (rhs.kind == NATIVE_LOC_IMM &&
+ (op == BO_SHL || op == BO_SHR_U || op == BO_SHR_S)) {
+ u32 shamt = (u32)rhs.v.imm; /* imm_legal guarantees 0 <= imm < datasize */
+ if (op == BO_SHL)
+ aa_lsl_imm(t, sf, rd, rn, shamt);
+ else if (op == BO_SHR_U)
+ aa_lsr_imm(t, sf, rd, rn, shamt);
+ else
+ aa_asr_imm(t, sf, rd, rn, shamt);
+ return;
+ }
+ if (rhs.kind == NATIVE_LOC_IMM &&
+ (op == BO_AND || op == BO_OR || op == BO_XOR)) {
+ u32 N, immr, imms;
+ if (!aa64_logimm_encode((u64)rhs.v.imm, sf, &N, &immr, &imms))
+ aa_panic(aa_of(t), "logical immediate not encodable");
+ if (op == BO_AND)
+ aa_emit32(t->mc, aa64_and_imm(sf, rd, rn, N, immr, imms));
+ else if (op == BO_OR)
+ aa_emit32(t->mc, aa64_orr_imm(sf, rd, rn, N, immr, imms));
+ else
+ aa_emit32(t->mc, aa64_eor_imm(sf, rd, rn, N, immr, imms));
+ return;
+ }
switch (op) {
case BO_IADD:
aa_emit32(t->mc, aa64_add(sf, rd, rn, rm));
diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c
@@ -832,6 +832,19 @@ static NativeLoc nd_dst_scratch(NativeDirectTarget* d, Operand dst) {
return nd_reg_loc(r, cls, dst.type);
}
+/* Arithmetic/compare RHS: keep a constant operand as an immediate when the
+ * target can encode it for `use` (so no scratch register is spent
+ * materializing it), mirroring the optimizer's operand_imm_or_reg. Falls back
+ * to a register when there is no imm_legal hook (e.g. a recording mock target)
+ * or the constant is not target-legal for this op. */
+static NativeLoc nd_rhs_imm_or_reg(NativeDirectTarget* d, NativeImmUse use,
+ u32 sub, Operand b) {
+ if (b.kind == OPK_IMM && d->native->imm_legal &&
+ d->native->imm_legal(d->native, use, sub, b.type, b.v.imm))
+ return nd_loc_imm(b.v.imm, b.type);
+ return nd_materialize_operand(d, b);
+}
+
/* Register a pure-compute op writes its result into. For a cacheable local that
* is the local's cache register (reused or freshly allocated), pinned for the
* instruction; nd_dst_writeback then marks it dirty without storing. Otherwise
@@ -1012,7 +1025,7 @@ static void nd_cmp_branch(CgTarget* t, CmpOp op, Operand a, Operand b,
NativeLoc ar, br;
nd_flush_all(d);
ar = nd_materialize_operand(d, a);
- br = nd_materialize_operand(d, b);
+ br = nd_rhs_imm_or_reg(d, NATIVE_IMM_CMP, (u32)op, b);
ND_REQUIRE_NATIVE(d, cmp_branch, "target does not emit compare branches");
d->native->cmp_branch(d->native, op, ar, br, nd_mc_label(d, label));
nd_release_materialized(d, br);
@@ -1431,7 +1444,7 @@ static void nd_bitfield_store(CgTarget* t, Operand record_addr, Operand src,
static void nd_binop(CgTarget* t, BinOp op, Operand dst, Operand a, Operand b) {
NativeDirectTarget* d = nd_of(t);
NativeLoc ar = nd_materialize_operand(d, a);
- NativeLoc br = nd_materialize_operand(d, b);
+ NativeLoc br = nd_rhs_imm_or_reg(d, NATIVE_IMM_BINOP, (u32)op, b);
NativeLoc dr = nd_dst_reg(d, dst);
ND_REQUIRE_NATIVE(d, binop, "target does not emit binary ops");
d->native->binop(d->native, op, dr, ar, br);
@@ -1453,7 +1466,7 @@ static void nd_unop(CgTarget* t, UnOp op, Operand dst, Operand a) {
static void nd_cmp(CgTarget* t, CmpOp op, Operand dst, Operand a, Operand b) {
NativeDirectTarget* d = nd_of(t);
NativeLoc ar = nd_materialize_operand(d, a);
- NativeLoc br = nd_materialize_operand(d, b);
+ NativeLoc br = nd_rhs_imm_or_reg(d, NATIVE_IMM_CMP, (u32)op, b);
NativeLoc dr = nd_dst_reg(d, dst);
ND_REQUIRE_NATIVE(d, cmp, "target does not emit compares");
d->native->cmp(d->native, op, dr, ar, br);
diff --git a/test/cg/strength_reduce_test.c b/test/cg/strength_reduce_test.c
@@ -1,15 +1,21 @@
/* strength_reduce_test — asserts the -O0 semantic peephole (src/cg/fold.c)
* turns multiply / unsigned-divide / unsigned-remainder by a power-of-two
- * constant into a shift / and.
+ * constant into a shift / and, AND that the shift/and is emitted in its
+ * immediate-operand form (no scratch register spent materializing the count).
*
* Each case builds a tiny
* i64 f(i64 x) { return x <op> <imm>; }
- * through the public CG API, emits an ELF object, disassembles its .text, and
- * inspects the instruction mnemonics. The assertions are deliberately on
- * substrings ("mul" / "div" / "rem") rather than exact shift mnemonics, so the
- * test is robust to per-arch shift aliasing (lsl/ubfm, slli, shl, ...): a
- * power-of-two operand must make the multiply/divide instruction disappear, and
- * a non-power-of-two operand (and signed division, left to -O1+) must keep it.
+ * through the public CG API, emits an ELF object, and disassembles its .text.
+ *
+ * Two layers of assertion:
+ * 1. The multiply/divide/remainder instruction disappears for power-of-two
+ * operands and survives for non-power-of-two (and for signed division,
+ * which is deliberately left to -O1+). Checked on mnemonic substrings
+ * ("mul"/"div"/"rem"), robust to per-arch aliasing.
+ * 2. The resulting shift/and carries an immediate operand (x64 "$imm",
+ * aa64 "#imm", rv64 i-suffixed mnemonic) rather than a materialized
+ * register — the NativeDirectTarget imm_legal pass-through (Piece A) plus
+ * the aa64 immediate shift/bitmask encodings (Piece B).
*
* Run by: make test-cg-api
*/
@@ -27,13 +33,9 @@
static KitUnit g_u;
#define EXPECT(cond, ...) CU_EXPECT(&g_u, cond, __VA_ARGS__)
-typedef struct BinFnSpec {
+typedef struct EmitCtx {
KitCgIntBinOp op;
int64_t imm;
-} BinFnSpec;
-
-typedef struct EmitCtx {
- BinFnSpec spec;
KitObjBuilder* ob;
} EmitCtx;
@@ -92,8 +94,8 @@ static KitStatus emit_binop_fn(KitCompiler* c, void* user) {
kit_cg_push_local(cg, param);
kit_cg_load(cg, (KitCgMemAccess){.type = i64_ty,
.align = kit_cg_type_align(c, i64_ty)});
- kit_cg_push_int(cg, (uint64_t)ctx->spec.imm, i64_ty);
- kit_cg_int_binop(cg, ctx->spec.op, KIT_CG_INTOP_NONE);
+ kit_cg_push_int(cg, (uint64_t)ctx->imm, i64_ty);
+ kit_cg_int_binop(cg, ctx->op, KIT_CG_INTOP_NONE);
kit_cg_ret(cg);
kit_cg_func_end(cg);
@@ -102,10 +104,21 @@ static KitStatus emit_binop_fn(KitCompiler* c, void* user) {
return KIT_OK;
}
-/* Emit the op, disassemble .text, and report whether any mnemonic contains
- * `needle`. Returns 1 if present, 0 if absent, -1 on harness failure. */
-static int op_text_has_mnem(KitArchKind arch, KitCgIntBinOp op, int64_t imm,
- const char* needle) {
+typedef struct DisInsn {
+ char mnem[16];
+ char ops[64];
+} DisInsn;
+
+static void slice_to_buf(KitSlice s, char* buf, size_t cap) {
+ size_t n = s.len < cap - 1 ? s.len : cap - 1;
+ if (s.s && n) memcpy(buf, s.s, n);
+ buf[n] = '\0';
+}
+
+/* Build the op and disassemble its .text into `out` (up to `cap` insns).
+ * Returns the instruction count, or -1 on harness failure. */
+static int op_disasm(KitArchKind arch, KitCgIntBinOp op, int64_t imm,
+ DisInsn* out, int cap) {
KitTarget target = kit_unit_target(arch, KIT_OS_LINUX, KIT_OBJ_ELF);
KitCompiler* c = NULL;
EmitCtx ctx;
@@ -119,12 +132,11 @@ static int op_text_has_mnem(KitArchKind arch, KitCgIntBinOp op, int64_t imm,
KitDisasmIter* it = NULL;
KitInsn insn;
int result = -1;
- int found = 0;
- size_t nlen = strlen(needle);
+ int n = 0;
memset(&ctx, 0, sizeof ctx);
- ctx.spec.op = op;
- ctx.spec.imm = imm;
+ ctx.op = op;
+ ctx.imm = imm;
if (kit_compiler_new(target, &g_u.ctx, &c) != KIT_OK || !c) return -1;
if (kit_frontend_run(c, emit_binop_fn, &ctx) != KIT_OK) goto done;
@@ -132,11 +144,9 @@ static int op_text_has_mnem(KitArchKind arch, KitCgIntBinOp op, int64_t imm,
if (kit_obj_builder_emit(ctx.ob, writer) != KIT_OK) goto done;
bytes.data = kit_writer_mem_bytes(writer, &len);
bytes.len = len;
- if (kit_obj_open(&g_u.ctx, KIT_SLICE_LIT("<sr-test>"), &bytes, &file) !=
- KIT_OK)
+ if (kit_obj_open(&g_u.ctx, KIT_SLICE_LIT("<sr-test>"), &bytes, &file) != KIT_OK)
goto done;
- if (kit_obj_section_by_name(file, KIT_SLICE_LIT(".text"), &text_sec) !=
- KIT_OK)
+ if (kit_obj_section_by_name(file, KIT_SLICE_LIT(".text"), &text_sec) != KIT_OK)
goto done;
if (kit_obj_section_data(file, text_sec, &data, &len) != KIT_OK) goto done;
@@ -145,20 +155,12 @@ static int op_text_has_mnem(KitArchKind arch, KitCgIntBinOp op, int64_t imm,
dc.context = g_u.ctx;
if (kit_disasm_iter_new(&dc, data, len, 0, file, &it) != KIT_OK || !it)
goto done;
- while (kit_disasm_iter_next(it, &insn) == KIT_ITER_ITEM) {
- const char* m = insn.mnemonic.s;
- size_t mlen = insn.mnemonic.len;
- if (m && mlen >= nlen) {
- for (size_t i = 0; i + nlen <= mlen; ++i) {
- if (memcmp(m + i, needle, nlen) == 0) {
- found = 1;
- break;
- }
- }
- }
- if (found) break;
+ while (n < cap && kit_disasm_iter_next(it, &insn) == KIT_ITER_ITEM) {
+ slice_to_buf(insn.mnemonic, out[n].mnem, sizeof out[n].mnem);
+ slice_to_buf(insn.operands, out[n].ops, sizeof out[n].ops);
+ ++n;
}
- result = found;
+ result = n;
done:
if (it) kit_disasm_iter_free(it);
@@ -169,57 +171,92 @@ done:
return result;
}
-static const char* arch_name(KitArchKind arch) {
- switch (arch) {
- case KIT_ARCH_X86_64:
- return "x86_64";
- case KIT_ARCH_ARM_64:
- return "aarch64";
- case KIT_ARCH_RV64:
- return "riscv64";
- default:
- return "arch";
- }
+/* 1 if any instruction's mnemonic contains `mnem`. */
+static int have_mnem(const DisInsn* in, int n, const char* mnem) {
+ for (int i = 0; i < n; ++i)
+ if (strstr(in[i].mnem, mnem)) return 1;
+ return 0;
+}
+
+/* 1 if the first instruction whose mnemonic contains `mnem` has `marker` in its
+ * operands (an empty marker matches any). This is how we tell an immediate
+ * operand form (x64 "$", aa64 "#") from a register form, and — via the
+ * i-suffixed RISC-V mnemonics — picks out slli/srli/andi over sll/srl/and. */
+static int imm_form(const DisInsn* in, int n, const char* mnem,
+ const char* marker) {
+ for (int i = 0; i < n; ++i)
+ if (strstr(in[i].mnem, mnem)) return strstr(in[i].ops, marker) != NULL;
+ return 0;
}
-static void check_arch(KitArchKind arch) {
- const char* an = arch_name(arch);
+typedef struct ArchExpect {
+ KitArchKind arch;
+ const char* name;
+ const char* shl_mnem; /* immediate shift-left mnemonic (x*2^k) */
+ const char* shr_mnem; /* immediate logical shift-right (x u/ 2^k) */
+ const char* and_mnem; /* immediate bitwise-and (x u% 2^k) */
+ const char* mark; /* operand marker for an immediate ("" if in mnemonic) */
+} ArchExpect;
+
+static void check_arch(const ArchExpect* ex) {
+ const char* an = ex->name;
+ DisInsn buf[128];
+ int n;
- /* x * 8 -> shift: no multiply instruction survives. */
- EXPECT(op_text_has_mnem(arch, KIT_CG_INT_MUL, 8, "mul") == 0,
+ /* --- multiply --- */
+ n = op_disasm(ex->arch, KIT_CG_INT_MUL, 8, buf, 128);
+ EXPECT(n > 0, "[%s] mul8 disasm failed", an);
+ EXPECT(!have_mnem(buf, n, "mul"),
"[%s] x * 8 should strength-reduce to a shift (no 'mul')", an);
- /* x * 7 -> still a multiply (not a power of two). */
- EXPECT(op_text_has_mnem(arch, KIT_CG_INT_MUL, 7, "mul") == 1,
- "[%s] x * 7 must stay a multiply", an);
+ EXPECT(imm_form(buf, n, ex->shl_mnem, ex->mark),
+ "[%s] x * 8 should use the immediate shift form", an);
- /* x u/ 8 -> logical shift right: no divide. */
- EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UDIV, 8, "div") == 0,
+ n = op_disasm(ex->arch, KIT_CG_INT_MUL, 7, buf, 128);
+ EXPECT(n > 0 && have_mnem(buf, n, "mul"), "[%s] x * 7 must stay a multiply",
+ an);
+
+ /* --- unsigned divide --- */
+ n = op_disasm(ex->arch, KIT_CG_INT_UDIV, 8, buf, 128);
+ EXPECT(n > 0, "[%s] udiv8 disasm failed", an);
+ EXPECT(!have_mnem(buf, n, "div"),
"[%s] x u/ 8 should strength-reduce to a shift (no 'div')", an);
- /* x u/ 7 -> still a divide. */
- EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UDIV, 7, "div") == 1,
- "[%s] x u/ 7 must stay a divide", an);
+ EXPECT(imm_form(buf, n, ex->shr_mnem, ex->mark),
+ "[%s] x u/ 8 should use the immediate shift form", an);
+
+ n = op_disasm(ex->arch, KIT_CG_INT_UDIV, 7, buf, 128);
+ EXPECT(n > 0 && have_mnem(buf, n, "div"), "[%s] x u/ 7 must stay a divide",
+ an);
- /* x u% 8 -> and: no divide and no remainder instruction. */
- EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UREM, 8, "div") == 0 &&
- op_text_has_mnem(arch, KIT_CG_INT_UREM, 8, "rem") == 0,
+ /* --- unsigned remainder --- */
+ n = op_disasm(ex->arch, KIT_CG_INT_UREM, 8, buf, 128);
+ EXPECT(n > 0, "[%s] urem8 disasm failed", an);
+ EXPECT(!have_mnem(buf, n, "div") && !have_mnem(buf, n, "rem"),
"[%s] x u%% 8 should strength-reduce to an and (no 'div'/'rem')", an);
- /* x u% 7 -> still a divide or remainder. */
- EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UREM, 7, "div") == 1 ||
- op_text_has_mnem(arch, KIT_CG_INT_UREM, 7, "rem") == 1,
+ EXPECT(imm_form(buf, n, ex->and_mnem, ex->mark),
+ "[%s] x u%% 8 should use the immediate and form", an);
+
+ n = op_disasm(ex->arch, KIT_CG_INT_UREM, 7, buf, 128);
+ EXPECT(n > 0 && (have_mnem(buf, n, "div") || have_mnem(buf, n, "rem")),
"[%s] x u%% 7 must stay a divide/remainder", an);
- /* Signed division by a power of two needs a sign-bias sequence; the -O0
- * peephole deliberately leaves it for the optimizer, so the divide stays. */
- EXPECT(op_text_has_mnem(arch, KIT_CG_INT_SDIV, 8, "div") == 1,
+ /* --- signed divide: deliberately left for the optimizer --- */
+ n = op_disasm(ex->arch, KIT_CG_INT_SDIV, 8, buf, 128);
+ EXPECT(n > 0 && have_mnem(buf, n, "div"),
"[%s] signed x / 8 is left as a divide at -O0", an);
}
int main(void) {
+ /* x64 AT&T immediates read "$N"; aa64 reads "#N"; rv64 folds the immediate
+ * into the mnemonic (slli/srli/andi), so its operand marker is empty. */
+ static const ArchExpect arches[] = {
+ {KIT_ARCH_X86_64, "x86_64", "shl", "shr", "and", "$"},
+ {KIT_ARCH_ARM_64, "aarch64", "lsl", "lsr", "and", "#"},
+ {KIT_ARCH_RV64, "riscv64", "slli", "srli", "andi", ""},
+ };
kit_unit_init(&g_u);
g_u.ctx.now = -1;
- check_arch(KIT_ARCH_X86_64);
- check_arch(KIT_ARCH_ARM_64);
- check_arch(KIT_ARCH_RV64);
+ for (size_t i = 0; i < sizeof arches / sizeof arches[0]; ++i)
+ check_arch(&arches[i]);
kit_unit_summary(&g_u, "strength_reduce_test");
return kit_unit_status(&g_u);
}