kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit dc1854de22635fd2f86aabe8947012f22b496d58
parent 27f088297163bc712f339f7ee942cceb56b0ae57
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue,  2 Jun 2026 16:14:50 -0700

cg+aa64: use immediate operand forms for shifts/and/cmp at -O0

Piece A — shared NativeDirectTarget layer. nd_binop/nd_cmp/nd_cmp_branch
materialized every RHS into a register, so the backend never saw the
target-legal immediates its binop/cmp hooks already accept. Add
nd_rhs_imm_or_reg, mirroring the optimizer's operand_imm_or_reg: keep a
constant RHS as an immediate when native->imm_legal accepts it for the op,
else materialize. Gated on imm_legal != NULL, so a recording mock (or a
backend without the hook) keeps the register path.

This closes the -O0/-O1 gap on x64 (shl/shr/and $imm, add/sub/or/xor imm,
cmp $imm) and rv64 (slli/srli/andi, addi, ...) with no backend changes:
those backends already encode the immediates (exercised by the optimizer).

Piece B — aa64 backend. aa_imm_legal now accepts SHL/SHR shift counts and
AND/ORR/EOR bitmask immediates; aa_binop encodes them via the existing
aa_lsl_imm/aa_lsr_imm/aa_asr_imm (UBFM/SBFM) and aa64_and_imm/orr_imm/eor_imm
helpers. So `x*8`/`x u/8`/`x u%8` become `lsl/lsr/and #imm` instead of
materializing the count/mask into a scratch register.

Disassembler. The aa64 disassembler couldn't render what the above now emits:
the logical-immediate family (AND/ORR/EOR/ANDS #bitmask) printed `.inst`, and
SBFM/UBFM printed raw instead of their shift aliases. Add the logical-immediate
format + decoder (aa64_logimm_decode, the ARM DecodeBitMasks inverse) and the
lsl/lsr/asr aliases via a shared aa64_bitfield_shift_alias helper used by both
the mnemonic writer and the operand printer. Non-aliasing SBFM/UBFM and the
other bitfield aliases (ubfx/sxtb/...) still print raw.

strength_reduce_test now also asserts the immediate operand form per arch
(x64 "$imm", aa64 "#imm", rv64 i-suffixed mnemonic), not just that the
multiply/divide became a shift. 39/39 across x86_64/aarch64/riscv64.

Diffstat:
Msrc/arch/aa64/disasm.c | 10++++++++++
Msrc/arch/aa64/isa.c | 91+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/arch/aa64/isa.h | 7+++++++
Msrc/arch/aa64/native.c | 38++++++++++++++++++++++++++++++++++++++
Msrc/cg/native_direct_target.c | 19++++++++++++++++---
Mtest/cg/strength_reduce_test.c | 185+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
6 files changed, 273 insertions(+), 77 deletions(-)

diff --git a/src/arch/aa64/disasm.c b/src/arch/aa64/disasm.c @@ -51,6 +51,16 @@ static void aa64_write_mnemonic(AA64Disasm* d, const AA64InsnDesc* desc, strbuf_puts(&d->mnem, aa64_cond_names[cond]); return; } + if (desc->fmt == AA64_FMT_BITFIELD) { + /* SBFM/UBFM disassemble to their lsl/lsr/asr shift aliases when the + * fields match; aa64_print_operands renders the matching operands. */ + u32 shift; + const char* alias = aa64_bitfield_shift_alias(word, &shift); + if (alias) { + strbuf_puts(&d->mnem, alias); + return; + } + } strbuf_put_slice(&d->mnem, desc->mnemonic); } diff --git a/src/arch/aa64/isa.c b/src/arch/aa64/isa.c @@ -448,6 +448,14 @@ const AA64InsnDesc aa64_insn_table[] = { {MN("sbfm"), 0x13000000u, 0x7F800000u, AA64_FMT_BITFIELD, 0, {0, 0}}, {MN("ubfm"), 0x53000000u, 0x7F800000u, AA64_FMT_BITFIELD, 0, {0, 0}}, + /* ----- Logical, immediate (AND / ORR / EOR / ANDS, bitmask form) ----- + * sf (bit31) and the N:immr:imms bitmask fields free; opc (bits30:29) + * selects the operation. Family bits[28:23]=100100. */ + {MN("and"), 0x12000000u, 0x7F800000u, AA64_FMT_LOG_IMM, 0, {0, 0}}, + {MN("orr"), 0x32000000u, 0x7F800000u, AA64_FMT_LOG_IMM, 0, {0, 0}}, + {MN("eor"), 0x52000000u, 0x7F800000u, AA64_FMT_LOG_IMM, 0, {0, 0}}, + {MN("ands"), 0x72000000u, 0x7F800000u, AA64_FMT_LOG_IMM, 0, {0, 0}}, + /* ----- Load/store, register offset [Xn, Xm{, LSL #s}] (V=0) ----- * Family bits[29:24]=111000, bit21=1, bits[11:10]=10; per-size opc rows. */ {MN("strb"), 0x38200800u, 0xFFE00C00u, AA64_FMT_LDST_REGOFF, 0, {0, 0}}, @@ -1038,8 +1046,47 @@ static void print_dp1(StrBuf* sb, u32 w) { } /* SBFM/UBFM: Rd, Rn, #immr, #imms. */ +/* SBFM/UBFM shift aliases. UBFM is LSR when imms is the top bit index and LSL + * when immr == imms+1 (the encoder's `lsl #s` form); SBFM is ASR when imms is + * the top bit index. The other bitfield aliases (UBFX/SBFX/UXTB/SBFIZ/...) are + * left as the raw mnemonic. */ +const char* aa64_bitfield_shift_alias(u32 word, u32* shift) { + u32 sf = (word >> 31) & 1u; + u32 opc = (word >> 29) & 3u; /* 0 = SBFM, 2 = UBFM */ + u32 immr = (word >> 16) & 0x3fu; + u32 imms = (word >> 10) & 0x3fu; + u32 top = sf ? 63u : 31u; + if (opc == 2u) { /* UBFM */ + if (imms == top) { + *shift = immr; + return "lsr"; + } + if (immr == imms + 1u) { + *shift = top - imms; + return "lsl"; + } + } else if (opc == 0u) { /* SBFM */ + if (imms == top) { + *shift = immr; + return "asr"; + } + } + return NULL; +} + static void print_bitfield(StrBuf* sb, u32 w) { int sf = (int)((w >> 31) & 1u); + u32 shift; + if (aa64_bitfield_shift_alias(w, &shift)) { + /* lsl/lsr/asr Rd, Rn, #shift (mnemonic chosen by the disasm mnemonic + * writer via the same helper). */ + emit_reg(sb, w & 0x1fu, sf, 0); + strbuf_puts(sb, ", "); + emit_reg(sb, (w >> 5) & 0x1fu, sf, 0); + strbuf_puts(sb, ", #"); + strbuf_put_u64(sb, (u64)shift); + return; + } emit_reg(sb, w & 0x1fu, sf, 0); strbuf_puts(sb, ", "); emit_reg(sb, (w >> 5) & 0x1fu, sf, 0); @@ -1049,6 +1096,47 @@ static void print_bitfield(StrBuf* sb, u32 w) { strbuf_put_u64(sb, (u64)((w >> 10) & 0x3fu)); } +/* Decode an AArch64 logical-immediate (N:immr:imms) bitmask to its value, per + * the ARM ARM DecodeBitMasks. Inverse of aa64_logimm_encode (isa.h). */ +static u64 aa64_logimm_decode(u32 N, u32 immr, u32 imms, int sf) { + u32 combined = (N << 6) | ((~imms) & 0x3fu); + int len = -1; + u32 esize, levels, S, R, i; + u64 welem, elem, elt_mask, result; + for (int b = 6; b >= 0; --b) { + if (combined & (1u << b)) { + len = b; + break; + } + } + if (len < 1) return 0; /* reserved encoding */ + esize = 1u << (u32)len; + levels = esize - 1u; + S = imms & levels; + R = immr & levels; + welem = (S + 1u >= 64u) ? ~(u64)0 : (((u64)1 << (S + 1u)) - 1u); + elt_mask = (esize >= 64u) ? ~(u64)0 : (((u64)1 << esize) - 1u); + elem = + (R == 0u) ? welem : (((welem >> R) | (welem << (esize - R))) & elt_mask); + result = 0; + for (i = 0; i < 64u; i += esize) result |= elem << i; + return sf ? result : (result & 0xFFFFFFFFu); +} + +static void print_logimm(StrBuf* sb, u32 w) { + int sf = (int)((w >> 31) & 1u); + u32 opc = (w >> 29) & 3u; /* 0=AND 1=ORR 2=EOR 3=ANDS */ + u32 N = (w >> 22) & 1u; + u32 immr = (w >> 16) & 0x3fu; + u32 imms = (w >> 10) & 0x3fu; + /* AND/ORR/EOR use the SP-or-GP destination; ANDS uses ZR. Rn is always GP. */ + emit_reg(sb, w & 0x1fu, sf, opc != 3u); + strbuf_puts(sb, ", "); + emit_reg(sb, (w >> 5) & 0x1fu, sf, 0); + strbuf_puts(sb, ", #"); + strbuf_put_hex_u64(sb, aa64_logimm_decode(N, immr, imms, sf)); +} + /* Register-offset load/store: Rt, [Xn, Xm{, LSL #s}]. Rt width follows the * size field (X for 64-bit accesses, W otherwise); the index extend is LSL * for option=011 (UXTX), with shift amount S ? size : 0. */ @@ -1154,6 +1242,9 @@ void aa64_print_operands(StrBuf* sb, const AA64InsnDesc* desc, u32 word, case AA64_FMT_BITFIELD: print_bitfield(sb, word); break; + case AA64_FMT_LOG_IMM: + print_logimm(sb, word); + break; case AA64_FMT_LDST_REGOFF: print_ldst_regoff(sb, word); break; diff --git a/src/arch/aa64/isa.h b/src/arch/aa64/isa.h @@ -68,6 +68,7 @@ typedef enum AA64Format { * (SCVTF/UCVTF/FCVTZS/FCVTZU/FMOV) */ AA64_FMT_LDST_EXCL, /* load/store exclusive + acquire/release ordered * (LDXR/LDAXR/STXR/STLXR/LDAR/STLR + b/h) */ + AA64_FMT_LOG_IMM, /* logical, immediate (AND/ORR/EOR/ANDS #bitmask) */ } AA64Format; /* ---- AsmFlags column on AA64InsnDesc ---- @@ -1543,6 +1544,12 @@ struct AA64AsmTok; /* opaque, defined by the phase-3 asm parser */ void aa64_print_operands(StrBuf* sb, const AA64InsnDesc* desc, u32 word, u64 vaddr); +/* If `word` is an SBFM/UBFM that has a preferred shift-alias disassembly, + * return its mnemonic ("lsl"/"lsr"/"asr") and write the shift amount to + * *shift; return NULL otherwise. Shared by the disassembler's mnemonic and + * operand printers so the alias decision lives in one place. */ +const char* aa64_bitfield_shift_alias(u32 word, u32* shift); + /* Returns 1 on success, 0 on parse error. Phase 2 stub returns 0 for * every format; phase 3 fills in the bodies. */ int aa64_parse_operands(struct AA64AsmTok* tok, const AA64InsnDesc* desc, diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c @@ -903,6 +903,18 @@ static int aa_imm_legal(NativeTarget* t, NativeImmUse use, u32 op, u32 sf = type_size32(t, type) == 8u ? 1u : 0u; return aa64_imul_strength_reducible(sf, imm); } + /* LSL/LSR/ASR #imm via the UBFM/SBFM aliases: shift count in range. */ + if ((BinOp)op == BO_SHL || (BinOp)op == BO_SHR_S || + (BinOp)op == BO_SHR_U) { + u32 bits = type_size32(t, type) == 8u ? 64u : 32u; + return imm >= 0 && (u64)imm < (u64)bits; + } + /* AND/ORR/EOR #bitmask: encodable as an AArch64 logical immediate. */ + if ((BinOp)op == BO_AND || (BinOp)op == BO_OR || (BinOp)op == BO_XOR) { + u32 sf = type_size32(t, type) == 8u ? 1u : 0u; + u32 N, immr, imms; + return aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms); + } return 0; case NATIVE_IMM_CMP: /* cmp lowers to subs #imm12; cmn (negative) is not wired, so require a @@ -2023,6 +2035,8 @@ static void aa_set_bytes(NativeTarget* t, NativeAddr dst, NativeLoc byte_value, } static void aa_lsl_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh); +static void aa_lsr_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh); +static void aa_asr_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh); /* Strength-reduce `mul rd, rn, #imm` for the constants accepted by * aa64_imul_strength_reducible into a single non-mul instruction. Callers @@ -2142,6 +2156,30 @@ static void aa_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc lhs, aa_emit_mul_const_imm(t, sf, rd, rn, rhs.v.imm); return; } + if (rhs.kind == NATIVE_LOC_IMM && + (op == BO_SHL || op == BO_SHR_U || op == BO_SHR_S)) { + u32 shamt = (u32)rhs.v.imm; /* imm_legal guarantees 0 <= imm < datasize */ + if (op == BO_SHL) + aa_lsl_imm(t, sf, rd, rn, shamt); + else if (op == BO_SHR_U) + aa_lsr_imm(t, sf, rd, rn, shamt); + else + aa_asr_imm(t, sf, rd, rn, shamt); + return; + } + if (rhs.kind == NATIVE_LOC_IMM && + (op == BO_AND || op == BO_OR || op == BO_XOR)) { + u32 N, immr, imms; + if (!aa64_logimm_encode((u64)rhs.v.imm, sf, &N, &immr, &imms)) + aa_panic(aa_of(t), "logical immediate not encodable"); + if (op == BO_AND) + aa_emit32(t->mc, aa64_and_imm(sf, rd, rn, N, immr, imms)); + else if (op == BO_OR) + aa_emit32(t->mc, aa64_orr_imm(sf, rd, rn, N, immr, imms)); + else + aa_emit32(t->mc, aa64_eor_imm(sf, rd, rn, N, immr, imms)); + return; + } switch (op) { case BO_IADD: aa_emit32(t->mc, aa64_add(sf, rd, rn, rm)); diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c @@ -832,6 +832,19 @@ static NativeLoc nd_dst_scratch(NativeDirectTarget* d, Operand dst) { return nd_reg_loc(r, cls, dst.type); } +/* Arithmetic/compare RHS: keep a constant operand as an immediate when the + * target can encode it for `use` (so no scratch register is spent + * materializing it), mirroring the optimizer's operand_imm_or_reg. Falls back + * to a register when there is no imm_legal hook (e.g. a recording mock target) + * or the constant is not target-legal for this op. */ +static NativeLoc nd_rhs_imm_or_reg(NativeDirectTarget* d, NativeImmUse use, + u32 sub, Operand b) { + if (b.kind == OPK_IMM && d->native->imm_legal && + d->native->imm_legal(d->native, use, sub, b.type, b.v.imm)) + return nd_loc_imm(b.v.imm, b.type); + return nd_materialize_operand(d, b); +} + /* Register a pure-compute op writes its result into. For a cacheable local that * is the local's cache register (reused or freshly allocated), pinned for the * instruction; nd_dst_writeback then marks it dirty without storing. Otherwise @@ -1012,7 +1025,7 @@ static void nd_cmp_branch(CgTarget* t, CmpOp op, Operand a, Operand b, NativeLoc ar, br; nd_flush_all(d); ar = nd_materialize_operand(d, a); - br = nd_materialize_operand(d, b); + br = nd_rhs_imm_or_reg(d, NATIVE_IMM_CMP, (u32)op, b); ND_REQUIRE_NATIVE(d, cmp_branch, "target does not emit compare branches"); d->native->cmp_branch(d->native, op, ar, br, nd_mc_label(d, label)); nd_release_materialized(d, br); @@ -1431,7 +1444,7 @@ static void nd_bitfield_store(CgTarget* t, Operand record_addr, Operand src, static void nd_binop(CgTarget* t, BinOp op, Operand dst, Operand a, Operand b) { NativeDirectTarget* d = nd_of(t); NativeLoc ar = nd_materialize_operand(d, a); - NativeLoc br = nd_materialize_operand(d, b); + NativeLoc br = nd_rhs_imm_or_reg(d, NATIVE_IMM_BINOP, (u32)op, b); NativeLoc dr = nd_dst_reg(d, dst); ND_REQUIRE_NATIVE(d, binop, "target does not emit binary ops"); d->native->binop(d->native, op, dr, ar, br); @@ -1453,7 +1466,7 @@ static void nd_unop(CgTarget* t, UnOp op, Operand dst, Operand a) { static void nd_cmp(CgTarget* t, CmpOp op, Operand dst, Operand a, Operand b) { NativeDirectTarget* d = nd_of(t); NativeLoc ar = nd_materialize_operand(d, a); - NativeLoc br = nd_materialize_operand(d, b); + NativeLoc br = nd_rhs_imm_or_reg(d, NATIVE_IMM_CMP, (u32)op, b); NativeLoc dr = nd_dst_reg(d, dst); ND_REQUIRE_NATIVE(d, cmp, "target does not emit compares"); d->native->cmp(d->native, op, dr, ar, br); diff --git a/test/cg/strength_reduce_test.c b/test/cg/strength_reduce_test.c @@ -1,15 +1,21 @@ /* strength_reduce_test — asserts the -O0 semantic peephole (src/cg/fold.c) * turns multiply / unsigned-divide / unsigned-remainder by a power-of-two - * constant into a shift / and. + * constant into a shift / and, AND that the shift/and is emitted in its + * immediate-operand form (no scratch register spent materializing the count). * * Each case builds a tiny * i64 f(i64 x) { return x <op> <imm>; } - * through the public CG API, emits an ELF object, disassembles its .text, and - * inspects the instruction mnemonics. The assertions are deliberately on - * substrings ("mul" / "div" / "rem") rather than exact shift mnemonics, so the - * test is robust to per-arch shift aliasing (lsl/ubfm, slli, shl, ...): a - * power-of-two operand must make the multiply/divide instruction disappear, and - * a non-power-of-two operand (and signed division, left to -O1+) must keep it. + * through the public CG API, emits an ELF object, and disassembles its .text. + * + * Two layers of assertion: + * 1. The multiply/divide/remainder instruction disappears for power-of-two + * operands and survives for non-power-of-two (and for signed division, + * which is deliberately left to -O1+). Checked on mnemonic substrings + * ("mul"/"div"/"rem"), robust to per-arch aliasing. + * 2. The resulting shift/and carries an immediate operand (x64 "$imm", + * aa64 "#imm", rv64 i-suffixed mnemonic) rather than a materialized + * register — the NativeDirectTarget imm_legal pass-through (Piece A) plus + * the aa64 immediate shift/bitmask encodings (Piece B). * * Run by: make test-cg-api */ @@ -27,13 +33,9 @@ static KitUnit g_u; #define EXPECT(cond, ...) CU_EXPECT(&g_u, cond, __VA_ARGS__) -typedef struct BinFnSpec { +typedef struct EmitCtx { KitCgIntBinOp op; int64_t imm; -} BinFnSpec; - -typedef struct EmitCtx { - BinFnSpec spec; KitObjBuilder* ob; } EmitCtx; @@ -92,8 +94,8 @@ static KitStatus emit_binop_fn(KitCompiler* c, void* user) { kit_cg_push_local(cg, param); kit_cg_load(cg, (KitCgMemAccess){.type = i64_ty, .align = kit_cg_type_align(c, i64_ty)}); - kit_cg_push_int(cg, (uint64_t)ctx->spec.imm, i64_ty); - kit_cg_int_binop(cg, ctx->spec.op, KIT_CG_INTOP_NONE); + kit_cg_push_int(cg, (uint64_t)ctx->imm, i64_ty); + kit_cg_int_binop(cg, ctx->op, KIT_CG_INTOP_NONE); kit_cg_ret(cg); kit_cg_func_end(cg); @@ -102,10 +104,21 @@ static KitStatus emit_binop_fn(KitCompiler* c, void* user) { return KIT_OK; } -/* Emit the op, disassemble .text, and report whether any mnemonic contains - * `needle`. Returns 1 if present, 0 if absent, -1 on harness failure. */ -static int op_text_has_mnem(KitArchKind arch, KitCgIntBinOp op, int64_t imm, - const char* needle) { +typedef struct DisInsn { + char mnem[16]; + char ops[64]; +} DisInsn; + +static void slice_to_buf(KitSlice s, char* buf, size_t cap) { + size_t n = s.len < cap - 1 ? s.len : cap - 1; + if (s.s && n) memcpy(buf, s.s, n); + buf[n] = '\0'; +} + +/* Build the op and disassemble its .text into `out` (up to `cap` insns). + * Returns the instruction count, or -1 on harness failure. */ +static int op_disasm(KitArchKind arch, KitCgIntBinOp op, int64_t imm, + DisInsn* out, int cap) { KitTarget target = kit_unit_target(arch, KIT_OS_LINUX, KIT_OBJ_ELF); KitCompiler* c = NULL; EmitCtx ctx; @@ -119,12 +132,11 @@ static int op_text_has_mnem(KitArchKind arch, KitCgIntBinOp op, int64_t imm, KitDisasmIter* it = NULL; KitInsn insn; int result = -1; - int found = 0; - size_t nlen = strlen(needle); + int n = 0; memset(&ctx, 0, sizeof ctx); - ctx.spec.op = op; - ctx.spec.imm = imm; + ctx.op = op; + ctx.imm = imm; if (kit_compiler_new(target, &g_u.ctx, &c) != KIT_OK || !c) return -1; if (kit_frontend_run(c, emit_binop_fn, &ctx) != KIT_OK) goto done; @@ -132,11 +144,9 @@ static int op_text_has_mnem(KitArchKind arch, KitCgIntBinOp op, int64_t imm, if (kit_obj_builder_emit(ctx.ob, writer) != KIT_OK) goto done; bytes.data = kit_writer_mem_bytes(writer, &len); bytes.len = len; - if (kit_obj_open(&g_u.ctx, KIT_SLICE_LIT("<sr-test>"), &bytes, &file) != - KIT_OK) + if (kit_obj_open(&g_u.ctx, KIT_SLICE_LIT("<sr-test>"), &bytes, &file) != KIT_OK) goto done; - if (kit_obj_section_by_name(file, KIT_SLICE_LIT(".text"), &text_sec) != - KIT_OK) + if (kit_obj_section_by_name(file, KIT_SLICE_LIT(".text"), &text_sec) != KIT_OK) goto done; if (kit_obj_section_data(file, text_sec, &data, &len) != KIT_OK) goto done; @@ -145,20 +155,12 @@ static int op_text_has_mnem(KitArchKind arch, KitCgIntBinOp op, int64_t imm, dc.context = g_u.ctx; if (kit_disasm_iter_new(&dc, data, len, 0, file, &it) != KIT_OK || !it) goto done; - while (kit_disasm_iter_next(it, &insn) == KIT_ITER_ITEM) { - const char* m = insn.mnemonic.s; - size_t mlen = insn.mnemonic.len; - if (m && mlen >= nlen) { - for (size_t i = 0; i + nlen <= mlen; ++i) { - if (memcmp(m + i, needle, nlen) == 0) { - found = 1; - break; - } - } - } - if (found) break; + while (n < cap && kit_disasm_iter_next(it, &insn) == KIT_ITER_ITEM) { + slice_to_buf(insn.mnemonic, out[n].mnem, sizeof out[n].mnem); + slice_to_buf(insn.operands, out[n].ops, sizeof out[n].ops); + ++n; } - result = found; + result = n; done: if (it) kit_disasm_iter_free(it); @@ -169,57 +171,92 @@ done: return result; } -static const char* arch_name(KitArchKind arch) { - switch (arch) { - case KIT_ARCH_X86_64: - return "x86_64"; - case KIT_ARCH_ARM_64: - return "aarch64"; - case KIT_ARCH_RV64: - return "riscv64"; - default: - return "arch"; - } +/* 1 if any instruction's mnemonic contains `mnem`. */ +static int have_mnem(const DisInsn* in, int n, const char* mnem) { + for (int i = 0; i < n; ++i) + if (strstr(in[i].mnem, mnem)) return 1; + return 0; +} + +/* 1 if the first instruction whose mnemonic contains `mnem` has `marker` in its + * operands (an empty marker matches any). This is how we tell an immediate + * operand form (x64 "$", aa64 "#") from a register form, and — via the + * i-suffixed RISC-V mnemonics — picks out slli/srli/andi over sll/srl/and. */ +static int imm_form(const DisInsn* in, int n, const char* mnem, + const char* marker) { + for (int i = 0; i < n; ++i) + if (strstr(in[i].mnem, mnem)) return strstr(in[i].ops, marker) != NULL; + return 0; } -static void check_arch(KitArchKind arch) { - const char* an = arch_name(arch); +typedef struct ArchExpect { + KitArchKind arch; + const char* name; + const char* shl_mnem; /* immediate shift-left mnemonic (x*2^k) */ + const char* shr_mnem; /* immediate logical shift-right (x u/ 2^k) */ + const char* and_mnem; /* immediate bitwise-and (x u% 2^k) */ + const char* mark; /* operand marker for an immediate ("" if in mnemonic) */ +} ArchExpect; + +static void check_arch(const ArchExpect* ex) { + const char* an = ex->name; + DisInsn buf[128]; + int n; - /* x * 8 -> shift: no multiply instruction survives. */ - EXPECT(op_text_has_mnem(arch, KIT_CG_INT_MUL, 8, "mul") == 0, + /* --- multiply --- */ + n = op_disasm(ex->arch, KIT_CG_INT_MUL, 8, buf, 128); + EXPECT(n > 0, "[%s] mul8 disasm failed", an); + EXPECT(!have_mnem(buf, n, "mul"), "[%s] x * 8 should strength-reduce to a shift (no 'mul')", an); - /* x * 7 -> still a multiply (not a power of two). */ - EXPECT(op_text_has_mnem(arch, KIT_CG_INT_MUL, 7, "mul") == 1, - "[%s] x * 7 must stay a multiply", an); + EXPECT(imm_form(buf, n, ex->shl_mnem, ex->mark), + "[%s] x * 8 should use the immediate shift form", an); - /* x u/ 8 -> logical shift right: no divide. */ - EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UDIV, 8, "div") == 0, + n = op_disasm(ex->arch, KIT_CG_INT_MUL, 7, buf, 128); + EXPECT(n > 0 && have_mnem(buf, n, "mul"), "[%s] x * 7 must stay a multiply", + an); + + /* --- unsigned divide --- */ + n = op_disasm(ex->arch, KIT_CG_INT_UDIV, 8, buf, 128); + EXPECT(n > 0, "[%s] udiv8 disasm failed", an); + EXPECT(!have_mnem(buf, n, "div"), "[%s] x u/ 8 should strength-reduce to a shift (no 'div')", an); - /* x u/ 7 -> still a divide. */ - EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UDIV, 7, "div") == 1, - "[%s] x u/ 7 must stay a divide", an); + EXPECT(imm_form(buf, n, ex->shr_mnem, ex->mark), + "[%s] x u/ 8 should use the immediate shift form", an); + + n = op_disasm(ex->arch, KIT_CG_INT_UDIV, 7, buf, 128); + EXPECT(n > 0 && have_mnem(buf, n, "div"), "[%s] x u/ 7 must stay a divide", + an); - /* x u% 8 -> and: no divide and no remainder instruction. */ - EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UREM, 8, "div") == 0 && - op_text_has_mnem(arch, KIT_CG_INT_UREM, 8, "rem") == 0, + /* --- unsigned remainder --- */ + n = op_disasm(ex->arch, KIT_CG_INT_UREM, 8, buf, 128); + EXPECT(n > 0, "[%s] urem8 disasm failed", an); + EXPECT(!have_mnem(buf, n, "div") && !have_mnem(buf, n, "rem"), "[%s] x u%% 8 should strength-reduce to an and (no 'div'/'rem')", an); - /* x u% 7 -> still a divide or remainder. */ - EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UREM, 7, "div") == 1 || - op_text_has_mnem(arch, KIT_CG_INT_UREM, 7, "rem") == 1, + EXPECT(imm_form(buf, n, ex->and_mnem, ex->mark), + "[%s] x u%% 8 should use the immediate and form", an); + + n = op_disasm(ex->arch, KIT_CG_INT_UREM, 7, buf, 128); + EXPECT(n > 0 && (have_mnem(buf, n, "div") || have_mnem(buf, n, "rem")), "[%s] x u%% 7 must stay a divide/remainder", an); - /* Signed division by a power of two needs a sign-bias sequence; the -O0 - * peephole deliberately leaves it for the optimizer, so the divide stays. */ - EXPECT(op_text_has_mnem(arch, KIT_CG_INT_SDIV, 8, "div") == 1, + /* --- signed divide: deliberately left for the optimizer --- */ + n = op_disasm(ex->arch, KIT_CG_INT_SDIV, 8, buf, 128); + EXPECT(n > 0 && have_mnem(buf, n, "div"), "[%s] signed x / 8 is left as a divide at -O0", an); } int main(void) { + /* x64 AT&T immediates read "$N"; aa64 reads "#N"; rv64 folds the immediate + * into the mnemonic (slli/srli/andi), so its operand marker is empty. */ + static const ArchExpect arches[] = { + {KIT_ARCH_X86_64, "x86_64", "shl", "shr", "and", "$"}, + {KIT_ARCH_ARM_64, "aarch64", "lsl", "lsr", "and", "#"}, + {KIT_ARCH_RV64, "riscv64", "slli", "srli", "andi", ""}, + }; kit_unit_init(&g_u); g_u.ctx.now = -1; - check_arch(KIT_ARCH_X86_64); - check_arch(KIT_ARCH_ARM_64); - check_arch(KIT_ARCH_RV64); + for (size_t i = 0; i < sizeof arches / sizeof arches[0]; ++i) + check_arch(&arches[i]); kit_unit_summary(&g_u, "strength_reduce_test"); return kit_unit_status(&g_u); }