commit 27f088297163bc712f339f7ee942cceb56b0ae57
parent 21aa437c9addefcc191d61f2b9c39f2bccc6efcc
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Tue, 2 Jun 2026 15:38:04 -0700
cg/fold: strength-reduce mul/udiv/urem by powers of two at -O0
The -O0 semantic peephole (src/cg/fold.c) already folds constants and
arithmetic identities but left multiply/divide/remainder by a power-of-two
constant as full mul/div instructions. Add api_try_strength_reduce, called
from api_cg_binop after constant folding, to rewrite:
x * 2^k -> x << k (commutative: immediate on either side; the
variable is canonicalized to the shift's LHS)
x u/ 2^k -> x u>> k
x u% 2^k -> x & (2^k - 1)
The rewrite happens before the delay/identity/fallback machinery, so the
resulting shift/and flows through the same paths as any other shift. Gated
on flags==0, so trap/saturate/exact ops keep their semantics. Signed
division/remainder needs a sign-bias sequence and is left to the optimizer.
Also reduces array-index scaling (i * sizeof(T)) to a shift when the element
size is a power of two.
test/cg/strength_reduce_test.c builds tiny `i64 f(i64 x){ return x op imm; }`
functions through the public CG API, disassembles .text, and asserts the
mul/div instruction disappears for power-of-two operands (and survives for
non-power-of-two and for signed division) across x86_64, aarch64, riscv64.
Diffstat:
6 files changed, 320 insertions(+), 2 deletions(-)
diff --git a/src/cg/arith.c b/src/cg/arith.c
@@ -49,6 +49,11 @@ void api_cg_binop(KitCg* g, BinOp iop, u32 flags) {
return;
}
+ /* Strength-reduce mul/udiv/urem by a power of two into shift/and. Rewrites
+ * iop and the operands in place; the result flows through the same delay /
+ * identity / fallback machinery as any other shift or and. */
+ if (!flags) api_try_strength_reduce(g, &iop, ty, &a, &b);
+
if (api_can_delay_int_arith(g, ty, flags) &&
api_try_fold_arith_chain(g, iop, ty, &a, &b, &folded_sv)) {
api_release(g, &a);
diff --git a/src/cg/fold.c b/src/cg/fold.c
@@ -405,6 +405,88 @@ int api_can_delay_int_arith(KitCg* g, KitCgTypeId ty, u32 flags) {
return g && !flags && api_foldable_int_type(g->c, ty, &width);
}
+/* Strength reduction: rewrite a multiply / unsigned-divide / unsigned-remainder
+ * by a power-of-two immediate into a shift / and. Operates on the freshly
+ * popped operands (`a` = LHS, `b` = RHS) and the op; on a match it rewrites
+ * *op, *a and *b in place and returns 1.
+ *
+ * Only the cases whose plain wrapping equivalence is exact live here:
+ * x * 2^k -> x << k (multiply is commutative: the immediate may be on
+ * either side; the result is canonicalized so the
+ * variable is the shift's LHS)
+ * x u/ 2^k -> x u>> k
+ * x u% 2^k -> x & (2^k - 1)
+ * Signed division/remainder by a power of two needs a sign-bias sequence
+ * (round toward zero on negatives), so it is left to the optimizer. The caller
+ * gates this on flags==0, so trap/saturate/exact semantics never reach here. */
+static int api_imm_is_pow2(u64 v, u32* log2_out) {
+ u32 k;
+ if (v == 0 || (v & (v - 1u)) != 0) return 0;
+ for (k = 0; k < 64u; ++k) {
+ if (v == (1ull << k)) {
+ *log2_out = k;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int api_try_strength_reduce(KitCg* g, BinOp* op, KitCgTypeId ty, ApiSValue* a,
+ ApiSValue* b) {
+ u32 width;
+ u32 k = 0;
+ u64 v;
+ int a_imm, b_imm;
+ if (!g || !op || !a || !b) return 0;
+ if (!api_foldable_int_type(g->c, ty, &width)) return 0;
+ a_imm = a->kind == SV_OPERAND && a->op.kind == OPK_IMM;
+ b_imm = b->kind == SV_OPERAND && b->op.kind == OPK_IMM;
+ switch (*op) {
+ case BO_IMUL: {
+ /* Both-imm is constant-folded before we get here; need exactly one, and
+ * the variable operand is canonicalized to the shift's LHS. */
+ ApiSValue* imm_sv;
+ int imm_on_lhs;
+ if (b_imm && !a_imm) {
+ imm_sv = b;
+ imm_on_lhs = 0;
+ } else if (a_imm && !b_imm) {
+ imm_sv = a;
+ imm_on_lhs = 1;
+ } else {
+ return 0;
+ }
+ v = api_mask_width((u64)imm_sv->op.v.imm, width);
+ if (!api_imm_is_pow2(v, &k) || k == 0) return 0;
+ if (imm_on_lhs) {
+ ApiSValue tmp = *a;
+ *a = *b;
+ *b = tmp;
+ }
+ b->op.v.imm = (i64)k;
+ b->op.type = ty;
+ *op = BO_SHL;
+ return 1;
+ }
+ case BO_UDIV:
+ case BO_UREM:
+ if (!b_imm || a_imm) return 0; /* RHS imm, LHS a real value */
+ v = api_mask_width((u64)b->op.v.imm, width);
+ if (!api_imm_is_pow2(v, &k) || k == 0) return 0;
+ if (*op == BO_UDIV) {
+ b->op.v.imm = (i64)k;
+ *op = BO_SHR_U;
+ } else {
+ b->op.v.imm = (i64)api_mask_width(v - 1u, width);
+ *op = BO_AND;
+ }
+ b->op.type = ty;
+ return 1;
+ default:
+ return 0;
+ }
+}
+
int api_op_is_int_identity(KitCg* g, BinOp op, KitCgTypeId ty, i64 imm) {
u32 width;
u64 v;
diff --git a/src/cg/fold.h b/src/cg/fold.h
@@ -74,6 +74,8 @@ void api_release_arith(KitCg* g, ApiSValue* sv);
void api_materialize_arith_to(KitCg* g, ApiSValue* sv, Operand dst);
int api_arith_rhs_reusable(const ApiSValue* sv);
int api_can_delay_int_arith(KitCg* g, KitCgTypeId ty, u32 flags);
+int api_try_strength_reduce(KitCg* g, BinOp* op, KitCgTypeId ty, ApiSValue* a,
+ ApiSValue* b);
int api_op_is_int_identity(KitCg* g, BinOp op, KitCgTypeId ty, i64 imm);
int api_try_collapse_binop_identity(KitCg* g, BinOp op, KitCgTypeId ty,
ApiSValue* a, ApiSValue* b, ApiSValue* out);
diff --git a/test/cg/strength_reduce_test.c b/test/cg/strength_reduce_test.c
@@ -0,0 +1,225 @@
+/* strength_reduce_test — asserts the -O0 semantic peephole (src/cg/fold.c)
+ * turns multiply / unsigned-divide / unsigned-remainder by a power-of-two
+ * constant into a shift / and.
+ *
+ * Each case builds a tiny
+ * i64 f(i64 x) { return x <op> <imm>; }
+ * through the public CG API, emits an ELF object, disassembles its .text, and
+ * inspects the instruction mnemonics. The assertions are deliberately on
+ * substrings ("mul" / "div" / "rem") rather than exact shift mnemonics, so the
+ * test is robust to per-arch shift aliasing (lsl/ubfm, slli, shl, ...): a
+ * power-of-two operand must make the multiply/divide instruction disappear, and
+ * a non-power-of-two operand (and signed division, left to -O1+) must keep it.
+ *
+ * Run by: make test-cg-api
+ */
+
+#include <kit/cg.h>
+#include <kit/disasm.h>
+#include <kit/frontend.h>
+#include <kit/object.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "lib/kit_unit.h"
+
+static KitUnit g_u;
+#define EXPECT(cond, ...) CU_EXPECT(&g_u, cond, __VA_ARGS__)
+
+typedef struct BinFnSpec {
+ KitCgIntBinOp op;
+ int64_t imm;
+} BinFnSpec;
+
+typedef struct EmitCtx {
+ BinFnSpec spec;
+ KitObjBuilder* ob;
+} EmitCtx;
+
+/* Frontend callback: build `i64 f(i64 x) { return x op imm; }` into ctx->ob. */
+static KitStatus emit_binop_fn(KitCompiler* c, void* user) {
+ EmitCtx* ctx = (EmitCtx*)user;
+ KitCg* cg = NULL;
+ KitCgBuiltinTypes bi;
+ KitCgTypeId i64_ty;
+ KitCgFuncParam param_desc;
+ KitCgFuncResult sig_result;
+ KitCgFuncSig sig;
+ KitCgDecl decl;
+ KitCgSym sym;
+ KitCgLocalAttrs attrs;
+ KitCgLocal param;
+ KitCodeOptions opts;
+
+ if (kit_obj_builder_new(c, &ctx->ob) != KIT_OK) return KIT_ERR;
+ if (kit_cg_new(c, &cg) != KIT_OK || !cg) return KIT_ERR;
+ memset(&opts, 0, sizeof opts);
+ opts.opt_level = 0; /* the -O0 peephole is the subject under test */
+ if (kit_cg_begin_obj(cg, ctx->ob, &opts) != KIT_OK) return KIT_ERR;
+
+ bi = kit_cg_builtin_types(c);
+ i64_ty = bi.id[KIT_CG_BUILTIN_I64];
+
+ memset(¶m_desc, 0, sizeof param_desc);
+ param_desc.type = i64_ty;
+ memset(&sig_result, 0, sizeof sig_result);
+ sig_result.type = i64_ty;
+ memset(&sig, 0, sizeof sig);
+ sig.params = ¶m_desc;
+ sig.nparams = 1;
+ sig.results = &sig_result;
+ sig.nresults = 1;
+ sig.call_conv = KIT_CG_CC_TARGET_C;
+
+ memset(&decl, 0, sizeof decl);
+ decl.kind = KIT_CG_DECL_FUNC;
+ decl.linkage_name = kit_sym_intern(c, kit_slice_cstr("f"));
+ decl.display_name = decl.linkage_name;
+ decl.type = kit_cg_type_func(c, sig);
+ decl.sym.bind = KIT_SB_GLOBAL;
+ decl.sym.visibility = KIT_CG_VIS_DEFAULT;
+ sym = kit_cg_decl(cg, decl);
+ if (sym == KIT_CG_SYM_NONE) return KIT_ERR;
+
+ kit_cg_func_begin(cg, sym);
+ memset(&attrs, 0, sizeof attrs);
+ attrs.name = kit_sym_intern(c, KIT_SLICE_LIT("x"));
+ param = kit_cg_param(cg, 0, i64_ty, attrs);
+ if (param == KIT_CG_LOCAL_NONE) return KIT_ERR;
+
+ /* return x <op> imm; */
+ kit_cg_push_local(cg, param);
+ kit_cg_load(cg, (KitCgMemAccess){.type = i64_ty,
+ .align = kit_cg_type_align(c, i64_ty)});
+ kit_cg_push_int(cg, (uint64_t)ctx->spec.imm, i64_ty);
+ kit_cg_int_binop(cg, ctx->spec.op, KIT_CG_INTOP_NONE);
+ kit_cg_ret(cg);
+ kit_cg_func_end(cg);
+
+ if (kit_cg_end_obj(cg) != KIT_OK) return KIT_ERR;
+ kit_cg_free(cg);
+ return KIT_OK;
+}
+
+/* Emit the op, disassemble .text, and report whether any mnemonic contains
+ * `needle`. Returns 1 if present, 0 if absent, -1 on harness failure. */
+static int op_text_has_mnem(KitArchKind arch, KitCgIntBinOp op, int64_t imm,
+ const char* needle) {
+ KitTarget target = kit_unit_target(arch, KIT_OS_LINUX, KIT_OBJ_ELF);
+ KitCompiler* c = NULL;
+ EmitCtx ctx;
+ KitWriter* writer = NULL;
+ KitObjFile* file = NULL;
+ KitSlice bytes;
+ KitObjSection text_sec;
+ const uint8_t* data = NULL;
+ size_t len = 0;
+ KitDisasmContext dc;
+ KitDisasmIter* it = NULL;
+ KitInsn insn;
+ int result = -1;
+ int found = 0;
+ size_t nlen = strlen(needle);
+
+ memset(&ctx, 0, sizeof ctx);
+ ctx.spec.op = op;
+ ctx.spec.imm = imm;
+
+ if (kit_compiler_new(target, &g_u.ctx, &c) != KIT_OK || !c) return -1;
+ if (kit_frontend_run(c, emit_binop_fn, &ctx) != KIT_OK) goto done;
+ if (kit_writer_mem(&g_u.heap, &writer) != KIT_OK || !writer) goto done;
+ if (kit_obj_builder_emit(ctx.ob, writer) != KIT_OK) goto done;
+ bytes.data = kit_writer_mem_bytes(writer, &len);
+ bytes.len = len;
+ if (kit_obj_open(&g_u.ctx, KIT_SLICE_LIT("<sr-test>"), &bytes, &file) !=
+ KIT_OK)
+ goto done;
+ if (kit_obj_section_by_name(file, KIT_SLICE_LIT(".text"), &text_sec) !=
+ KIT_OK)
+ goto done;
+ if (kit_obj_section_data(file, text_sec, &data, &len) != KIT_OK) goto done;
+
+ memset(&dc, 0, sizeof dc);
+ dc.target = target;
+ dc.context = g_u.ctx;
+ if (kit_disasm_iter_new(&dc, data, len, 0, file, &it) != KIT_OK || !it)
+ goto done;
+ while (kit_disasm_iter_next(it, &insn) == KIT_ITER_ITEM) {
+ const char* m = insn.mnemonic.s;
+ size_t mlen = insn.mnemonic.len;
+ if (m && mlen >= nlen) {
+ for (size_t i = 0; i + nlen <= mlen; ++i) {
+ if (memcmp(m + i, needle, nlen) == 0) {
+ found = 1;
+ break;
+ }
+ }
+ }
+ if (found) break;
+ }
+ result = found;
+
+done:
+ if (it) kit_disasm_iter_free(it);
+ if (file) kit_obj_free(file);
+ if (writer) kit_writer_close(writer);
+ if (ctx.ob) kit_obj_builder_free(ctx.ob);
+ kit_compiler_free(c);
+ return result;
+}
+
+static const char* arch_name(KitArchKind arch) {
+ switch (arch) {
+ case KIT_ARCH_X86_64:
+ return "x86_64";
+ case KIT_ARCH_ARM_64:
+ return "aarch64";
+ case KIT_ARCH_RV64:
+ return "riscv64";
+ default:
+ return "arch";
+ }
+}
+
+static void check_arch(KitArchKind arch) {
+ const char* an = arch_name(arch);
+
+ /* x * 8 -> shift: no multiply instruction survives. */
+ EXPECT(op_text_has_mnem(arch, KIT_CG_INT_MUL, 8, "mul") == 0,
+ "[%s] x * 8 should strength-reduce to a shift (no 'mul')", an);
+ /* x * 7 -> still a multiply (not a power of two). */
+ EXPECT(op_text_has_mnem(arch, KIT_CG_INT_MUL, 7, "mul") == 1,
+ "[%s] x * 7 must stay a multiply", an);
+
+ /* x u/ 8 -> logical shift right: no divide. */
+ EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UDIV, 8, "div") == 0,
+ "[%s] x u/ 8 should strength-reduce to a shift (no 'div')", an);
+ /* x u/ 7 -> still a divide. */
+ EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UDIV, 7, "div") == 1,
+ "[%s] x u/ 7 must stay a divide", an);
+
+ /* x u% 8 -> and: no divide and no remainder instruction. */
+ EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UREM, 8, "div") == 0 &&
+ op_text_has_mnem(arch, KIT_CG_INT_UREM, 8, "rem") == 0,
+ "[%s] x u%% 8 should strength-reduce to an and (no 'div'/'rem')", an);
+ /* x u% 7 -> still a divide or remainder. */
+ EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UREM, 7, "div") == 1 ||
+ op_text_has_mnem(arch, KIT_CG_INT_UREM, 7, "rem") == 1,
+ "[%s] x u%% 7 must stay a divide/remainder", an);
+
+ /* Signed division by a power of two needs a sign-bias sequence; the -O0
+ * peephole deliberately leaves it for the optimizer, so the divide stays. */
+ EXPECT(op_text_has_mnem(arch, KIT_CG_INT_SDIV, 8, "div") == 1,
+ "[%s] signed x / 8 is left as a divide at -O0", an);
+}
+
+int main(void) {
+ kit_unit_init(&g_u);
+ g_u.ctx.now = -1;
+ check_arch(KIT_ARCH_X86_64);
+ check_arch(KIT_ARCH_ARM_64);
+ check_arch(KIT_ARCH_RV64);
+ kit_unit_summary(&g_u, "strength_reduce_test");
+ return kit_unit_status(&g_u);
+}
diff --git a/test/lib/unit.mk b/test/lib/unit.mk
@@ -31,12 +31,13 @@ UNIT_CFLAGS_INTERNAL = $(HOST_CFLAGS) -Iinclude -Isrc -Itest
UNIT_TESTS_PUBLIC := \
ar_test cg_api_test cg_switch_test cg_fp_cmp_test hash_test rv64_jit_test \
- aa64_inline_test rv64_inline_test x64_inline_test
+ aa64_inline_test rv64_inline_test x64_inline_test strength_reduce_test
ar_test_SRC := test/ar/ar_test.c
hash_test_SRC := test/api/hash_test.c
cg_api_test_SRC := test/api/cg_type_test.c
cg_switch_test_SRC := test/api/cg_switch_test.c
cg_fp_cmp_test_SRC := test/api/cg_fp_cmp_test.c
+strength_reduce_test_SRC := test/cg/strength_reduce_test.c
rv64_jit_test_SRC := test/link/rv64_jit_test.c
aa64_inline_test_SRC := test/arch/aa64_inline_test.c
rv64_inline_test_SRC := test/arch/rv64_inline_test.c
diff --git a/test/test.mk b/test/test.mk
@@ -379,15 +379,18 @@ test-interp-toy: bin
CG_API_TEST_BIN = build/test/cg_api_test
CG_SWITCH_TEST_BIN = build/test/cg_switch_test
CG_FP_CMP_TEST_BIN = build/test/cg_fp_cmp_test
+STRENGTH_REDUCE_TEST_BIN = build/test/strength_reduce_test
HASH_TEST_BIN = build/test/hash_test
ABI_CLASSIFY_TEST_BIN = build/test/abi_classify_test
IR_RECORDER_TEST_BIN = build/test/ir_recorder_test
NATIVE_DIRECT_TARGET_TEST_BIN = build/test/native_direct_target_test
-test-cg-api: $(CG_API_TEST_BIN) $(CG_SWITCH_TEST_BIN) $(CG_FP_CMP_TEST_BIN)
+test-cg-api: $(CG_API_TEST_BIN) $(CG_SWITCH_TEST_BIN) $(CG_FP_CMP_TEST_BIN) \
+ $(STRENGTH_REDUCE_TEST_BIN)
$(CG_API_TEST_BIN)
$(CG_SWITCH_TEST_BIN)
$(CG_FP_CMP_TEST_BIN)
+ $(STRENGTH_REDUCE_TEST_BIN)
test-hash: $(HASH_TEST_BIN)
$(HASH_TEST_BIN)