cg/fold: strength-reduce mul/udiv/urem by powers of two at -O0 - kit

commit 27f088297163bc712f339f7ee942cceb56b0ae57
parent 21aa437c9addefcc191d61f2b9c39f2bccc6efcc
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue,  2 Jun 2026 15:38:04 -0700

cg/fold: strength-reduce mul/udiv/urem by powers of two at -O0

The -O0 semantic peephole (src/cg/fold.c) already folds constants and
arithmetic identities but left multiply/divide/remainder by a power-of-two
constant as full mul/div instructions. Add api_try_strength_reduce, called
from api_cg_binop after constant folding, to rewrite:

  x * 2^k   -> x << k     (commutative: immediate on either side; the
                           variable is canonicalized to the shift's LHS)
  x u/ 2^k  -> x u>> k
  x u% 2^k  -> x & (2^k - 1)

The rewrite happens before the delay/identity/fallback machinery, so the
resulting shift/and flows through the same paths as any other shift. Gated
on flags==0, so trap/saturate/exact ops keep their semantics. Signed
division/remainder needs a sign-bias sequence and is left to the optimizer.

Also reduces array-index scaling (i * sizeof(T)) to a shift when the element
size is a power of two.

test/cg/strength_reduce_test.c builds tiny `i64 f(i64 x){ return x op imm; }`
functions through the public CG API, disassembles .text, and asserts the
mul/div instruction disappears for power-of-two operands (and survives for
non-power-of-two and for signed division) across x86_64, aarch64, riscv64.

Diffstat:
M src/cg/arith.c  | 5 +++++
M src/cg/fold.c  | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M src/cg/fold.h  | 2 ++
A test/cg/strength_reduce_test.c  | 225 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M test/lib/unit.mk  | 3 ++-
M test/test.mk  | 5 ++++-

6 files changed, 320 insertions(+), 2 deletions(-)
diff --git a/src/cg/arith.c b/src/cg/arith.c
@@ -49,6 +49,11 @@ void api_cg_binop(KitCg* g, BinOp iop, u32 flags) {
     return;
   }
 
+  /* Strength-reduce mul/udiv/urem by a power of two into shift/and. Rewrites
+   * iop and the operands in place; the result flows through the same delay /
+   * identity / fallback machinery as any other shift or and. */
+  if (!flags) api_try_strength_reduce(g, &iop, ty, &a, &b);
+
   if (api_can_delay_int_arith(g, ty, flags) &&
       api_try_fold_arith_chain(g, iop, ty, &a, &b, &folded_sv)) {
     api_release(g, &a);
diff --git a/src/cg/fold.c b/src/cg/fold.c
@@ -405,6 +405,88 @@ int api_can_delay_int_arith(KitCg* g, KitCgTypeId ty, u32 flags) {
   return g && !flags && api_foldable_int_type(g->c, ty, &width);
 }
 
+/* Strength reduction: rewrite a multiply / unsigned-divide / unsigned-remainder
+ * by a power-of-two immediate into a shift / and. Operates on the freshly
+ * popped operands (`a` = LHS, `b` = RHS) and the op; on a match it rewrites
+ * *op, *a and *b in place and returns 1.
+ *
+ * Only the cases whose plain wrapping equivalence is exact live here:
+ *   x * 2^k   -> x << k      (multiply is commutative: the immediate may be on
+ *                             either side; the result is canonicalized so the
+ *                             variable is the shift's LHS)
+ *   x u/ 2^k  -> x u>> k
+ *   x u% 2^k  -> x & (2^k - 1)
+ * Signed division/remainder by a power of two needs a sign-bias sequence
+ * (round toward zero on negatives), so it is left to the optimizer. The caller
+ * gates this on flags==0, so trap/saturate/exact semantics never reach here. */
+static int api_imm_is_pow2(u64 v, u32* log2_out) {
+  u32 k;
+  if (v == 0 || (v & (v - 1u)) != 0) return 0;
+  for (k = 0; k < 64u; ++k) {
+    if (v == (1ull << k)) {
+      *log2_out = k;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+int api_try_strength_reduce(KitCg* g, BinOp* op, KitCgTypeId ty, ApiSValue* a,
+                            ApiSValue* b) {
+  u32 width;
+  u32 k = 0;
+  u64 v;
+  int a_imm, b_imm;
+  if (!g || !op || !a || !b) return 0;
+  if (!api_foldable_int_type(g->c, ty, &width)) return 0;
+  a_imm = a->kind == SV_OPERAND && a->op.kind == OPK_IMM;
+  b_imm = b->kind == SV_OPERAND && b->op.kind == OPK_IMM;
+  switch (*op) {
+    case BO_IMUL: {
+      /* Both-imm is constant-folded before we get here; need exactly one, and
+       * the variable operand is canonicalized to the shift's LHS. */
+      ApiSValue* imm_sv;
+      int imm_on_lhs;
+      if (b_imm && !a_imm) {
+        imm_sv = b;
+        imm_on_lhs = 0;
+      } else if (a_imm && !b_imm) {
+        imm_sv = a;
+        imm_on_lhs = 1;
+      } else {
+        return 0;
+      }
+      v = api_mask_width((u64)imm_sv->op.v.imm, width);
+      if (!api_imm_is_pow2(v, &k) || k == 0) return 0;
+      if (imm_on_lhs) {
+        ApiSValue tmp = *a;
+        *a = *b;
+        *b = tmp;
+      }
+      b->op.v.imm = (i64)k;
+      b->op.type = ty;
+      *op = BO_SHL;
+      return 1;
+    }
+    case BO_UDIV:
+    case BO_UREM:
+      if (!b_imm || a_imm) return 0; /* RHS imm, LHS a real value */
+      v = api_mask_width((u64)b->op.v.imm, width);
+      if (!api_imm_is_pow2(v, &k) || k == 0) return 0;
+      if (*op == BO_UDIV) {
+        b->op.v.imm = (i64)k;
+        *op = BO_SHR_U;
+      } else {
+        b->op.v.imm = (i64)api_mask_width(v - 1u, width);
+        *op = BO_AND;
+      }
+      b->op.type = ty;
+      return 1;
+    default:
+      return 0;
+  }
+}
+
 int api_op_is_int_identity(KitCg* g, BinOp op, KitCgTypeId ty, i64 imm) {
   u32 width;
   u64 v;
diff --git a/src/cg/fold.h b/src/cg/fold.h
@@ -74,6 +74,8 @@ void api_release_arith(KitCg* g, ApiSValue* sv);
 void api_materialize_arith_to(KitCg* g, ApiSValue* sv, Operand dst);
 int api_arith_rhs_reusable(const ApiSValue* sv);
 int api_can_delay_int_arith(KitCg* g, KitCgTypeId ty, u32 flags);
+int api_try_strength_reduce(KitCg* g, BinOp* op, KitCgTypeId ty, ApiSValue* a,
+                            ApiSValue* b);
 int api_op_is_int_identity(KitCg* g, BinOp op, KitCgTypeId ty, i64 imm);
 int api_try_collapse_binop_identity(KitCg* g, BinOp op, KitCgTypeId ty,
                                     ApiSValue* a, ApiSValue* b, ApiSValue* out);
diff --git a/test/cg/strength_reduce_test.c b/test/cg/strength_reduce_test.c
@@ -0,0 +1,225 @@
+/* strength_reduce_test — asserts the -O0 semantic peephole (src/cg/fold.c)
+ * turns multiply / unsigned-divide / unsigned-remainder by a power-of-two
+ * constant into a shift / and.
+ *
+ * Each case builds a tiny
+ *     i64 f(i64 x) { return x <op> <imm>; }
+ * through the public CG API, emits an ELF object, disassembles its .text, and
+ * inspects the instruction mnemonics. The assertions are deliberately on
+ * substrings ("mul" / "div" / "rem") rather than exact shift mnemonics, so the
+ * test is robust to per-arch shift aliasing (lsl/ubfm, slli, shl, ...): a
+ * power-of-two operand must make the multiply/divide instruction disappear, and
+ * a non-power-of-two operand (and signed division, left to -O1+) must keep it.
+ *
+ * Run by: make test-cg-api
+ */
+
+#include <kit/cg.h>
+#include <kit/disasm.h>
+#include <kit/frontend.h>
+#include <kit/object.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "lib/kit_unit.h"
+
+static KitUnit g_u;
+#define EXPECT(cond, ...) CU_EXPECT(&g_u, cond, __VA_ARGS__)
+
+typedef struct BinFnSpec {
+  KitCgIntBinOp op;
+  int64_t imm;
+} BinFnSpec;
+
+typedef struct EmitCtx {
+  BinFnSpec spec;
+  KitObjBuilder* ob;
+} EmitCtx;
+
+/* Frontend callback: build `i64 f(i64 x) { return x op imm; }` into ctx->ob. */
+static KitStatus emit_binop_fn(KitCompiler* c, void* user) {
+  EmitCtx* ctx = (EmitCtx*)user;
+  KitCg* cg = NULL;
+  KitCgBuiltinTypes bi;
+  KitCgTypeId i64_ty;
+  KitCgFuncParam param_desc;
+  KitCgFuncResult sig_result;
+  KitCgFuncSig sig;
+  KitCgDecl decl;
+  KitCgSym sym;
+  KitCgLocalAttrs attrs;
+  KitCgLocal param;
+  KitCodeOptions opts;
+
+  if (kit_obj_builder_new(c, &ctx->ob) != KIT_OK) return KIT_ERR;
+  if (kit_cg_new(c, &cg) != KIT_OK || !cg) return KIT_ERR;
+  memset(&opts, 0, sizeof opts);
+  opts.opt_level = 0; /* the -O0 peephole is the subject under test */
+  if (kit_cg_begin_obj(cg, ctx->ob, &opts) != KIT_OK) return KIT_ERR;
+
+  bi = kit_cg_builtin_types(c);
+  i64_ty = bi.id[KIT_CG_BUILTIN_I64];
+
+  memset(&param_desc, 0, sizeof param_desc);
+  param_desc.type = i64_ty;
+  memset(&sig_result, 0, sizeof sig_result);
+  sig_result.type = i64_ty;
+  memset(&sig, 0, sizeof sig);
+  sig.params = &param_desc;
+  sig.nparams = 1;
+  sig.results = &sig_result;
+  sig.nresults = 1;
+  sig.call_conv = KIT_CG_CC_TARGET_C;
+
+  memset(&decl, 0, sizeof decl);
+  decl.kind = KIT_CG_DECL_FUNC;
+  decl.linkage_name = kit_sym_intern(c, kit_slice_cstr("f"));
+  decl.display_name = decl.linkage_name;
+  decl.type = kit_cg_type_func(c, sig);
+  decl.sym.bind = KIT_SB_GLOBAL;
+  decl.sym.visibility = KIT_CG_VIS_DEFAULT;
+  sym = kit_cg_decl(cg, decl);
+  if (sym == KIT_CG_SYM_NONE) return KIT_ERR;
+
+  kit_cg_func_begin(cg, sym);
+  memset(&attrs, 0, sizeof attrs);
+  attrs.name = kit_sym_intern(c, KIT_SLICE_LIT("x"));
+  param = kit_cg_param(cg, 0, i64_ty, attrs);
+  if (param == KIT_CG_LOCAL_NONE) return KIT_ERR;
+
+  /* return x <op> imm; */
+  kit_cg_push_local(cg, param);
+  kit_cg_load(cg, (KitCgMemAccess){.type = i64_ty,
+                                   .align = kit_cg_type_align(c, i64_ty)});
+  kit_cg_push_int(cg, (uint64_t)ctx->spec.imm, i64_ty);
+  kit_cg_int_binop(cg, ctx->spec.op, KIT_CG_INTOP_NONE);
+  kit_cg_ret(cg);
+  kit_cg_func_end(cg);
+
+  if (kit_cg_end_obj(cg) != KIT_OK) return KIT_ERR;
+  kit_cg_free(cg);
+  return KIT_OK;
+}
+
+/* Emit the op, disassemble .text, and report whether any mnemonic contains
+ * `needle`. Returns 1 if present, 0 if absent, -1 on harness failure. */
+static int op_text_has_mnem(KitArchKind arch, KitCgIntBinOp op, int64_t imm,
+                            const char* needle) {
+  KitTarget target = kit_unit_target(arch, KIT_OS_LINUX, KIT_OBJ_ELF);
+  KitCompiler* c = NULL;
+  EmitCtx ctx;
+  KitWriter* writer = NULL;
+  KitObjFile* file = NULL;
+  KitSlice bytes;
+  KitObjSection text_sec;
+  const uint8_t* data = NULL;
+  size_t len = 0;
+  KitDisasmContext dc;
+  KitDisasmIter* it = NULL;
+  KitInsn insn;
+  int result = -1;
+  int found = 0;
+  size_t nlen = strlen(needle);
+
+  memset(&ctx, 0, sizeof ctx);
+  ctx.spec.op = op;
+  ctx.spec.imm = imm;
+
+  if (kit_compiler_new(target, &g_u.ctx, &c) != KIT_OK || !c) return -1;
+  if (kit_frontend_run(c, emit_binop_fn, &ctx) != KIT_OK) goto done;
+  if (kit_writer_mem(&g_u.heap, &writer) != KIT_OK || !writer) goto done;
+  if (kit_obj_builder_emit(ctx.ob, writer) != KIT_OK) goto done;
+  bytes.data = kit_writer_mem_bytes(writer, &len);
+  bytes.len = len;
+  if (kit_obj_open(&g_u.ctx, KIT_SLICE_LIT("<sr-test>"), &bytes, &file) !=
+      KIT_OK)
+    goto done;
+  if (kit_obj_section_by_name(file, KIT_SLICE_LIT(".text"), &text_sec) !=
+      KIT_OK)
+    goto done;
+  if (kit_obj_section_data(file, text_sec, &data, &len) != KIT_OK) goto done;
+
+  memset(&dc, 0, sizeof dc);
+  dc.target = target;
+  dc.context = g_u.ctx;
+  if (kit_disasm_iter_new(&dc, data, len, 0, file, &it) != KIT_OK || !it)
+    goto done;
+  while (kit_disasm_iter_next(it, &insn) == KIT_ITER_ITEM) {
+    const char* m = insn.mnemonic.s;
+    size_t mlen = insn.mnemonic.len;
+    if (m && mlen >= nlen) {
+      for (size_t i = 0; i + nlen <= mlen; ++i) {
+        if (memcmp(m + i, needle, nlen) == 0) {
+          found = 1;
+          break;
+        }
+      }
+    }
+    if (found) break;
+  }
+  result = found;
+
+done:
+  if (it) kit_disasm_iter_free(it);
+  if (file) kit_obj_free(file);
+  if (writer) kit_writer_close(writer);
+  if (ctx.ob) kit_obj_builder_free(ctx.ob);
+  kit_compiler_free(c);
+  return result;
+}
+
+static const char* arch_name(KitArchKind arch) {
+  switch (arch) {
+    case KIT_ARCH_X86_64:
+      return "x86_64";
+    case KIT_ARCH_ARM_64:
+      return "aarch64";
+    case KIT_ARCH_RV64:
+      return "riscv64";
+    default:
+      return "arch";
+  }
+}
+
+static void check_arch(KitArchKind arch) {
+  const char* an = arch_name(arch);
+
+  /* x * 8  -> shift: no multiply instruction survives. */
+  EXPECT(op_text_has_mnem(arch, KIT_CG_INT_MUL, 8, "mul") == 0,
+         "[%s] x * 8 should strength-reduce to a shift (no 'mul')", an);
+  /* x * 7  -> still a multiply (not a power of two). */
+  EXPECT(op_text_has_mnem(arch, KIT_CG_INT_MUL, 7, "mul") == 1,
+         "[%s] x * 7 must stay a multiply", an);
+
+  /* x u/ 8 -> logical shift right: no divide. */
+  EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UDIV, 8, "div") == 0,
+         "[%s] x u/ 8 should strength-reduce to a shift (no 'div')", an);
+  /* x u/ 7 -> still a divide. */
+  EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UDIV, 7, "div") == 1,
+         "[%s] x u/ 7 must stay a divide", an);
+
+  /* x u% 8 -> and: no divide and no remainder instruction. */
+  EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UREM, 8, "div") == 0 &&
+             op_text_has_mnem(arch, KIT_CG_INT_UREM, 8, "rem") == 0,
+         "[%s] x u%% 8 should strength-reduce to an and (no 'div'/'rem')", an);
+  /* x u% 7 -> still a divide or remainder. */
+  EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UREM, 7, "div") == 1 ||
+             op_text_has_mnem(arch, KIT_CG_INT_UREM, 7, "rem") == 1,
+         "[%s] x u%% 7 must stay a divide/remainder", an);
+
+  /* Signed division by a power of two needs a sign-bias sequence; the -O0
+   * peephole deliberately leaves it for the optimizer, so the divide stays. */
+  EXPECT(op_text_has_mnem(arch, KIT_CG_INT_SDIV, 8, "div") == 1,
+         "[%s] signed x / 8 is left as a divide at -O0", an);
+}
+
+int main(void) {
+  kit_unit_init(&g_u);
+  g_u.ctx.now = -1;
+  check_arch(KIT_ARCH_X86_64);
+  check_arch(KIT_ARCH_ARM_64);
+  check_arch(KIT_ARCH_RV64);
+  kit_unit_summary(&g_u, "strength_reduce_test");
+  return kit_unit_status(&g_u);
+}
diff --git a/test/lib/unit.mk b/test/lib/unit.mk
@@ -31,12 +31,13 @@ UNIT_CFLAGS_INTERNAL = $(HOST_CFLAGS) -Iinclude -Isrc -Itest
 
 UNIT_TESTS_PUBLIC := \
     ar_test cg_api_test cg_switch_test cg_fp_cmp_test hash_test rv64_jit_test \
-    aa64_inline_test rv64_inline_test x64_inline_test
+    aa64_inline_test rv64_inline_test x64_inline_test strength_reduce_test
 ar_test_SRC          := test/ar/ar_test.c
 hash_test_SRC        := test/api/hash_test.c
 cg_api_test_SRC      := test/api/cg_type_test.c
 cg_switch_test_SRC   := test/api/cg_switch_test.c
 cg_fp_cmp_test_SRC   := test/api/cg_fp_cmp_test.c
+strength_reduce_test_SRC := test/cg/strength_reduce_test.c
 rv64_jit_test_SRC    := test/link/rv64_jit_test.c
 aa64_inline_test_SRC := test/arch/aa64_inline_test.c
 rv64_inline_test_SRC := test/arch/rv64_inline_test.c
diff --git a/test/test.mk b/test/test.mk
@@ -379,15 +379,18 @@ test-interp-toy: bin
 CG_API_TEST_BIN = build/test/cg_api_test
 CG_SWITCH_TEST_BIN = build/test/cg_switch_test
 CG_FP_CMP_TEST_BIN = build/test/cg_fp_cmp_test
+STRENGTH_REDUCE_TEST_BIN = build/test/strength_reduce_test
 HASH_TEST_BIN = build/test/hash_test
 ABI_CLASSIFY_TEST_BIN = build/test/abi_classify_test
 IR_RECORDER_TEST_BIN = build/test/ir_recorder_test
 NATIVE_DIRECT_TARGET_TEST_BIN = build/test/native_direct_target_test
 
-test-cg-api: $(CG_API_TEST_BIN) $(CG_SWITCH_TEST_BIN) $(CG_FP_CMP_TEST_BIN)
+test-cg-api: $(CG_API_TEST_BIN) $(CG_SWITCH_TEST_BIN) $(CG_FP_CMP_TEST_BIN) \
+             $(STRENGTH_REDUCE_TEST_BIN)
 	$(CG_API_TEST_BIN)
 	$(CG_SWITCH_TEST_BIN)
 	$(CG_FP_CMP_TEST_BIN)
+	$(STRENGTH_REDUCE_TEST_BIN)
 
 test-hash: $(HASH_TEST_BIN)
 	$(HASH_TEST_BIN)

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/cg/arith.c	\|	5	+++++
M	src/cg/fold.c	\|	82	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	src/cg/fold.h	\|	2	++
A	test/cg/strength_reduce_test.c	\|	225	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	test/lib/unit.mk	\|	3	++-
M	test/test.mk	\|	5	++++-