kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 27f088297163bc712f339f7ee942cceb56b0ae57
parent 21aa437c9addefcc191d61f2b9c39f2bccc6efcc
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue,  2 Jun 2026 15:38:04 -0700

cg/fold: strength-reduce mul/udiv/urem by powers of two at -O0

The -O0 semantic peephole (src/cg/fold.c) already folds constants and
arithmetic identities but left multiply/divide/remainder by a power-of-two
constant as full mul/div instructions. Add api_try_strength_reduce, called
from api_cg_binop after constant folding, to rewrite:

  x * 2^k   -> x << k     (commutative: immediate on either side; the
                           variable is canonicalized to the shift's LHS)
  x u/ 2^k  -> x u>> k
  x u% 2^k  -> x & (2^k - 1)

The rewrite happens before the delay/identity/fallback machinery, so the
resulting shift/and flows through the same paths as any other shift. Gated
on flags==0, so trap/saturate/exact ops keep their semantics. Signed
division/remainder needs a sign-bias sequence and is left to the optimizer.

Also reduces array-index scaling (i * sizeof(T)) to a shift when the element
size is a power of two.

test/cg/strength_reduce_test.c builds tiny `i64 f(i64 x){ return x op imm; }`
functions through the public CG API, disassembles .text, and asserts the
mul/div instruction disappears for power-of-two operands (and survives for
non-power-of-two and for signed division) across x86_64, aarch64, riscv64.

Diffstat:
Msrc/cg/arith.c | 5+++++
Msrc/cg/fold.c | 82+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/cg/fold.h | 2++
Atest/cg/strength_reduce_test.c | 225+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest/lib/unit.mk | 3++-
Mtest/test.mk | 5++++-
6 files changed, 320 insertions(+), 2 deletions(-)

diff --git a/src/cg/arith.c b/src/cg/arith.c @@ -49,6 +49,11 @@ void api_cg_binop(KitCg* g, BinOp iop, u32 flags) { return; } + /* Strength-reduce mul/udiv/urem by a power of two into shift/and. Rewrites + * iop and the operands in place; the result flows through the same delay / + * identity / fallback machinery as any other shift or and. */ + if (!flags) api_try_strength_reduce(g, &iop, ty, &a, &b); + if (api_can_delay_int_arith(g, ty, flags) && api_try_fold_arith_chain(g, iop, ty, &a, &b, &folded_sv)) { api_release(g, &a); diff --git a/src/cg/fold.c b/src/cg/fold.c @@ -405,6 +405,88 @@ int api_can_delay_int_arith(KitCg* g, KitCgTypeId ty, u32 flags) { return g && !flags && api_foldable_int_type(g->c, ty, &width); } +/* Strength reduction: rewrite a multiply / unsigned-divide / unsigned-remainder + * by a power-of-two immediate into a shift / and. Operates on the freshly + * popped operands (`a` = LHS, `b` = RHS) and the op; on a match it rewrites + * *op, *a and *b in place and returns 1. + * + * Only the cases whose plain wrapping equivalence is exact live here: + * x * 2^k -> x << k (multiply is commutative: the immediate may be on + * either side; the result is canonicalized so the + * variable is the shift's LHS) + * x u/ 2^k -> x u>> k + * x u% 2^k -> x & (2^k - 1) + * Signed division/remainder by a power of two needs a sign-bias sequence + * (round toward zero on negatives), so it is left to the optimizer. The caller + * gates this on flags==0, so trap/saturate/exact semantics never reach here. */ +static int api_imm_is_pow2(u64 v, u32* log2_out) { + u32 k; + if (v == 0 || (v & (v - 1u)) != 0) return 0; + for (k = 0; k < 64u; ++k) { + if (v == (1ull << k)) { + *log2_out = k; + return 1; + } + } + return 0; +} + +int api_try_strength_reduce(KitCg* g, BinOp* op, KitCgTypeId ty, ApiSValue* a, + ApiSValue* b) { + u32 width; + u32 k = 0; + u64 v; + int a_imm, b_imm; + if (!g || !op || !a || !b) return 0; + if (!api_foldable_int_type(g->c, ty, &width)) return 0; + a_imm = a->kind == SV_OPERAND && a->op.kind == OPK_IMM; + b_imm = b->kind == SV_OPERAND && b->op.kind == OPK_IMM; + switch (*op) { + case BO_IMUL: { + /* Both-imm is constant-folded before we get here; need exactly one, and + * the variable operand is canonicalized to the shift's LHS. */ + ApiSValue* imm_sv; + int imm_on_lhs; + if (b_imm && !a_imm) { + imm_sv = b; + imm_on_lhs = 0; + } else if (a_imm && !b_imm) { + imm_sv = a; + imm_on_lhs = 1; + } else { + return 0; + } + v = api_mask_width((u64)imm_sv->op.v.imm, width); + if (!api_imm_is_pow2(v, &k) || k == 0) return 0; + if (imm_on_lhs) { + ApiSValue tmp = *a; + *a = *b; + *b = tmp; + } + b->op.v.imm = (i64)k; + b->op.type = ty; + *op = BO_SHL; + return 1; + } + case BO_UDIV: + case BO_UREM: + if (!b_imm || a_imm) return 0; /* RHS imm, LHS a real value */ + v = api_mask_width((u64)b->op.v.imm, width); + if (!api_imm_is_pow2(v, &k) || k == 0) return 0; + if (*op == BO_UDIV) { + b->op.v.imm = (i64)k; + *op = BO_SHR_U; + } else { + b->op.v.imm = (i64)api_mask_width(v - 1u, width); + *op = BO_AND; + } + b->op.type = ty; + return 1; + default: + return 0; + } +} + int api_op_is_int_identity(KitCg* g, BinOp op, KitCgTypeId ty, i64 imm) { u32 width; u64 v; diff --git a/src/cg/fold.h b/src/cg/fold.h @@ -74,6 +74,8 @@ void api_release_arith(KitCg* g, ApiSValue* sv); void api_materialize_arith_to(KitCg* g, ApiSValue* sv, Operand dst); int api_arith_rhs_reusable(const ApiSValue* sv); int api_can_delay_int_arith(KitCg* g, KitCgTypeId ty, u32 flags); +int api_try_strength_reduce(KitCg* g, BinOp* op, KitCgTypeId ty, ApiSValue* a, + ApiSValue* b); int api_op_is_int_identity(KitCg* g, BinOp op, KitCgTypeId ty, i64 imm); int api_try_collapse_binop_identity(KitCg* g, BinOp op, KitCgTypeId ty, ApiSValue* a, ApiSValue* b, ApiSValue* out); diff --git a/test/cg/strength_reduce_test.c b/test/cg/strength_reduce_test.c @@ -0,0 +1,225 @@ +/* strength_reduce_test — asserts the -O0 semantic peephole (src/cg/fold.c) + * turns multiply / unsigned-divide / unsigned-remainder by a power-of-two + * constant into a shift / and. + * + * Each case builds a tiny + * i64 f(i64 x) { return x <op> <imm>; } + * through the public CG API, emits an ELF object, disassembles its .text, and + * inspects the instruction mnemonics. The assertions are deliberately on + * substrings ("mul" / "div" / "rem") rather than exact shift mnemonics, so the + * test is robust to per-arch shift aliasing (lsl/ubfm, slli, shl, ...): a + * power-of-two operand must make the multiply/divide instruction disappear, and + * a non-power-of-two operand (and signed division, left to -O1+) must keep it. + * + * Run by: make test-cg-api + */ + +#include <kit/cg.h> +#include <kit/disasm.h> +#include <kit/frontend.h> +#include <kit/object.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> + +#include "lib/kit_unit.h" + +static KitUnit g_u; +#define EXPECT(cond, ...) CU_EXPECT(&g_u, cond, __VA_ARGS__) + +typedef struct BinFnSpec { + KitCgIntBinOp op; + int64_t imm; +} BinFnSpec; + +typedef struct EmitCtx { + BinFnSpec spec; + KitObjBuilder* ob; +} EmitCtx; + +/* Frontend callback: build `i64 f(i64 x) { return x op imm; }` into ctx->ob. */ +static KitStatus emit_binop_fn(KitCompiler* c, void* user) { + EmitCtx* ctx = (EmitCtx*)user; + KitCg* cg = NULL; + KitCgBuiltinTypes bi; + KitCgTypeId i64_ty; + KitCgFuncParam param_desc; + KitCgFuncResult sig_result; + KitCgFuncSig sig; + KitCgDecl decl; + KitCgSym sym; + KitCgLocalAttrs attrs; + KitCgLocal param; + KitCodeOptions opts; + + if (kit_obj_builder_new(c, &ctx->ob) != KIT_OK) return KIT_ERR; + if (kit_cg_new(c, &cg) != KIT_OK || !cg) return KIT_ERR; + memset(&opts, 0, sizeof opts); + opts.opt_level = 0; /* the -O0 peephole is the subject under test */ + if (kit_cg_begin_obj(cg, ctx->ob, &opts) != KIT_OK) return KIT_ERR; + + bi = kit_cg_builtin_types(c); + i64_ty = bi.id[KIT_CG_BUILTIN_I64]; + + memset(&param_desc, 0, sizeof param_desc); + param_desc.type = i64_ty; + memset(&sig_result, 0, sizeof sig_result); + sig_result.type = i64_ty; + memset(&sig, 0, sizeof sig); + sig.params = &param_desc; + sig.nparams = 1; + sig.results = &sig_result; + sig.nresults = 1; + sig.call_conv = KIT_CG_CC_TARGET_C; + + memset(&decl, 0, sizeof decl); + decl.kind = KIT_CG_DECL_FUNC; + decl.linkage_name = kit_sym_intern(c, kit_slice_cstr("f")); + decl.display_name = decl.linkage_name; + decl.type = kit_cg_type_func(c, sig); + decl.sym.bind = KIT_SB_GLOBAL; + decl.sym.visibility = KIT_CG_VIS_DEFAULT; + sym = kit_cg_decl(cg, decl); + if (sym == KIT_CG_SYM_NONE) return KIT_ERR; + + kit_cg_func_begin(cg, sym); + memset(&attrs, 0, sizeof attrs); + attrs.name = kit_sym_intern(c, KIT_SLICE_LIT("x")); + param = kit_cg_param(cg, 0, i64_ty, attrs); + if (param == KIT_CG_LOCAL_NONE) return KIT_ERR; + + /* return x <op> imm; */ + kit_cg_push_local(cg, param); + kit_cg_load(cg, (KitCgMemAccess){.type = i64_ty, + .align = kit_cg_type_align(c, i64_ty)}); + kit_cg_push_int(cg, (uint64_t)ctx->spec.imm, i64_ty); + kit_cg_int_binop(cg, ctx->spec.op, KIT_CG_INTOP_NONE); + kit_cg_ret(cg); + kit_cg_func_end(cg); + + if (kit_cg_end_obj(cg) != KIT_OK) return KIT_ERR; + kit_cg_free(cg); + return KIT_OK; +} + +/* Emit the op, disassemble .text, and report whether any mnemonic contains + * `needle`. Returns 1 if present, 0 if absent, -1 on harness failure. */ +static int op_text_has_mnem(KitArchKind arch, KitCgIntBinOp op, int64_t imm, + const char* needle) { + KitTarget target = kit_unit_target(arch, KIT_OS_LINUX, KIT_OBJ_ELF); + KitCompiler* c = NULL; + EmitCtx ctx; + KitWriter* writer = NULL; + KitObjFile* file = NULL; + KitSlice bytes; + KitObjSection text_sec; + const uint8_t* data = NULL; + size_t len = 0; + KitDisasmContext dc; + KitDisasmIter* it = NULL; + KitInsn insn; + int result = -1; + int found = 0; + size_t nlen = strlen(needle); + + memset(&ctx, 0, sizeof ctx); + ctx.spec.op = op; + ctx.spec.imm = imm; + + if (kit_compiler_new(target, &g_u.ctx, &c) != KIT_OK || !c) return -1; + if (kit_frontend_run(c, emit_binop_fn, &ctx) != KIT_OK) goto done; + if (kit_writer_mem(&g_u.heap, &writer) != KIT_OK || !writer) goto done; + if (kit_obj_builder_emit(ctx.ob, writer) != KIT_OK) goto done; + bytes.data = kit_writer_mem_bytes(writer, &len); + bytes.len = len; + if (kit_obj_open(&g_u.ctx, KIT_SLICE_LIT("<sr-test>"), &bytes, &file) != + KIT_OK) + goto done; + if (kit_obj_section_by_name(file, KIT_SLICE_LIT(".text"), &text_sec) != + KIT_OK) + goto done; + if (kit_obj_section_data(file, text_sec, &data, &len) != KIT_OK) goto done; + + memset(&dc, 0, sizeof dc); + dc.target = target; + dc.context = g_u.ctx; + if (kit_disasm_iter_new(&dc, data, len, 0, file, &it) != KIT_OK || !it) + goto done; + while (kit_disasm_iter_next(it, &insn) == KIT_ITER_ITEM) { + const char* m = insn.mnemonic.s; + size_t mlen = insn.mnemonic.len; + if (m && mlen >= nlen) { + for (size_t i = 0; i + nlen <= mlen; ++i) { + if (memcmp(m + i, needle, nlen) == 0) { + found = 1; + break; + } + } + } + if (found) break; + } + result = found; + +done: + if (it) kit_disasm_iter_free(it); + if (file) kit_obj_free(file); + if (writer) kit_writer_close(writer); + if (ctx.ob) kit_obj_builder_free(ctx.ob); + kit_compiler_free(c); + return result; +} + +static const char* arch_name(KitArchKind arch) { + switch (arch) { + case KIT_ARCH_X86_64: + return "x86_64"; + case KIT_ARCH_ARM_64: + return "aarch64"; + case KIT_ARCH_RV64: + return "riscv64"; + default: + return "arch"; + } +} + +static void check_arch(KitArchKind arch) { + const char* an = arch_name(arch); + + /* x * 8 -> shift: no multiply instruction survives. */ + EXPECT(op_text_has_mnem(arch, KIT_CG_INT_MUL, 8, "mul") == 0, + "[%s] x * 8 should strength-reduce to a shift (no 'mul')", an); + /* x * 7 -> still a multiply (not a power of two). */ + EXPECT(op_text_has_mnem(arch, KIT_CG_INT_MUL, 7, "mul") == 1, + "[%s] x * 7 must stay a multiply", an); + + /* x u/ 8 -> logical shift right: no divide. */ + EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UDIV, 8, "div") == 0, + "[%s] x u/ 8 should strength-reduce to a shift (no 'div')", an); + /* x u/ 7 -> still a divide. */ + EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UDIV, 7, "div") == 1, + "[%s] x u/ 7 must stay a divide", an); + + /* x u% 8 -> and: no divide and no remainder instruction. */ + EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UREM, 8, "div") == 0 && + op_text_has_mnem(arch, KIT_CG_INT_UREM, 8, "rem") == 0, + "[%s] x u%% 8 should strength-reduce to an and (no 'div'/'rem')", an); + /* x u% 7 -> still a divide or remainder. */ + EXPECT(op_text_has_mnem(arch, KIT_CG_INT_UREM, 7, "div") == 1 || + op_text_has_mnem(arch, KIT_CG_INT_UREM, 7, "rem") == 1, + "[%s] x u%% 7 must stay a divide/remainder", an); + + /* Signed division by a power of two needs a sign-bias sequence; the -O0 + * peephole deliberately leaves it for the optimizer, so the divide stays. */ + EXPECT(op_text_has_mnem(arch, KIT_CG_INT_SDIV, 8, "div") == 1, + "[%s] signed x / 8 is left as a divide at -O0", an); +} + +int main(void) { + kit_unit_init(&g_u); + g_u.ctx.now = -1; + check_arch(KIT_ARCH_X86_64); + check_arch(KIT_ARCH_ARM_64); + check_arch(KIT_ARCH_RV64); + kit_unit_summary(&g_u, "strength_reduce_test"); + return kit_unit_status(&g_u); +} diff --git a/test/lib/unit.mk b/test/lib/unit.mk @@ -31,12 +31,13 @@ UNIT_CFLAGS_INTERNAL = $(HOST_CFLAGS) -Iinclude -Isrc -Itest UNIT_TESTS_PUBLIC := \ ar_test cg_api_test cg_switch_test cg_fp_cmp_test hash_test rv64_jit_test \ - aa64_inline_test rv64_inline_test x64_inline_test + aa64_inline_test rv64_inline_test x64_inline_test strength_reduce_test ar_test_SRC := test/ar/ar_test.c hash_test_SRC := test/api/hash_test.c cg_api_test_SRC := test/api/cg_type_test.c cg_switch_test_SRC := test/api/cg_switch_test.c cg_fp_cmp_test_SRC := test/api/cg_fp_cmp_test.c +strength_reduce_test_SRC := test/cg/strength_reduce_test.c rv64_jit_test_SRC := test/link/rv64_jit_test.c aa64_inline_test_SRC := test/arch/aa64_inline_test.c rv64_inline_test_SRC := test/arch/rv64_inline_test.c diff --git a/test/test.mk b/test/test.mk @@ -379,15 +379,18 @@ test-interp-toy: bin CG_API_TEST_BIN = build/test/cg_api_test CG_SWITCH_TEST_BIN = build/test/cg_switch_test CG_FP_CMP_TEST_BIN = build/test/cg_fp_cmp_test +STRENGTH_REDUCE_TEST_BIN = build/test/strength_reduce_test HASH_TEST_BIN = build/test/hash_test ABI_CLASSIFY_TEST_BIN = build/test/abi_classify_test IR_RECORDER_TEST_BIN = build/test/ir_recorder_test NATIVE_DIRECT_TARGET_TEST_BIN = build/test/native_direct_target_test -test-cg-api: $(CG_API_TEST_BIN) $(CG_SWITCH_TEST_BIN) $(CG_FP_CMP_TEST_BIN) +test-cg-api: $(CG_API_TEST_BIN) $(CG_SWITCH_TEST_BIN) $(CG_FP_CMP_TEST_BIN) \ + $(STRENGTH_REDUCE_TEST_BIN) $(CG_API_TEST_BIN) $(CG_SWITCH_TEST_BIN) $(CG_FP_CMP_TEST_BIN) + $(STRENGTH_REDUCE_TEST_BIN) test-hash: $(HASH_TEST_BIN) $(HASH_TEST_BIN)