kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 1bb03d8f2ccde5f3d4f12ddc5391b4674b4000f5
parent 57579de1edd9188b89bd36ff9a223373b31339b0
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed, 27 May 2026 14:31:08 -0700

opt: fix two O1 aa64 codegen-correctness bugs

Both reproduced as crashes only at -O1 on the pass_native_emit /
opt_mir_combine path (the -O0 native_direct_target path was unaffected),
and both predate the recent immediate-fold / local-cache work.

- pass_native_emit (collapse_addr_to_reg): materialize a collapsed
  address into a reserved scratch register instead of reusing the base
  register as the load_addr destination. The allocator can keep the base
  value live past the memory op (e.g. a pointer stored into several of its
  own fields and then returned); the in-place `add base, base, #off`
  corrupted it, so later field stores and the return read garbage.
  (lists benchmark: SIGSEGV)

- pass_combine (count_uses_in_live_range): decide whether a producer's
  def is killed in-block from the instruction's precise def/clobber set
  (opt_hard_inst_use_def) rather than a blanket clobber-barrier test. A
  call clobbers only its caller-saved set, so a value in a callee-saved
  register survives it; treating every call as killing every register made
  `killed` spuriously true, letting combine skip the cross-block live-out
  check and delete a still-live def. (strcat benchmark: SIGABRT from
  freeing an uninitialized pointer)

opt_bench_compare.py now includes mir-c2m alongside cfree in the
vs-baseline comparison tables.

Verified on aa64: test-opt, test-toy (1022/0), test-aa64-inline green;
full O1 benchmark sweep clean (no timeouts/mismatches); minimal repros
pass at O0 and O1.

Diffstat:
Mscripts/opt_bench_compare.py | 4++--
Msrc/opt/pass_combine.c | 29++++++++++++++++++++++-------
Msrc/opt/pass_native_emit.c | 15++++++++++-----
3 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/scripts/opt_bench_compare.py b/scripts/opt_bench_compare.py @@ -105,14 +105,14 @@ def main(): # Show order: base row first, then cfree/cfree-run at each opt level show = [(base_tool, base_opt)] seen = {(base_tool, base_opt)} - for t in ("cfree", "cfree-run"): + for t in ("cfree", "cfree-run", "mir-c2m"): for o in all_opts: key = (t, o) if key not in seen and any((t, o, b) in idx for b in all_benches): show.append(key) seen.add(key) - print(f"cfree vs {base_tool} -O{base_opt} [{os.path.basename(csv_path)}]") + print(f"cfree/mir vs {base_tool} -O{base_opt} [{os.path.basename(csv_path)}]") print() # Per-benchmark table diff --git a/src/opt/pass_combine.c b/src/opt/pass_combine.c @@ -512,7 +512,22 @@ static void ctx_restore_removed_def(CombineCtx* ctx, const Operand* reg, * Without this live-range scoping, reuse of a scratch physreg later in the * block (`sxtw x12, ...; mov x13, x12; mov x12, ...`) makes every fold look * multi-use and combine rejects almost everything. */ -static int count_uses_in_live_range(const Block* bl, i32 prod_idx, +/* True if `in` redefines or clobbers `def`'s physical register. Consults the + * instruction's precise def/clobber set (the same one liveness uses) rather than + * a blanket clobber-barrier test: a call clobbers only its caller-saved set, so + * a value held in a callee-saved register survives it. Treating every call as + * killing every register made `killed` spuriously true and let combine skip the + * cross-block live-out check, deleting a still-live def. */ +static int inst_kills_phys_reg(Func* f, const Inst* in, const Operand* def) { + OptHardRegSet use, d; + if (!def || def->kind != OPK_REG || def->cls >= OPT_REG_CLASSES || + def->v.reg >= 32) + return 0; + opt_hard_inst_use_def(f, in, &use, &d); + return (d.cls[def->cls] & (1u << def->v.reg)) != 0; +} + +static int count_uses_in_live_range(Func* f, const Block* bl, i32 prod_idx, const Operand* def, int* killed_in_block_out) { int n = 0; @@ -520,7 +535,7 @@ static int count_uses_in_live_range(const Block* bl, i32 prod_idx, for (i32 i = prod_idx + 1; i < (i32)bl->ninsts; ++i) { const Inst* in = &bl->insts[i]; n += inst_uses_phys_reg(in, def); - if (inst_is_clobber_barrier(in) || inst_defines_phys_reg(in, def)) { + if (inst_kills_phys_reg(f, in, def)) { killed = 1; break; } @@ -769,7 +784,7 @@ static int try_substitute_for_reg(CombineCtx* ctx, Inst* in, i32 i, u8 cls, * this block, the producer's def is killed before any successor sees it, * so the cross-block live-out check can be skipped. */ int killed = 0; - int uses_total = count_uses_in_live_range(ctx->bl, prod_idx, &def, &killed); + int uses_total = count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &def, &killed); if (uses_total != 1) return 0; if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live, ctx->bl->id, &def)) @@ -845,7 +860,7 @@ static int try_addr_synth_one_op(CombineCtx* ctx, Inst* in, i32 i, * before next redef/barrier). */ int killed = 0; int uses_after = - count_uses_in_live_range(ctx->bl, prod_idx, &prod_def, &killed); + count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &prod_def, &killed); if (uses_after == 1 && (killed || !opt_block_live_out_has_phys_reg( ctx->f, ctx->hard_live, ctx->bl->id, &prod_def))) { @@ -907,7 +922,7 @@ static int try_addr_synth_one_op(CombineCtx* ctx, Inst* in, i32 i, Operand prod_def = prod->opnds[0]; int killed = 0; int uses_after = - count_uses_in_live_range(ctx->bl, prod_idx, &prod_def, &killed); + count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &prod_def, &killed); /* `<= 2` allows the degenerate [base == index] case where the SHL * dst appears twice in the same indirect; rewriting only the index * still yields equivalent arithmetic when base also held the SHL @@ -985,7 +1000,7 @@ static int try_sink(CombineCtx* ctx, Inst* in, i32 i) { * at i). If the live range terminates inside the block, the cross-block * live-out check is moot. */ int killed = 0; - int uses_total = count_uses_in_live_range(ctx->bl, prod_idx, &src, &killed); + int uses_total = count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &src, &killed); if (uses_total != 1) return 0; if (killed && use_after_clobber_before_redef(ctx->bl, prod_idx, &src)) return 0; @@ -1047,7 +1062,7 @@ static int try_combine_exts(CombineCtx* ctx, Inst* in, i32 i) { Operand prod_def = prod->opnds[0]; int killed = 0; int uses_total = - count_uses_in_live_range(ctx->bl, prod_idx, &prod_def, &killed); + count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &prod_def, &killed); if (uses_total != 1) return 0; if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live, ctx->bl->id, &prod_def)) diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c @@ -362,11 +362,16 @@ static Reg addr_index_reg(const NativeAddr* addr) { static void collapse_addr_to_reg(NativeEmitCtx* e, NativeAddr* addr, SrcLoc loc) { - Reg r = addr_base_reg(addr); - NativeLoc dst; - if (r == (Reg)REG_NONE) - r = scratch_reg(e, NATIVE_REG_INT, REG_NONE, REG_NONE, loc); - dst = loc_reg(addr->base_type, NATIVE_REG_INT, r); + /* Materialize the full address into a reserved scratch register. We must not + * reuse the base register as the destination: the register allocator may keep + * that value live past this memory op (e.g. a pointer stored into several of + * its own fields and then returned), so an in-place `add base, base, #off` + * would corrupt it. Avoid both base and index so load_addr can still read + * them. */ + Reg r = + scratch_reg(e, NATIVE_REG_INT, addr_base_reg(addr), addr_index_reg(addr), + loc); + NativeLoc dst = loc_reg(addr->base_type, NATIVE_REG_INT, r); e->target->load_addr(e->target, dst, *addr); memset(addr, 0, sizeof *addr); addr->base_kind = NATIVE_ADDR_BASE_REG;