commit 1bb03d8f2ccde5f3d4f12ddc5391b4674b4000f5
parent 57579de1edd9188b89bd36ff9a223373b31339b0
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Wed, 27 May 2026 14:31:08 -0700
opt: fix two O1 aa64 codegen-correctness bugs
Both reproduced as crashes only at -O1 on the pass_native_emit /
opt_mir_combine path (the -O0 native_direct_target path was unaffected),
and both predate the recent immediate-fold / local-cache work.
- pass_native_emit (collapse_addr_to_reg): materialize a collapsed
address into a reserved scratch register instead of reusing the base
register as the load_addr destination. The allocator can keep the base
value live past the memory op (e.g. a pointer stored into several of its
own fields and then returned); the in-place `add base, base, #off`
corrupted it, so later field stores and the return read garbage.
(lists benchmark: SIGSEGV)
- pass_combine (count_uses_in_live_range): decide whether a producer's
def is killed in-block from the instruction's precise def/clobber set
(opt_hard_inst_use_def) rather than a blanket clobber-barrier test. A
call clobbers only its caller-saved set, so a value in a callee-saved
register survives it; treating every call as killing every register made
`killed` spuriously true, letting combine skip the cross-block live-out
check and delete a still-live def. (strcat benchmark: SIGABRT from
freeing an uninitialized pointer)
opt_bench_compare.py now includes mir-c2m alongside cfree in the
vs-baseline comparison tables.
Verified on aa64: test-opt, test-toy (1022/0), test-aa64-inline green;
full O1 benchmark sweep clean (no timeouts/mismatches); minimal repros
pass at O0 and O1.
Diffstat:
3 files changed, 34 insertions(+), 14 deletions(-)
diff --git a/scripts/opt_bench_compare.py b/scripts/opt_bench_compare.py
@@ -105,14 +105,14 @@ def main():
# Show order: base row first, then cfree/cfree-run at each opt level
show = [(base_tool, base_opt)]
seen = {(base_tool, base_opt)}
- for t in ("cfree", "cfree-run"):
+ for t in ("cfree", "cfree-run", "mir-c2m"):
for o in all_opts:
key = (t, o)
if key not in seen and any((t, o, b) in idx for b in all_benches):
show.append(key)
seen.add(key)
- print(f"cfree vs {base_tool} -O{base_opt} [{os.path.basename(csv_path)}]")
+ print(f"cfree/mir vs {base_tool} -O{base_opt} [{os.path.basename(csv_path)}]")
print()
# Per-benchmark table
diff --git a/src/opt/pass_combine.c b/src/opt/pass_combine.c
@@ -512,7 +512,22 @@ static void ctx_restore_removed_def(CombineCtx* ctx, const Operand* reg,
* Without this live-range scoping, reuse of a scratch physreg later in the
* block (`sxtw x12, ...; mov x13, x12; mov x12, ...`) makes every fold look
* multi-use and combine rejects almost everything. */
-static int count_uses_in_live_range(const Block* bl, i32 prod_idx,
+/* True if `in` redefines or clobbers `def`'s physical register. Consults the
+ * instruction's precise def/clobber set (the same one liveness uses) rather than
+ * a blanket clobber-barrier test: a call clobbers only its caller-saved set, so
+ * a value held in a callee-saved register survives it. Treating every call as
+ * killing every register made `killed` spuriously true and let combine skip the
+ * cross-block live-out check, deleting a still-live def. */
+static int inst_kills_phys_reg(Func* f, const Inst* in, const Operand* def) {
+ OptHardRegSet use, d;
+ if (!def || def->kind != OPK_REG || def->cls >= OPT_REG_CLASSES ||
+ def->v.reg >= 32)
+ return 0;
+ opt_hard_inst_use_def(f, in, &use, &d);
+ return (d.cls[def->cls] & (1u << def->v.reg)) != 0;
+}
+
+static int count_uses_in_live_range(Func* f, const Block* bl, i32 prod_idx,
const Operand* def,
int* killed_in_block_out) {
int n = 0;
@@ -520,7 +535,7 @@ static int count_uses_in_live_range(const Block* bl, i32 prod_idx,
for (i32 i = prod_idx + 1; i < (i32)bl->ninsts; ++i) {
const Inst* in = &bl->insts[i];
n += inst_uses_phys_reg(in, def);
- if (inst_is_clobber_barrier(in) || inst_defines_phys_reg(in, def)) {
+ if (inst_kills_phys_reg(f, in, def)) {
killed = 1;
break;
}
@@ -769,7 +784,7 @@ static int try_substitute_for_reg(CombineCtx* ctx, Inst* in, i32 i, u8 cls,
* this block, the producer's def is killed before any successor sees it,
* so the cross-block live-out check can be skipped. */
int killed = 0;
- int uses_total = count_uses_in_live_range(ctx->bl, prod_idx, &def, &killed);
+ int uses_total = count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &def, &killed);
if (uses_total != 1) return 0;
if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live,
ctx->bl->id, &def))
@@ -845,7 +860,7 @@ static int try_addr_synth_one_op(CombineCtx* ctx, Inst* in, i32 i,
* before next redef/barrier). */
int killed = 0;
int uses_after =
- count_uses_in_live_range(ctx->bl, prod_idx, &prod_def, &killed);
+ count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &prod_def, &killed);
if (uses_after == 1 &&
(killed || !opt_block_live_out_has_phys_reg(
ctx->f, ctx->hard_live, ctx->bl->id, &prod_def))) {
@@ -907,7 +922,7 @@ static int try_addr_synth_one_op(CombineCtx* ctx, Inst* in, i32 i,
Operand prod_def = prod->opnds[0];
int killed = 0;
int uses_after =
- count_uses_in_live_range(ctx->bl, prod_idx, &prod_def, &killed);
+ count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &prod_def, &killed);
/* `<= 2` allows the degenerate [base == index] case where the SHL
* dst appears twice in the same indirect; rewriting only the index
* still yields equivalent arithmetic when base also held the SHL
@@ -985,7 +1000,7 @@ static int try_sink(CombineCtx* ctx, Inst* in, i32 i) {
* at i). If the live range terminates inside the block, the cross-block
* live-out check is moot. */
int killed = 0;
- int uses_total = count_uses_in_live_range(ctx->bl, prod_idx, &src, &killed);
+ int uses_total = count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &src, &killed);
if (uses_total != 1) return 0;
if (killed && use_after_clobber_before_redef(ctx->bl, prod_idx, &src))
return 0;
@@ -1047,7 +1062,7 @@ static int try_combine_exts(CombineCtx* ctx, Inst* in, i32 i) {
Operand prod_def = prod->opnds[0];
int killed = 0;
int uses_total =
- count_uses_in_live_range(ctx->bl, prod_idx, &prod_def, &killed);
+ count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &prod_def, &killed);
if (uses_total != 1) return 0;
if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live,
ctx->bl->id, &prod_def))
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -362,11 +362,16 @@ static Reg addr_index_reg(const NativeAddr* addr) {
static void collapse_addr_to_reg(NativeEmitCtx* e, NativeAddr* addr,
SrcLoc loc) {
- Reg r = addr_base_reg(addr);
- NativeLoc dst;
- if (r == (Reg)REG_NONE)
- r = scratch_reg(e, NATIVE_REG_INT, REG_NONE, REG_NONE, loc);
- dst = loc_reg(addr->base_type, NATIVE_REG_INT, r);
+ /* Materialize the full address into a reserved scratch register. We must not
+ * reuse the base register as the destination: the register allocator may keep
+ * that value live past this memory op (e.g. a pointer stored into several of
+ * its own fields and then returned), so an in-place `add base, base, #off`
+ * would corrupt it. Avoid both base and index so load_addr can still read
+ * them. */
+ Reg r =
+ scratch_reg(e, NATIVE_REG_INT, addr_base_reg(addr), addr_index_reg(addr),
+ loc);
+ NativeLoc dst = loc_reg(addr->base_type, NATIVE_REG_INT, r);
e->target->load_addr(e->target, dst, *addr);
memset(addr, 0, sizeof *addr);
addr->base_kind = NATIVE_ADDR_BASE_REG;