commit 251ee264b52abe8f81f692c419a25716672bf9a2
parent e462688b568fc13b28f3db4126fca1da77335b87
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 22 May 2026 11:07:31 -0700
Enable coalescing and addr-xform at O1; add gap analysis and fast-bench mode
Coalescing at O1:
- Remove the allow_live_range_split gate in opt_regalloc so
opt_coalesce_ranges runs unconditionally at both O1 and O2.
- Fix ranges_overlap_kind in pass_coalesce.c: track a unit_count instead
of a boolean. Two or more unit-length overlaps between dst and src are a
real conflict (return 2), not the natural move conflict (return 1). This
prevents unsafe merges when a multi-def pseudo at O1 has unit overlaps
with the source at each def point.
- Update opt_o1_ignores_coalescing -> opt_o1_coalesces_simple_copy and
flip the expected candidate/merge counts.
opt_addr_xform_pregs (pass_o2.c, new):
- PReg-namespace equivalent of opt_addr_xform for the O1 pipeline, which
has no SSA Val namespace or Val-keyed def-use chains.
- Classifies every use of an IR_ADDR_OF result pseudo. If all uses are
foldable (non-observable IR_LOAD/STORE, zero offset, correct operand
index, no aux-operand or non-main-operand references), rewrites those
uses from OPK_INDIRECT(base=p, ofs=0) to OPK_LOCAL and converts the
IR_ADDR_OF to IR_NOP.
- Inserted in opt_run_lowering_pipeline immediately after opt_machinize,
so both O1 and O2 benefit (O2 already runs the SSA variant earlier via
opt_cleanup, but the PReg pass cleans up anything that survived).
Benchmark infrastructure:
- CFREE_OPT_BENCH_FAST=1 mode in opt_bench.sh: O1 only, five benches
(array hash hash2 matrix sieve), 3 compile/run repeats, skips GCC/Clang
native builds. Overridable per-variable as before.
- driver/run.c: when --bench-time is active, emit machine-parseable
"cfree-run <scope> -- <ms> msec" lines instead of the indented tree.
Adds bench_time field to RunMetrics; init path enabled when either
--metrics or --bench-time is set.
- opt_bench.sh: sum_pattern_ms helper sums all matching timing lines from
a --bench-time log, collapsing per-function per-pass entries into a
whole-TU total.
Documentation (doc/OPT_PERF.md):
- O1 and O2 gap analyses vs MIR covering compile-time and runtime deficits
with references to specific pass names and MIR counterparts.
- Iteration notes for the two changes above with benchmark tables showing
cumulative and incremental runtime/compile-time deltas.
Diffstat:
9 files changed, 458 insertions(+), 20 deletions(-)
diff --git a/doc/OPT_PERF.md b/doc/OPT_PERF.md
@@ -198,3 +198,231 @@ Coverage goals:
targeted fixes backed by benchmark deltas and pass-local tests.
4. Add finer cfree timing columns to `scripts/opt_bench.sh` once the driver can
expose parse, optimize, link, and JIT slices in parseable form.
+
+## O1 Gap Analysis vs MIR
+
+cfree `O1` runs only the shared lowering pipeline (`doc/OPT.md`): build_cfg,
+jump_cleanup, simplify_local, machinize, build_loop_tree, live_blocks,
+dead_def_elim, regalloc (simple mode), combine, dce, emit. No SSA-era passes
+run at `O1`, so the gaps differ from `O2`.
+
+Headline at `O1`:
+
+| metric | cfree | MIR |
+| --- | ---: | ---: |
+| compile speed (vs gcc-15 `O2`) | `5.79x` | `2.52x` |
+| runtime speed (vs gcc-15 `O2`) | `0.50x` | `0.57x` |
+| opt+link+JIT (5-case scope) | `4.552 ms` | `0.718 ms` |
+
+cfree wins the whole-pipeline compile metric because of frontend speed, but
+loses the optimized-stage opt+link+JIT split by `6.34x`.
+
+Compile-time gaps unique to `O1`:
+
+- `opt_live_blocks` tracks every pseudo-register uniformly. MIR's
+ `calculate_func_cfg_live_info` narrows to move-related variables at `O1` via
+ `consider_move_vars_only` when coalescing is on, shrinking bitset width and
+ fixpoint iterations.
+- `opt_regalloc` uses sorted interval-vector overlap checks per hard register.
+ MIR uses point-indexed bitsets ORed per program point.
+- `opt_combine` runs a single forward sweep per BB with no fixpoint, so
+ cascading folds either require an earlier pass to enable them or get
+ missed.
+- `opt_dead_def_elim_with_live` runs before allocation and `opt_dce` runs
+ after; some of this work overlaps and may be redundant.
+
+Runtime gaps unique to `O1`:
+
+- No coalescing. cfree gates `opt_coalesce_ranges` on
+ `allow_live_range_split`, which is off at `O1`. Every `IR_COPY` survives to
+ emission unless `opt_combine`'s identity-copy detector catches it post-RA.
+ MIR's `O1` runs `collect_moves`, `consider_move_vars_only`, and `coalesce`.
+- No address-mode synthesis. `opt_ssa_combine` and `opt_addr_xform` only run
+ at `O2`, so `IR_ADDR_OF(local) + indirect load` stays as separate
+ instructions at `O1`.
+- No ext-of-ext semantic fold. `opt_combine` removes only identical convert
+ pairs. MIR's `combine_exts` folds chains like `zext8 -> zext32` into a
+ single extension of the right width.
+- No per-BB combine fixpoint. Single-sweep misses transitive folds where one
+ rewrite enables another.
+
+## O2 Gap Analysis vs MIR
+
+cfree `O2` adds `opt_cleanup` (the SSA-era schedule) ahead of the shared
+lowering pipeline: ssa construction, block cloning, mem2reg, ssa_dce,
+copy_cleanup, addr_xform, simplify, gvn, copy_prop, dse, licm,
+pressure_relief, conventional ssa, ssa_combine, undo ssa, jump_opt. Then the
+shared lowering pipeline runs with coalescing and live-range splitting
+enabled.
+
+Headline at `O2`:
+
+| metric | cfree | MIR |
+| --- | ---: | ---: |
+| compile speed (vs gcc-15 `O2`) | `5.73x` | `2.44x` |
+| runtime speed (vs gcc-15 `O2`) | `0.47x` | `0.79x` |
+| opt+link+JIT (5-case scope) | `4.728 ms` | `1.100 ms` |
+
+cfree `O2` is roughly the same compile speed as `O1` but slower at runtime
+than `O1` on the current mix - the optimization investment is not paying off
+on this benchmark set.
+
+Compile-time gaps:
+
+- `opt_rebuild_def_use` is called wholesale after most mutating passes
+ (`opt_addr_xform`, `opt_make_conventional_ssa`, every iteration of
+ `opt_copy_cleanup`). MIR maintains `ssa_edge_t` lists incrementally.
+- The pipeline interleaves `opt_copy_cleanup` and `opt_ssa_dce` five or more
+ times. MIR folds equivalent work into single passes.
+- Coalescing runs after liveness on the full program. MIR coalesces before
+ liveness, deleting moves first and computing liveness on the reduced
+ program.
+- `opt_build_reg_ssa` and `opt_build_ssa` insert phis at every IDF site
+ without live-in filtering. MIR's demand-driven `get_def` only materializes
+ phis where reaching defs differ, then runs `minimize_ssa` to fixpoint.
+- `opt_dse` runs an inline memory-availability fixpoint per invocation. MIR
+ computes memory liveness once before the pass.
+- LICM rescans dominance to detect loops instead of reusing the loop tree
+ already built by `opt_build_loop_tree`.
+
+Runtime gaps:
+
+- `opt_licm` has no register-pressure cost model. MIR's LICM has a third
+ backward filter pass that skips hoists which would raise pressure unless
+ the operation is expensive (multiply, divide). This is the most likely
+ cause of cfree `O2` trailing cfree `O1` on the current benchmark mix:
+ invariants get hoisted into preheaders, increasing live ranges across the
+ loop body and forcing the allocator to spill what fit in registers at
+ `O1`.
+- LICM also iterates loops in raster order instead of inner-to-outer.
+- `opt_ssa_combine` does not synthesize complex address modes. MIR
+ iteratively decomposes base+index*scale+disp chains via `update_addr_p`
+ validated by `target_memory_ok_p`. Largest single missing codegen pattern
+ for memory-heavy benchmarks (`hash`, `matrix`).
+- `opt_combine` has no per-BB fixpoint (same as `O1`).
+- No ext-of-ext semantic fold (same as `O1`).
+- Live-range splitting is boundary-driven from precomputed ranges and limited
+ to singletons that are not call-crossing. MIR's `get_hard_reg_with_split`
+ searches gaps in already-allocated ranges and picks based on a
+ spiller-vs-spillee frequency profit comparison.
+- GVN branch folding does not re-enqueue dependent blocks within the same
+ pass. New constants only propagate further on the next `opt_cleanup`
+ iteration.
+- `opt_pressure_relief` only sinks `IR_LOAD_IMM`/`IR_CONST_I` candidates.
+ MIR sinks any single-cross-block-use move.
+- `opt_addr_xform` is all-or-nothing: a single non-foldable use prevents
+ elimination. MIR can fold most uses and rewrite the rest.
+
+## Iteration Notes
+
+### Coalescing at O1 (2026-05-22)
+
+Change: remove the `allow_live_range_split` gate around
+`opt_coalesce_ranges` in `opt_regalloc` so coalesce runs at both O1 and
+O2. Initially this produced wrong code on the address-taken branch-join
+pattern (`test/toy/cases/128_o2_branch_join_addr_mem.toy`).
+
+Root cause: in `ranges_overlap_kind` (`src/opt/pass_coalesce.c`), any
+number of unit-length overlaps between two pregs collapsed into a
+single "unit conflict" bit. `group_conflicts` then allowed that single
+unit conflict, treating it as the natural conflict from the move
+itself. At O1 a pseudo with multiple non-SSA defs (e.g. the
+`var x = 0; x = v` pattern) can have unit overlaps with src at every
+def point. Only the move's own def point should be allowed; the others
+are real conflicts. The coalescer was merging the local's pseudo with
+the param `v` (tied to `x1`), then the `LOAD_IMM 0` def of the local
+emitted `movz x1, 0`, clobbering the incoming `v`.
+
+Fix: `ranges_overlap_kind` now counts unit overlaps. Two or more
+distinct unit overlaps are reported as a real conflict (returns 2),
+which blocks the merge.
+
+Measurement (CFREE_OPT_BENCH_FAST=1, 3 compile/run repeats, single
+machine, current load — single-run noise still ~5%):
+
+cfree-run runtime_ms at O1:
+
+| bench | baseline | coalesce | delta |
+| --- | ---: | ---: | ---: |
+| array | `5497.8` | `5344.5` | `-2.8%` |
+| hash | `4497.4` | `4362.4` | `-3.0%` |
+| hash2 | `5688.3` | `6312.0` | `+11.0%` |
+| matrix | `11222.6` | `11089.0` | `-1.2%` |
+| sieve | `9515.9` | `8775.9` | `-7.8%` |
+
+cfree-run compile_ms at O1:
+
+| bench | baseline | coalesce | delta |
+| --- | ---: | ---: | ---: |
+| array | `34.3` | `24.4` | `-29%` |
+| hash | `49.4` | `47.8` | `-3%` |
+| hash2 | `56.2` | `59.0` | `+5%` |
+| matrix | `43.8` | `43.0` | `-2%` |
+| sieve | `26.8` | `32.2` | `+20%` |
+
+The hash2 +11% runtime regression and sieve +20% compile-time
+regression are worth investigating; both runs are single-shot at this
+repeat count and could be partly noise. Average across the five
+benches is roughly neutral on compile time and slightly positive on
+runtime. The change unlocks downstream wins (fewer moves to emit,
+fewer post-RA copies to clean up) and is a precondition for further
+O1 work.
+
+### opt_addr_xform at O1 (2026-05-22)
+
+Change: new pass `opt_addr_xform_pregs` (`src/opt/pass_o2.c`). The
+existing `opt_addr_xform` operates on the SSA Val namespace and uses
+Val-keyed def-use chains, neither of which exists at O1. The new pass
+is a PReg-namespace equivalent that scans each `IR_ADDR_OF` def,
+checks whether every use of the result is the base of a
+non-observable `IR_LOAD`/`IR_STORE` with offset 0 at the right
+operand index, and if so rewrites those uses from
+`OPK_INDIRECT(base=p, ofs=0)` to `OPK_LOCAL(local)` and converts the
+`IR_ADDR_OF` to `IR_NOP`. Aux operands (call args, asm ops,
+intrinsic ops, etc.) blocking folding unconditionally. It runs once
+right after `opt_machinize` in the shared lowering pipeline.
+
+Effect on `same()` from
+`test/toy/cases/128_o2_branch_join_addr_mem.toy`: the read of `x` was
+`sub x1, x29, #8; ldur x0, [x1]` and is now a single
+`ldur x0, [x29, #-8]`.
+
+Measurement (CFREE_OPT_BENCH_FAST=1, 3 compile/run repeats):
+
+cfree-run runtime_ms at O1 (cumulative effect with coalesce):
+
+| bench | baseline | +coalesce+addrxform | delta |
+| --- | ---: | ---: | ---: |
+| array | `5497.8` | `5283.9` | `-3.9%` |
+| hash | `4497.4` | `4475.2` | `-0.5%` |
+| hash2 | `5688.3` | `5818.8` | `+2.3%` |
+| matrix | `11222.6` | `11205.8` | `-0.1%` |
+| sieve | `9515.9` | `8662.2` | `-9.0%` |
+
+Incremental delta of addr_xform on top of coalesce:
+
+| bench | +coalesce only | +coalesce+addrxform | delta |
+| --- | ---: | ---: | ---: |
+| array | `5344.5` | `5283.9` | `-1.1%` |
+| hash | `4362.4` | `4475.2` | `+2.6%` |
+| hash2 | `6312.0` | `5818.8` | `-7.8%` |
+| matrix | `11089.0` | `11205.8` | `+1.1%` |
+| sieve | `8775.9` | `8662.2` | `-1.3%` |
+
+cfree-run compile_ms at O1 (cumulative effect with coalesce):
+
+| bench | baseline | +coalesce+addrxform | delta |
+| --- | ---: | ---: | ---: |
+| array | `34.3` | `24.3` | `-29%` |
+| hash | `49.4` | `62.9` | `+27%` |
+| hash2 | `56.2` | `59.9` | `+6.6%` |
+| matrix | `43.8` | `39.1` | `-10.7%` |
+| sieve | `26.8` | `28.1` | `+4.8%` |
+
+Direction is small positive on runtime overall. sieve and array show
+sustained wins, hash2 partly recovers the +11% coalesce regression
+when addr_xform also runs. Single-shot noise still around 5% at this
+repeat count, so finer measurements (more repeats, controlled load)
+are needed to draw strong conclusions per-benchmark. The change is
+correct (full test suite passes including the previously failing
+`128_o2_branch_join_addr_mem` toy case under both new transforms).
diff --git a/driver/run.c b/driver/run.c
@@ -57,6 +57,7 @@ typedef struct RunMetrics {
CfreeMetrics iface;
RunMetricFrame stack[RUN_METRIC_MAX_DEPTH];
uint32_t depth;
+ int bench_time;
} RunMetrics;
static void run_metrics_scope_begin(void* user, const char* name) {
@@ -73,14 +74,21 @@ static void run_metrics_scope_end(void* user, const char* name) {
uint64_t end_ns;
uint64_t elapsed;
uint32_t depth;
+ const char* scope_name;
if (!m || m->depth == 0) return;
m->depth--;
depth = m->depth;
f = m->stack[depth];
end_ns = driver_now_ns();
elapsed = (end_ns >= f.start_ns) ? (end_ns - f.start_ns) : 0;
- driver_logf("%*s%s %.3f ms", (int)(depth * 2u), "", f.name ? f.name : name,
- (double)elapsed / 1000000.0);
+ scope_name = f.name ? f.name : name;
+ if (m->bench_time) {
+ driver_logf("cfree-run %s -- %.3f msec", scope_name,
+ (double)elapsed / 1000000.0);
+ } else {
+ driver_logf("%*s%s %.3f ms", (int)(depth * 2u), "", scope_name,
+ (double)elapsed / 1000000.0);
+ }
}
static void run_metrics_count(void* user, const char* name, uint64_t value) {
@@ -90,12 +98,13 @@ static void run_metrics_count(void* user, const char* name, uint64_t value) {
(unsigned long long)value);
}
-static void run_metrics_init(RunMetrics* m) {
+static void run_metrics_init(RunMetrics* m, int bench_time) {
m->iface.scope_begin = run_metrics_scope_begin;
m->iface.scope_end = run_metrics_scope_end;
m->iface.count = run_metrics_count;
m->iface.user = m;
m->depth = 0;
+ m->bench_time = bench_time;
}
static void run_metrics_begin(RunMetrics* m, const char* name) {
@@ -569,11 +578,11 @@ int driver_run(int argc, char** argv) {
return 2;
}
- if (ro.metrics) {
- run_metrics_init(&metrics_storage);
+ if (ro.metrics || ro.bench_time) {
+ run_metrics_init(&metrics_storage, ro.bench_time);
metrics = &metrics_storage;
env.metrics = &metrics->iface;
- driver_logf("cfree metrics:");
+ if (ro.metrics && !ro.bench_time) driver_logf("cfree metrics:");
run_metrics_begin(metrics, "run.total");
}
if (ro.bench_time) bench_total_start = driver_now_ns();
diff --git a/scripts/opt_bench.sh b/scripts/opt_bench.sh
@@ -10,10 +10,22 @@ GCC="${GCC:-gcc-15}"
OUT_DIR="${CFREE_OPT_BENCH_OUT:-$ROOT/build/bench/opt}"
CFREE_SYSROOT="${CFREE_OPT_BENCH_SYSROOT:-}"
-LEVELS="${CFREE_OPT_BENCH_LEVELS:-0 1 2}"
-BENCHES="${CFREE_OPT_BENCHES:-array binary-trees except funnkuch-reduce hash hash2 heapsort lists matrix method-call mandelbrot nbody sieve spectral-norm strcat}"
-COMPILE_REPEATS="${CFREE_OPT_BENCH_COMPILE_REPEATS:-1}"
-RUN_REPEATS="${CFREE_OPT_BENCH_RUN_REPEATS:-3}"
+FAST="${CFREE_OPT_BENCH_FAST:-0}"
+if [ "$FAST" = "1" ]; then
+ DEFAULT_LEVELS="1"
+ DEFAULT_BENCHES="array hash hash2 matrix sieve"
+ DEFAULT_COMPILE_REPEATS="3"
+ DEFAULT_RUN_REPEATS="3"
+else
+ DEFAULT_LEVELS="0 1 2"
+ DEFAULT_BENCHES="array binary-trees except funnkuch-reduce hash hash2 heapsort lists matrix method-call mandelbrot nbody sieve spectral-norm strcat"
+ DEFAULT_COMPILE_REPEATS="1"
+ DEFAULT_RUN_REPEATS="3"
+fi
+LEVELS="${CFREE_OPT_BENCH_LEVELS:-$DEFAULT_LEVELS}"
+BENCHES="${CFREE_OPT_BENCHES:-$DEFAULT_BENCHES}"
+COMPILE_REPEATS="${CFREE_OPT_BENCH_COMPILE_REPEATS:-$DEFAULT_COMPILE_REPEATS}"
+RUN_REPEATS="${CFREE_OPT_BENCH_RUN_REPEATS:-$DEFAULT_RUN_REPEATS}"
MIR_MAKE="${MIR_MAKE:-}"
case "$(uname -s 2>/dev/null || true)" in
@@ -126,6 +138,23 @@ parse_mir_ms() {
' "$file"
}
+# Sum all matches for a pattern. cfree-run --bench-time emits per-pass timings
+# once per function compiled, so summing yields the per-pass total for the
+# whole TU. Empty (no matches) prints nothing.
+sum_pattern_ms() {
+ local pattern="$1" file="$2"
+ awk -v pat="$pattern" '
+ $0 ~ pat {
+ v = $(NF - 1) + 0.0
+ unit = $NF
+ if (unit == "usec") v = v / 1000.0
+ total += v
+ hits++
+ }
+ END { if (hits > 0) printf "%.3f\n", total }
+ ' "$file"
+}
+
ensure_mir() {
if [ -x "$MIR_C2M" ]; then
return 0
@@ -471,8 +500,10 @@ for bench in $BENCHES; do
fi
printf '+++++ %s %s +++++\n' "$bench" "$arg_line"
for opt in $LEVELS; do
- bench_native_tool "$bench" "$(tool_label "$GCC")" "$GCC" "$opt" "$src" "$expect" "$arg_line"
- bench_native_tool "$bench" "$(tool_label "$CLANG")" "$CLANG" "$opt" "$src" "$expect" "$arg_line"
+ if [ "$FAST" != "1" ]; then
+ bench_native_tool "$bench" "$(tool_label "$GCC")" "$GCC" "$opt" "$src" "$expect" "$arg_line"
+ bench_native_tool "$bench" "$(tool_label "$CLANG")" "$CLANG" "$opt" "$src" "$expect" "$arg_line"
+ fi
bench_native_tool "$bench" "cfree" "$CFREE cc" "$opt" "$src" "$expect" "$arg_line"
bench_cfree_run "$bench" "$opt" "$src" "$expect" "$arg_line"
bench_mir "$bench" "$opt" "$src" "$expect" "$arg_line"
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -1336,6 +1336,12 @@ static void opt_run_lowering_pipeline(OptImpl* o, const char* total_scope,
metrics_scope_begin(o->c, "opt.machinize");
opt_machinize(o->f, o->target);
metrics_scope_end(o->c, "opt.machinize");
+ /* Fold address-of-local pseudos into direct memory operands. O2 runs the
+ * SSA variant in opt_cleanup; here is the equivalent for the O1 pipeline
+ * which operates on the PReg namespace directly. */
+ metrics_scope_begin(o->c, "opt.addr_xform_pregs");
+ opt_addr_xform_pregs(o->f);
+ metrics_scope_end(o->c, "opt.addr_xform_pregs");
metrics_scope_begin(o->c, "opt.build_loop_tree");
opt_build_loop_tree(o->f);
metrics_scope_end(o->c, "opt.build_loop_tree");
diff --git a/src/opt/opt.h b/src/opt/opt.h
@@ -39,6 +39,7 @@ void opt_block_cloning(Func*);
void opt_build_reg_ssa(Func*);
void opt_build_ssa(Func*);
void opt_addr_xform(Func*);
+void opt_addr_xform_pregs(Func*); /* O1: PReg-namespace addr-of-local folding */
void opt_simplify_local(Func*);
void opt_simplify(Func*);
void opt_gvn(Func*); /* incl. constprop, redundant-load elim */
diff --git a/src/opt/pass_coalesce.c b/src/opt/pass_coalesce.c
@@ -52,7 +52,18 @@ static void coalesce_add_related(CoalesceCtx* c, PReg v) {
}
static int ranges_overlap_kind(const OptLiveRangeSet* ranges, PReg a, PReg b) {
- int unit_only = 0;
+ /* Returns 0 (no overlap), 1 (a single unit-length overlap), or 2 (real
+ * conflict: an overlap longer than one point, or two or more disjoint
+ * unit-length overlaps).
+ *
+ * A single unit-length overlap is the natural conflict of a move
+ * `dst = COPY src`: at the def point of dst, src is still live for one
+ * point. `group_conflicts` allows that single unit conflict to permit
+ * coalescing the move. Two or more unit overlaps at distinct points imply
+ * dst has multiple non-SSA defs whose live ranges each clip src — those
+ * are real conflicts and must block coalescing, otherwise we'd allocate
+ * dst into src's hard register and clobber src at the extra def. */
+ u32 unit_count = 0;
for (u32 ar = ranges->first_range_by_preg[a]; ar != OPT_RANGE_NONE;
ar = ranges->ranges[ar].next) {
const OptLiveRange* ra = &ranges->ranges[ar];
@@ -63,11 +74,11 @@ static int ranges_overlap_kind(const OptLiveRangeSet* ranges, PReg a, PReg b) {
u32 start = ra->start > rb->start ? ra->start : rb->start;
u32 end = ra->end < rb->end ? ra->end : rb->end;
if (end > start + 1u) return 2;
- unit_only = 1;
+ if (++unit_count > 1u) return 2;
}
}
}
- return unit_only ? 1 : 0;
+ return unit_count ? 1 : 0;
}
static void coalesce_set_conflict_bit(u64* bits, u32 nrelated, u32 a, u32 b) {
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -1835,7 +1835,11 @@ void opt_regalloc(Func* f, int allow_live_range_split) {
opt_init_preg_info_from_ranges(f, &ranges);
opt_apply_asm_constraints_from_live(f, &live);
apply_param_incoming_register_hazards(f);
- if (allow_live_range_split) opt_coalesce_ranges(f, &ranges);
+ /* Coalesce runs at both O1 and O2. IRF_NO_COALESCE protects SSA edge copies
+ * inserted by opt_make_conventional_ssa at O2; at O1 no such copies exist.
+ * ranges_overlap_kind treats two or more unit overlaps between dst and src
+ * as a real conflict, so multi-def pseudos at O1 are not unsafely merged. */
+ opt_coalesce_ranges(f, &ranges);
metrics_count(f->c, "opt.live_words", f->opt_live_words);
metrics_count(f->c, "opt.ranges", ranges.nranges);
metrics_count(f->c, "opt.range_points", ranges.point_count);
diff --git a/src/opt/pass_o2.c b/src/opt/pass_o2.c
@@ -596,6 +596,154 @@ void opt_addr_xform(Func* f) {
opt_rebuild_def_use(f);
}
+/* PReg-namespace variant of opt_addr_xform for the O1 pipeline (no SSA, no
+ * Val-keyed def-use chains). Scans the whole function once per candidate
+ * IR_ADDR_OF def to classify uses of its PReg result. The candidate is
+ * foldable only if every use is the base of a non-observable IR_LOAD/STORE
+ * with zero offset and the correct main-operand index. Folding rewrites
+ * those uses from `OPK_INDIRECT(base=p, ofs=0)` to `OPK_LOCAL(local)` and
+ * replaces the IR_ADDR_OF with IR_NOP. */
+
+static int addr_xform_pregs_main_op_foldable(Inst* in, Operand* op,
+ u32 op_idx) {
+ if (op->kind != OPK_INDIRECT) return 0;
+ if (op->v.ind.ofs != 0) return 0;
+ if ((IROp)in->op != IR_LOAD && (IROp)in->op != IR_STORE) return 0;
+ if (opt_mem_observable(&in->extra.mem)) return 0;
+ if ((IROp)in->op == IR_LOAD && op_idx != 1u) return 0;
+ if ((IROp)in->op == IR_STORE && op_idx != 0u) return 0;
+ return 1;
+}
+
+static int addr_xform_pregs_op_uses(const Operand* op, PReg p) {
+ if (!op) return 0;
+ if (op->kind == OPK_REG && (PReg)op->v.reg == p) return 1;
+ if (op->kind == OPK_INDIRECT && (PReg)op->v.ind.base == p) return 1;
+ return 0;
+}
+
+static int addr_xform_pregs_abivalue_uses(const CGABIValue* v, PReg p) {
+ if (!v) return 0;
+ if (addr_xform_pregs_op_uses(&v->storage, p)) return 1;
+ for (u32 i = 0; i < v->nparts; ++i)
+ if (addr_xform_pregs_op_uses((const Operand*)&v->parts[i].op, p)) return 1;
+ return 0;
+}
+
+static int addr_xform_pregs_aux_uses(Inst* in, PReg p) {
+ switch ((IROp)in->op) {
+ case IR_CALL: {
+ IRCallAux* aux = (IRCallAux*)in->extra.aux;
+ if (!aux) return 0;
+ if (aux->use_plan_replay) {
+ if (addr_xform_pregs_op_uses(&aux->plan.callee, p)) return 1;
+ for (u32 i = 0; i < aux->plan.nargs; ++i)
+ if (addr_xform_pregs_op_uses(&aux->plan.args[i].src, p)) return 1;
+ for (u32 i = 0; i < aux->plan.nrets; ++i)
+ if (addr_xform_pregs_op_uses(&aux->plan.rets[i].dst, p)) return 1;
+ } else {
+ if (addr_xform_pregs_op_uses(&aux->desc.callee, p)) return 1;
+ for (u32 i = 0; i < aux->desc.nargs; ++i)
+ if (addr_xform_pregs_abivalue_uses(
+ (const CGABIValue*)&aux->desc.args[i], p))
+ return 1;
+ if (addr_xform_pregs_abivalue_uses(&aux->desc.ret, p)) return 1;
+ }
+ return 0;
+ }
+ case IR_RET: {
+ IRRetAux* aux = (IRRetAux*)in->extra.aux;
+ if (!aux || !aux->present) return 0;
+ return addr_xform_pregs_abivalue_uses(&aux->val, p);
+ }
+ case IR_SCOPE_BEGIN: {
+ IRScopeAux* aux = (IRScopeAux*)in->extra.aux;
+ if (!aux) return 0;
+ return addr_xform_pregs_op_uses(&aux->desc.cond, p);
+ }
+ case IR_ASM_BLOCK: {
+ IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
+ if (!aux) return 0;
+ for (u32 i = 0; i < aux->nin; ++i)
+ if (addr_xform_pregs_op_uses(&aux->in_ops[i], p)) return 1;
+ for (u32 i = 0; i < aux->nout; ++i)
+ if (addr_xform_pregs_op_uses(&aux->out_ops[i], p)) return 1;
+ return 0;
+ }
+ case IR_INTRINSIC: {
+ IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
+ if (!aux) return 0;
+ for (u32 i = 0; i < aux->narg; ++i)
+ if (addr_xform_pregs_op_uses(&aux->args[i], p)) return 1;
+ for (u32 i = 0; i < aux->ndst; ++i)
+ if (addr_xform_pregs_op_uses(&aux->dsts[i], p)) return 1;
+ return 0;
+ }
+ default:
+ return 0;
+ }
+}
+
+static int addr_xform_pregs_classify(Func* f, PReg p, Inst* def_inst) {
+ int has_foldable_use = 0;
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if (in == def_inst) continue;
+ for (u32 o = 0; o < in->nopnds; ++o) {
+ Operand* op = &in->opnds[o];
+ if (!addr_xform_pregs_op_uses(op, p)) continue;
+ if (addr_xform_pregs_main_op_foldable(in, op, o))
+ has_foldable_use = 1;
+ else
+ return 0;
+ }
+ if (addr_xform_pregs_aux_uses(in, p)) return 0;
+ }
+ }
+ return has_foldable_use;
+}
+
+void opt_addr_xform_pregs(Func* f) {
+ if (!f || f->opt_reg_ssa || f->opt_rewritten) return;
+ int changed = 0;
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if ((IROp)in->op != IR_ADDR_OF) continue;
+ if (in->nopnds < 2) continue;
+ if (in->opnds[0].kind != OPK_REG) continue;
+ if (in->opnds[1].kind != OPK_LOCAL) continue;
+ PReg p = (PReg)in->opnds[0].v.reg;
+ if (!opt_reg_valid(f, p)) continue;
+ if (!addr_xform_pregs_classify(f, p, in)) continue;
+ Operand local = in->opnds[1];
+ for (u32 bb = 0; bb < f->nblocks; ++bb) {
+ Block* rb = &f->blocks[bb];
+ for (u32 ii = 0; ii < rb->ninsts; ++ii) {
+ Inst* use = &rb->insts[ii];
+ if (use == in) continue;
+ for (u32 o = 0; o < use->nopnds; ++o) {
+ Operand* op = &use->opnds[o];
+ if (op->kind != OPK_INDIRECT) continue;
+ if ((PReg)op->v.ind.base != p) continue;
+ Operand folded = local;
+ folded.type = use->extra.mem.type ? use->extra.mem.type : local.type;
+ *op = folded;
+ }
+ }
+ }
+ addr_inst_remove(in);
+ changed = 1;
+ }
+ }
+ if (changed)
+ opt_analysis_invalidate(
+ f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP);
+}
+
static u64 gvn_width_mask(u32 width) {
if (width >= 64u) return ~0ull;
return (1ull << width) - 1ull;
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -4270,7 +4270,7 @@ static void opt_o2_coalesces_nonconflicting_copy(void) {
tc_fini(&tc);
}
-static void opt_o1_ignores_coalescing(void) {
+static void opt_o1_coalesces_simple_copy(void) {
TestCtx tc;
tc_init(&tc);
MockCGTarget mock;
@@ -4290,8 +4290,8 @@ static void opt_o1_ignores_coalescing(void) {
opt_build_loop_tree(f);
opt_regalloc(f, 0);
- EXPECT(f->opt_coalesce_candidates == 0 && f->opt_coalesce_merges == 0,
- "O1 regalloc should not run coalescing");
+ EXPECT(f->opt_coalesce_candidates == 1 && f->opt_coalesce_merges == 1,
+ "O1 regalloc should coalesce a simple copy");
tc_fini(&tc);
}
@@ -6640,7 +6640,7 @@ int main(void) {
opt_range_overlap_class();
opt_regalloc_priority();
opt_o2_coalesces_nonconflicting_copy();
- opt_o1_ignores_coalescing();
+ opt_o1_coalesces_simple_copy();
opt_o2_refuses_overlapping_copy_coalesce();
opt_o2_refuses_incompatible_copy_coalesce();
opt_o2_splits_singleton_when_whole_alloc_fails();