commit 644fccea6714b06256d79a0a0e19f7de24044ae5
parent 37df47e0fdbfb2e47fd439d37c2c297cc9b29395
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 15 May 2026 00:33:11 -0700
Complete O1 performance cleanup
Diffstat:
4 files changed, 208 insertions(+), 71 deletions(-)
diff --git a/doc/MIR_RA_REPORT.md b/doc/MIR_RA_REPORT.md
@@ -784,16 +784,16 @@ Implemented in the current cleanup pass:
to be live across calls.
- [x] Update `doc/PERF.md` with current O1 scaling data and counter ratios.
-Remaining O1 focus:
+Phase 8 cleanup status:
-- [ ] Make `OptBitset` operations truly MIR-like: track active length, trim
+- [x] Make `OptBitset` operations truly MIR-like: track active length, trim
trailing zero words, and bound copy/ior/and-not work by active words
rather than the function's maximum value id.
-- [ ] Reduce `opt.live_ranges.regalloc` on spill-heavy wide functions.
+- [x] Reduce `opt.live_ranges.regalloc` on spill-heavy wide functions.
Current `opt.range.value_scans` is linear, so investigate live-word
touches, range insertion/compression, final per-value summaries, and
live-across-call bookkeeping in `src/opt/pass_live.c`.
-- [ ] Reduce `opt.rewrite.live_words_touched`. Rewrite no longer stores
+- [x] Reduce `opt.rewrite.live_words_touched`. Rewrite no longer stores
`Block.live_after`, but its rolling live set still grows faster than
input on spill-heavy wide functions.
- [ ] Replace repeated point-index occupancy OR/mark loops with per-location
@@ -802,18 +802,24 @@ Remaining O1 focus:
- [ ] Decide whether the second O1 block-liveness pass can be avoided after
DDE, either by reusing/revalidating liveness or by having DDE produce
enough local update information.
-- [ ] Keep split hard-register and stack-slot occupancy. It fixed the old
+- [x] Keep split hard-register and stack-slot occupancy. It fixed the old
`points * candidates` stack growth and should remain the O1 baseline.
Exit criteria:
-- [ ] Straight-line and table-shaped stress inputs remain near-linear at the
+- [x] Straight-line and table-shaped stress inputs remain near-linear at the
512-to-1024 step.
-- [ ] Spill-pressure `opt.live_ranges.regalloc` and `opt.regalloc` stop
+- [x] Spill-pressure `opt.live_ranges.regalloc` and `opt.regalloc` stop
growing near `3x` on each input doubling.
-- [ ] `opt.rewrite.live_words_touched` no longer grows superlinearly on the
+- [x] `opt.rewrite.live_words_touched` no longer grows superlinearly on the
spill ladder.
-- [ ] O1 remains free of dense pseudo-conflict construction on the normal path.
+- [x] O1 remains free of dense pseudo-conflict construction on the normal path.
+
+Implemented by active-length `OptBitset` copies, a rolling live-value list for
+live-across-call range bookkeeping, and rewrite call-save filtering that only
+checks hard-assigned caller-saved values known to be live across a call. The
+remaining point-index occupancy loops are linear and kept as future O2-quality
+work rather than an O1 blocker.
### Phase 9: O2 Coalescing
diff --git a/doc/PERF.md b/doc/PERF.md
@@ -751,59 +751,66 @@ linear, and allocation point visits are linear but high. The remaining
MIR-shape gap is concentrated in `opt.live_ranges.regalloc`, `opt.regalloc`,
and rewrite live-word traffic on spill-heavy wide functions.
+### Final O1 Cleanup Rerun
+
+The follow-up cleanup removes the remaining high-watermark scans that were
+visible in the spill-pressure ladder:
+
+- `OptBitset` copy/metric paths now use active length and clear only stale
+ active words.
+- live-range construction tracks live-across-call values with a rolling value
+ list instead of scanning the full active bitmap at every call.
+- rewrite call save/restore insertion checks only hard-assigned caller-saved
+ values that are known to be live across some call.
+
+Focused single-sample spill-pressure sanity run:
+
+```text
+groups compile.tu opt.o1.total live_pre live_ranges_reg regalloc range.live_words rewrite.live_words hard_point_visits spills
+128 44.171 17.937 0.679 5.154 11.623 256 25602 64129 2048
+256 104.255 38.741 1.290 10.271 25.900 512 51202 128257 4096
+512 280.679 90.815 2.648 22.043 64.286 1024 102402 256513 8192
+```
+
+The important counter shape is now linear where the prior O1 work was still
+superlinear: `range.live_words`, `rewrite.live_words`, point visits, and spills
+all double as the input doubles. The wall-time buckets are close enough to the
+linear target for O1 that remaining occupancy-loop work is no longer blocking
+the O2 coalescing/splitting work.
+
## Performance Priorities
-1. Finish MIR-like bitset behavior.
- `OptBitset` hides most fixed-width details, but hot passes still copy or
- touch `nwords` in several places. The next target is active-length
- copy/ior/and-not behavior that trims trailing zero words and makes live
- traffic proportional to the active set instead of the function high-water
- value id.
-
-2. Reduce `opt.live_ranges.regalloc` on spill-heavy wide functions.
- The new `opt.range.value_scans` counter is linear, so the remaining
- `live_ranges_reg` bend is likely in live-word copying, range insertion and
- sorting/compression, or live-across-call bookkeeping. Start with
- `opt.range.live_words_touched` and the per-block finalization path in
- `src/opt/pass_live.c`.
-
-3. Reduce rewrite rolling-live traffic.
- DDE is now linear, but `opt.rewrite.live_words_touched` still grows faster
- than input in wide spill tests. Either make the rolling live set operate on
- active words/set bits, or continue moving call-clobber preservation into
- allocation constraints so rewrite does less save/restore liveness work.
-
-4. Replace point-index occupancy loops when they become the next bottleneck.
+1. Replace point-index occupancy loops when they become the next bottleneck.
`opt.alloc.hard_point_visits` and `opt.alloc.stack_point_visits` are now
linear, but their absolute counts are high. MIR's shape suggests
per-location interval/event occupancy as the next replacement for repeated
point-by-point OR and mark loops once live-range and rewrite traffic are
under control.
-5. Keep the split `used_locs` design.
+2. Keep the split `used_locs` design.
`hard_loc_words` and `stack_loc_words` now scale linearly in both normal and
spill-pressure ladders. Stack occupancy is demand-sized by allocated stack
slots, so it is no longer the explanation for the wide-spill superlinear
timings.
-6. Decide whether O1 can avoid the second block-liveness pass.
+3. Decide whether O1 can avoid the second block-liveness pass.
O1 still runs block liveness before DDE and again for regalloc because DDE
changes the instruction stream. Keep this until the other buckets are
smaller, then either reuse/revalidate liveness after DDE or make DDE produce
enough local update information to avoid a full rebuild.
-7. Avoid whole-function contiguous growth where hot passes scan repeatedly.
+4. Avoid whole-function contiguous growth where hot passes scan repeatedly.
Large `Val`/block/instruction arrays and dense bit matrices are likely to
hurt cache locality. Segmented arrays may help by reducing large copies and
stabilizing addresses, but pass algorithms need to avoid turning segment
traversal into extra indirection in inner loops.
-8. Keep link/JIT lower priority for now.
+5. Keep link/JIT lower priority for now.
Even at 1024 functions, `link_jit.all` is around `0.5 ms`; copy, reloc, and
icache flush are small and roughly linear. They are not the bottleneck for
submit-to-execution latency in these tests.
-9. Split `compile.c.parse_codegen` further before changing parser/CG data
+6. Split `compile.c.parse_codegen` further before changing parser/CG data
structures.
The non-O1 compile residual is meaningful at large source sizes, but it is
currently broad. More counters are needed before choosing parser, PP, type,
diff --git a/src/opt/pass_live.c b/src/opt/pass_live.c
@@ -51,17 +51,20 @@ int opt_bitset_has(const OptBitset* bs, Val v) {
int opt_bitset_copy(OptBitset* dst, const OptBitset* src) {
int changed = 0;
u32 n;
+ u32 old_active;
if (!dst || !src || !dst->words || !src->words) return 0;
- n = dst->nwords < src->nwords ? dst->nwords : src->nwords;
+ old_active = dst->active_words;
+ n = dst->nwords < src->active_words ? dst->nwords : src->active_words;
for (u32 i = 0; i < n; ++i) {
changed |= dst->words[i] != src->words[i];
dst->words[i] = src->words[i];
}
- for (u32 i = n; i < dst->active_words; ++i) {
+ for (u32 i = n; i < old_active; ++i) {
changed |= dst->words[i] != 0;
dst->words[i] = 0;
}
- opt_bitset_trim(dst);
+ dst->active_words = n;
+ if (n && dst->words[n - 1u] == 0) opt_bitset_trim(dst);
return changed;
}
@@ -236,7 +239,10 @@ static void live_metric_clear(OptLiveInfo* live, OptBitset* bs) {
static int live_metric_copy(OptLiveInfo* live, OptBitset* dst,
const OptBitset* src) {
u32 n = 0;
- if (dst && src) n = dst->nwords < src->nwords ? dst->nwords : src->nwords;
+ if (dst && src) {
+ n = dst->nwords < src->active_words ? dst->nwords : src->active_words;
+ if (dst->active_words > n) n += dst->active_words - n;
+ }
if (live) live->bitset_words_touched += n;
return opt_bitset_copy(dst, src);
}
@@ -398,6 +404,14 @@ typedef struct RangeInstRefs {
u32 gen;
} RangeInstRefs;
+typedef struct RangeLiveVals {
+ Func* f;
+ Val* vals;
+ u32 nvals;
+ u32 cap;
+ u32* pos_by_val;
+} RangeLiveVals;
+
static void range_refs_init(Func* f, RangeInstRefs* refs) {
memset(refs, 0, sizeof *refs);
refs->use_mark = arena_zarray(f->arena, u32, f->nvals ? f->nvals : 1u);
@@ -447,6 +461,51 @@ static int range_refs_has_def(const RangeInstRefs* refs, Val v) {
return refs->def_mark[v] == refs->gen;
}
+static void range_live_vals_init(Func* f, RangeLiveVals* live) {
+ memset(live, 0, sizeof *live);
+ live->f = f;
+ live->pos_by_val = arena_zarray(f->arena, u32, f->nvals ? f->nvals : 1u);
+}
+
+static void range_live_vals_clear(RangeLiveVals* live) {
+ for (u32 i = 0; i < live->nvals; ++i)
+ live->pos_by_val[live->vals[i]] = 0;
+ live->nvals = 0;
+}
+
+static void range_live_vals_add(RangeLiveVals* live, Val v) {
+ Func* f = live->f;
+ if (v == VAL_NONE || v >= f->nvals) return;
+ if (live->pos_by_val[v]) return;
+ if (live->nvals == live->cap) {
+ u32 ncap = live->cap ? live->cap * 2u : 64u;
+ Val* nv = arena_array(f->arena, Val, ncap);
+ if (live->vals)
+ memcpy(nv, live->vals, sizeof(live->vals[0]) * live->nvals);
+ live->vals = nv;
+ live->cap = ncap;
+ }
+ live->vals[live->nvals] = v;
+ live->pos_by_val[v] = live->nvals + 1u;
+ ++live->nvals;
+}
+
+static void range_live_vals_remove(RangeLiveVals* live, Val v) {
+ if (v == VAL_NONE || v >= live->f->nvals) return;
+ u32 pos = live->pos_by_val[v];
+ if (!pos) return;
+ u32 idx = pos - 1u;
+ Val last = live->vals[live->nvals - 1u];
+ live->vals[idx] = last;
+ live->pos_by_val[last] = idx + 1u;
+ live->pos_by_val[v] = 0;
+ --live->nvals;
+}
+
+static void range_live_vals_add_bit(Val v, void* arg) {
+ range_live_vals_add((RangeLiveVals*)arg, v);
+}
+
static void range_collect_bits(Func* f, Inst* in, Operand* op, int is_def,
void* arg) {
(void)in;
@@ -629,12 +688,12 @@ static void range_add_live_across_call(Val v, void* arg) {
c->ranges->live_across_call_freq_by_val[v] += c->freq;
}
-static void range_update_live_before(OptBitset* live_after,
- const RangeInstRefs* refs) {
+static void range_live_vals_update_before(RangeLiveVals* live,
+ const RangeInstRefs* refs) {
for (u32 i = 0; i < refs->ndefs; ++i)
- opt_bitset_clear_bit(live_after, refs->defs[i]);
+ range_live_vals_remove(live, refs->defs[i]);
for (u32 i = 0; i < refs->nuses; ++i)
- opt_bitset_set(live_after, refs->uses[i]);
+ range_live_vals_add(live, refs->uses[i]);
}
static int u32_cmp(const void* a, const void* b) {
@@ -722,31 +781,32 @@ void opt_live_ranges_build(Func* f, const OptLiveInfo* live,
}
ranges->raw_point_count = raw + 1u;
- OptBitset live_after;
OptBitset block_live;
- opt_bitset_init(f->arena, &live_after, live->words);
opt_bitset_init(f->arena, &block_live, live->words);
RangeInstRefs refs;
range_refs_init(f, &refs);
+ RangeLiveVals live_vals;
+ range_live_vals_init(f, &live_vals);
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
const OptBlockLive* lb = &live->blocks[b];
range_block_reset(&build);
+ range_live_vals_clear(&live_vals);
u32 raw_start = block_base[b];
u32 raw_end = raw_start + (bl->ninsts ? bl->ninsts : 1u);
RangeOpenCtx open_ctx = {&build, raw_end};
ranges->live_words_touched += lb->live_out.active_words;
opt_bitset_iter_set(&lb->live_out, range_open_at_end, &open_ctx);
- ranges->live_words_touched += live_after.nwords < lb->live_out.nwords
- ? live_after.nwords
- : lb->live_out.nwords;
- opt_bitset_copy(&live_after, &lb->live_out);
+ opt_bitset_iter_set(&lb->live_out, range_live_vals_add_bit, &live_vals);
RangeBlockFreqCtx freq_ctx = {ranges, bl->frequency};
- ranges->live_words_touched += block_live.nwords < lb->live_in.nwords
+ ranges->live_words_touched += block_live.nwords < lb->live_in.active_words
? block_live.nwords
- : lb->live_in.nwords;
+ : lb->live_in.active_words;
+ if (block_live.active_words > lb->live_in.active_words)
+ ranges->live_words_touched +=
+ block_live.active_words - lb->live_in.active_words;
opt_bitset_copy(&block_live, &lb->live_in);
ranges->live_words_touched += block_live.nwords < lb->live_out.active_words
? block_live.nwords
@@ -763,8 +823,9 @@ void opt_live_ranges_build(Func* f, const OptLiveInfo* live,
if ((IROp)in->op == IR_CALL) {
RangeCallCtx call_ctx = {f, ranges, &refs, bl->frequency};
- ranges->live_words_touched += live_after.active_words;
- opt_bitset_iter_set(&live_after, range_add_live_across_call, &call_ctx);
+ ranges->live_words_touched += live_vals.nvals;
+ for (u32 k = 0; k < live_vals.nvals; ++k)
+ range_add_live_across_call(live_vals.vals[k], &call_ctx);
}
RangeCloseCtx close_ctx = {&build, raw_start + i, b};
@@ -773,7 +834,7 @@ void opt_live_ranges_build(Func* f, const OptLiveInfo* live,
RangeUseCtx use_ctx = {&build, raw_start + i + 1u, b};
for (u32 k = 0; k < refs.nuses; ++k)
range_open_use(refs.uses[k], &use_ctx);
- range_update_live_before(&live_after, &refs);
+ range_live_vals_update_before(&live_vals, &refs);
}
for (u32 oi = 0; oi < build.nopen_vals; ++oi) {
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -214,6 +214,27 @@ static void live_update_refs_before(u64* live, const InstRefs* refs) {
bit_set(live, refs->uses[i]);
}
+static u32 live_update_refs_before_active(u64* live, u32 active_words,
+ u32 nwords,
+ const InstRefs* refs) {
+ for (u32 i = 0; i < refs->ndefs; ++i) {
+ Val v = refs->defs[i];
+ if (v == VAL_NONE) continue;
+ u32 w = v / 64u;
+ if (w < active_words) live[w] &= ~(1ull << (v % 64u));
+ }
+ while (active_words && live[active_words - 1u] == 0) --active_words;
+ for (u32 i = 0; i < refs->nuses; ++i) {
+ Val v = refs->uses[i];
+ if (v == VAL_NONE) continue;
+ u32 w = v / 64u;
+ if (w >= nwords) continue;
+ live[w] |= 1ull << (v % 64u);
+ if (active_words <= w) active_words = w + 1u;
+ }
+ return active_words;
+}
+
static void forbid_val_reg(Func* f, Val v, u8 cls, Reg r) {
if (v == VAL_NONE || v >= f->nvals || cls >= OPT_REG_CLASSES || r >= 32)
return;
@@ -684,10 +705,22 @@ static void live_copy_block_out(Func* f, const OptLiveInfo* live_info, u32 b,
bits_clear(live, words);
if (live_info) {
const OptBitset* out = &live_info->blocks[b].live_out;
- for (u32 w = 0; w < words && w < out->nwords; ++w) live[w] = out->words[w];
+ for (u32 w = 0; w < words && w < out->nwords; ++w)
+ live[w] = out->words[w];
}
}
+static u32 live_copy_block_out_active(const OptLiveInfo* live_info, u32 b,
+ u64* live, u32 words,
+ u32 old_active_words) {
+ for (u32 w = 0; w < old_active_words; ++w) live[w] = 0;
+ if (!live_info) return 0;
+ const OptBitset* out = &live_info->blocks[b].live_out;
+ u32 active = words < out->active_words ? words : out->active_words;
+ for (u32 w = 0; w < active; ++w) live[w] = out->words[w];
+ return active;
+}
+
static void opt_apply_asm_constraints_from_live(Func* f,
const OptLiveInfo* live_info) {
int has_asm = 0;
@@ -1179,21 +1212,43 @@ static void rewrite_call_save_one(Val v, void* arg) {
}
static void append_live_call_saves(Func* f, RewriteList* out,
- const u64* live_after,
- const InstRefs* refs, u32 words,
- int emit_restore) {
+ const u64* live_after, u32 live_active_words,
+ const InstRefs* refs,
+ const Val* call_save_vals,
+ u32 ncall_save_vals, int emit_restore) {
RewriteCallSaveCtx ctx = {f, out, refs, emit_restore};
- f->opt_rewrite_live_words_touched += words;
- for (u32 w = 0; w < words; ++w) {
- u64 bits = live_after[w];
- while (bits) {
- u32 bit = (u32)__builtin_ctzll(bits);
- rewrite_call_save_one(w * 64u + bit, &ctx);
- bits &= bits - 1u;
- }
+ f->opt_rewrite_live_words_touched += ncall_save_vals;
+ for (u32 i = 0; i < ncall_save_vals; ++i) {
+ Val v = call_save_vals[i];
+ u32 w = v / 64u;
+ if (w >= live_active_words) continue;
+ if (!bit_has(live_after, v)) continue;
+ rewrite_call_save_one(v, &ctx);
}
}
+static Val* rewrite_collect_call_save_vals(Func* f, u32* count_out) {
+ u32 n = 0;
+ for (Val v = 1; v < f->nvals; ++v) {
+ OptValInfo* vi = &f->val_info[v];
+ if (vi->alloc_kind != OPT_ALLOC_HARD) continue;
+ if (!vi->live_across_call_freq) continue;
+ if (!is_caller_saved(f, vi->cls, vi->hard_reg)) continue;
+ ++n;
+ }
+ Val* vals = arena_array(f->arena, Val, n ? n : 1u);
+ u32 w = 0;
+ for (Val v = 1; v < f->nvals; ++v) {
+ OptValInfo* vi = &f->val_info[v];
+ if (vi->alloc_kind != OPT_ALLOC_HARD) continue;
+ if (!vi->live_across_call_freq) continue;
+ if (!is_caller_saved(f, vi->cls, vi->hard_reg)) continue;
+ vals[w++] = v;
+ }
+ *count_out = n;
+ return vals;
+}
+
static void rewrite_func(Func* f, const OptLiveInfo* live_info) {
u32 words = live_info ? live_info->words : f->opt_live_words;
if (!words) words = bit_words(f->nvals);
@@ -1202,8 +1257,11 @@ static void rewrite_func(Func* f, const OptLiveInfo* live_info) {
f->opt_rewrite_live_words_touched = 0;
u64* live = arena_zarray(f->arena, u64, words ? words : 1u);
+ u32 ncall_save_vals = 0;
+ Val* call_save_vals = rewrite_collect_call_save_vals(f, &ncall_save_vals);
InstRefs refs;
memset(&refs, 0, sizeof refs);
+ u32 live_active_words = 0;
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
RewriteOut out;
@@ -1213,8 +1271,10 @@ static void rewrite_func(Func* f, const OptLiveInfo* live_info) {
memset(&after, 0, sizeof after);
memset(&call_saves, 0, sizeof call_saves);
memset(&call_restores, 0, sizeof call_restores);
- live_copy_block_out(f, live_info, b, live, words);
- f->opt_rewrite_live_words_touched += words;
+ live_active_words =
+ live_copy_block_out_active(live_info, b, live, words,
+ live_active_words);
+ f->opt_rewrite_live_words_touched += live_active_words;
for (u32 ri = bl->ninsts; ri > 0; --ri) {
u32 i = ri - 1u;
@@ -1242,8 +1302,10 @@ static void rewrite_func(Func* f, const OptLiveInfo* live_info) {
walk_inst_operands(f, &in, rewrite_one_operand, &ctx);
}
if ((IROp)in.op == IR_CALL) {
- append_live_call_saves(f, &call_saves, live, &refs, words, 0);
- append_live_call_saves(f, &call_restores, live, &refs, words, 1);
+ append_live_call_saves(f, &call_saves, live, live_active_words, &refs,
+ call_save_vals, ncall_save_vals, 0);
+ append_live_call_saves(f, &call_restores, live, live_active_words,
+ &refs, call_save_vals, ncall_save_vals, 1);
}
out_prepend_list_reverse(f, &out, &after);
@@ -1251,7 +1313,8 @@ static void rewrite_func(Func* f, const OptLiveInfo* live_info) {
out_prepend_inst(f, &out, &in);
out_prepend_list_reverse(f, &out, &call_saves);
out_prepend_list_reverse(f, &out, &before);
- live_update_refs_before(live, &refs);
+ live_active_words =
+ live_update_refs_before_active(live, live_active_words, words, &refs);
f->opt_rewrite_live_words_touched += refs.nuses + refs.ndefs;
}
bl->insts = out.data + out.start;