commit 177f4e8b9580ec1b9e1ce75700ad2305f3c47336
parent 08951b102ee218c381a2b1fc69cb955808ce8c23
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 21 May 2026 12:18:49 -0700
Keep O1 stack spill assignment linear
Diffstat:
| M | doc/PERF.md | | | 101 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------- |
| M | src/opt/pass_lower.c | | | 209 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------- |
| M | test/opt/opt_test.c | | | 39 | +++++++++++++++++++++++++++++++++++++++ |
3 files changed, 296 insertions(+), 53 deletions(-)
diff --git a/doc/PERF.md b/doc/PERF.md
@@ -7,6 +7,7 @@ older allocator phase history is preserved in git and in `doc/MIR_RA_REPORT.md`.
## Measurement Setup
Fresh snapshot: May 17, 2026.
+Follow-up stack-assignment probe: May 21, 2026.
- Host: `arm64` macOS.
- Compiler under test: `build/cfree`, rebuilt as a Mach-O arm64 executable with
@@ -19,6 +20,8 @@ Fresh snapshot: May 17, 2026.
- `build/o1-probes/spill_argc.tsv`
- `build/o1-current/params-hostO2.tsv`
- `build/o1-current/toy-hostO2.tsv`
+ - `build/o1-investigate/spill_argc_final.tsv`
+ - `build/o1-investigate/params_argc_final.tsv`
The build log should show `clang -O2 ...` for driver, lib, C frontend, and Toy
frontend objects before these timings are compared to future runs.
@@ -127,12 +130,45 @@ spills 2.04x
used_words 3.60x
```
-This is the main remaining mild super-linear signal. Wall time is only slightly
-above 2x, but `used_words` grows 3.60x. That points at occupancy/assignment
-working-set growth under stack pressure, even though the old conflict matrix is
-gone. The next investigation should look at why per-location interval occupancy
-and candidate checking expand faster than spills/ranges for this generated
-shape.
+This was the main remaining mild super-linear signal in the May 17 snapshot.
+The May 21 follow-up found the direct cause: stack spill assignment used
+first-fit probing across existing spill slots. When many spilled ranges all
+overlapped, every new spill checked almost every previous stack slot before
+allocating a new one, so `opt.alloc.stack_point_visits` grew quadratically.
+
+The fix keeps hard-register assignment priority ordered, then assigns spilled
+values to stack slots in lifetime order using an active min-heap keyed by slot
+end position plus a free-slot list. Stack occupancy is still marked into the
+per-slot interval vectors for replay/debug metrics, but assignment no longer
+probes existing stack slots in the all-live case.
+
+May 21 follow-up, same `spill_argc` shape, single runs:
+
+```text
+spill_argc final
+N compile.tu opt.o1.total live_pre regalloc used_words hard_words stack_words stack_slots hard_visits stack_visits hard_marks stack_marks spills points
+128 3.037 0.853 0.114 0.396 1686 616 1070 107 11732 0 405 107 107 514
+256 3.989 1.214 0.082 0.634 3222 872 2350 235 23508 0 789 235 235 1026
+512 6.776 1.887 0.075 1.050 6294 1384 4910 491 47060 0 1557 491 491 2050
+1024 13.605 3.561 0.102 2.041 12438 2408 10030 1003 94164 0 3093 1003 1003 4098
+```
+
+512 to 1024 ratios:
+
+```text
+compile.tu 2.01x
+opt.o1.total 1.89x
+regalloc 1.94x
+spills 2.04x
+used_words 1.98x
+stack_marks 2.04x
+stack_visits 0
+```
+
+`used_words` now reports the allocator's actual interval-vector storage rather
+than the size of the old dense point-by-location bitmap model. It is expected
+to track interval/vector capacity and can move in allocation-size steps, but it
+no longer reflects a point-times-slot working set.
## Parameter-Heavy Ladder
@@ -165,11 +201,36 @@ used_words 3.33x
```
The timing buckets are sub-linear to mildly linear here because the fixed
-frontend/O1 setup costs are still visible. The counter to watch is again
-`used_words`: it grows faster than spills and ranges when many parameters force
-stack traffic. This is not yet a wall-time bottleneck, but it is the clearest
-counter-level sign that parameter/stack occupancy needs attention before much
-larger parameter-heavy inputs are treated as normal workloads.
+frontend/O1 setup costs are still visible. In the May 17 snapshot, `used_words`
+again grew faster than spills and ranges when many parameters forced stack
+traffic.
+
+May 21 follow-up used `argc`-dependent caller arguments to keep the call-side
+values live and prevent constant folding from collapsing the probe:
+
+```text
+params_argc final
+N compile.tu opt.o1.total live_pre regalloc used_words hard_words stack_words stack_slots hard_visits stack_visits hard_marks stack_marks spills points
+128 2.296 0.417 0.051 0.205 1436 376 1060 106 2969 0 25 106 106 133
+256 3.717 0.635 0.053 0.292 2716 376 2340 234 5913 0 25 234 234 261
+512 6.343 0.926 0.052 0.365 5276 376 4900 490 11801 0 25 490 490 517
+1024 13.602 2.428 0.080 0.834 10396 376 10020 1002 23577 0 25 1002 1002 1029
+```
+
+512 to 1024 ratios:
+
+```text
+compile.tu 2.14x
+opt.o1.total 2.62x
+regalloc 2.28x
+spills 2.04x
+used_words 1.97x
+stack_marks 2.04x
+stack_visits 0
+```
+
+The parameter probe still has noisy wall-time at these small absolute O1
+durations, but the allocator-side counters now have the desired linear shape.
## Toy Samples
@@ -196,20 +257,20 @@ generated-code quality:
- Normal many-function and large-caller C inputs are approximately linear.
- The old dense liveness/conflict path is gone from normal O1
(`opt.conflict_bytes=0`).
+- Stack spill assignment no longer does first-fit interval probing across every
+ existing spill slot; `opt.alloc.stack_point_visits` stays zero on the
+ stack-heavy May 21 ladders.
- Link/JIT is not the bottleneck in these runs; the visible time is frontend
parse/codegen plus O1.
-- Spill-heavy and parameter-heavy shapes still have counter-level
- super-linear growth in `used_words`. Wall time is only mildly affected today,
- but future work should verify interval occupancy and candidate checking do not
- drift back into point/candidate-product behavior.
+- Spill-heavy and parameter-heavy allocator counters now scale by ranges,
+ stack slots, and interval marks rather than by point/candidate products.
## Investigation Priorities
-1. Track `used_words` decomposition on stack-heavy inputs.
- Add or inspect `hard_loc_words`, `stack_loc_words`, stack slot count, and
- interval probe/mark counters for `spill_argc` and parameter-heavy ladders.
- The immediate question is why `used_words` grows 3.3-3.6x while ranges and
- spills only double.
+1. Keep the stack-heavy May 21 shape as a regression guard.
+ `stack_point_visits` should remain zero on the normal O1 stack assignment
+ path, while `stack_marks`, `stack_slots`, spills, and actual interval
+ storage should remain approximately linear.
2. Keep `opt.conflict_bytes=0` as a regression guard.
Reintroducing dense pseudo conflict storage would be a clear O1 regression.
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -312,8 +312,6 @@ static FrameSlot spill_slot_for(Func* f, PReg v) {
return f->preg_info[v].spill_slot;
}
-static u32 loc_bit_words(u32 nbits) { return (nbits + 63u) / 64u; }
-
static u32 hard_loc_bit(u8 cls, Reg r) { return ((u32)cls * 32u) + (u32)r; }
typedef struct OptAllocCandidate {
@@ -324,6 +322,12 @@ typedef struct OptAllocCandidate {
u8 pad[3];
} OptAllocCandidate;
+typedef struct OptSpillCandidate {
+ PReg v;
+ u32 first;
+ u32 last;
+} OptSpillCandidate;
+
typedef struct AllocInterval {
u32 start;
u32 end;
@@ -339,6 +343,13 @@ typedef struct OptAllocator {
OptLoc* locs;
AllocIntervalVec* hard_used_locs;
AllocIntervalVec* stack_used_locs;
+ u32* stack_slot_end;
+ u32* active_stack_slots;
+ u32 nactive_stack_slots;
+ u32 active_stack_cap;
+ u32* free_stack_slots;
+ u32 nfree_stack_slots;
+ u32 free_stack_cap;
u32 point_count;
u32 hard_loc_words;
u32 stack_loc_words;
@@ -391,6 +402,19 @@ static void alloc_sort_candidates(OptAllocCandidate* cands, u32 n) {
if (n > 1) qsort(cands, n, sizeof(cands[0]), alloc_candidate_cmp);
}
+static int spill_candidate_cmp(const void* va, const void* vb) {
+ const OptSpillCandidate* a = (const OptSpillCandidate*)va;
+ const OptSpillCandidate* b = (const OptSpillCandidate*)vb;
+ if (a->first != b->first)
+ return (a->first > b->first) - (a->first < b->first);
+ if (a->last != b->last) return (a->last > b->last) - (a->last < b->last);
+ return (a->v > b->v) - (a->v < b->v);
+}
+
+static void alloc_sort_spills(OptSpillCandidate* spills, u32 n) {
+ if (n > 1) qsort(spills, n, sizeof(spills[0]), spill_candidate_cmp);
+}
+
static u32 alloc_interval_lower_bound(const AllocIntervalVec* v, u32 start) {
u32 lo = 0;
u32 hi = v ? v->n : 0;
@@ -482,6 +506,15 @@ static void alloc_mark_vec(Func* f, OptAllocator* a,
}
}
+static u32 alloc_interval_storage_words(const AllocIntervalVec* vecs,
+ u32 nvecs) {
+ u64 bytes = (u64)nvecs * (u64)sizeof(vecs[0]);
+ for (u32 i = 0; i < nvecs; ++i)
+ bytes += (u64)vecs[i].cap * (u64)sizeof(vecs[i].data[0]);
+ bytes = (bytes + 7u) / 8u;
+ return bytes > (u64)(u32)~0u ? (u32)~0u : (u32)bytes;
+}
+
static void opt_init_preg_info_from_ranges(Func* f,
const OptLiveRangeSet* ranges) {
OptPRegInfo* old = f->preg_info;
@@ -610,13 +643,6 @@ static int alloc_hard_conflicts(OptAllocator* a, const OptLiveRangeSet* ranges,
&a->hard_point_visits);
}
-static int alloc_stack_conflicts(OptAllocator* a, const OptLiveRangeSet* ranges,
- PReg v, u32 stack_idx) {
- if (stack_idx >= a->stack_slot_count) return 1;
- return alloc_ranges_overlap_vec(a, ranges, v, &a->stack_used_locs[stack_idx],
- &a->stack_point_visits);
-}
-
static void alloc_mark_hard_loc(OptAllocator* a, const OptLiveRangeSet* ranges,
PReg v, u32 bit, Func* f) {
if (bit >= a->hard_loc_bits) return;
@@ -630,13 +656,17 @@ static void alloc_grow_stack_locs(Func* f, OptAllocator* a, u32 need_slots) {
while (ncap < need_slots) ncap *= 2u;
FrameSlot* ns = arena_array(f->arena, FrameSlot, ncap);
AllocIntervalVec* ni = arena_zarray(f->arena, AllocIntervalVec, ncap);
+ u32* ne = arena_zarray(f->arena, u32, ncap);
if (a->stack_slots) {
memcpy(ns, a->stack_slots, sizeof(a->stack_slots[0]) * a->stack_slot_count);
memcpy(ni, a->stack_used_locs,
sizeof(a->stack_used_locs[0]) * a->stack_slot_count);
+ memcpy(ne, a->stack_slot_end,
+ sizeof(a->stack_slot_end[0]) * a->stack_slot_count);
}
a->stack_slots = ns;
a->stack_used_locs = ni;
+ a->stack_slot_end = ne;
a->stack_slot_cap = ncap;
}
@@ -647,6 +677,105 @@ static void alloc_mark_stack_loc(OptAllocator* a, const OptLiveRangeSet* ranges,
&a->stack_mark_points);
}
+static int stack_slot_heap_less(const OptAllocator* a, u32 lhs, u32 rhs) {
+ u32 le = lhs < a->stack_slot_count ? a->stack_slot_end[lhs] : 0;
+ u32 re = rhs < a->stack_slot_count ? a->stack_slot_end[rhs] : 0;
+ if (le != re) return le < re;
+ return lhs < rhs;
+}
+
+static void alloc_grow_active_stack(Func* f, OptAllocator* a) {
+ if (a->nactive_stack_slots < a->active_stack_cap) return;
+ u32 ncap = a->active_stack_cap ? a->active_stack_cap * 2u : 16u;
+ u32* ns = arena_array(f->arena, u32, ncap);
+ if (a->active_stack_slots)
+ memcpy(ns, a->active_stack_slots,
+ sizeof(a->active_stack_slots[0]) * a->nactive_stack_slots);
+ a->active_stack_slots = ns;
+ a->active_stack_cap = ncap;
+}
+
+static void stack_active_push(Func* f, OptAllocator* a, u32 slot) {
+ alloc_grow_active_stack(f, a);
+ u32 i = a->nactive_stack_slots++;
+ a->active_stack_slots[i] = slot;
+ while (i) {
+ u32 p = (i - 1u) / 2u;
+ if (stack_slot_heap_less(a, a->active_stack_slots[p],
+ a->active_stack_slots[i]))
+ break;
+ u32 tmp = a->active_stack_slots[p];
+ a->active_stack_slots[p] = a->active_stack_slots[i];
+ a->active_stack_slots[i] = tmp;
+ i = p;
+ }
+}
+
+static u32 stack_active_pop(OptAllocator* a) {
+ u32 out = a->active_stack_slots[0];
+ u32 last = a->active_stack_slots[--a->nactive_stack_slots];
+ if (a->nactive_stack_slots) {
+ u32 i = 0;
+ a->active_stack_slots[0] = last;
+ for (;;) {
+ u32 l = i * 2u + 1u;
+ u32 r = l + 1u;
+ u32 best = i;
+ if (l < a->nactive_stack_slots &&
+ !stack_slot_heap_less(a, a->active_stack_slots[best],
+ a->active_stack_slots[l]))
+ best = l;
+ if (r < a->nactive_stack_slots &&
+ !stack_slot_heap_less(a, a->active_stack_slots[best],
+ a->active_stack_slots[r]))
+ best = r;
+ if (best == i) break;
+ u32 tmp = a->active_stack_slots[i];
+ a->active_stack_slots[i] = a->active_stack_slots[best];
+ a->active_stack_slots[best] = tmp;
+ i = best;
+ }
+ }
+ return out;
+}
+
+static void alloc_grow_free_stack(Func* f, OptAllocator* a) {
+ if (a->nfree_stack_slots < a->free_stack_cap) return;
+ u32 ncap = a->free_stack_cap ? a->free_stack_cap * 2u : 16u;
+ u32* ns = arena_array(f->arena, u32, ncap);
+ if (a->free_stack_slots)
+ memcpy(ns, a->free_stack_slots,
+ sizeof(a->free_stack_slots[0]) * a->nfree_stack_slots);
+ a->free_stack_slots = ns;
+ a->free_stack_cap = ncap;
+}
+
+static void stack_free_push(Func* f, OptAllocator* a, u32 slot) {
+ alloc_grow_free_stack(f, a);
+ a->free_stack_slots[a->nfree_stack_slots++] = slot;
+}
+
+static void stack_expire_slots(Func* f, OptAllocator* a, u32 first) {
+ while (a->nactive_stack_slots) {
+ u32 slot = a->active_stack_slots[0];
+ if (slot >= a->stack_slot_count || a->stack_slot_end[slot] > first) break;
+ (void)stack_active_pop(a);
+ stack_free_push(f, a, slot);
+ }
+}
+
+static int stack_take_free_compatible(Func* f, OptAllocator* a, PReg v,
+ u32* stack_idx_out) {
+ for (u32 i = 0; i < a->nfree_stack_slots; ++i) {
+ u32 slot = a->free_stack_slots[i];
+ if (!spill_slot_compatible(f, a->stack_slots[slot], v)) continue;
+ a->free_stack_slots[i] = a->free_stack_slots[--a->nfree_stack_slots];
+ *stack_idx_out = slot;
+ return 1;
+ }
+ return 0;
+}
+
static void alloc_assign_hard(Func* f, OptAllocator* a,
const OptLiveRangeSet* ranges, PReg v, Reg r) {
OptPRegInfo* vi = &f->preg_info[v];
@@ -659,33 +788,35 @@ static void alloc_assign_hard(Func* f, OptAllocator* a,
alloc_mark_hard_loc(a, ranges, v, hard_loc_bit(vi->cls, r), f);
}
-static void alloc_assign_stack(Func* f, OptAllocator* a,
- const OptLiveRangeSet* ranges, PReg v) {
+static void alloc_assign_stack_slot(Func* f, OptAllocator* a,
+ const OptLiveRangeSet* ranges, PReg v,
+ u32 stack_idx) {
OptPRegInfo* vi = &f->preg_info[v];
- u32 stack_idx = a->stack_slot_count;
- FrameSlot slot = FRAME_SLOT_NONE;
- for (u32 i = 0; i < a->stack_slot_count; ++i) {
- if (alloc_stack_conflicts(a, ranges, v, i)) continue;
- if (!spill_slot_compatible(f, a->stack_slots[i], v)) continue;
- stack_idx = i;
- slot = a->stack_slots[i];
- break;
- }
- if (slot == FRAME_SLOT_NONE) {
- slot = spill_slot_for(f, v);
- alloc_grow_stack_locs(f, a, a->stack_slot_count + 1u);
- a->stack_slots[a->stack_slot_count++] = slot;
- stack_idx = a->stack_slot_count - 1u;
- a->stack_loc_words = loc_bit_words(a->stack_slot_count);
- } else {
- vi->spill_slot = slot;
- }
+ FrameSlot slot = a->stack_slots[stack_idx];
+ vi->spill_slot = slot;
vi->alloc_kind = OPT_ALLOC_SPILL;
a->locs[v].kind = OPT_LOC_STACK;
a->locs[v].cls = vi->cls;
a->locs[v].hard_reg = REG_NONE;
a->locs[v].spill_slot = slot;
alloc_mark_stack_loc(a, ranges, v, stack_idx, f);
+ a->stack_slot_end[stack_idx] = vi->last_pos;
+ stack_active_push(f, a, stack_idx);
+}
+
+static void alloc_assign_stack(Func* f, OptAllocator* a,
+ const OptLiveRangeSet* ranges, PReg v) {
+ OptPRegInfo* vi = &f->preg_info[v];
+ u32 first = vi->first_pos ? vi->first_pos - 1u : 0;
+ u32 stack_idx = 0;
+ stack_expire_slots(f, a, first);
+ if (!stack_take_free_compatible(f, a, v, &stack_idx)) {
+ FrameSlot slot = spill_slot_for(f, v);
+ alloc_grow_stack_locs(f, a, a->stack_slot_count + 1u);
+ stack_idx = a->stack_slot_count++;
+ a->stack_slots[stack_idx] = slot;
+ }
+ alloc_assign_stack_slot(f, a, ranges, v, stack_idx);
}
static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
@@ -696,7 +827,6 @@ static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
u32 ncands = 0;
for (PReg v = 1; v < opt_reg_count(f); ++v)
if (ranges->first_range_by_preg[v] != OPT_RANGE_NONE) ++ncands;
- a->hard_loc_words = loc_bit_words(a->hard_loc_bits);
a->locs =
arena_zarray(f->arena, OptLoc, opt_reg_count(f) ? opt_reg_count(f) : 1u);
a->hard_used_locs = arena_zarray(f->arena, AllocIntervalVec,
@@ -707,6 +837,8 @@ static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
OptAllocCandidate* cands =
arena_array(f->arena, OptAllocCandidate, ncands ? ncands : 1u);
+ OptSpillCandidate* spills =
+ arena_array(f->arena, OptSpillCandidate, ncands ? ncands : 1u);
u32 n = 0;
for (PReg v = 1; v < opt_reg_count(f); ++v) {
if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue;
@@ -720,6 +852,7 @@ static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
}
alloc_sort_candidates(cands, n);
+ u32 nspills = 0;
for (u32 i = 0; i < n; ++i) {
PReg v = cands[i].v;
OptPRegInfo* vi = &f->preg_info[v];
@@ -768,11 +901,21 @@ static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
alloc_assign_hard(f, a, ranges, v, best);
}
if (!found) {
- alloc_assign_stack(f, a, ranges, v);
+ spills[nspills].v = v;
+ spills[nspills].first = vi->first_pos ? vi->first_pos - 1u : 0;
+ spills[nspills].last = vi->last_pos;
+ ++nspills;
}
}
- f->opt_alloc_hard_loc_words = a->point_count * a->hard_loc_words;
- f->opt_alloc_stack_loc_words = a->point_count * a->stack_loc_words;
+ alloc_sort_spills(spills, nspills);
+ for (u32 i = 0; i < nspills; ++i)
+ alloc_assign_stack(f, a, ranges, spills[i].v);
+ a->hard_loc_words =
+ alloc_interval_storage_words(a->hard_used_locs, a->hard_loc_bits);
+ a->stack_loc_words =
+ alloc_interval_storage_words(a->stack_used_locs, a->stack_slot_count);
+ f->opt_alloc_hard_loc_words = a->hard_loc_words;
+ f->opt_alloc_stack_loc_words = a->stack_loc_words;
f->opt_alloc_stack_slots = a->stack_slot_count;
f->opt_used_loc_words =
f->opt_alloc_hard_loc_words + f->opt_alloc_stack_loc_words;
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -2717,6 +2717,44 @@ static void opt_range_regalloc_no_conflicts_and_stack_reuse(void) {
tc_fini(&tc);
}
+static void opt_stack_spill_assignment_avoids_quadratic_probe(void) {
+ enum { NVALS = 64 };
+ TestCtx tc;
+ tc_init(&tc);
+ MockCGTarget mock;
+ mock_init(&mock, tc.c);
+ static const Reg scratch[] = {9, 10};
+ mock_set_pool(&mock, RC_INT, NULL, 0, scratch, 2, 0);
+
+ Func* f = new_func(&tc);
+ opt_machinize(f, &mock.base);
+ Val vals[NVALS];
+ for (u32 i = 0; i < NVALS; ++i) {
+ vals[i] = add_val(f, tc.i32);
+ emit_load_imm(f, f->entry, vals[i], tc.i32, (i64)i + 1);
+ }
+ Val acc = vals[0];
+ for (u32 i = 1; i < NVALS; ++i) {
+ Val next = add_val(f, tc.i32);
+ emit_binop(f, f->entry, next, acc, vals[i], tc.i32);
+ acc = next;
+ }
+ emit_ret_val(f, f->entry, acc, tc.i32);
+ opt_build_cfg(f);
+ opt_build_loop_tree(f);
+ opt_regalloc(f, 0);
+
+ EXPECT(f->opt_alloc_stack_slots >= NVALS - 1u,
+ "overlapping pressure should allocate many stack slots, got %u",
+ (unsigned)f->opt_alloc_stack_slots);
+ EXPECT(f->opt_alloc_stack_point_visits <= f->opt_alloc_stack_mark_points,
+ "stack assignment should not probe existing slots quadratically: "
+ "visits=%llu marks=%llu",
+ (unsigned long long)f->opt_alloc_stack_point_visits,
+ (unsigned long long)f->opt_alloc_stack_mark_points);
+ tc_fini(&tc);
+}
+
static void opt_rewrite_spill_use_def(void) {
TestCtx tc;
tc_init(&tc);
@@ -4438,6 +4476,7 @@ int main(void) {
opt_range_overlap_class();
opt_regalloc_priority();
opt_range_regalloc_no_conflicts_and_stack_reuse();
+ opt_stack_spill_assignment_avoids_quadratic_probe();
opt_rewrite_spill_use_def();
opt_call_clobber_preservation();
opt_call_clobber_caller_saved();