kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit a11e9f6d86f9aa5c4d8d543fee63730bfa8b3991
parent 0bfde91143b108d562e7bc2754f9335c212745d5
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 14 May 2026 19:42:22 -0700

Implement range rewrite without live_after

Diffstat:
Mdoc/MIR_RA_REPORT.md | 148++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
Mdoc/PERF.md | 83+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Msrc/opt/opt.h | 1+
Msrc/opt/pass_lower.c | 181++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
Mtest/opt/opt_test.c | 22++++++++++++++++------
5 files changed, 339 insertions(+), 96 deletions(-)

diff --git a/doc/MIR_RA_REPORT.md b/doc/MIR_RA_REPORT.md @@ -486,17 +486,20 @@ Deliverables: - noop move deletion - no required per-instruction full live-after storage -### Phase 5: Coalescing and Splitting +### Phase 5: O1 Cleanup and Scaling -After O1 has the right shape, add O2 allocator features: +After O1 has the right shape, remove transition scaffolding and fix the +remaining O1 scaling buckets before adding O2 allocator features: -- move collection -- move-only liveness -- capped conflict matrix for coalescing -- live-range splitting -- edge/block spill placement +- rewrite without extra per-instruction temporary list churn +- delete the old dense liveness/conflict path instead of preserving + compatibility for old optimizer-internal tests +- narrow or compress `used_locs` so stack occupancy does not scale as + `points * candidates` -This mirrors MIR's division: the matrix is an optional coalescing tool, not the +Only after that should O2 add move collection, move-only liveness, capped +coalescing matrices, live-range splitting, and edge/block spill placement. This +mirrors MIR's division: the matrix is an optional coalescing tool, not the allocator's main model. ## Implementation Checklist @@ -504,6 +507,12 @@ allocator's main model. Use this as the working tracker for the structural rewrite. Each phase should land with focused tests and metrics before moving to the next phase. +Compatibility policy: compatibility with the old O1 optimizer internals is not +a goal. The public compiler/API behavior must stay correct, but internal helper +APIs, dumps, tests, metrics, and persistent IR fields may be broken, renamed, or +deleted when they preserve the dense-conflict shape or add runtime cost. Prefer +a clean fast implementation over shims for old pass boundaries. + ### Phase 0: Baseline and Guardrails - [x] Preserve current O0 behavior as the reference path. @@ -548,12 +557,13 @@ coverage belongs in a later pass once the range allocator owns rewrite. - [x] Compute `live_in`/`live_out` without allocating conflicts. - [x] Make pre-DDE use only block liveness. - [x] Keep old full `opt_live_info` available for regalloc during transition. + This is now transition debt, not a compatibility requirement. - [x] Add dump output for block liveness. - [x] Add tests for branch, loop, and call liveness. Exit criteria: -- [x] Existing O1 behavior is unchanged. +- [x] Existing public O1 behavior is unchanged. - [x] Pre-DDE no longer builds `val_conflicts`. - [x] Metrics show separate block-liveness timing and no pre-DDE conflict bytes. @@ -608,43 +618,102 @@ Exit criteria: ### Phase 4: Rewrite From Assignment Map -- [ ] Rewrite hard-assigned values to hard registers. -- [ ] Insert reloads for stack-assigned uses. -- [ ] Insert stores for stack-assigned defs. -- [ ] Reserve target scratch registers per class. -- [ ] Handle operands nested in: - - [ ] normal operands - - [ ] indirect operands - - [ ] call descriptors - - [ ] return descriptors - - [ ] intrinsic descriptors - - [ ] inline asm descriptors -- [ ] Preserve call-clobber saves/restores with a rolling live set. -- [ ] Delete noop moves after rewrite. -- [ ] Avoid persistent per-instruction full `live_after` storage. -- [ ] Add rewrite dump output. +- [x] Rewrite hard-assigned values to hard registers. +- [x] Insert reloads for stack-assigned uses. +- [x] Insert stores for stack-assigned defs. +- [x] Reserve target scratch registers per class. +- [x] Handle operands nested in: + - [x] normal operands + - [x] indirect operands + - [x] call descriptors + - [x] return descriptors + - [x] intrinsic descriptors + - [x] inline asm descriptors +- [x] Preserve call-clobber saves/restores with a rolling live set. +- [x] Delete noop moves after rewrite. +- [x] Avoid persistent per-instruction full `live_after` storage. +- [x] Add rewrite dump output. + +Exit criteria: + +- [x] O1 emits correct code without `Block.live_after`. +- [x] Reload/store metrics are stable and understandable. +- [x] Public O1 behavior and smoke JIT tests pass. + +### Phase 5: Reduce Phase-4 Rewrite Overhead + +Phase-4 timings show the no-`live_after` rewrite improves the large +straight-line case but regresses many-small-function/function-table inputs. Keep +the rolling-live design, but remove the temporary-list and reverse-copy cost +introduced by the first implementation. + +- [ ] Rewrite blocks with a single block-local output buffer or pre-sized edit + buffer. +- [ ] Avoid per-instruction `RewriteList` allocation/copy churn. +- [ ] Preserve rolling-live call-clobber behavior without persistent + `Block.live_after`. +- [ ] Keep scratch-register selection deterministic after the rewrite cleanup. +- [ ] Update tests that asserted old internal storage or exact intermediate + dump details. +- [ ] Re-run the same-input phase-3 vs phase-5 timing comparison from + `doc/PERF.md`. Exit criteria: -- [ ] O1 emits correct code without `Block.live_after`. -- [ ] Reload/store metrics are stable and understandable. -- [ ] Existing O1 tests and smoke JIT tests pass. +- [ ] Function-table/main-many-small `opt.regalloc` returns to phase-3 parity + or better. +- [ ] Large straight-line phase-4 gains are retained. +- [ ] No persistent per-instruction full live-after sets are rebuilt in normal + O1. -### Phase 5: Remove Old Dense Path +### Phase 6: Delete Old Dense Path - [ ] Remove normal-path allocation of `Func.val_conflicts`. -- [ ] Remove or demote `opt_conflict_words` from persistent `Func` state. -- [ ] Remove old all-values conflict construction from O1. -- [ ] Keep only compatibility helpers still required by unported passes. +- [ ] Delete old all-values conflict construction from O1, including + compatibility-only helpers and tests. +- [ ] Remove or demote `opt_live_words`, `opt_conflict_words`, + `Block.live_in`, `Block.live_out`, `Block.live_use`, `Block.live_def`, + and `Block.live_after` from persistent IR state where no public behavior + requires them. +- [ ] Replace `opt_live_info` callers with pass-local liveness/ranges, then + delete or narrow `opt_live_info`. +- [ ] Make tests assert public behavior and new pass dumps/metrics, not dense + conflict matrix details. - [ ] Update docs and metrics names to reflect range-based RA. Exit criteria: +- [ ] Normal O1 has no reachable dense pseudo-conflict construction. +- [ ] The source tree no longer contains tests whose purpose is preserving the + old dense interference model. +- [ ] `opt.conflict_bytes` is absent or always zero on normal O1 paths. + +### Phase 7: Narrow `used_locs` + +Phase-3 and phase-4 timings show the remaining large-function bend tracks +`opt.alloc.used_loc_words`. The first range allocator sizes occupied-location +bitmaps as hard-register bits plus one possible stack bit per candidate; that +preserves correctness but reintroduces superlinear memory/time for one large +function. + +- [ ] Split hard-register occupancy from stack-slot occupancy. +- [ ] Make stack occupancy demand-sized by allocated stack slots, not candidate + count. +- [ ] Consider sparse or interval-indexed stack occupancy for long straight-line + functions. +- [ ] Keep hard-register occupancy compact and class-aware. +- [ ] Add metrics separating hard occupancy words, stack occupancy words, and + allocated stack slot count. +- [ ] Re-run straight-line and table-main ladders after each shape change. + +Exit criteria: + - [ ] Large straight-line benchmark scales by range/point count, not - `nvals * live_words`. + `points * candidates`, `nvals * live_words`, or dense conflict bytes. - [ ] `doc/PERF.md` can be updated with before/after O1 scaling numbers. +- [ ] Table-main remains at phase-5 parity or better. -### Phase 6: O2 Coalescing +### Phase 8: O2 Coalescing - [ ] Collect move candidates after machinize. - [ ] Add move-only liveness using `scan_vars`-style restricted variables. @@ -662,7 +731,7 @@ Exit criteria: - [ ] Coalescing is optional and does not affect O1 compile-time shape. - [ ] Matrix memory is bounded and reported. -### Phase 7: O2 Splitting and Spill Placement +### Phase 9: O2 Splitting and Spill Placement - [ ] Add split-capable occupancy state similar to MIR's `busy_used_locs`. - [ ] Identify profitable live-range gaps. @@ -676,7 +745,7 @@ Exit criteria: - [ ] O2 can reduce spills without changing O1 assignment behavior. - [ ] Edge/block placement tests cover diamonds, loops, and critical edges. -### Phase 8: Cleanup and Documentation +### Phase 10: Cleanup and Documentation - [ ] Update `doc/OPT.md` with the final implemented module names and pass order. @@ -693,6 +762,7 @@ Avoid spending early effort on: - micro-optimizing the existing full conflict matrix - adding sparse rows to `val_conflicts` as a long-term solution - tuning candidate priority before range-based allocation exists +- preserving optimizer-internal compatibility with the dense allocator era - link/JIT performance - parser/CG data-structure changes @@ -740,8 +810,10 @@ knowledge behind explicit helpers: ## Bottom Line MIR's RA performance shape comes from avoiding a full pseudo interference graph -in the normal allocation path. cfree should mirror that. The next serious O1 -work should be a structural rewrite around pass-local liveness, compressed live -ranges, occupied-location assignment, and rewrite from an allocation map. +in the normal allocation path. cfree should mirror that, even when doing so +breaks old optimizer-internal APIs, fields, dumps, or tests. The next serious +O1 work should keep the pass-local liveness, compressed live ranges, +occupied-location assignment, and assignment-map rewrite shape, then remove the +transition scaffolding and remaining scaling buckets. Once that shape exists, incremental tuning will have a much better foundation. diff --git a/doc/PERF.md b/doc/PERF.md @@ -414,12 +414,87 @@ Next useful compile instrumentation: - Add pool/arena allocation counters or high-water marks for frontend and CG arenas. +### Phase-4 Rewrite Rerun + +After `doc/MIR_RA_REPORT.md` phase 4, O1 rewrite no longer materializes +`Block.live_after`. Inline-asm clobber constraints and call-clobber +save/restore insertion now use rolling backward live sets. The old full +per-instruction live-after storage is still available through `opt_live_info` +for compatibility tests, but the normal O1 `opt_regalloc` path no longer +requires it. + +Samples: 5 runs per point. The table uses p50 milliseconds. O1 scopes and O1 +counters are summed across functions. The generator is the same direct-call vs +function-table family used for the phase-3 rerun, but these are fresh generated +inputs and not byte-for-byte identical to the earlier tables. + +Straight-line main: + +```text +funcs input_bytes insts vals pre_block_bytes ranges range_points used_loc_words conflict_bytes run.total compile.tu parse_codegen opt.o1.total live_blocks_pre live_ranges_reg regalloc link_jit +1 218 61 38 576 37 52 104 0 0.584 0.475 0.319 0.208 0.059 0.033 0.067 0.062 +4 731 226 140 2016 142 196 392 0 1.050 0.943 0.800 0.530 0.146 0.090 0.193 0.058 +16 2810 886 548 7872 562 772 1776 0 3.044 2.912 2.754 1.855 0.507 0.345 0.726 0.065 +64 11191 3526 2180 31392 2242 3076 8864 0 10.963 10.756 10.599 7.140 1.924 1.375 2.906 0.089 +128 22450 7046 4356 62688 4482 6148 23096 0 21.189 20.676 20.524 13.718 3.682 2.690 5.706 0.110 +256 45191 14086 8708 125280 8962 12292 67688 0 41.840 41.173 41.019 27.476 7.238 5.464 11.777 0.157 +512 90668 28166 17412 250464 17922 24580 221384 0 87.461 86.405 86.252 58.701 14.896 11.760 26.346 0.243 +1024 181691 56326 34820 500832 35842 49156 786824 0 190.569 188.638 188.479 129.835 32.046 26.404 61.906 0.425 +``` + +Function-table main: + +```text +funcs input_bytes insts vals pre_block_bytes ranges range_points used_loc_words conflict_bytes run.total compile.tu parse_codegen opt.o1.total live_blocks_pre live_ranges_reg regalloc link_jit +1 311 82 49 832 48 67 134 0 0.665 0.545 0.381 0.230 0.064 0.036 0.077 0.066 +4 788 226 133 2272 135 190 380 0 1.122 0.989 0.839 0.550 0.149 0.094 0.203 0.068 +16 2718 802 469 8032 483 682 1364 0 2.886 2.744 2.592 1.737 0.481 0.316 0.661 0.076 +64 10475 3106 1813 31072 1875 2650 5300 0 10.054 9.828 9.670 6.569 1.827 1.207 2.491 0.097 +128 20875 6178 3605 61792 3731 5274 10548 0 18.978 18.667 18.515 12.271 3.328 2.275 4.800 0.117 +256 41824 12322 7189 123232 7443 10522 21044 0 37.247 36.757 36.593 24.043 6.552 4.505 9.416 0.158 +512 83717 24610 14357 246112 14867 21018 42036 0 74.068 73.190 73.035 47.519 12.995 8.945 18.699 0.236 +1024 167549 49186 28693 491872 29715 42010 84020 0 153.281 151.622 151.471 96.303 26.390 17.895 37.547 0.382 +``` + +To isolate the phase-4 change, the same generated inputs were also run against +a temporary clean worktree at the phase-3 baseline (`HEAD` before the phase-4 +working-tree edits). The table below shows phase-3 p50 to phase-4 p50 on those +identical inputs. + +```text +straight_main: +funcs run.total opt.o1.total live_ranges_reg regalloc +512 88.544 -> 87.461 (-1.2%) 60.910 -> 58.701 (-3.6%) 12.268 -> 11.760 (-4.1%) 29.589 -> 26.346 (-11.0%) +1024 206.333 -> 190.569 (-7.6%) 147.862 -> 129.835 (-12.2%) 27.774 -> 26.404 (-4.9%) 82.750 -> 61.906 (-25.2%) + +table_main: +funcs run.total opt.o1.total live_ranges_reg regalloc +512 69.043 -> 74.068 (+7.3%) 43.713 -> 47.519 (+8.7%) 9.240 -> 8.945 (-3.2%) 15.729 -> 18.699 (+18.9%) +1024 143.333 -> 153.281 (+6.9%) 88.574 -> 96.303 (+8.7%) 18.549 -> 17.895 (-3.5%) 31.645 -> 37.547 (+18.7%) +``` + +An extra 7-sample repeat at 512 and 1024 confirmed the shape: large +straight-line `regalloc` improves by about 13-27%, while the table-shaped case +regresses by about 16-17% in `regalloc`. The live-range analysis bucket itself +improves modestly in both shapes. The table-main regression is therefore in +rewrite mechanics, not range construction. + +The most likely cause is the first phase-4 implementation strategy: it rewrites +each block while walking backward, builds per-instruction temporary rewrite +lists, appends them in reverse, then reverses the block. That removes the large +persistent `live_after` allocation and helps the large straight-line case, but +it adds list-copy overhead that is visible when many compact functions dominate +the workload. A forward rewrite with a precomputed rolling-live cursor or a +block-local edit buffer should keep the no-`live_after` shape without the +table-main tax. + ## Performance Priorities -1. Finish phase-4 rewrite without persistent `live_after`. - Phase 3 removed the dense conflict matrix from O1 assignment. Rewrite still - needs per-instruction full live-after sets for call preservation and spill - insertion, so that is the next correctness-preserving structural target. +1. Reduce phase-4 rewrite overhead. + Phase 4 removed persistent `Block.live_after` from the normal O1 path and + improved the large straight-line case, but the current backward rewrite + regresses many-small-function/table-shaped inputs. Keep the rolling-live + design, but remove the extra temporary-list/reverse-copy cost. 2. Compress or narrow `used_locs`. The remaining straight-line superlinear bucket now tracks diff --git a/src/opt/opt.h b/src/opt/opt.h @@ -151,6 +151,7 @@ void opt_live_blocks(Func*, OptLiveInfo*); void opt_live_dump_blocks(Func*, const OptLiveInfo*, Writer*); void opt_live_ranges_build(Func*, const OptLiveInfo*, OptLiveRangeSet*); void opt_live_dump_ranges(Func*, const OptLiveRangeSet*, Writer*); +void opt_rewrite_dump(Func*, Writer*); void opt_live_info(Func*); void opt_coalesce(Func*); void opt_regalloc(Func*, int allow_live_range_split); diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c @@ -1,3 +1,4 @@ +#include <stdio.h> #include <string.h> #include "core/arena.h" @@ -825,34 +826,48 @@ static void opt_init_val_info_from_ranges(Func* f, f->opt_conflict_words = 0; } -static void opt_build_live_after_only(Func* f, const OptLiveInfo* live_info) { +static void bits_clear(u64* bits, u32 words) { + for (u32 i = 0; i < words; ++i) bits[i] = 0; +} + +static void live_update_before(u64* live, const u64* use, const u64* def, + u32 words) { + for (u32 w = 0; w < words; ++w) live[w] = (live[w] & ~def[w]) | use[w]; +} + +static void live_copy_block_out(Func* f, const OptLiveInfo* live_info, u32 b, + u64* live, u32 words) { + bits_clear(live, words); + if (live_info) { + const OptBitset* out = &live_info->blocks[b].live_out; + for (u32 w = 0; w < words && w < out->nwords; ++w) live[w] = out->words[w]; + } else if (b < f->nblocks && f->blocks[b].live_out) { + copy_bits(live, f->blocks[b].live_out, words); + } +} + +static void opt_apply_asm_constraints_from_live(Func* f, + const OptLiveInfo* live_info) { u32 words = live_info ? live_info->words : f->opt_live_words; if (!words) words = bit_words(f->nvals); f->opt_live_words = (u16)words; + + u64* live = arena_zarray(f->arena, u64, words ? words : 1u); + u64* use = arena_zarray(f->arena, u64, words ? words : 1u); + u64* def = arena_zarray(f->arena, u64, words ? words : 1u); for (u32 b = 0; b < f->nblocks; ++b) { Block* bl = &f->blocks[b]; - bl->live_after = arena_array(f->arena, u64*, bl->ninsts ? bl->ninsts : 1u); - u64* live = arena_zarray(f->arena, u64, words); - if (live_info) { - const OptBitset* out = &live_info->blocks[b].live_out; - for (u32 w = 0; w < words && w < out->nwords; ++w) live[w] = out->words[w]; - } else if (bl->live_out) { - copy_bits(live, bl->live_out, words); - } + live_copy_block_out(f, live_info, b, live, words); for (u32 ri = bl->ninsts; ri > 0; --ri) { u32 i = ri - 1u; Inst* in = &bl->insts[i]; - u64* after = arena_zarray(f->arena, u64, words); - copy_bits(after, live, words); - bl->live_after[i] = after; - - u64* use = arena_zarray(f->arena, u64, words); - u64* def = arena_zarray(f->arena, u64, words); + bits_clear(use, words); + bits_clear(def, words); BitsCtx bc = {use, def}; walk_inst_operands(f, in, collect_bits, &bc); if ((IROp)in->op == IR_ASM_BLOCK) - apply_asm_register_constraints(f, in, use, def, after); - for (u32 w = 0; w < words; ++w) live[w] = (live[w] & ~def[w]) | use[w]; + apply_asm_register_constraints(f, in, use, def, live); + live_update_before(live, use, def, words); } } } @@ -1163,17 +1178,60 @@ static void rewrite_call_arg_value(Func* f, Inst* owner, CGABIValue* v, } } -static void rewrite_func(Func* f) { +typedef struct RewriteCallSaveCtx { + Func* f; + RewriteList* out; + const u64* def; + int emit_restore; +} RewriteCallSaveCtx; + +static void rewrite_call_save_one(Val v, void* arg) { + RewriteCallSaveCtx* c = (RewriteCallSaveCtx*)arg; + Func* f = c->f; + if (v == VAL_NONE || v >= f->nvals) return; + if (bit_has(c->def, v)) return; + if (f->val_info[v].alloc_kind != OPT_ALLOC_HARD) return; + if (!is_caller_saved(f, f->val_info[v].cls, f->val_info[v].hard_reg)) return; + if (c->emit_restore) + append_load_val(f, c->out, v); + else + append_store_val(f, c->out, v); +} + +static void append_live_call_saves(Func* f, RewriteList* out, + const u64* live_after, const u64* def, + u32 words, int emit_restore) { + RewriteCallSaveCtx ctx = {f, out, def, emit_restore}; + for (u32 w = 0; w < words; ++w) { + u64 bits = live_after[w]; + while (bits) { + u32 bit = (u32)__builtin_ctzll(bits); + rewrite_call_save_one(w * 64u + bit, &ctx); + bits &= bits - 1u; + } + } +} + +static void rewrite_func(Func* f, const OptLiveInfo* live_info) { + u32 words = live_info ? live_info->words : f->opt_live_words; + if (!words) words = bit_words(f->nvals); + f->opt_live_words = (u16)words; + + u64* live = arena_zarray(f->arena, u64, words ? words : 1u); + u64* use = arena_zarray(f->arena, u64, words ? words : 1u); + u64* def = arena_zarray(f->arena, u64, words ? words : 1u); for (u32 b = 0; b < f->nblocks; ++b) { Block* bl = &f->blocks[b]; - u64** live_after = bl->live_after; + RewriteList rev; + memset(&rev, 0, sizeof rev); + live_copy_block_out(f, live_info, b, live, words); - RewriteList out; - memset(&out, 0, sizeof out); - for (u32 i = 0; i < bl->ninsts; ++i) { + for (u32 ri = bl->ninsts; ri > 0; --ri) { + u32 i = ri - 1u; Inst in = bl->insts[i]; - u64* def = arena_zarray(f->arena, u64, f->opt_live_words); - BitsCtx db = {arena_zarray(f->arena, u64, f->opt_live_words), def}; + bits_clear(use, words); + bits_clear(def, words); + BitsCtx db = {use, def}; walk_inst_operands(f, &in, collect_bits, &db); RewriteList before, after; memset(&before, 0, sizeof before); @@ -1194,40 +1252,67 @@ static void rewrite_func(Func* f) { } else { walk_inst_operands(f, &in, rewrite_one_operand, &ctx); } + RewriteList seq; + memset(&seq, 0, sizeof seq); for (u32 k = 0; k < before.n; ++k) { - Inst* dst = list_push(f, &out, (IROp)before.data[k].op); + Inst* dst = list_push(f, &seq, (IROp)before.data[k].op); *dst = before.data[k]; } - if ((IROp)in.op == IR_CALL) { - for (Val v = 1; v < f->nvals; ++v) { - if (!bit_has(live_after[i], v) || bit_has(def, v)) continue; - if (f->val_info[v].alloc_kind == OPT_ALLOC_HARD && - is_caller_saved(f, f->val_info[v].cls, f->val_info[v].hard_reg)) - append_store_val(f, &out, v); - } - } - Inst* dst = list_push(f, &out, (IROp)in.op); + if ((IROp)in.op == IR_CALL) + append_live_call_saves(f, &seq, live, def, words, 0); + Inst* dst = list_push(f, &seq, (IROp)in.op); *dst = in; - if ((IROp)in.op == IR_CALL) { - for (Val v = 1; v < f->nvals; ++v) { - if (!bit_has(live_after[i], v) || bit_has(def, v)) continue; - if (f->val_info[v].alloc_kind == OPT_ALLOC_HARD && - is_caller_saved(f, f->val_info[v].cls, f->val_info[v].hard_reg)) - append_load_val(f, &out, v); - } - } + if ((IROp)in.op == IR_CALL) + append_live_call_saves(f, &seq, live, def, words, 1); for (u32 k = 0; k < after.n; ++k) { - Inst* ad = list_push(f, &out, (IROp)after.data[k].op); + Inst* ad = list_push(f, &seq, (IROp)after.data[k].op); *ad = after.data[k]; } + + for (u32 k = seq.n; k > 0; --k) { + Inst* rd = list_push(f, &rev, (IROp)seq.data[k - 1u].op); + *rd = seq.data[k - 1u]; + } + live_update_before(live, use, def, words); } - bl->insts = out.data; - bl->ninsts = out.n; - bl->cap = out.cap; + for (u32 i = 0; i < rev.n / 2u; ++i) { + Inst tmp = rev.data[i]; + rev.data[i] = rev.data[rev.n - 1u - i]; + rev.data[rev.n - 1u - i] = tmp; + } + bl->insts = rev.data; + bl->ninsts = rev.n; + bl->cap = rev.cap; + bl->live_after = NULL; } f->opt_rewritten = 1; } +static void rewrite_dump_write(Writer* w, const char* s) { + cfree_writer_write(w, s, strlen(s)); +} + +void opt_rewrite_dump(Func* f, Writer* w) { + if (!f || !w) return; + char buf[96]; + snprintf(buf, sizeof buf, "rewrite blocks=%u vals=%u rewritten=%u\n", + (unsigned)f->nblocks, (unsigned)f->nvals, + (unsigned)f->opt_rewritten); + rewrite_dump_write(w, buf); + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + snprintf(buf, sizeof buf, "block %u insts=%u\n", (unsigned)b, + (unsigned)bl->ninsts); + rewrite_dump_write(w, buf); + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + snprintf(buf, sizeof buf, " %u op=%u operands=%u\n", (unsigned)i, + (unsigned)in->op, (unsigned)in->nopnds); + rewrite_dump_write(w, buf); + } + } +} + static int inst_has_side_effect(Func* f, const Inst* in); static int all_defs_dead(Func* f, Inst* in, u64* live) { @@ -1297,14 +1382,14 @@ void opt_regalloc(Func* f, int allow_live_range_split) { OptLiveRangeSet ranges; opt_live_ranges_build(f, &live, &ranges); opt_init_val_info_from_ranges(f, &ranges); - opt_build_live_after_only(f, &live); + opt_apply_asm_constraints_from_live(f, &live); metrics_count(f->c, "opt.live_words", f->opt_live_words); metrics_count(f->c, "opt.conflict_bytes", 0); metrics_scope_end(f->c, "opt.live_ranges.regalloc"); OptAllocator alloc; opt_assign_ranges(f, &ranges, &alloc); - rewrite_func(f); + rewrite_func(f, &live); } static int same_reg_operand(const Operand* a, const Operand* b) { diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c @@ -1216,15 +1216,26 @@ static void opt_rewrite_spill_use_def(void) { emit_ret_val(f, f->entry, c, tc.i32); opt_build_cfg(f); opt_build_loop_tree(f); - opt_live_info(f); opt_regalloc(f, 0); EXPECT(f->opt_rewritten, "regalloc should rewrite pseudos"); + EXPECT(f->blocks[f->entry].live_after == NULL, + "range rewrite should not require per-instruction live_after sets"); EXPECT(count_op(f, IR_STORE) >= 1, "spill def should insert store"); EXPECT(count_op(f, IR_LOAD) >= 1, "spill use should insert reload"); int saw_spill_slot = 0; for (u32 i = 0; i < f->nframe_slots; ++i) if (f->frame_slots[i].kind == FS_SPILL) saw_spill_slot = 1; EXPECT(saw_spill_slot, "rewrite should allocate FS_SPILL slot"); + + CfreeWriter* w = cfree_writer_mem(&g_heap); + opt_rewrite_dump(f, w); + size_t len = 0; + const unsigned char* bytes = cfree_writer_mem_bytes(w, &len); + EXPECT(bytes_contains(bytes, len, "rewrite blocks="), + "rewrite dump should include summary"); + EXPECT(bytes_contains(bytes, len, "op="), + "rewrite dump should include rewritten instructions"); + cfree_writer_close(w); tc_fini(&tc); } @@ -1245,10 +1256,11 @@ static void opt_call_clobber_preservation(void) { emit_ret_val(f, f->entry, live, tc.i32); opt_build_cfg(f); opt_build_loop_tree(f); - opt_live_info(f); opt_regalloc(f, 0); Block* b = &f->blocks[f->entry]; + EXPECT(b->live_after == NULL, + "call rewrite should keep live-after state pass-local"); int saw_call_save_restore = 0; for (u32 i = 1; i + 1 < b->ninsts; ++i) { if ((IROp)b->insts[i].op == IR_CALL && @@ -1279,10 +1291,11 @@ static void opt_call_clobber_caller_saved(void) { emit_ret_val(f, f->entry, live, tc.i32); opt_build_cfg(f); opt_build_loop_tree(f); - opt_live_info(f); opt_regalloc(f, 0); Block* b = &f->blocks[f->entry]; + EXPECT(b->live_after == NULL, + "call rewrite should keep live-after state pass-local"); int saw_call_save_restore = 0; for (u32 i = 1; i + 1 < b->ninsts; ++i) { if ((IROp)b->insts[i].op == IR_CALL && @@ -1320,7 +1333,6 @@ static void opt_spill_pressure(void) { emit_ret_val(f, f->entry, d, tc.i32); opt_build_cfg(f); opt_build_loop_tree(f); - opt_live_info(f); opt_regalloc(f, 0); int spills = 0; @@ -1437,7 +1449,6 @@ static void opt_inline_asm_constraints_and_clobbers(void) { opt_machinize(f, &mock.base); opt_build_cfg(f); opt_build_loop_tree(f); - opt_live_info(f); opt_regalloc(f, 0); aux = (IRAsmAux*)in->extra.aux; @@ -1560,7 +1571,6 @@ static void opt_regalloc_spill_requires_scratch(void) { emit_ret_val(f, f->entry, c, tc.i32); opt_build_cfg(f); opt_build_loop_tree(f); - opt_live_info(f); NoScratchCtx nctx = {f}; EXPECT(expect_panic(tc.c, run_no_scratch_regalloc, &nctx),