commit a11e9f6d86f9aa5c4d8d543fee63730bfa8b3991
parent 0bfde91143b108d562e7bc2754f9335c212745d5
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 14 May 2026 19:42:22 -0700
Implement range rewrite without live_after
Diffstat:
5 files changed, 339 insertions(+), 96 deletions(-)
diff --git a/doc/MIR_RA_REPORT.md b/doc/MIR_RA_REPORT.md
@@ -486,17 +486,20 @@ Deliverables:
- noop move deletion
- no required per-instruction full live-after storage
-### Phase 5: Coalescing and Splitting
+### Phase 5: O1 Cleanup and Scaling
-After O1 has the right shape, add O2 allocator features:
+After O1 has the right shape, remove transition scaffolding and fix the
+remaining O1 scaling buckets before adding O2 allocator features:
-- move collection
-- move-only liveness
-- capped conflict matrix for coalescing
-- live-range splitting
-- edge/block spill placement
+- rewrite without extra per-instruction temporary list churn
+- delete the old dense liveness/conflict path instead of preserving
+ compatibility for old optimizer-internal tests
+- narrow or compress `used_locs` so stack occupancy does not scale as
+ `points * candidates`
-This mirrors MIR's division: the matrix is an optional coalescing tool, not the
+Only after that should O2 add move collection, move-only liveness, capped
+coalescing matrices, live-range splitting, and edge/block spill placement. This
+mirrors MIR's division: the matrix is an optional coalescing tool, not the
allocator's main model.
## Implementation Checklist
@@ -504,6 +507,12 @@ allocator's main model.
Use this as the working tracker for the structural rewrite. Each phase should
land with focused tests and metrics before moving to the next phase.
+Compatibility policy: compatibility with the old O1 optimizer internals is not
+a goal. The public compiler/API behavior must stay correct, but internal helper
+APIs, dumps, tests, metrics, and persistent IR fields may be broken, renamed, or
+deleted when they preserve the dense-conflict shape or add runtime cost. Prefer
+a clean fast implementation over shims for old pass boundaries.
+
### Phase 0: Baseline and Guardrails
- [x] Preserve current O0 behavior as the reference path.
@@ -548,12 +557,13 @@ coverage belongs in a later pass once the range allocator owns rewrite.
- [x] Compute `live_in`/`live_out` without allocating conflicts.
- [x] Make pre-DDE use only block liveness.
- [x] Keep old full `opt_live_info` available for regalloc during transition.
+ This is now transition debt, not a compatibility requirement.
- [x] Add dump output for block liveness.
- [x] Add tests for branch, loop, and call liveness.
Exit criteria:
-- [x] Existing O1 behavior is unchanged.
+- [x] Existing public O1 behavior is unchanged.
- [x] Pre-DDE no longer builds `val_conflicts`.
- [x] Metrics show separate block-liveness timing and no pre-DDE conflict bytes.
@@ -608,43 +618,102 @@ Exit criteria:
### Phase 4: Rewrite From Assignment Map
-- [ ] Rewrite hard-assigned values to hard registers.
-- [ ] Insert reloads for stack-assigned uses.
-- [ ] Insert stores for stack-assigned defs.
-- [ ] Reserve target scratch registers per class.
-- [ ] Handle operands nested in:
- - [ ] normal operands
- - [ ] indirect operands
- - [ ] call descriptors
- - [ ] return descriptors
- - [ ] intrinsic descriptors
- - [ ] inline asm descriptors
-- [ ] Preserve call-clobber saves/restores with a rolling live set.
-- [ ] Delete noop moves after rewrite.
-- [ ] Avoid persistent per-instruction full `live_after` storage.
-- [ ] Add rewrite dump output.
+- [x] Rewrite hard-assigned values to hard registers.
+- [x] Insert reloads for stack-assigned uses.
+- [x] Insert stores for stack-assigned defs.
+- [x] Reserve target scratch registers per class.
+- [x] Handle operands nested in:
+ - [x] normal operands
+ - [x] indirect operands
+ - [x] call descriptors
+ - [x] return descriptors
+ - [x] intrinsic descriptors
+ - [x] inline asm descriptors
+- [x] Preserve call-clobber saves/restores with a rolling live set.
+- [x] Delete noop moves after rewrite.
+- [x] Avoid persistent per-instruction full `live_after` storage.
+- [x] Add rewrite dump output.
+
+Exit criteria:
+
+- [x] O1 emits correct code without `Block.live_after`.
+- [x] Reload/store metrics are stable and understandable.
+- [x] Public O1 behavior and smoke JIT tests pass.
+
+### Phase 5: Reduce Phase-4 Rewrite Overhead
+
+Phase-4 timings show the no-`live_after` rewrite improves the large
+straight-line case but regresses many-small-function/function-table inputs. Keep
+the rolling-live design, but remove the temporary-list and reverse-copy cost
+introduced by the first implementation.
+
+- [ ] Rewrite blocks with a single block-local output buffer or pre-sized edit
+ buffer.
+- [ ] Avoid per-instruction `RewriteList` allocation/copy churn.
+- [ ] Preserve rolling-live call-clobber behavior without persistent
+ `Block.live_after`.
+- [ ] Keep scratch-register selection deterministic after the rewrite cleanup.
+- [ ] Update tests that asserted old internal storage or exact intermediate
+ dump details.
+- [ ] Re-run the same-input phase-3 vs phase-5 timing comparison from
+ `doc/PERF.md`.
Exit criteria:
-- [ ] O1 emits correct code without `Block.live_after`.
-- [ ] Reload/store metrics are stable and understandable.
-- [ ] Existing O1 tests and smoke JIT tests pass.
+- [ ] Function-table/main-many-small `opt.regalloc` returns to phase-3 parity
+ or better.
+- [ ] Large straight-line phase-4 gains are retained.
+- [ ] No persistent per-instruction full live-after sets are rebuilt in normal
+ O1.
-### Phase 5: Remove Old Dense Path
+### Phase 6: Delete Old Dense Path
- [ ] Remove normal-path allocation of `Func.val_conflicts`.
-- [ ] Remove or demote `opt_conflict_words` from persistent `Func` state.
-- [ ] Remove old all-values conflict construction from O1.
-- [ ] Keep only compatibility helpers still required by unported passes.
+- [ ] Delete old all-values conflict construction from O1, including
+ compatibility-only helpers and tests.
+- [ ] Remove or demote `opt_live_words`, `opt_conflict_words`,
+ `Block.live_in`, `Block.live_out`, `Block.live_use`, `Block.live_def`,
+ and `Block.live_after` from persistent IR state where no public behavior
+ requires them.
+- [ ] Replace `opt_live_info` callers with pass-local liveness/ranges, then
+ delete or narrow `opt_live_info`.
+- [ ] Make tests assert public behavior and new pass dumps/metrics, not dense
+ conflict matrix details.
- [ ] Update docs and metrics names to reflect range-based RA.
Exit criteria:
+- [ ] Normal O1 has no reachable dense pseudo-conflict construction.
+- [ ] The source tree no longer contains tests whose purpose is preserving the
+ old dense interference model.
+- [ ] `opt.conflict_bytes` is absent or always zero on normal O1 paths.
+
+### Phase 7: Narrow `used_locs`
+
+Phase-3 and phase-4 timings show the remaining large-function bend tracks
+`opt.alloc.used_loc_words`. The first range allocator sizes occupied-location
+bitmaps as hard-register bits plus one possible stack bit per candidate; that
+preserves correctness but reintroduces superlinear memory/time for one large
+function.
+
+- [ ] Split hard-register occupancy from stack-slot occupancy.
+- [ ] Make stack occupancy demand-sized by allocated stack slots, not candidate
+ count.
+- [ ] Consider sparse or interval-indexed stack occupancy for long straight-line
+ functions.
+- [ ] Keep hard-register occupancy compact and class-aware.
+- [ ] Add metrics separating hard occupancy words, stack occupancy words, and
+ allocated stack slot count.
+- [ ] Re-run straight-line and table-main ladders after each shape change.
+
+Exit criteria:
+
- [ ] Large straight-line benchmark scales by range/point count, not
- `nvals * live_words`.
+ `points * candidates`, `nvals * live_words`, or dense conflict bytes.
- [ ] `doc/PERF.md` can be updated with before/after O1 scaling numbers.
+- [ ] Table-main remains at phase-5 parity or better.
-### Phase 6: O2 Coalescing
+### Phase 8: O2 Coalescing
- [ ] Collect move candidates after machinize.
- [ ] Add move-only liveness using `scan_vars`-style restricted variables.
@@ -662,7 +731,7 @@ Exit criteria:
- [ ] Coalescing is optional and does not affect O1 compile-time shape.
- [ ] Matrix memory is bounded and reported.
-### Phase 7: O2 Splitting and Spill Placement
+### Phase 9: O2 Splitting and Spill Placement
- [ ] Add split-capable occupancy state similar to MIR's `busy_used_locs`.
- [ ] Identify profitable live-range gaps.
@@ -676,7 +745,7 @@ Exit criteria:
- [ ] O2 can reduce spills without changing O1 assignment behavior.
- [ ] Edge/block placement tests cover diamonds, loops, and critical edges.
-### Phase 8: Cleanup and Documentation
+### Phase 10: Cleanup and Documentation
- [ ] Update `doc/OPT.md` with the final implemented module names and pass
order.
@@ -693,6 +762,7 @@ Avoid spending early effort on:
- micro-optimizing the existing full conflict matrix
- adding sparse rows to `val_conflicts` as a long-term solution
- tuning candidate priority before range-based allocation exists
+- preserving optimizer-internal compatibility with the dense allocator era
- link/JIT performance
- parser/CG data-structure changes
@@ -740,8 +810,10 @@ knowledge behind explicit helpers:
## Bottom Line
MIR's RA performance shape comes from avoiding a full pseudo interference graph
-in the normal allocation path. cfree should mirror that. The next serious O1
-work should be a structural rewrite around pass-local liveness, compressed live
-ranges, occupied-location assignment, and rewrite from an allocation map.
+in the normal allocation path. cfree should mirror that, even when doing so
+breaks old optimizer-internal APIs, fields, dumps, or tests. The next serious
+O1 work should keep the pass-local liveness, compressed live ranges,
+occupied-location assignment, and assignment-map rewrite shape, then remove the
+transition scaffolding and remaining scaling buckets.
Once that shape exists, incremental tuning will have a much better foundation.
diff --git a/doc/PERF.md b/doc/PERF.md
@@ -414,12 +414,87 @@ Next useful compile instrumentation:
- Add pool/arena allocation counters or high-water marks for frontend and CG
arenas.
+### Phase-4 Rewrite Rerun
+
+After `doc/MIR_RA_REPORT.md` phase 4, O1 rewrite no longer materializes
+`Block.live_after`. Inline-asm clobber constraints and call-clobber
+save/restore insertion now use rolling backward live sets. The old full
+per-instruction live-after storage is still available through `opt_live_info`
+for compatibility tests, but the normal O1 `opt_regalloc` path no longer
+requires it.
+
+Samples: 5 runs per point. The table uses p50 milliseconds. O1 scopes and O1
+counters are summed across functions. The generator is the same direct-call vs
+function-table family used for the phase-3 rerun, but these are fresh generated
+inputs and not byte-for-byte identical to the earlier tables.
+
+Straight-line main:
+
+```text
+funcs input_bytes insts vals pre_block_bytes ranges range_points used_loc_words conflict_bytes run.total compile.tu parse_codegen opt.o1.total live_blocks_pre live_ranges_reg regalloc link_jit
+1 218 61 38 576 37 52 104 0 0.584 0.475 0.319 0.208 0.059 0.033 0.067 0.062
+4 731 226 140 2016 142 196 392 0 1.050 0.943 0.800 0.530 0.146 0.090 0.193 0.058
+16 2810 886 548 7872 562 772 1776 0 3.044 2.912 2.754 1.855 0.507 0.345 0.726 0.065
+64 11191 3526 2180 31392 2242 3076 8864 0 10.963 10.756 10.599 7.140 1.924 1.375 2.906 0.089
+128 22450 7046 4356 62688 4482 6148 23096 0 21.189 20.676 20.524 13.718 3.682 2.690 5.706 0.110
+256 45191 14086 8708 125280 8962 12292 67688 0 41.840 41.173 41.019 27.476 7.238 5.464 11.777 0.157
+512 90668 28166 17412 250464 17922 24580 221384 0 87.461 86.405 86.252 58.701 14.896 11.760 26.346 0.243
+1024 181691 56326 34820 500832 35842 49156 786824 0 190.569 188.638 188.479 129.835 32.046 26.404 61.906 0.425
+```
+
+Function-table main:
+
+```text
+funcs input_bytes insts vals pre_block_bytes ranges range_points used_loc_words conflict_bytes run.total compile.tu parse_codegen opt.o1.total live_blocks_pre live_ranges_reg regalloc link_jit
+1 311 82 49 832 48 67 134 0 0.665 0.545 0.381 0.230 0.064 0.036 0.077 0.066
+4 788 226 133 2272 135 190 380 0 1.122 0.989 0.839 0.550 0.149 0.094 0.203 0.068
+16 2718 802 469 8032 483 682 1364 0 2.886 2.744 2.592 1.737 0.481 0.316 0.661 0.076
+64 10475 3106 1813 31072 1875 2650 5300 0 10.054 9.828 9.670 6.569 1.827 1.207 2.491 0.097
+128 20875 6178 3605 61792 3731 5274 10548 0 18.978 18.667 18.515 12.271 3.328 2.275 4.800 0.117
+256 41824 12322 7189 123232 7443 10522 21044 0 37.247 36.757 36.593 24.043 6.552 4.505 9.416 0.158
+512 83717 24610 14357 246112 14867 21018 42036 0 74.068 73.190 73.035 47.519 12.995 8.945 18.699 0.236
+1024 167549 49186 28693 491872 29715 42010 84020 0 153.281 151.622 151.471 96.303 26.390 17.895 37.547 0.382
+```
+
+To isolate the phase-4 change, the same generated inputs were also run against
+a temporary clean worktree at the phase-3 baseline (`HEAD` before the phase-4
+working-tree edits). The table below shows phase-3 p50 to phase-4 p50 on those
+identical inputs.
+
+```text
+straight_main:
+funcs run.total opt.o1.total live_ranges_reg regalloc
+512 88.544 -> 87.461 (-1.2%) 60.910 -> 58.701 (-3.6%) 12.268 -> 11.760 (-4.1%) 29.589 -> 26.346 (-11.0%)
+1024 206.333 -> 190.569 (-7.6%) 147.862 -> 129.835 (-12.2%) 27.774 -> 26.404 (-4.9%) 82.750 -> 61.906 (-25.2%)
+
+table_main:
+funcs run.total opt.o1.total live_ranges_reg regalloc
+512 69.043 -> 74.068 (+7.3%) 43.713 -> 47.519 (+8.7%) 9.240 -> 8.945 (-3.2%) 15.729 -> 18.699 (+18.9%)
+1024 143.333 -> 153.281 (+6.9%) 88.574 -> 96.303 (+8.7%) 18.549 -> 17.895 (-3.5%) 31.645 -> 37.547 (+18.7%)
+```
+
+An extra 7-sample repeat at 512 and 1024 confirmed the shape: large
+straight-line `regalloc` improves by about 13-27%, while the table-shaped case
+regresses by about 16-17% in `regalloc`. The live-range analysis bucket itself
+improves modestly in both shapes. The table-main regression is therefore in
+rewrite mechanics, not range construction.
+
+The most likely cause is the first phase-4 implementation strategy: it rewrites
+each block while walking backward, builds per-instruction temporary rewrite
+lists, appends them in reverse, then reverses the block. That removes the large
+persistent `live_after` allocation and helps the large straight-line case, but
+it adds list-copy overhead that is visible when many compact functions dominate
+the workload. A forward rewrite with a precomputed rolling-live cursor or a
+block-local edit buffer should keep the no-`live_after` shape without the
+table-main tax.
+
## Performance Priorities
-1. Finish phase-4 rewrite without persistent `live_after`.
- Phase 3 removed the dense conflict matrix from O1 assignment. Rewrite still
- needs per-instruction full live-after sets for call preservation and spill
- insertion, so that is the next correctness-preserving structural target.
+1. Reduce phase-4 rewrite overhead.
+ Phase 4 removed persistent `Block.live_after` from the normal O1 path and
+ improved the large straight-line case, but the current backward rewrite
+ regresses many-small-function/table-shaped inputs. Keep the rolling-live
+ design, but remove the extra temporary-list/reverse-copy cost.
2. Compress or narrow `used_locs`.
The remaining straight-line superlinear bucket now tracks
diff --git a/src/opt/opt.h b/src/opt/opt.h
@@ -151,6 +151,7 @@ void opt_live_blocks(Func*, OptLiveInfo*);
void opt_live_dump_blocks(Func*, const OptLiveInfo*, Writer*);
void opt_live_ranges_build(Func*, const OptLiveInfo*, OptLiveRangeSet*);
void opt_live_dump_ranges(Func*, const OptLiveRangeSet*, Writer*);
+void opt_rewrite_dump(Func*, Writer*);
void opt_live_info(Func*);
void opt_coalesce(Func*);
void opt_regalloc(Func*, int allow_live_range_split);
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -1,3 +1,4 @@
+#include <stdio.h>
#include <string.h>
#include "core/arena.h"
@@ -825,34 +826,48 @@ static void opt_init_val_info_from_ranges(Func* f,
f->opt_conflict_words = 0;
}
-static void opt_build_live_after_only(Func* f, const OptLiveInfo* live_info) {
+static void bits_clear(u64* bits, u32 words) {
+ for (u32 i = 0; i < words; ++i) bits[i] = 0;
+}
+
+static void live_update_before(u64* live, const u64* use, const u64* def,
+ u32 words) {
+ for (u32 w = 0; w < words; ++w) live[w] = (live[w] & ~def[w]) | use[w];
+}
+
+static void live_copy_block_out(Func* f, const OptLiveInfo* live_info, u32 b,
+ u64* live, u32 words) {
+ bits_clear(live, words);
+ if (live_info) {
+ const OptBitset* out = &live_info->blocks[b].live_out;
+ for (u32 w = 0; w < words && w < out->nwords; ++w) live[w] = out->words[w];
+ } else if (b < f->nblocks && f->blocks[b].live_out) {
+ copy_bits(live, f->blocks[b].live_out, words);
+ }
+}
+
+static void opt_apply_asm_constraints_from_live(Func* f,
+ const OptLiveInfo* live_info) {
u32 words = live_info ? live_info->words : f->opt_live_words;
if (!words) words = bit_words(f->nvals);
f->opt_live_words = (u16)words;
+
+ u64* live = arena_zarray(f->arena, u64, words ? words : 1u);
+ u64* use = arena_zarray(f->arena, u64, words ? words : 1u);
+ u64* def = arena_zarray(f->arena, u64, words ? words : 1u);
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
- bl->live_after = arena_array(f->arena, u64*, bl->ninsts ? bl->ninsts : 1u);
- u64* live = arena_zarray(f->arena, u64, words);
- if (live_info) {
- const OptBitset* out = &live_info->blocks[b].live_out;
- for (u32 w = 0; w < words && w < out->nwords; ++w) live[w] = out->words[w];
- } else if (bl->live_out) {
- copy_bits(live, bl->live_out, words);
- }
+ live_copy_block_out(f, live_info, b, live, words);
for (u32 ri = bl->ninsts; ri > 0; --ri) {
u32 i = ri - 1u;
Inst* in = &bl->insts[i];
- u64* after = arena_zarray(f->arena, u64, words);
- copy_bits(after, live, words);
- bl->live_after[i] = after;
-
- u64* use = arena_zarray(f->arena, u64, words);
- u64* def = arena_zarray(f->arena, u64, words);
+ bits_clear(use, words);
+ bits_clear(def, words);
BitsCtx bc = {use, def};
walk_inst_operands(f, in, collect_bits, &bc);
if ((IROp)in->op == IR_ASM_BLOCK)
- apply_asm_register_constraints(f, in, use, def, after);
- for (u32 w = 0; w < words; ++w) live[w] = (live[w] & ~def[w]) | use[w];
+ apply_asm_register_constraints(f, in, use, def, live);
+ live_update_before(live, use, def, words);
}
}
}
@@ -1163,17 +1178,60 @@ static void rewrite_call_arg_value(Func* f, Inst* owner, CGABIValue* v,
}
}
-static void rewrite_func(Func* f) {
+typedef struct RewriteCallSaveCtx {
+ Func* f;
+ RewriteList* out;
+ const u64* def;
+ int emit_restore;
+} RewriteCallSaveCtx;
+
+static void rewrite_call_save_one(Val v, void* arg) {
+ RewriteCallSaveCtx* c = (RewriteCallSaveCtx*)arg;
+ Func* f = c->f;
+ if (v == VAL_NONE || v >= f->nvals) return;
+ if (bit_has(c->def, v)) return;
+ if (f->val_info[v].alloc_kind != OPT_ALLOC_HARD) return;
+ if (!is_caller_saved(f, f->val_info[v].cls, f->val_info[v].hard_reg)) return;
+ if (c->emit_restore)
+ append_load_val(f, c->out, v);
+ else
+ append_store_val(f, c->out, v);
+}
+
+static void append_live_call_saves(Func* f, RewriteList* out,
+ const u64* live_after, const u64* def,
+ u32 words, int emit_restore) {
+ RewriteCallSaveCtx ctx = {f, out, def, emit_restore};
+ for (u32 w = 0; w < words; ++w) {
+ u64 bits = live_after[w];
+ while (bits) {
+ u32 bit = (u32)__builtin_ctzll(bits);
+ rewrite_call_save_one(w * 64u + bit, &ctx);
+ bits &= bits - 1u;
+ }
+ }
+}
+
+static void rewrite_func(Func* f, const OptLiveInfo* live_info) {
+ u32 words = live_info ? live_info->words : f->opt_live_words;
+ if (!words) words = bit_words(f->nvals);
+ f->opt_live_words = (u16)words;
+
+ u64* live = arena_zarray(f->arena, u64, words ? words : 1u);
+ u64* use = arena_zarray(f->arena, u64, words ? words : 1u);
+ u64* def = arena_zarray(f->arena, u64, words ? words : 1u);
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
- u64** live_after = bl->live_after;
+ RewriteList rev;
+ memset(&rev, 0, sizeof rev);
+ live_copy_block_out(f, live_info, b, live, words);
- RewriteList out;
- memset(&out, 0, sizeof out);
- for (u32 i = 0; i < bl->ninsts; ++i) {
+ for (u32 ri = bl->ninsts; ri > 0; --ri) {
+ u32 i = ri - 1u;
Inst in = bl->insts[i];
- u64* def = arena_zarray(f->arena, u64, f->opt_live_words);
- BitsCtx db = {arena_zarray(f->arena, u64, f->opt_live_words), def};
+ bits_clear(use, words);
+ bits_clear(def, words);
+ BitsCtx db = {use, def};
walk_inst_operands(f, &in, collect_bits, &db);
RewriteList before, after;
memset(&before, 0, sizeof before);
@@ -1194,40 +1252,67 @@ static void rewrite_func(Func* f) {
} else {
walk_inst_operands(f, &in, rewrite_one_operand, &ctx);
}
+ RewriteList seq;
+ memset(&seq, 0, sizeof seq);
for (u32 k = 0; k < before.n; ++k) {
- Inst* dst = list_push(f, &out, (IROp)before.data[k].op);
+ Inst* dst = list_push(f, &seq, (IROp)before.data[k].op);
*dst = before.data[k];
}
- if ((IROp)in.op == IR_CALL) {
- for (Val v = 1; v < f->nvals; ++v) {
- if (!bit_has(live_after[i], v) || bit_has(def, v)) continue;
- if (f->val_info[v].alloc_kind == OPT_ALLOC_HARD &&
- is_caller_saved(f, f->val_info[v].cls, f->val_info[v].hard_reg))
- append_store_val(f, &out, v);
- }
- }
- Inst* dst = list_push(f, &out, (IROp)in.op);
+ if ((IROp)in.op == IR_CALL)
+ append_live_call_saves(f, &seq, live, def, words, 0);
+ Inst* dst = list_push(f, &seq, (IROp)in.op);
*dst = in;
- if ((IROp)in.op == IR_CALL) {
- for (Val v = 1; v < f->nvals; ++v) {
- if (!bit_has(live_after[i], v) || bit_has(def, v)) continue;
- if (f->val_info[v].alloc_kind == OPT_ALLOC_HARD &&
- is_caller_saved(f, f->val_info[v].cls, f->val_info[v].hard_reg))
- append_load_val(f, &out, v);
- }
- }
+ if ((IROp)in.op == IR_CALL)
+ append_live_call_saves(f, &seq, live, def, words, 1);
for (u32 k = 0; k < after.n; ++k) {
- Inst* ad = list_push(f, &out, (IROp)after.data[k].op);
+ Inst* ad = list_push(f, &seq, (IROp)after.data[k].op);
*ad = after.data[k];
}
+
+ for (u32 k = seq.n; k > 0; --k) {
+ Inst* rd = list_push(f, &rev, (IROp)seq.data[k - 1u].op);
+ *rd = seq.data[k - 1u];
+ }
+ live_update_before(live, use, def, words);
}
- bl->insts = out.data;
- bl->ninsts = out.n;
- bl->cap = out.cap;
+ for (u32 i = 0; i < rev.n / 2u; ++i) {
+ Inst tmp = rev.data[i];
+ rev.data[i] = rev.data[rev.n - 1u - i];
+ rev.data[rev.n - 1u - i] = tmp;
+ }
+ bl->insts = rev.data;
+ bl->ninsts = rev.n;
+ bl->cap = rev.cap;
+ bl->live_after = NULL;
}
f->opt_rewritten = 1;
}
+static void rewrite_dump_write(Writer* w, const char* s) {
+ cfree_writer_write(w, s, strlen(s));
+}
+
+void opt_rewrite_dump(Func* f, Writer* w) {
+ if (!f || !w) return;
+ char buf[96];
+ snprintf(buf, sizeof buf, "rewrite blocks=%u vals=%u rewritten=%u\n",
+ (unsigned)f->nblocks, (unsigned)f->nvals,
+ (unsigned)f->opt_rewritten);
+ rewrite_dump_write(w, buf);
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ snprintf(buf, sizeof buf, "block %u insts=%u\n", (unsigned)b,
+ (unsigned)bl->ninsts);
+ rewrite_dump_write(w, buf);
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ snprintf(buf, sizeof buf, " %u op=%u operands=%u\n", (unsigned)i,
+ (unsigned)in->op, (unsigned)in->nopnds);
+ rewrite_dump_write(w, buf);
+ }
+ }
+}
+
static int inst_has_side_effect(Func* f, const Inst* in);
static int all_defs_dead(Func* f, Inst* in, u64* live) {
@@ -1297,14 +1382,14 @@ void opt_regalloc(Func* f, int allow_live_range_split) {
OptLiveRangeSet ranges;
opt_live_ranges_build(f, &live, &ranges);
opt_init_val_info_from_ranges(f, &ranges);
- opt_build_live_after_only(f, &live);
+ opt_apply_asm_constraints_from_live(f, &live);
metrics_count(f->c, "opt.live_words", f->opt_live_words);
metrics_count(f->c, "opt.conflict_bytes", 0);
metrics_scope_end(f->c, "opt.live_ranges.regalloc");
OptAllocator alloc;
opt_assign_ranges(f, &ranges, &alloc);
- rewrite_func(f);
+ rewrite_func(f, &live);
}
static int same_reg_operand(const Operand* a, const Operand* b) {
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -1216,15 +1216,26 @@ static void opt_rewrite_spill_use_def(void) {
emit_ret_val(f, f->entry, c, tc.i32);
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
opt_regalloc(f, 0);
EXPECT(f->opt_rewritten, "regalloc should rewrite pseudos");
+ EXPECT(f->blocks[f->entry].live_after == NULL,
+ "range rewrite should not require per-instruction live_after sets");
EXPECT(count_op(f, IR_STORE) >= 1, "spill def should insert store");
EXPECT(count_op(f, IR_LOAD) >= 1, "spill use should insert reload");
int saw_spill_slot = 0;
for (u32 i = 0; i < f->nframe_slots; ++i)
if (f->frame_slots[i].kind == FS_SPILL) saw_spill_slot = 1;
EXPECT(saw_spill_slot, "rewrite should allocate FS_SPILL slot");
+
+ CfreeWriter* w = cfree_writer_mem(&g_heap);
+ opt_rewrite_dump(f, w);
+ size_t len = 0;
+ const unsigned char* bytes = cfree_writer_mem_bytes(w, &len);
+ EXPECT(bytes_contains(bytes, len, "rewrite blocks="),
+ "rewrite dump should include summary");
+ EXPECT(bytes_contains(bytes, len, "op="),
+ "rewrite dump should include rewritten instructions");
+ cfree_writer_close(w);
tc_fini(&tc);
}
@@ -1245,10 +1256,11 @@ static void opt_call_clobber_preservation(void) {
emit_ret_val(f, f->entry, live, tc.i32);
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
opt_regalloc(f, 0);
Block* b = &f->blocks[f->entry];
+ EXPECT(b->live_after == NULL,
+ "call rewrite should keep live-after state pass-local");
int saw_call_save_restore = 0;
for (u32 i = 1; i + 1 < b->ninsts; ++i) {
if ((IROp)b->insts[i].op == IR_CALL &&
@@ -1279,10 +1291,11 @@ static void opt_call_clobber_caller_saved(void) {
emit_ret_val(f, f->entry, live, tc.i32);
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
opt_regalloc(f, 0);
Block* b = &f->blocks[f->entry];
+ EXPECT(b->live_after == NULL,
+ "call rewrite should keep live-after state pass-local");
int saw_call_save_restore = 0;
for (u32 i = 1; i + 1 < b->ninsts; ++i) {
if ((IROp)b->insts[i].op == IR_CALL &&
@@ -1320,7 +1333,6 @@ static void opt_spill_pressure(void) {
emit_ret_val(f, f->entry, d, tc.i32);
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
opt_regalloc(f, 0);
int spills = 0;
@@ -1437,7 +1449,6 @@ static void opt_inline_asm_constraints_and_clobbers(void) {
opt_machinize(f, &mock.base);
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
opt_regalloc(f, 0);
aux = (IRAsmAux*)in->extra.aux;
@@ -1560,7 +1571,6 @@ static void opt_regalloc_spill_requires_scratch(void) {
emit_ret_val(f, f->entry, c, tc.i32);
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
NoScratchCtx nctx = {f};
EXPECT(expect_panic(tc.c, run_no_scratch_regalloc, &nctx),