Implement range rewrite without live_after - kit

commit a11e9f6d86f9aa5c4d8d543fee63730bfa8b3991
parent 0bfde91143b108d562e7bc2754f9335c212745d5
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 14 May 2026 19:42:22 -0700

Implement range rewrite without live_after

Diffstat:
M doc/MIR_RA_REPORT.md  | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
M doc/PERF.md  | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M src/opt/opt.h  | 1 +
M src/opt/pass_lower.c  | 181 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
M test/opt/opt_test.c  | 22 ++++++++++++++++------

5 files changed, 339 insertions(+), 96 deletions(-)
diff --git a/doc/MIR_RA_REPORT.md b/doc/MIR_RA_REPORT.md
@@ -486,17 +486,20 @@ Deliverables:
 - noop move deletion
 - no required per-instruction full live-after storage
 
-### Phase 5: Coalescing and Splitting
+### Phase 5: O1 Cleanup and Scaling
 
-After O1 has the right shape, add O2 allocator features:
+After O1 has the right shape, remove transition scaffolding and fix the
+remaining O1 scaling buckets before adding O2 allocator features:
 
-- move collection
-- move-only liveness
-- capped conflict matrix for coalescing
-- live-range splitting
-- edge/block spill placement
+- rewrite without extra per-instruction temporary list churn
+- delete the old dense liveness/conflict path instead of preserving
+  compatibility for old optimizer-internal tests
+- narrow or compress `used_locs` so stack occupancy does not scale as
+  `points * candidates`
 
-This mirrors MIR's division: the matrix is an optional coalescing tool, not the
+Only after that should O2 add move collection, move-only liveness, capped
+coalescing matrices, live-range splitting, and edge/block spill placement. This
+mirrors MIR's division: the matrix is an optional coalescing tool, not the
 allocator's main model.
 
 ## Implementation Checklist
@@ -504,6 +507,12 @@ allocator's main model.
 Use this as the working tracker for the structural rewrite. Each phase should
 land with focused tests and metrics before moving to the next phase.
 
+Compatibility policy: compatibility with the old O1 optimizer internals is not
+a goal. The public compiler/API behavior must stay correct, but internal helper
+APIs, dumps, tests, metrics, and persistent IR fields may be broken, renamed, or
+deleted when they preserve the dense-conflict shape or add runtime cost. Prefer
+a clean fast implementation over shims for old pass boundaries.
+
 ### Phase 0: Baseline and Guardrails
 
 - [x] Preserve current O0 behavior as the reference path.
@@ -548,12 +557,13 @@ coverage belongs in a later pass once the range allocator owns rewrite.
 - [x] Compute `live_in`/`live_out` without allocating conflicts.
 - [x] Make pre-DDE use only block liveness.
 - [x] Keep old full `opt_live_info` available for regalloc during transition.
+      This is now transition debt, not a compatibility requirement.
 - [x] Add dump output for block liveness.
 - [x] Add tests for branch, loop, and call liveness.
 
 Exit criteria:
 
-- [x] Existing O1 behavior is unchanged.
+- [x] Existing public O1 behavior is unchanged.
 - [x] Pre-DDE no longer builds `val_conflicts`.
 - [x] Metrics show separate block-liveness timing and no pre-DDE conflict bytes.
 
@@ -608,43 +618,102 @@ Exit criteria:
 
 ### Phase 4: Rewrite From Assignment Map
 
-- [ ] Rewrite hard-assigned values to hard registers.
-- [ ] Insert reloads for stack-assigned uses.
-- [ ] Insert stores for stack-assigned defs.
-- [ ] Reserve target scratch registers per class.
-- [ ] Handle operands nested in:
-  - [ ] normal operands
-  - [ ] indirect operands
-  - [ ] call descriptors
-  - [ ] return descriptors
-  - [ ] intrinsic descriptors
-  - [ ] inline asm descriptors
-- [ ] Preserve call-clobber saves/restores with a rolling live set.
-- [ ] Delete noop moves after rewrite.
-- [ ] Avoid persistent per-instruction full `live_after` storage.
-- [ ] Add rewrite dump output.
+- [x] Rewrite hard-assigned values to hard registers.
+- [x] Insert reloads for stack-assigned uses.
+- [x] Insert stores for stack-assigned defs.
+- [x] Reserve target scratch registers per class.
+- [x] Handle operands nested in:
+  - [x] normal operands
+  - [x] indirect operands
+  - [x] call descriptors
+  - [x] return descriptors
+  - [x] intrinsic descriptors
+  - [x] inline asm descriptors
+- [x] Preserve call-clobber saves/restores with a rolling live set.
+- [x] Delete noop moves after rewrite.
+- [x] Avoid persistent per-instruction full `live_after` storage.
+- [x] Add rewrite dump output.
+
+Exit criteria:
+
+- [x] O1 emits correct code without `Block.live_after`.
+- [x] Reload/store metrics are stable and understandable.
+- [x] Public O1 behavior and smoke JIT tests pass.
+
+### Phase 5: Reduce Phase-4 Rewrite Overhead
+
+Phase-4 timings show the no-`live_after` rewrite improves the large
+straight-line case but regresses many-small-function/function-table inputs. Keep
+the rolling-live design, but remove the temporary-list and reverse-copy cost
+introduced by the first implementation.
+
+- [ ] Rewrite blocks with a single block-local output buffer or pre-sized edit
+      buffer.
+- [ ] Avoid per-instruction `RewriteList` allocation/copy churn.
+- [ ] Preserve rolling-live call-clobber behavior without persistent
+      `Block.live_after`.
+- [ ] Keep scratch-register selection deterministic after the rewrite cleanup.
+- [ ] Update tests that asserted old internal storage or exact intermediate
+      dump details.
+- [ ] Re-run the same-input phase-3 vs phase-5 timing comparison from
+      `doc/PERF.md`.
 
 Exit criteria:
 
-- [ ] O1 emits correct code without `Block.live_after`.
-- [ ] Reload/store metrics are stable and understandable.
-- [ ] Existing O1 tests and smoke JIT tests pass.
+- [ ] Function-table/main-many-small `opt.regalloc` returns to phase-3 parity
+      or better.
+- [ ] Large straight-line phase-4 gains are retained.
+- [ ] No persistent per-instruction full live-after sets are rebuilt in normal
+      O1.
 
-### Phase 5: Remove Old Dense Path
+### Phase 6: Delete Old Dense Path
 
 - [ ] Remove normal-path allocation of `Func.val_conflicts`.
-- [ ] Remove or demote `opt_conflict_words` from persistent `Func` state.
-- [ ] Remove old all-values conflict construction from O1.
-- [ ] Keep only compatibility helpers still required by unported passes.
+- [ ] Delete old all-values conflict construction from O1, including
+      compatibility-only helpers and tests.
+- [ ] Remove or demote `opt_live_words`, `opt_conflict_words`,
+      `Block.live_in`, `Block.live_out`, `Block.live_use`, `Block.live_def`,
+      and `Block.live_after` from persistent IR state where no public behavior
+      requires them.
+- [ ] Replace `opt_live_info` callers with pass-local liveness/ranges, then
+      delete or narrow `opt_live_info`.
+- [ ] Make tests assert public behavior and new pass dumps/metrics, not dense
+      conflict matrix details.
 - [ ] Update docs and metrics names to reflect range-based RA.
 
 Exit criteria:
 
+- [ ] Normal O1 has no reachable dense pseudo-conflict construction.
+- [ ] The source tree no longer contains tests whose purpose is preserving the
+      old dense interference model.
+- [ ] `opt.conflict_bytes` is absent or always zero on normal O1 paths.
+
+### Phase 7: Narrow `used_locs`
+
+Phase-3 and phase-4 timings show the remaining large-function bend tracks
+`opt.alloc.used_loc_words`. The first range allocator sizes occupied-location
+bitmaps as hard-register bits plus one possible stack bit per candidate; that
+preserves correctness but reintroduces superlinear memory/time for one large
+function.
+
+- [ ] Split hard-register occupancy from stack-slot occupancy.
+- [ ] Make stack occupancy demand-sized by allocated stack slots, not candidate
+      count.
+- [ ] Consider sparse or interval-indexed stack occupancy for long straight-line
+      functions.
+- [ ] Keep hard-register occupancy compact and class-aware.
+- [ ] Add metrics separating hard occupancy words, stack occupancy words, and
+      allocated stack slot count.
+- [ ] Re-run straight-line and table-main ladders after each shape change.
+
+Exit criteria:
+
 - [ ] Large straight-line benchmark scales by range/point count, not
-      `nvals * live_words`.
+      `points * candidates`, `nvals * live_words`, or dense conflict bytes.
 - [ ] `doc/PERF.md` can be updated with before/after O1 scaling numbers.
+- [ ] Table-main remains at phase-5 parity or better.
 
-### Phase 6: O2 Coalescing
+### Phase 8: O2 Coalescing
 
 - [ ] Collect move candidates after machinize.
 - [ ] Add move-only liveness using `scan_vars`-style restricted variables.
@@ -662,7 +731,7 @@ Exit criteria:
 - [ ] Coalescing is optional and does not affect O1 compile-time shape.
 - [ ] Matrix memory is bounded and reported.
 
-### Phase 7: O2 Splitting and Spill Placement
+### Phase 9: O2 Splitting and Spill Placement
 
 - [ ] Add split-capable occupancy state similar to MIR's `busy_used_locs`.
 - [ ] Identify profitable live-range gaps.
@@ -676,7 +745,7 @@ Exit criteria:
 - [ ] O2 can reduce spills without changing O1 assignment behavior.
 - [ ] Edge/block placement tests cover diamonds, loops, and critical edges.
 
-### Phase 8: Cleanup and Documentation
+### Phase 10: Cleanup and Documentation
 
 - [ ] Update `doc/OPT.md` with the final implemented module names and pass
       order.
@@ -693,6 +762,7 @@ Avoid spending early effort on:
 - micro-optimizing the existing full conflict matrix
 - adding sparse rows to `val_conflicts` as a long-term solution
 - tuning candidate priority before range-based allocation exists
+- preserving optimizer-internal compatibility with the dense allocator era
 - link/JIT performance
 - parser/CG data-structure changes
 
@@ -740,8 +810,10 @@ knowledge behind explicit helpers:
 ## Bottom Line
 
 MIR's RA performance shape comes from avoiding a full pseudo interference graph
-in the normal allocation path. cfree should mirror that. The next serious O1
-work should be a structural rewrite around pass-local liveness, compressed live
-ranges, occupied-location assignment, and rewrite from an allocation map.
+in the normal allocation path. cfree should mirror that, even when doing so
+breaks old optimizer-internal APIs, fields, dumps, or tests. The next serious
+O1 work should keep the pass-local liveness, compressed live ranges,
+occupied-location assignment, and assignment-map rewrite shape, then remove the
+transition scaffolding and remaining scaling buckets.
 
 Once that shape exists, incremental tuning will have a much better foundation.
diff --git a/doc/PERF.md b/doc/PERF.md
@@ -414,12 +414,87 @@ Next useful compile instrumentation:
 - Add pool/arena allocation counters or high-water marks for frontend and CG
   arenas.
 
+### Phase-4 Rewrite Rerun
+
+After `doc/MIR_RA_REPORT.md` phase 4, O1 rewrite no longer materializes
+`Block.live_after`. Inline-asm clobber constraints and call-clobber
+save/restore insertion now use rolling backward live sets. The old full
+per-instruction live-after storage is still available through `opt_live_info`
+for compatibility tests, but the normal O1 `opt_regalloc` path no longer
+requires it.
+
+Samples: 5 runs per point. The table uses p50 milliseconds. O1 scopes and O1
+counters are summed across functions. The generator is the same direct-call vs
+function-table family used for the phase-3 rerun, but these are fresh generated
+inputs and not byte-for-byte identical to the earlier tables.
+
+Straight-line main:
+
+```text
+funcs input_bytes insts vals  pre_block_bytes ranges range_points used_loc_words conflict_bytes run.total compile.tu parse_codegen opt.o1.total live_blocks_pre live_ranges_reg regalloc link_jit
+1     218         61    38    576             37     52           104            0              0.584     0.475      0.319         0.208        0.059           0.033           0.067    0.062
+4     731         226   140   2016            142    196          392            0              1.050     0.943      0.800         0.530        0.146           0.090           0.193    0.058
+16    2810        886   548   7872            562    772          1776           0              3.044     2.912      2.754         1.855        0.507           0.345           0.726    0.065
+64    11191       3526  2180  31392           2242   3076         8864           0              10.963    10.756     10.599        7.140        1.924           1.375           2.906    0.089
+128   22450       7046  4356  62688           4482   6148         23096          0              21.189    20.676     20.524        13.718       3.682           2.690           5.706    0.110
+256   45191       14086 8708  125280          8962   12292        67688          0              41.840    41.173     41.019        27.476       7.238           5.464           11.777   0.157
+512   90668       28166 17412 250464          17922  24580        221384         0              87.461    86.405     86.252        58.701       14.896          11.760          26.346   0.243
+1024  181691      56326 34820 500832          35842  49156        786824         0              190.569   188.638    188.479       129.835      32.046          26.404          61.906   0.425
+```
+
+Function-table main:
+
+```text
+funcs input_bytes insts vals  pre_block_bytes ranges range_points used_loc_words conflict_bytes run.total compile.tu parse_codegen opt.o1.total live_blocks_pre live_ranges_reg regalloc link_jit
+1     311         82    49    832             48     67           134            0              0.665     0.545      0.381         0.230        0.064           0.036           0.077    0.066
+4     788         226   133   2272            135    190          380            0              1.122     0.989      0.839         0.550        0.149           0.094           0.203    0.068
+16    2718        802   469   8032            483    682          1364           0              2.886     2.744      2.592         1.737        0.481           0.316           0.661    0.076
+64    10475       3106  1813  31072           1875   2650         5300           0              10.054    9.828      9.670         6.569        1.827           1.207           2.491    0.097
+128   20875       6178  3605  61792           3731   5274         10548          0              18.978    18.667     18.515        12.271       3.328           2.275           4.800    0.117
+256   41824       12322 7189  123232          7443   10522        21044          0              37.247    36.757     36.593        24.043       6.552           4.505           9.416    0.158
+512   83717       24610 14357 246112          14867  21018        42036          0              74.068    73.190     73.035        47.519       12.995          8.945           18.699   0.236
+1024  167549      49186 28693 491872          29715  42010        84020          0              153.281   151.622    151.471       96.303       26.390          17.895          37.547   0.382
+```
+
+To isolate the phase-4 change, the same generated inputs were also run against
+a temporary clean worktree at the phase-3 baseline (`HEAD` before the phase-4
+working-tree edits). The table below shows phase-3 p50 to phase-4 p50 on those
+identical inputs.
+
+```text
+straight_main:
+funcs run.total                opt.o1.total              live_ranges_reg        regalloc
+512   88.544 -> 87.461 (-1.2%) 60.910 -> 58.701 (-3.6%)  12.268 -> 11.760 (-4.1%) 29.589 -> 26.346 (-11.0%)
+1024  206.333 -> 190.569 (-7.6%) 147.862 -> 129.835 (-12.2%) 27.774 -> 26.404 (-4.9%) 82.750 -> 61.906 (-25.2%)
+
+table_main:
+funcs run.total                opt.o1.total             live_ranges_reg        regalloc
+512   69.043 -> 74.068 (+7.3%) 43.713 -> 47.519 (+8.7%) 9.240 -> 8.945 (-3.2%)  15.729 -> 18.699 (+18.9%)
+1024  143.333 -> 153.281 (+6.9%) 88.574 -> 96.303 (+8.7%) 18.549 -> 17.895 (-3.5%) 31.645 -> 37.547 (+18.7%)
+```
+
+An extra 7-sample repeat at 512 and 1024 confirmed the shape: large
+straight-line `regalloc` improves by about 13-27%, while the table-shaped case
+regresses by about 16-17% in `regalloc`. The live-range analysis bucket itself
+improves modestly in both shapes. The table-main regression is therefore in
+rewrite mechanics, not range construction.
+
+The most likely cause is the first phase-4 implementation strategy: it rewrites
+each block while walking backward, builds per-instruction temporary rewrite
+lists, appends them in reverse, then reverses the block. That removes the large
+persistent `live_after` allocation and helps the large straight-line case, but
+it adds list-copy overhead that is visible when many compact functions dominate
+the workload. A forward rewrite with a precomputed rolling-live cursor or a
+block-local edit buffer should keep the no-`live_after` shape without the
+table-main tax.
+
 ## Performance Priorities
 
-1. Finish phase-4 rewrite without persistent `live_after`.
-   Phase 3 removed the dense conflict matrix from O1 assignment. Rewrite still
-   needs per-instruction full live-after sets for call preservation and spill
-   insertion, so that is the next correctness-preserving structural target.
+1. Reduce phase-4 rewrite overhead.
+   Phase 4 removed persistent `Block.live_after` from the normal O1 path and
+   improved the large straight-line case, but the current backward rewrite
+   regresses many-small-function/table-shaped inputs. Keep the rolling-live
+   design, but remove the extra temporary-list/reverse-copy cost.
 
 2. Compress or narrow `used_locs`.
    The remaining straight-line superlinear bucket now tracks
diff --git a/src/opt/opt.h b/src/opt/opt.h
@@ -151,6 +151,7 @@ void opt_live_blocks(Func*, OptLiveInfo*);
 void opt_live_dump_blocks(Func*, const OptLiveInfo*, Writer*);
 void opt_live_ranges_build(Func*, const OptLiveInfo*, OptLiveRangeSet*);
 void opt_live_dump_ranges(Func*, const OptLiveRangeSet*, Writer*);
+void opt_rewrite_dump(Func*, Writer*);
 void opt_live_info(Func*);
 void opt_coalesce(Func*);
 void opt_regalloc(Func*, int allow_live_range_split);
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -1,3 +1,4 @@
+#include <stdio.h>
 #include <string.h>
 
 #include "core/arena.h"
@@ -825,34 +826,48 @@ static void opt_init_val_info_from_ranges(Func* f,
   f->opt_conflict_words = 0;
 }
 
-static void opt_build_live_after_only(Func* f, const OptLiveInfo* live_info) {
+static void bits_clear(u64* bits, u32 words) {
+  for (u32 i = 0; i < words; ++i) bits[i] = 0;
+}
+
+static void live_update_before(u64* live, const u64* use, const u64* def,
+                               u32 words) {
+  for (u32 w = 0; w < words; ++w) live[w] = (live[w] & ~def[w]) | use[w];
+}
+
+static void live_copy_block_out(Func* f, const OptLiveInfo* live_info, u32 b,
+                                u64* live, u32 words) {
+  bits_clear(live, words);
+  if (live_info) {
+    const OptBitset* out = &live_info->blocks[b].live_out;
+    for (u32 w = 0; w < words && w < out->nwords; ++w) live[w] = out->words[w];
+  } else if (b < f->nblocks && f->blocks[b].live_out) {
+    copy_bits(live, f->blocks[b].live_out, words);
+  }
+}
+
+static void opt_apply_asm_constraints_from_live(Func* f,
+                                                const OptLiveInfo* live_info) {
   u32 words = live_info ? live_info->words : f->opt_live_words;
   if (!words) words = bit_words(f->nvals);
   f->opt_live_words = (u16)words;
+
+  u64* live = arena_zarray(f->arena, u64, words ? words : 1u);
+  u64* use = arena_zarray(f->arena, u64, words ? words : 1u);
+  u64* def = arena_zarray(f->arena, u64, words ? words : 1u);
   for (u32 b = 0; b < f->nblocks; ++b) {
     Block* bl = &f->blocks[b];
-    bl->live_after = arena_array(f->arena, u64*, bl->ninsts ? bl->ninsts : 1u);
-    u64* live = arena_zarray(f->arena, u64, words);
-    if (live_info) {
-      const OptBitset* out = &live_info->blocks[b].live_out;
-      for (u32 w = 0; w < words && w < out->nwords; ++w) live[w] = out->words[w];
-    } else if (bl->live_out) {
-      copy_bits(live, bl->live_out, words);
-    }
+    live_copy_block_out(f, live_info, b, live, words);
     for (u32 ri = bl->ninsts; ri > 0; --ri) {
       u32 i = ri - 1u;
       Inst* in = &bl->insts[i];
-      u64* after = arena_zarray(f->arena, u64, words);
-      copy_bits(after, live, words);
-      bl->live_after[i] = after;
-
-      u64* use = arena_zarray(f->arena, u64, words);
-      u64* def = arena_zarray(f->arena, u64, words);
+      bits_clear(use, words);
+      bits_clear(def, words);
       BitsCtx bc = {use, def};
       walk_inst_operands(f, in, collect_bits, &bc);
       if ((IROp)in->op == IR_ASM_BLOCK)
-        apply_asm_register_constraints(f, in, use, def, after);
-      for (u32 w = 0; w < words; ++w) live[w] = (live[w] & ~def[w]) | use[w];
+        apply_asm_register_constraints(f, in, use, def, live);
+      live_update_before(live, use, def, words);
     }
   }
 }
@@ -1163,17 +1178,60 @@ static void rewrite_call_arg_value(Func* f, Inst* owner, CGABIValue* v,
   }
 }
 
-static void rewrite_func(Func* f) {
+typedef struct RewriteCallSaveCtx {
+  Func* f;
+  RewriteList* out;
+  const u64* def;
+  int emit_restore;
+} RewriteCallSaveCtx;
+
+static void rewrite_call_save_one(Val v, void* arg) {
+  RewriteCallSaveCtx* c = (RewriteCallSaveCtx*)arg;
+  Func* f = c->f;
+  if (v == VAL_NONE || v >= f->nvals) return;
+  if (bit_has(c->def, v)) return;
+  if (f->val_info[v].alloc_kind != OPT_ALLOC_HARD) return;
+  if (!is_caller_saved(f, f->val_info[v].cls, f->val_info[v].hard_reg)) return;
+  if (c->emit_restore)
+    append_load_val(f, c->out, v);
+  else
+    append_store_val(f, c->out, v);
+}
+
+static void append_live_call_saves(Func* f, RewriteList* out,
+                                   const u64* live_after, const u64* def,
+                                   u32 words, int emit_restore) {
+  RewriteCallSaveCtx ctx = {f, out, def, emit_restore};
+  for (u32 w = 0; w < words; ++w) {
+    u64 bits = live_after[w];
+    while (bits) {
+      u32 bit = (u32)__builtin_ctzll(bits);
+      rewrite_call_save_one(w * 64u + bit, &ctx);
+      bits &= bits - 1u;
+    }
+  }
+}
+
+static void rewrite_func(Func* f, const OptLiveInfo* live_info) {
+  u32 words = live_info ? live_info->words : f->opt_live_words;
+  if (!words) words = bit_words(f->nvals);
+  f->opt_live_words = (u16)words;
+
+  u64* live = arena_zarray(f->arena, u64, words ? words : 1u);
+  u64* use = arena_zarray(f->arena, u64, words ? words : 1u);
+  u64* def = arena_zarray(f->arena, u64, words ? words : 1u);
   for (u32 b = 0; b < f->nblocks; ++b) {
     Block* bl = &f->blocks[b];
-    u64** live_after = bl->live_after;
+    RewriteList rev;
+    memset(&rev, 0, sizeof rev);
+    live_copy_block_out(f, live_info, b, live, words);
 
-    RewriteList out;
-    memset(&out, 0, sizeof out);
-    for (u32 i = 0; i < bl->ninsts; ++i) {
+    for (u32 ri = bl->ninsts; ri > 0; --ri) {
+      u32 i = ri - 1u;
       Inst in = bl->insts[i];
-      u64* def = arena_zarray(f->arena, u64, f->opt_live_words);
-      BitsCtx db = {arena_zarray(f->arena, u64, f->opt_live_words), def};
+      bits_clear(use, words);
+      bits_clear(def, words);
+      BitsCtx db = {use, def};
       walk_inst_operands(f, &in, collect_bits, &db);
       RewriteList before, after;
       memset(&before, 0, sizeof before);
@@ -1194,40 +1252,67 @@ static void rewrite_func(Func* f) {
       } else {
         walk_inst_operands(f, &in, rewrite_one_operand, &ctx);
       }
+      RewriteList seq;
+      memset(&seq, 0, sizeof seq);
       for (u32 k = 0; k < before.n; ++k) {
-        Inst* dst = list_push(f, &out, (IROp)before.data[k].op);
+        Inst* dst = list_push(f, &seq, (IROp)before.data[k].op);
         *dst = before.data[k];
       }
-      if ((IROp)in.op == IR_CALL) {
-        for (Val v = 1; v < f->nvals; ++v) {
-          if (!bit_has(live_after[i], v) || bit_has(def, v)) continue;
-          if (f->val_info[v].alloc_kind == OPT_ALLOC_HARD &&
-              is_caller_saved(f, f->val_info[v].cls, f->val_info[v].hard_reg))
-            append_store_val(f, &out, v);
-        }
-      }
-      Inst* dst = list_push(f, &out, (IROp)in.op);
+      if ((IROp)in.op == IR_CALL)
+        append_live_call_saves(f, &seq, live, def, words, 0);
+      Inst* dst = list_push(f, &seq, (IROp)in.op);
       *dst = in;
-      if ((IROp)in.op == IR_CALL) {
-        for (Val v = 1; v < f->nvals; ++v) {
-          if (!bit_has(live_after[i], v) || bit_has(def, v)) continue;
-          if (f->val_info[v].alloc_kind == OPT_ALLOC_HARD &&
-              is_caller_saved(f, f->val_info[v].cls, f->val_info[v].hard_reg))
-            append_load_val(f, &out, v);
-        }
-      }
+      if ((IROp)in.op == IR_CALL)
+        append_live_call_saves(f, &seq, live, def, words, 1);
       for (u32 k = 0; k < after.n; ++k) {
-        Inst* ad = list_push(f, &out, (IROp)after.data[k].op);
+        Inst* ad = list_push(f, &seq, (IROp)after.data[k].op);
         *ad = after.data[k];
       }
+
+      for (u32 k = seq.n; k > 0; --k) {
+        Inst* rd = list_push(f, &rev, (IROp)seq.data[k - 1u].op);
+        *rd = seq.data[k - 1u];
+      }
+      live_update_before(live, use, def, words);
     }
-    bl->insts = out.data;
-    bl->ninsts = out.n;
-    bl->cap = out.cap;
+    for (u32 i = 0; i < rev.n / 2u; ++i) {
+      Inst tmp = rev.data[i];
+      rev.data[i] = rev.data[rev.n - 1u - i];
+      rev.data[rev.n - 1u - i] = tmp;
+    }
+    bl->insts = rev.data;
+    bl->ninsts = rev.n;
+    bl->cap = rev.cap;
+    bl->live_after = NULL;
   }
   f->opt_rewritten = 1;
 }
 
+static void rewrite_dump_write(Writer* w, const char* s) {
+  cfree_writer_write(w, s, strlen(s));
+}
+
+void opt_rewrite_dump(Func* f, Writer* w) {
+  if (!f || !w) return;
+  char buf[96];
+  snprintf(buf, sizeof buf, "rewrite blocks=%u vals=%u rewritten=%u\n",
+           (unsigned)f->nblocks, (unsigned)f->nvals,
+           (unsigned)f->opt_rewritten);
+  rewrite_dump_write(w, buf);
+  for (u32 b = 0; b < f->nblocks; ++b) {
+    Block* bl = &f->blocks[b];
+    snprintf(buf, sizeof buf, "block %u insts=%u\n", (unsigned)b,
+             (unsigned)bl->ninsts);
+    rewrite_dump_write(w, buf);
+    for (u32 i = 0; i < bl->ninsts; ++i) {
+      Inst* in = &bl->insts[i];
+      snprintf(buf, sizeof buf, "  %u op=%u operands=%u\n", (unsigned)i,
+               (unsigned)in->op, (unsigned)in->nopnds);
+      rewrite_dump_write(w, buf);
+    }
+  }
+}
+
 static int inst_has_side_effect(Func* f, const Inst* in);
 
 static int all_defs_dead(Func* f, Inst* in, u64* live) {
@@ -1297,14 +1382,14 @@ void opt_regalloc(Func* f, int allow_live_range_split) {
   OptLiveRangeSet ranges;
   opt_live_ranges_build(f, &live, &ranges);
   opt_init_val_info_from_ranges(f, &ranges);
-  opt_build_live_after_only(f, &live);
+  opt_apply_asm_constraints_from_live(f, &live);
   metrics_count(f->c, "opt.live_words", f->opt_live_words);
   metrics_count(f->c, "opt.conflict_bytes", 0);
   metrics_scope_end(f->c, "opt.live_ranges.regalloc");
 
   OptAllocator alloc;
   opt_assign_ranges(f, &ranges, &alloc);
-  rewrite_func(f);
+  rewrite_func(f, &live);
 }
 
 static int same_reg_operand(const Operand* a, const Operand* b) {
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -1216,15 +1216,26 @@ static void opt_rewrite_spill_use_def(void) {
   emit_ret_val(f, f->entry, c, tc.i32);
   opt_build_cfg(f);
   opt_build_loop_tree(f);
-  opt_live_info(f);
   opt_regalloc(f, 0);
   EXPECT(f->opt_rewritten, "regalloc should rewrite pseudos");
+  EXPECT(f->blocks[f->entry].live_after == NULL,
+         "range rewrite should not require per-instruction live_after sets");
   EXPECT(count_op(f, IR_STORE) >= 1, "spill def should insert store");
   EXPECT(count_op(f, IR_LOAD) >= 1, "spill use should insert reload");
   int saw_spill_slot = 0;
   for (u32 i = 0; i < f->nframe_slots; ++i)
     if (f->frame_slots[i].kind == FS_SPILL) saw_spill_slot = 1;
   EXPECT(saw_spill_slot, "rewrite should allocate FS_SPILL slot");
+
+  CfreeWriter* w = cfree_writer_mem(&g_heap);
+  opt_rewrite_dump(f, w);
+  size_t len = 0;
+  const unsigned char* bytes = cfree_writer_mem_bytes(w, &len);
+  EXPECT(bytes_contains(bytes, len, "rewrite blocks="),
+         "rewrite dump should include summary");
+  EXPECT(bytes_contains(bytes, len, "op="),
+         "rewrite dump should include rewritten instructions");
+  cfree_writer_close(w);
   tc_fini(&tc);
 }
 
@@ -1245,10 +1256,11 @@ static void opt_call_clobber_preservation(void) {
   emit_ret_val(f, f->entry, live, tc.i32);
   opt_build_cfg(f);
   opt_build_loop_tree(f);
-  opt_live_info(f);
   opt_regalloc(f, 0);
 
   Block* b = &f->blocks[f->entry];
+  EXPECT(b->live_after == NULL,
+         "call rewrite should keep live-after state pass-local");
   int saw_call_save_restore = 0;
   for (u32 i = 1; i + 1 < b->ninsts; ++i) {
     if ((IROp)b->insts[i].op == IR_CALL &&
@@ -1279,10 +1291,11 @@ static void opt_call_clobber_caller_saved(void) {
   emit_ret_val(f, f->entry, live, tc.i32);
   opt_build_cfg(f);
   opt_build_loop_tree(f);
-  opt_live_info(f);
   opt_regalloc(f, 0);
 
   Block* b = &f->blocks[f->entry];
+  EXPECT(b->live_after == NULL,
+         "call rewrite should keep live-after state pass-local");
   int saw_call_save_restore = 0;
   for (u32 i = 1; i + 1 < b->ninsts; ++i) {
     if ((IROp)b->insts[i].op == IR_CALL &&
@@ -1320,7 +1333,6 @@ static void opt_spill_pressure(void) {
   emit_ret_val(f, f->entry, d, tc.i32);
   opt_build_cfg(f);
   opt_build_loop_tree(f);
-  opt_live_info(f);
   opt_regalloc(f, 0);
 
   int spills = 0;
@@ -1437,7 +1449,6 @@ static void opt_inline_asm_constraints_and_clobbers(void) {
   opt_machinize(f, &mock.base);
   opt_build_cfg(f);
   opt_build_loop_tree(f);
-  opt_live_info(f);
   opt_regalloc(f, 0);
   aux = (IRAsmAux*)in->extra.aux;
 
@@ -1560,7 +1571,6 @@ static void opt_regalloc_spill_requires_scratch(void) {
   emit_ret_val(f, f->entry, c, tc.i32);
   opt_build_cfg(f);
   opt_build_loop_tree(f);
-  opt_live_info(f);
 
   NoScratchCtx nctx = {f};
   EXPECT(expect_panic(tc.c, run_no_scratch_regalloc, &nctx),

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	doc/MIR_RA_REPORT.md	\|	148	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
M	doc/PERF.md	\|	83	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M	src/opt/opt.h	\|	1	+
M	src/opt/pass_lower.c	\|	181	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
M	test/opt/opt_test.c	\|	22	++++++++++++++++------