Implement range-based O1 assignment - kit

commit 064cd29d7879a73a15ed06cbd9860f4ac4d59384
parent ed6d5b24fc8b242a6b5ba8405673c0b5b1836d2d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 14 May 2026 19:25:09 -0700

Implement range-based O1 assignment

Diffstat:
M doc/MIR_RA_REPORT.md  | 36 ++++++++++++++++++------------------
M doc/PERF.md  | 213 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
M src/opt/ir.h  | 1 +
M src/opt/opt.c  | 3 ++-
M src/opt/opt.h  | 13 +++++++++++++
M src/opt/pass_lower.c  | 399 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
M test/opt/opt_test.c  | 36 ++++++++++++++++++++++++++++++++++++

7 files changed, 511 insertions(+), 190 deletions(-)
diff --git a/doc/MIR_RA_REPORT.md b/doc/MIR_RA_REPORT.md
@@ -584,27 +584,27 @@ Exit criteria:
 
 ### Phase 3: Range-Based Assignment
 
-- [ ] Define `OptLoc` assignment mapping for every `Val`.
-- [ ] Build allocation candidate list from values with ranges.
-- [ ] Sort candidates by:
-  - [ ] tied hard-register requirement
-  - [ ] frequency/spill cost
-  - [ ] shorter live length
-  - [ ] stable value id
-- [ ] Add `used_locs[point]` bitmaps.
-- [ ] Mark hard-register live ranges in `used_locs`.
-- [ ] For each candidate, union occupied locations across its ranges.
-- [ ] Assign a legal hard register when available.
-- [ ] Assign/reuse stack slots when no hard register is available.
-- [ ] Preserve tied hard-register validation.
-- [ ] Preserve forbidden hard-register constraints from asm/calls.
-- [ ] Keep live-range splitting disabled for O1.
+- [x] Define `OptLoc` assignment mapping for every `Val`.
+- [x] Build allocation candidate list from values with ranges.
+- [x] Sort candidates by:
+  - [x] tied hard-register requirement
+  - [x] frequency/spill cost
+  - [x] shorter live length
+  - [x] stable value id
+- [x] Add `used_locs[point]` bitmaps.
+- [x] Mark hard-register live ranges in `used_locs`.
+- [x] For each candidate, union occupied locations across its ranges.
+- [x] Assign a legal hard register when available.
+- [x] Assign/reuse stack slots when no hard register is available.
+- [x] Preserve tied hard-register validation.
+- [x] Preserve forbidden hard-register constraints from asm/calls.
+- [x] Keep live-range splitting disabled for O1.
 
 Exit criteria:
 
-- [ ] O1 regalloc no longer depends on `val_conflicts`.
-- [ ] `opt.conflict_bytes` is zero or absent on the normal O1 allocation path.
-- [ ] Branch, loop, call-clobber, spill, and tied-register tests pass.
+- [x] O1 regalloc no longer depends on `val_conflicts`.
+- [x] `opt.conflict_bytes` is zero or absent on the normal O1 allocation path.
+- [x] Branch, loop, call-clobber, spill, and tied-register tests pass.
 
 ### Phase 4: Rewrite From Assignment Map
 
diff --git a/doc/PERF.md b/doc/PERF.md
@@ -30,6 +30,9 @@ Important counters:
 - `opt.funcs`, `opt.blocks`, `opt.insts`, `opt.vals`
 - `opt.live_words`, `opt.live.blocks`, `opt.live.active_words`
 - `opt.live.block_bytes`, `opt.live.set_bit_scans`
+- `opt.ranges`, `opt.range_points`, `opt.range_raw_points`
+- `opt.alloc.used_loc_words`, `opt.alloc.spills`
+- `opt.rewrite.reloads`, `opt.rewrite.stores`
 - `opt.conflict_bytes`
 - `link.inputs`, `link.sections`, `link.segments`, `link.syms`, `link.relocs`
 - `jit.master_size`, `jit.nsegments`, `jit.input_section_bytes`,
@@ -43,19 +46,19 @@ Times are `min / p50 / p95 / mean / max` in milliseconds.
 `int main(){return 0;}`
 
 ```text
-run.total             0.298 / 0.312 / 0.344 / 0.319 / 0.390
-run.compile_and_jit   0.283 / 0.295 / 0.323 / 0.300 / 0.346
-compile.tu            0.209 / 0.222 / 0.242 / 0.224 / 0.244
+run.total             0.290 / 0.311 / 0.345 / 0.316 / 0.351
+run.compile_and_jit   0.274 / 0.293 / 0.313 / 0.296 / 0.329
+compile.tu            0.218 / 0.235 / 0.246 / 0.235 / 0.257
 compile.c.parse_codegen
-                      0.073 / 0.081 / 0.092 / 0.082 / 0.095
-opt.o1.total          0.040 / 0.048 / 0.057 / 0.048 / 0.059
+                      0.082 / 0.091 / 0.097 / 0.091 / 0.107
+opt.o1.total          0.048 / 0.056 / 0.061 / 0.055 / 0.065
 opt.live_blocks.pre_dde
-                      0.008 / 0.008 / 0.017 / 0.010 / 0.019
-opt.live_info.regalloc
-                      0.003 / 0.003 / 0.004 / 0.004 / 0.007
-opt.regalloc          0.008 / 0.012 / 0.016 / 0.012 / 0.019
-link_jit.all          0.043 / 0.049 / 0.058 / 0.051 / 0.059
-link.resolve.total    0.020 / 0.021 / 0.027 / 0.023 / 0.029
+                      0.013 / 0.017 / 0.023 / 0.017 / 0.024
+opt.live_ranges.regalloc
+                      0.003 / 0.004 / 0.004 / 0.004 / 0.005
+opt.regalloc          0.009 / 0.010 / 0.012 / 0.010 / 0.015
+link_jit.all          0.047 / 0.050 / 0.056 / 0.051 / 0.061
+link.resolve.total    0.023 / 0.025 / 0.028 / 0.025 / 0.029
 ```
 
 One small loop:
@@ -65,19 +68,19 @@ int main(){int s=0; for(int i=0;i<10;i++) s+=i; return s==45?0:1;}
 ```
 
 ```text
-run.total             0.351 / 0.365 / 0.394 / 0.371 / 0.409
-run.compile_and_jit   0.334 / 0.349 / 0.377 / 0.353 / 0.389
-compile.tu            0.266 / 0.277 / 0.292 / 0.279 / 0.315
+run.total             0.369 / 0.400 / 0.446 / 0.401 / 0.448
+run.compile_and_jit   0.351 / 0.381 / 0.414 / 0.381 / 0.427
+compile.tu            0.293 / 0.314 / 0.349 / 0.315 / 0.353
 compile.c.parse_codegen
-                      0.131 / 0.140 / 0.155 / 0.141 / 0.157
-opt.o1.total          0.077 / 0.083 / 0.094 / 0.084 / 0.095
+                      0.151 / 0.165 / 0.196 / 0.167 / 0.212
+opt.o1.total          0.090 / 0.097 / 0.107 / 0.098 / 0.122
 opt.live_blocks.pre_dde
-                      0.011 / 0.012 / 0.018 / 0.013 / 0.018
-opt.live_info.regalloc
-                      0.016 / 0.016 / 0.016 / 0.016 / 0.017
-opt.regalloc          0.027 / 0.028 / 0.034 / 0.029 / 0.034
-link_jit.all          0.043 / 0.046 / 0.057 / 0.049 / 0.064
-link.resolve.total    0.021 / 0.021 / 0.028 / 0.023 / 0.031
+                      0.024 / 0.026 / 0.029 / 0.027 / 0.032
+opt.live_ranges.regalloc
+                      0.012 / 0.013 / 0.015 / 0.013 / 0.015
+opt.regalloc          0.025 / 0.027 / 0.030 / 0.028 / 0.042
+link_jit.all          0.048 / 0.052 / 0.066 / 0.055 / 0.088
+link.resolve.total    0.023 / 0.026 / 0.034 / 0.026 / 0.035
 ```
 
 For tiny programs, compile dominates, with O1 and JIT/link image setup as the
@@ -221,6 +224,78 @@ conflict matrix and its block-live storage scales linearly on the large
 straight-line case. The remaining superlinear behavior is now concentrated in
 the regalloc-side full liveness/conflict path.
 
+### Phase-3 Range-Allocator Rerun
+
+After `doc/MIR_RA_REPORT.md` phase 3, O1 assignment no longer uses the dense
+pseudo conflict matrix. Regalloc rebuilds pass-local block liveness and live
+ranges, assigns against `used_locs[point]`, and reports
+`opt.conflict_bytes=0` on the normal allocation path. Rewrite still uses
+per-instruction `live_after`, so this is not the final phase-4 shape.
+
+Samples: 5 runs per point. The table uses p50 milliseconds. O1 scopes and O1
+counters are summed across functions. The generator is the same direct-call vs
+function-table family as above, but these are fresh generated inputs and not
+byte-for-byte identical to the earlier tables.
+
+Straight-line main:
+
+```text
+funcs  input_bytes  insts  vals   pre_block_bytes  ranges  range_points  used_loc_words  conflict_bytes  run.total  compile.tu  parse_codegen  opt.o1.total  live_blocks_pre  live_ranges_reg  regalloc  link_jit
+1      139          55     32     576              28      41            82              0               0.551      0.444       0.296          0.184         0.048            0.025            0.051     0.057
+4      430          178    110    1632             100     143           286             0               0.887      0.784       0.642          0.418         0.114            0.069            0.132     0.055
+16     1624         670    422    6080             388     551           1189            0               2.433      2.306       2.147          1.414         0.391            0.254            0.480     0.069
+64     6520         2638   1670   23648            1540    2183          5674            0               8.195      8.042       7.894          5.236         1.435            0.992            1.894     0.084
+128    13188        5262   3334   47072            3076    4359          13894           0              15.634     15.399      15.250         10.068         2.657            1.936            3.893     0.102
+256    26884        10510  6662   93920            6148    8711          38014           0              31.114     30.795      30.648         20.296         5.097            3.876            8.415     0.139
+512    54276        21006  13318  187616           12292   17415         116974          0              66.957     66.408      66.265         44.891        10.516            8.204           20.599     0.219
+1024   109180       41998  26630  375008           24580   34823         397774          0             157.795    156.418     156.251        109.919        22.778           18.122           57.799     0.374
+```
+
+Function-table main:
+
+```text
+funcs  input_bytes  insts  vals   pre_block_bytes  ranges  range_points  used_loc_words  conflict_bytes  run.total  compile.tu  parse_codegen  opt.o1.total  live_blocks_pre  live_ranges_reg  regalloc  link_jit
+1      212          76     43     832              39      56            112             0               0.594      0.486       0.335          0.194         0.051            0.031            0.060     0.061
+4      482          184    109    1888             99      143           286             0               0.944      0.833       0.685          0.437         0.118            0.072            0.140     0.061
+16     1587         616    373    6112             339     491           982             0               2.266      2.145       1.993          1.316         0.375            0.230            0.432     0.064
+64     6099         2344   1429   23008            1299    1883          3766            0               7.565      7.414       7.259          4.806         1.347            0.861            1.605     0.077
+128    12228        4648   2837   45536            2579    3739          7478            0              14.055     13.851      13.703          8.959         2.491            1.659            3.058     0.098
+256    24772        9256   5653   90592            5139    7451          14902           0              27.074     26.765      26.620         17.091         4.677            3.237            5.942     0.141
+512    49860        18472  11285  180704           10259   14875         29750           0              54.364     53.839      53.682         33.996         9.330            6.422           11.850     0.225
+1024   100133       36904  22549  360928           20499   29723         59446           0             114.081    112.879     112.706         68.967        18.959           12.909           23.842     0.367
+```
+
+The 512 to 1024 step after phase 3:
+
+```text
+straight_main:
+compile.tu               66.408 -> 156.418 ms  2.36x
+opt.o1.total             44.891 -> 109.919 ms  2.45x
+opt.live_blocks.pre_dde  10.516 ->  22.778 ms  2.17x
+opt.live_ranges.regalloc  8.204 ->  18.122 ms  2.21x
+opt.regalloc             20.599 ->  57.799 ms  2.81x
+pre_block_bytes          187616 -> 375008      2.00x
+used_loc_words           116974 -> 397774      3.40x
+conflict_bytes                0 -> 0
+
+table_main:
+compile.tu               53.839 -> 112.879 ms  2.10x
+opt.o1.total             33.996 ->  68.967 ms  2.03x
+opt.live_blocks.pre_dde   9.330 ->  18.959 ms  2.03x
+opt.live_ranges.regalloc  6.422 ->  12.909 ms  2.01x
+opt.regalloc             11.850 ->  23.842 ms  2.01x
+pre_block_bytes          180704 -> 360928      2.00x
+used_loc_words            29750 -> 59446       2.00x
+conflict_bytes                0 -> 0
+```
+
+This is the intended phase-3 result: the old conflict matrix is gone from O1,
+and the large straight-line case is much faster than the phase-1 rerun. The
+remaining non-linear straight-line bucket now tracks `used_loc_words`, because
+the first range allocator uses dense occupied-location bitmaps whose stack-slot
+portion is sized by candidate count. Table-main stays close to linear because
+each function remains small.
+
 ## Compile Perf
 
 `compile.tu` is now split into generic and C-specific scopes:
@@ -245,35 +320,36 @@ compile.tu
   compile.obj_relocs
 ```
 
-Updated 1024-function single-run examples with the detailed compile scopes:
+Updated phase-3 1024-function p50 examples with the detailed compile scopes:
 
 ```text
 metric                         straight_main  table_main
-run.total                      828.387 ms     314.678 ms
-compile.tu                     825.154 ms     309.944 ms
-compile.frontend               825.129 ms     309.825 ms
-compile.c.setup                  0.156 ms       0.946 ms
-compile.c.pp_options             0.021 ms       0.021 ms
-compile.c.parse_codegen        824.900 ms     308.676 ms
-compile.c.cleanup                0.026 ms       0.015 ms
-compile.obj_finalize             0.000 ms       0.001 ms
-opt.o1.total summed            746.533 ms     231.538 ms
+run.total                      157.795 ms     114.081 ms
+compile.tu                     156.418 ms     112.879 ms
+compile.frontend               156.411 ms     112.868 ms
+compile.c.setup                  0.128 ms       0.130 ms
+compile.c.pp_options             0.003 ms       0.004 ms
+compile.c.parse_codegen        156.251 ms     112.706 ms
+compile.c.cleanup                0.020 ms       0.015 ms
+compile.obj_finalize             0.000 ms       0.000 ms
+opt.o1.total summed            109.919 ms      68.967 ms
 ```
 
-Subtracting summed O1 from `compile.tu` leaves about `78 ms` in both variants:
+Subtracting summed O1 from `compile.tu` leaves about `44-46 ms` in both
+variants:
 
 ```text
-straight_main: 825.154 - 746.533 = 78.621 ms
-table_main:    309.944 - 231.538 = 78.406 ms
+straight_main: 156.418 - 109.919 = 46.499 ms
+table_main:    112.879 -  68.967 = 43.912 ms
 ```
 
 That residual is the current frontend/PP/parser/CG API cost for roughly
-330 KiB of generated C. It appears linear with input size and number of small
-functions in these tests. The setup, PP option application, cleanup, and object
-finalization scopes are negligible. The opaque part that remains is inside
-`compile.c.parse_codegen`, which streams PP tokens, parses declarations and
-statements, performs type work, drives the CG API, and triggers per-function O1
-when function bodies end.
+100 KiB of generated C in these phase-3 inputs. It appears linear with input
+size and number of small functions in these tests. The setup, PP option
+application, cleanup, and object finalization scopes are negligible. The opaque
+part that remains is inside `compile.c.parse_codegen`, which streams PP tokens,
+parses declarations and statements, performs type work, drives the CG API, and
+triggers per-function O1 when function bodies end.
 
 ### Compile Scope Scaling
 
@@ -285,28 +361,28 @@ Straight-line main:
 
 ```text
 funcs  input_bytes  compile.tu  parse_codegen  opt.o1.total  non_o1_residual
-1      383            0.743        0.521          0.383          0.360
-4      1338           1.748        1.519          1.148          0.600
-16     5193           5.485        5.264          4.026          1.459
-64     20697         21.657       21.447         16.719          4.938
-128    41599         46.748       46.530         37.164          9.584
-256    83839        109.467      109.241         90.519         18.948
-512    168319       281.974      281.750        244.471         37.503
-1024   337481       824.834      824.586        746.447         78.387
+1      139            0.444        0.296          0.184          0.260
+4      430            0.784        0.642          0.418          0.366
+16     1624           2.306        2.147          1.414          0.892
+64     6520           8.042        7.894          5.236          2.806
+128    13188         15.399       15.250         10.068          5.331
+256    26884         30.795       30.648         20.296         10.499
+512    54276         66.408       66.265         44.891         21.517
+1024   109180       156.418      156.251        109.919         46.499
 ```
 
 Function-table main:
 
 ```text
 funcs  input_bytes  compile.tu  parse_codegen  opt.o1.total  non_o1_residual
-1      504            0.783        0.563          0.402          0.381
-4      1429           1.781        1.561          1.171          0.610
-16     5166           5.272        5.062          3.839          1.433
-64     20190         19.208       18.990         14.467          4.741
-128    40454         38.143       37.929         28.862          9.281
-256    81414         75.035       74.816         56.935         18.100
-512    163334       150.504      150.275        114.101         36.403
-1024   327378       304.716      304.492        227.790         76.926
+1      212            0.486        0.335          0.194          0.292
+4      482            0.833        0.685          0.437          0.396
+16     1587           2.145        1.993          1.316          0.829
+64     6099           7.414        7.259          4.806          2.608
+128    12228         13.851       13.703          8.959          4.892
+256    24772         26.765       26.620         17.091          9.674
+512    49860         53.839       53.682         33.996         19.843
+1024   100133       112.879      112.706         68.967         43.912
 ```
 
 The detailed compile scopes show:
@@ -320,7 +396,7 @@ The detailed compile scopes show:
   options.
 - `compile.c.cleanup` grows only to about `0.01-0.02 ms`.
 - `non_o1_residual` is near-linear with source size at larger sizes, around
-  `228-241 us/KiB` from 64 through 1024 functions.
+  `430-460 us/KiB` from 64 through 1024 functions in the phase-3 inputs.
 
 So the compile-side scaling story is two-layered: the broad C parse/codegen
 bucket scales linearly with input size after startup overhead, while the nested
@@ -340,16 +416,17 @@ Next useful compile instrumentation:
 
 ## Performance Priorities
 
-1. Move regalloc off full `opt_live_info`.
-   Phase 1 removed the full liveness/conflict build from pre-DDE. The remaining
-   superlinear bucket is now `opt.live_info.regalloc`, which still builds
-   `live_after` and the dense conflict matrix before assignment/rewrite.
-
-2. Replace or narrow dense conflict structures.
-   Regalloc-side `opt.conflict_bytes` still tracks the observed curve closely.
-   Investigate sparse sets, segmented bitsets, per-block live sets, or
-   interval-style structures that avoid touching full dense rows for values that
-   cannot overlap.
+1. Finish phase-4 rewrite without persistent `live_after`.
+   Phase 3 removed the dense conflict matrix from O1 assignment. Rewrite still
+   needs per-instruction full live-after sets for call preservation and spill
+   insertion, so that is the next correctness-preserving structural target.
+
+2. Compress or narrow `used_locs`.
+   The remaining straight-line superlinear bucket now tracks
+   `opt.alloc.used_loc_words`. The first range allocator sizes the
+   occupied-location bitmap by hard-register bits plus candidate stack bits.
+   The stack side should become demand-sized, sparse, or split-capable instead
+   of paying for every candidate at every point.
 
 3. Avoid whole-function contiguous growth where hot passes scan repeatedly.
    Large `Val`/block/instruction arrays and dense bit matrices are likely to
diff --git a/src/opt/ir.h b/src/opt/ir.h
@@ -328,6 +328,7 @@ typedef struct Func {
   u8 opt_rewritten;
   u16 opt_live_words;
   u16 opt_conflict_words;
+  u32 opt_used_loc_words;
   u32 opt_position_count;
   OptValInfo* val_info; /* indexed by Val, length nvals */
   u64* val_conflicts;   /* nvals x opt_conflict_words bit matrix */
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -1424,7 +1424,8 @@ static void w_func_end(CGTarget* t) {
     metrics_scope_end(o->c, "opt.dead_def_elim");
     metrics_scope_begin(o->c, "opt.regalloc");
     opt_regalloc(o->f, 0);
-    metrics_count(o->c, "opt.alloc.used_loc_words", 0);
+    metrics_count(o->c, "opt.alloc.used_loc_words",
+                  o->f->opt_used_loc_words);
     metrics_count(o->c, "opt.alloc.spills", func_spill_alloc_count(o->f));
     metrics_count(o->c, "opt.rewrite.reloads", func_spill_load_count(o->f));
     metrics_count(o->c, "opt.rewrite.stores", func_spill_store_count(o->f));
diff --git a/src/opt/opt.h b/src/opt/opt.h
@@ -123,6 +123,19 @@ typedef struct OptLiveRangeSet {
   u32 whole_block_spans;
 } OptLiveRangeSet;
 
+typedef enum OptLocKind {
+  OPT_LOC_NONE,
+  OPT_LOC_HARD,
+  OPT_LOC_STACK,
+} OptLocKind;
+
+typedef struct OptLoc {
+  u8 kind;
+  u8 cls;
+  Reg hard_reg;
+  FrameSlot spill_slot;
+} OptLoc;
+
 typedef void (*OptBitsetIterFn)(Val, void*);
 
 void opt_bitset_clear(OptBitset*);
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -704,39 +704,6 @@ void opt_live_info(Func* f) {
   }
 }
 
-static int ranges_overlap(const OptValInfo* a, const OptValInfo* b) {
-  if (!a->first_pos || !b->first_pos) return 0;
-  return a->first_pos <= b->last_pos && b->first_pos <= a->last_pos;
-}
-
-static int val_higher_priority(Func* f, Val a, Val b) {
-  OptValInfo* av = &f->val_info[a];
-  OptValInfo* bv = &f->val_info[b];
-  int at = av->tied_hard_reg >= 0;
-  int bt = bv->tied_hard_reg >= 0;
-  if (at != bt) return at > bt;
-  if (av->frequency != bv->frequency) return av->frequency > bv->frequency;
-  if (av->live_across_call_freq != bv->live_across_call_freq)
-    return av->live_across_call_freq > bv->live_across_call_freq;
-  if (av->live_length != bv->live_length)
-    return av->live_length < bv->live_length;
-  return a < b;
-}
-
-static int hard_conflicts(Func* f, Val* assigned, u32 nassigned, Val v, Reg r) {
-  for (u32 i = 0; i < nassigned; ++i) {
-    Val ov = assigned[i];
-    if (f->val_info[ov].alloc_kind != OPT_ALLOC_HARD) continue;
-    if (f->val_info[ov].hard_reg != r) continue;
-    if (f->val_conflicts && f->opt_conflict_words) {
-      if (bit_has(conflict_row(f, v), ov)) return 1;
-    } else if (ranges_overlap(&f->val_info[ov], &f->val_info[v])) {
-      return 1;
-    }
-  }
-  return 0;
-}
-
 static int hard_available(Func* f, u8 cls, Reg r) {
   if (cls >= OPT_REG_CLASSES) return 0;
   for (u32 i = 0; i < f->opt_hard_reg_count[cls]; ++i)
@@ -757,6 +724,289 @@ static FrameSlot spill_slot_for(Func* f, Val v) {
   return f->val_info[v].spill_slot;
 }
 
+static u32 loc_bit_words(u32 nbits) { return (nbits + 63u) / 64u; }
+
+static u32 hard_loc_bit(u8 cls, Reg r) { return ((u32)cls * 32u) + (u32)r; }
+
+static void loc_bit_set(u64* bits, u32 bit) {
+  bits[bit / 64u] |= 1ull << (bit % 64u);
+}
+
+static int loc_bit_has(const u64* bits, u32 bit) {
+  return (bits[bit / 64u] & (1ull << (bit % 64u))) != 0;
+}
+
+static void loc_bits_clear(u64* bits, u32 words) {
+  for (u32 i = 0; i < words; ++i) bits[i] = 0;
+}
+
+static void loc_bits_or(u64* dst, const u64* src, u32 words) {
+  for (u32 i = 0; i < words; ++i) dst[i] |= src[i];
+}
+
+typedef struct OptAllocCandidate {
+  Val v;
+  u32 spill_cost;
+  u32 live_length;
+  u8 tied;
+  u8 pad[3];
+} OptAllocCandidate;
+
+typedef struct OptAllocator {
+  OptLoc* locs;
+  u64* used_locs;
+  u32 point_count;
+  u32 loc_words;
+  u32 hard_loc_bits;
+  FrameSlot* stack_slots;
+  u32 stack_slot_count;
+} OptAllocator;
+
+static int alloc_candidate_higher(const OptAllocCandidate* a,
+                                  const OptAllocCandidate* b) {
+  if (a->tied != b->tied) return a->tied > b->tied;
+  if (a->spill_cost != b->spill_cost) return a->spill_cost > b->spill_cost;
+  if (a->live_length != b->live_length) return a->live_length < b->live_length;
+  return a->v < b->v;
+}
+
+static void alloc_sort_candidates(OptAllocCandidate* cands, u32 n) {
+  for (u32 i = 1; i < n; ++i) {
+    OptAllocCandidate key = cands[i];
+    u32 j = i;
+    while (j > 0 && alloc_candidate_higher(&key, &cands[j - 1u])) {
+      cands[j] = cands[j - 1u];
+      --j;
+    }
+    cands[j] = key;
+  }
+}
+
+static void opt_init_val_info_from_ranges(Func* f,
+                                          const OptLiveRangeSet* ranges) {
+  OptValInfo* old = f->val_info;
+  OptValInfo* info = arena_zarray(f->arena, OptValInfo, f->nvals ? f->nvals : 1u);
+  for (Val v = 0; v < f->nvals; ++v) {
+    i32 tied = old ? old[v].tied_hard_reg : -1;
+    u32 forbidden = old ? old[v].forbidden_hard_regs : 0;
+    u32 old_frequency = old ? old[v].frequency : 0;
+    OptValInfo* vi = &info[v];
+    vi->tied_hard_reg = tied;
+    vi->hard_reg = REG_NONE;
+    vi->spill_slot = FRAME_SLOT_NONE;
+    vi->alloc_kind = OPT_ALLOC_NONE;
+    vi->cls = f->val_cls[v];
+    vi->forbidden_hard_regs = forbidden;
+    if (!ranges || v == VAL_NONE ||
+        ranges->first_range_by_val[v] == OPT_RANGE_NONE) {
+      continue;
+    }
+    u32 first = (u32)~0u;
+    u32 last = 0;
+    for (u32 r = ranges->first_range_by_val[v]; r != OPT_RANGE_NONE;
+         r = ranges->ranges[r].next) {
+      const OptLiveRange* lr = &ranges->ranges[r];
+      if (lr->start < first) first = lr->start;
+      if (lr->end > last) last = lr->end;
+    }
+    vi->first_pos = first == (u32)~0u ? 0 : first + 1u;
+    vi->last_pos = last;
+    vi->live_length = ranges->live_length_by_val[v];
+    vi->use_freq = ranges->use_freq_by_val[v];
+    vi->def_freq = ranges->def_freq_by_val[v];
+    vi->live_block_freq = ranges->live_block_freq_by_val[v];
+    vi->live_across_call_freq = ranges->live_across_call_freq_by_val[v];
+    vi->spill_cost = ranges->spill_cost_by_val[v];
+    vi->frequency = vi->spill_cost;
+    if (old_frequency > vi->frequency) vi->frequency = old_frequency;
+  }
+  f->val_info = info;
+  f->val_conflicts = NULL;
+  f->opt_conflict_words = 0;
+}
+
+static void opt_build_live_after_only(Func* f, const OptLiveInfo* live_info) {
+  u32 words = live_info ? live_info->words : f->opt_live_words;
+  if (!words) words = bit_words(f->nvals);
+  f->opt_live_words = (u16)words;
+  for (u32 b = 0; b < f->nblocks; ++b) {
+    Block* bl = &f->blocks[b];
+    bl->live_after = arena_array(f->arena, u64*, bl->ninsts ? bl->ninsts : 1u);
+    u64* live = arena_zarray(f->arena, u64, words);
+    if (live_info) {
+      const OptBitset* out = &live_info->blocks[b].live_out;
+      for (u32 w = 0; w < words && w < out->nwords; ++w) live[w] = out->words[w];
+    } else if (bl->live_out) {
+      copy_bits(live, bl->live_out, words);
+    }
+    for (u32 ri = bl->ninsts; ri > 0; --ri) {
+      u32 i = ri - 1u;
+      Inst* in = &bl->insts[i];
+      u64* after = arena_zarray(f->arena, u64, words);
+      copy_bits(after, live, words);
+      bl->live_after[i] = after;
+
+      u64* use = arena_zarray(f->arena, u64, words);
+      u64* def = arena_zarray(f->arena, u64, words);
+      BitsCtx bc = {use, def};
+      walk_inst_operands(f, in, collect_bits, &bc);
+      if ((IROp)in->op == IR_ASM_BLOCK)
+        apply_asm_register_constraints(f, in, use, def, after);
+      for (u32 w = 0; w < words; ++w) live[w] = (live[w] & ~def[w]) | use[w];
+    }
+  }
+}
+
+static int spill_slot_compatible(Func* f, FrameSlot fs, Val v) {
+  if (fs == FRAME_SLOT_NONE || fs > f->nframe_slots) return 0;
+  IRFrameSlot* s = &f->frame_slots[fs - 1u];
+  u32 size = type_size_fallback(f, f->val_type[v]);
+  u32 align = size >= 8 ? 8 : size;
+  if (s->kind != FS_SPILL) return 0;
+  if (s->size < size) return 0;
+  if (s->align < align) return 0;
+  return 1;
+}
+
+static void alloc_collect_occupied(const OptAllocator* a,
+                                   const OptLiveRangeSet* ranges, Val v,
+                                   u64* occupied) {
+  loc_bits_clear(occupied, a->loc_words);
+  for (u32 r = ranges->first_range_by_val[v]; r != OPT_RANGE_NONE;
+       r = ranges->ranges[r].next) {
+    const OptLiveRange* lr = &ranges->ranges[r];
+    for (u32 p = lr->start; p < lr->end && p < a->point_count; ++p)
+      loc_bits_or(occupied, a->used_locs + ((size_t)p * a->loc_words),
+                  a->loc_words);
+  }
+}
+
+static void alloc_mark_loc(OptAllocator* a, const OptLiveRangeSet* ranges,
+                           Val v, u32 bit) {
+  for (u32 r = ranges->first_range_by_val[v]; r != OPT_RANGE_NONE;
+       r = ranges->ranges[r].next) {
+    const OptLiveRange* lr = &ranges->ranges[r];
+    for (u32 p = lr->start; p < lr->end && p < a->point_count; ++p)
+      loc_bit_set(a->used_locs + ((size_t)p * a->loc_words), bit);
+  }
+}
+
+static void alloc_assign_hard(Func* f, OptAllocator* a,
+                              const OptLiveRangeSet* ranges, Val v, Reg r) {
+  OptValInfo* vi = &f->val_info[v];
+  vi->alloc_kind = OPT_ALLOC_HARD;
+  vi->hard_reg = r;
+  a->locs[v].kind = OPT_LOC_HARD;
+  a->locs[v].cls = vi->cls;
+  a->locs[v].hard_reg = r;
+  a->locs[v].spill_slot = FRAME_SLOT_NONE;
+  alloc_mark_loc(a, ranges, v, hard_loc_bit(vi->cls, r));
+}
+
+static void alloc_assign_stack(Func* f, OptAllocator* a,
+                               const OptLiveRangeSet* ranges, Val v,
+                               const u64* occupied) {
+  OptValInfo* vi = &f->val_info[v];
+  u32 stack_idx = a->stack_slot_count;
+  FrameSlot slot = FRAME_SLOT_NONE;
+  for (u32 i = 0; i < a->stack_slot_count; ++i) {
+    u32 bit = a->hard_loc_bits + i;
+    if (loc_bit_has(occupied, bit)) continue;
+    if (!spill_slot_compatible(f, a->stack_slots[i], v)) continue;
+    stack_idx = i;
+    slot = a->stack_slots[i];
+    break;
+  }
+  if (slot == FRAME_SLOT_NONE) {
+    slot = spill_slot_for(f, v);
+    a->stack_slots[a->stack_slot_count++] = slot;
+    stack_idx = a->stack_slot_count - 1u;
+  } else {
+    vi->spill_slot = slot;
+  }
+  vi->alloc_kind = OPT_ALLOC_SPILL;
+  a->locs[v].kind = OPT_LOC_STACK;
+  a->locs[v].cls = vi->cls;
+  a->locs[v].hard_reg = REG_NONE;
+  a->locs[v].spill_slot = slot;
+  alloc_mark_loc(a, ranges, v, a->hard_loc_bits + stack_idx);
+}
+
+static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
+                              OptAllocator* a) {
+  memset(a, 0, sizeof *a);
+  a->point_count = ranges->point_count ? ranges->point_count : 1u;
+  a->hard_loc_bits = OPT_REG_CLASSES * 32u;
+  u32 ncands = 0;
+  for (Val v = 1; v < f->nvals; ++v)
+    if (ranges->first_range_by_val[v] != OPT_RANGE_NONE) ++ncands;
+  u32 loc_bits = a->hard_loc_bits + ncands;
+  a->loc_words = loc_bit_words(loc_bits);
+  f->opt_used_loc_words = a->point_count * a->loc_words;
+  a->locs = arena_zarray(f->arena, OptLoc, f->nvals ? f->nvals : 1u);
+  a->used_locs =
+      arena_zarray(f->arena, u64, (size_t)a->point_count * a->loc_words);
+  a->stack_slots = arena_array(f->arena, FrameSlot, ncands ? ncands : 1u);
+
+  OptAllocCandidate* cands =
+      arena_array(f->arena, OptAllocCandidate, ncands ? ncands : 1u);
+  u32 n = 0;
+  for (Val v = 1; v < f->nvals; ++v) {
+    if (ranges->first_range_by_val[v] == OPT_RANGE_NONE) continue;
+    OptValInfo* vi = &f->val_info[v];
+    cands[n].v = v;
+    cands[n].spill_cost = vi->frequency ? vi->frequency : vi->spill_cost;
+    cands[n].live_length = vi->live_length;
+    cands[n].tied = vi->tied_hard_reg >= 0;
+    memset(cands[n].pad, 0, sizeof cands[n].pad);
+    ++n;
+  }
+  alloc_sort_candidates(cands, n);
+
+  u64* occupied = arena_zarray(f->arena, u64, a->loc_words ? a->loc_words : 1u);
+  for (u32 i = 0; i < n; ++i) {
+    Val v = cands[i].v;
+    OptValInfo* vi = &f->val_info[v];
+    u8 cls = vi->cls;
+    alloc_collect_occupied(a, ranges, v, occupied);
+    if (vi->tied_hard_reg >= 0) {
+      Reg fixed = (Reg)vi->tied_hard_reg;
+      if (!hard_available(f, cls, fixed)) {
+        SrcLoc loc = {0, 0, 0};
+        compiler_panic(
+            f->c, loc,
+            "opt regalloc: fixed hard reg %u unavailable in class %u",
+            (unsigned)fixed, (unsigned)cls);
+      }
+      if (fixed < 32 && (vi->forbidden_hard_regs & (1u << fixed))) {
+        SrcLoc loc = {0, 0, 0};
+        compiler_panic(f->c, loc,
+                       "opt regalloc: fixed hard reg %u is clobbered",
+                       (unsigned)fixed);
+      }
+      if (fixed >= 32 || loc_bit_has(occupied, hard_loc_bit(cls, fixed))) {
+        SrcLoc loc = {0, 0, 0};
+        compiler_panic(f->c, loc, "opt regalloc: conflicting fixed hard reg %u",
+                       (unsigned)fixed);
+      }
+      alloc_assign_hard(f, a, ranges, v, fixed);
+      continue;
+    }
+
+    int found = 0;
+    for (u32 r = 0; r < f->opt_hard_reg_count[cls]; ++r) {
+      Reg hr = f->opt_hard_regs[cls][r];
+      if (hr >= 32) continue;
+      if (vi->forbidden_hard_regs & (1u << hr)) continue;
+      if (loc_bit_has(occupied, hard_loc_bit(cls, hr))) continue;
+      alloc_assign_hard(f, a, ranges, v, hr);
+      found = 1;
+      break;
+    }
+    if (!found) alloc_assign_stack(f, a, ranges, v, occupied);
+  }
+}
+
 typedef struct RewriteList {
   Inst* data;
   u32 n;
@@ -1041,76 +1291,19 @@ void opt_dead_def_elim(Func* f) {
 
 void opt_regalloc(Func* f, int allow_live_range_split) {
   (void)allow_live_range_split;
-  if (!f->val_info) {
-    metrics_scope_begin(f->c, "opt.live_info.regalloc");
-    opt_live_info(f);
-    metrics_count(f->c, "opt.live_words", f->opt_live_words);
-    metrics_count(
-        f->c, "opt.conflict_bytes",
-        (u64)f->nvals * (u64)f->opt_conflict_words * (u64)sizeof(u64));
-    metrics_scope_end(f->c, "opt.live_info.regalloc");
-  }
-  Val* order = arena_array(f->arena, Val, f->nvals ? f->nvals : 1u);
-  u32 norder = 0;
-  for (Val v = 1; v < f->nvals; ++v)
-    if (f->val_info[v].first_pos) order[norder++] = v;
-  for (u32 i = 1; i < norder; ++i) {
-    Val key = order[i];
-    u32 j = i;
-    while (j > 0 && val_higher_priority(f, key, order[j - 1u])) {
-      order[j] = order[j - 1u];
-      --j;
-    }
-    order[j] = key;
-  }
-  Val* assigned = arena_array(f->arena, Val, norder ? norder : 1u);
-  u32 nassigned = 0;
-  for (u32 i = 0; i < norder; ++i) {
-    Val v = order[i];
-    OptValInfo* vi = &f->val_info[v];
-    u8 cls = vi->cls;
-    if (vi->tied_hard_reg >= 0) {
-      Reg fixed = (Reg)vi->tied_hard_reg;
-      if (!hard_available(f, cls, fixed)) {
-        SrcLoc loc = {0, 0, 0};
-        compiler_panic(
-            f->c, loc,
-            "opt regalloc: fixed hard reg %u unavailable in class %u",
-            (unsigned)fixed, (unsigned)cls);
-      }
-      if (fixed < 32 && (vi->forbidden_hard_regs & (1u << fixed))) {
-        SrcLoc loc = {0, 0, 0};
-        compiler_panic(f->c, loc,
-                       "opt regalloc: fixed hard reg %u is clobbered",
-                       (unsigned)fixed);
-      }
-      if (hard_conflicts(f, assigned, nassigned, v, fixed)) {
-        SrcLoc loc = {0, 0, 0};
-        compiler_panic(f->c, loc, "opt regalloc: conflicting fixed hard reg %u",
-                       (unsigned)fixed);
-      }
-      vi->alloc_kind = OPT_ALLOC_HARD;
-      vi->hard_reg = fixed;
-      assigned[nassigned++] = v;
-      continue;
-    }
-    int found = 0;
-    for (u32 r = 0; r < f->opt_hard_reg_count[cls]; ++r) {
-      Reg hr = f->opt_hard_regs[cls][r];
-      if (hr < 32 && (vi->forbidden_hard_regs & (1u << hr))) continue;
-      if (!hard_conflicts(f, assigned, nassigned, v, hr)) {
-        vi->alloc_kind = OPT_ALLOC_HARD;
-        vi->hard_reg = hr;
-        assigned[nassigned++] = v;
-        found = 1;
-        break;
-      }
-    }
-    if (!found) {
-      vi->alloc_kind = OPT_ALLOC_SPILL;
-      vi->spill_slot = spill_slot_for(f, v);
-    }
-  }
+  metrics_scope_begin(f->c, "opt.live_ranges.regalloc");
+  OptLiveInfo live;
+  opt_live_blocks(f, &live);
+  OptLiveRangeSet ranges;
+  opt_live_ranges_build(f, &live, &ranges);
+  opt_init_val_info_from_ranges(f, &ranges);
+  opt_build_live_after_only(f, &live);
+  metrics_count(f->c, "opt.live_words", f->opt_live_words);
+  metrics_count(f->c, "opt.conflict_bytes", 0);
+  metrics_scope_end(f->c, "opt.live_ranges.regalloc");
+
+  OptAllocator alloc;
+  opt_assign_ranges(f, &ranges, &alloc);
   rewrite_func(f);
 }
 
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -1161,6 +1161,41 @@ static void opt_regalloc_priority(void) {
   tc_fini(&tc);
 }
 
+static void opt_range_regalloc_no_conflicts_and_stack_reuse(void) {
+  TestCtx tc;
+  tc_init(&tc);
+  MockCGTarget mock;
+  mock_init(&mock, tc.c);
+  static const Reg scratch[] = {9, 10};
+  mock_set_pool(&mock, RC_INT, NULL, 0, scratch, 2, 0);
+
+  Func* f = new_func(&tc);
+  opt_machinize(f, &mock.base);
+  Val a = add_val(f, tc.i32);
+  Val b = add_val(f, tc.i32);
+  emit_load_imm(f, f->entry, a, tc.i32, 1);
+  emit_load_imm(f, f->entry, b, tc.i32, 2);
+  emit_ret_val(f, f->entry, b, tc.i32);
+  opt_build_cfg(f);
+  opt_build_loop_tree(f);
+
+  opt_regalloc(f, 0);
+
+  EXPECT(f->val_conflicts == NULL,
+         "range allocator should not allocate a conflict matrix");
+  EXPECT(f->opt_conflict_words == 0,
+         "range allocator should leave conflict words at zero");
+  EXPECT(f->opt_used_loc_words != 0,
+         "range allocator should record used-location storage");
+  EXPECT(f->val_info[a].alloc_kind == OPT_ALLOC_SPILL,
+         "no hard regs should force v%u to stack", (unsigned)a);
+  EXPECT(f->val_info[b].alloc_kind == OPT_ALLOC_SPILL,
+         "no hard regs should force v%u to stack", (unsigned)b);
+  EXPECT(f->val_info[a].spill_slot == f->val_info[b].spill_slot,
+         "disjoint stack ranges should reuse a spill slot");
+  tc_fini(&tc);
+}
+
 static void opt_rewrite_spill_use_def(void) {
   TestCtx tc;
   tc_init(&tc);
@@ -2078,6 +2113,7 @@ int main(void) {
   opt_live_across_call_frequency();
   opt_conflict_symmetry_and_class();
   opt_regalloc_priority();
+  opt_range_regalloc_no_conflicts_and_stack_reuse();
   opt_rewrite_spill_use_def();
   opt_call_clobber_preservation();
   opt_call_clobber_caller_saved();

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	doc/MIR_RA_REPORT.md	\|	36	++++++++++++++++++------------------
M	doc/PERF.md	\|	213	++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
M	src/opt/ir.h	\|	1	+
M	src/opt/opt.c	\|	3	++-
M	src/opt/opt.h	\|	13	+++++++++++++
M	src/opt/pass_lower.c	\|	399	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
M	test/opt/opt_test.c	\|	36	++++++++++++++++++++++++++++++++++++