commit 064cd29d7879a73a15ed06cbd9860f4ac4d59384
parent ed6d5b24fc8b242a6b5ba8405673c0b5b1836d2d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 14 May 2026 19:25:09 -0700
Implement range-based O1 assignment
Diffstat:
7 files changed, 511 insertions(+), 190 deletions(-)
diff --git a/doc/MIR_RA_REPORT.md b/doc/MIR_RA_REPORT.md
@@ -584,27 +584,27 @@ Exit criteria:
### Phase 3: Range-Based Assignment
-- [ ] Define `OptLoc` assignment mapping for every `Val`.
-- [ ] Build allocation candidate list from values with ranges.
-- [ ] Sort candidates by:
- - [ ] tied hard-register requirement
- - [ ] frequency/spill cost
- - [ ] shorter live length
- - [ ] stable value id
-- [ ] Add `used_locs[point]` bitmaps.
-- [ ] Mark hard-register live ranges in `used_locs`.
-- [ ] For each candidate, union occupied locations across its ranges.
-- [ ] Assign a legal hard register when available.
-- [ ] Assign/reuse stack slots when no hard register is available.
-- [ ] Preserve tied hard-register validation.
-- [ ] Preserve forbidden hard-register constraints from asm/calls.
-- [ ] Keep live-range splitting disabled for O1.
+- [x] Define `OptLoc` assignment mapping for every `Val`.
+- [x] Build allocation candidate list from values with ranges.
+- [x] Sort candidates by:
+ - [x] tied hard-register requirement
+ - [x] frequency/spill cost
+ - [x] shorter live length
+ - [x] stable value id
+- [x] Add `used_locs[point]` bitmaps.
+- [x] Mark hard-register live ranges in `used_locs`.
+- [x] For each candidate, union occupied locations across its ranges.
+- [x] Assign a legal hard register when available.
+- [x] Assign/reuse stack slots when no hard register is available.
+- [x] Preserve tied hard-register validation.
+- [x] Preserve forbidden hard-register constraints from asm/calls.
+- [x] Keep live-range splitting disabled for O1.
Exit criteria:
-- [ ] O1 regalloc no longer depends on `val_conflicts`.
-- [ ] `opt.conflict_bytes` is zero or absent on the normal O1 allocation path.
-- [ ] Branch, loop, call-clobber, spill, and tied-register tests pass.
+- [x] O1 regalloc no longer depends on `val_conflicts`.
+- [x] `opt.conflict_bytes` is zero or absent on the normal O1 allocation path.
+- [x] Branch, loop, call-clobber, spill, and tied-register tests pass.
### Phase 4: Rewrite From Assignment Map
diff --git a/doc/PERF.md b/doc/PERF.md
@@ -30,6 +30,9 @@ Important counters:
- `opt.funcs`, `opt.blocks`, `opt.insts`, `opt.vals`
- `opt.live_words`, `opt.live.blocks`, `opt.live.active_words`
- `opt.live.block_bytes`, `opt.live.set_bit_scans`
+- `opt.ranges`, `opt.range_points`, `opt.range_raw_points`
+- `opt.alloc.used_loc_words`, `opt.alloc.spills`
+- `opt.rewrite.reloads`, `opt.rewrite.stores`
- `opt.conflict_bytes`
- `link.inputs`, `link.sections`, `link.segments`, `link.syms`, `link.relocs`
- `jit.master_size`, `jit.nsegments`, `jit.input_section_bytes`,
@@ -43,19 +46,19 @@ Times are `min / p50 / p95 / mean / max` in milliseconds.
`int main(){return 0;}`
```text
-run.total 0.298 / 0.312 / 0.344 / 0.319 / 0.390
-run.compile_and_jit 0.283 / 0.295 / 0.323 / 0.300 / 0.346
-compile.tu 0.209 / 0.222 / 0.242 / 0.224 / 0.244
+run.total 0.290 / 0.311 / 0.345 / 0.316 / 0.351
+run.compile_and_jit 0.274 / 0.293 / 0.313 / 0.296 / 0.329
+compile.tu 0.218 / 0.235 / 0.246 / 0.235 / 0.257
compile.c.parse_codegen
- 0.073 / 0.081 / 0.092 / 0.082 / 0.095
-opt.o1.total 0.040 / 0.048 / 0.057 / 0.048 / 0.059
+ 0.082 / 0.091 / 0.097 / 0.091 / 0.107
+opt.o1.total 0.048 / 0.056 / 0.061 / 0.055 / 0.065
opt.live_blocks.pre_dde
- 0.008 / 0.008 / 0.017 / 0.010 / 0.019
-opt.live_info.regalloc
- 0.003 / 0.003 / 0.004 / 0.004 / 0.007
-opt.regalloc 0.008 / 0.012 / 0.016 / 0.012 / 0.019
-link_jit.all 0.043 / 0.049 / 0.058 / 0.051 / 0.059
-link.resolve.total 0.020 / 0.021 / 0.027 / 0.023 / 0.029
+ 0.013 / 0.017 / 0.023 / 0.017 / 0.024
+opt.live_ranges.regalloc
+ 0.003 / 0.004 / 0.004 / 0.004 / 0.005
+opt.regalloc 0.009 / 0.010 / 0.012 / 0.010 / 0.015
+link_jit.all 0.047 / 0.050 / 0.056 / 0.051 / 0.061
+link.resolve.total 0.023 / 0.025 / 0.028 / 0.025 / 0.029
```
One small loop:
@@ -65,19 +68,19 @@ int main(){int s=0; for(int i=0;i<10;i++) s+=i; return s==45?0:1;}
```
```text
-run.total 0.351 / 0.365 / 0.394 / 0.371 / 0.409
-run.compile_and_jit 0.334 / 0.349 / 0.377 / 0.353 / 0.389
-compile.tu 0.266 / 0.277 / 0.292 / 0.279 / 0.315
+run.total 0.369 / 0.400 / 0.446 / 0.401 / 0.448
+run.compile_and_jit 0.351 / 0.381 / 0.414 / 0.381 / 0.427
+compile.tu 0.293 / 0.314 / 0.349 / 0.315 / 0.353
compile.c.parse_codegen
- 0.131 / 0.140 / 0.155 / 0.141 / 0.157
-opt.o1.total 0.077 / 0.083 / 0.094 / 0.084 / 0.095
+ 0.151 / 0.165 / 0.196 / 0.167 / 0.212
+opt.o1.total 0.090 / 0.097 / 0.107 / 0.098 / 0.122
opt.live_blocks.pre_dde
- 0.011 / 0.012 / 0.018 / 0.013 / 0.018
-opt.live_info.regalloc
- 0.016 / 0.016 / 0.016 / 0.016 / 0.017
-opt.regalloc 0.027 / 0.028 / 0.034 / 0.029 / 0.034
-link_jit.all 0.043 / 0.046 / 0.057 / 0.049 / 0.064
-link.resolve.total 0.021 / 0.021 / 0.028 / 0.023 / 0.031
+ 0.024 / 0.026 / 0.029 / 0.027 / 0.032
+opt.live_ranges.regalloc
+ 0.012 / 0.013 / 0.015 / 0.013 / 0.015
+opt.regalloc 0.025 / 0.027 / 0.030 / 0.028 / 0.042
+link_jit.all 0.048 / 0.052 / 0.066 / 0.055 / 0.088
+link.resolve.total 0.023 / 0.026 / 0.034 / 0.026 / 0.035
```
For tiny programs, compile dominates, with O1 and JIT/link image setup as the
@@ -221,6 +224,78 @@ conflict matrix and its block-live storage scales linearly on the large
straight-line case. The remaining superlinear behavior is now concentrated in
the regalloc-side full liveness/conflict path.
+### Phase-3 Range-Allocator Rerun
+
+After `doc/MIR_RA_REPORT.md` phase 3, O1 assignment no longer uses the dense
+pseudo conflict matrix. Regalloc rebuilds pass-local block liveness and live
+ranges, assigns against `used_locs[point]`, and reports
+`opt.conflict_bytes=0` on the normal allocation path. Rewrite still uses
+per-instruction `live_after`, so this is not the final phase-4 shape.
+
+Samples: 5 runs per point. The table uses p50 milliseconds. O1 scopes and O1
+counters are summed across functions. The generator is the same direct-call vs
+function-table family as above, but these are fresh generated inputs and not
+byte-for-byte identical to the earlier tables.
+
+Straight-line main:
+
+```text
+funcs input_bytes insts vals pre_block_bytes ranges range_points used_loc_words conflict_bytes run.total compile.tu parse_codegen opt.o1.total live_blocks_pre live_ranges_reg regalloc link_jit
+1 139 55 32 576 28 41 82 0 0.551 0.444 0.296 0.184 0.048 0.025 0.051 0.057
+4 430 178 110 1632 100 143 286 0 0.887 0.784 0.642 0.418 0.114 0.069 0.132 0.055
+16 1624 670 422 6080 388 551 1189 0 2.433 2.306 2.147 1.414 0.391 0.254 0.480 0.069
+64 6520 2638 1670 23648 1540 2183 5674 0 8.195 8.042 7.894 5.236 1.435 0.992 1.894 0.084
+128 13188 5262 3334 47072 3076 4359 13894 0 15.634 15.399 15.250 10.068 2.657 1.936 3.893 0.102
+256 26884 10510 6662 93920 6148 8711 38014 0 31.114 30.795 30.648 20.296 5.097 3.876 8.415 0.139
+512 54276 21006 13318 187616 12292 17415 116974 0 66.957 66.408 66.265 44.891 10.516 8.204 20.599 0.219
+1024 109180 41998 26630 375008 24580 34823 397774 0 157.795 156.418 156.251 109.919 22.778 18.122 57.799 0.374
+```
+
+Function-table main:
+
+```text
+funcs input_bytes insts vals pre_block_bytes ranges range_points used_loc_words conflict_bytes run.total compile.tu parse_codegen opt.o1.total live_blocks_pre live_ranges_reg regalloc link_jit
+1 212 76 43 832 39 56 112 0 0.594 0.486 0.335 0.194 0.051 0.031 0.060 0.061
+4 482 184 109 1888 99 143 286 0 0.944 0.833 0.685 0.437 0.118 0.072 0.140 0.061
+16 1587 616 373 6112 339 491 982 0 2.266 2.145 1.993 1.316 0.375 0.230 0.432 0.064
+64 6099 2344 1429 23008 1299 1883 3766 0 7.565 7.414 7.259 4.806 1.347 0.861 1.605 0.077
+128 12228 4648 2837 45536 2579 3739 7478 0 14.055 13.851 13.703 8.959 2.491 1.659 3.058 0.098
+256 24772 9256 5653 90592 5139 7451 14902 0 27.074 26.765 26.620 17.091 4.677 3.237 5.942 0.141
+512 49860 18472 11285 180704 10259 14875 29750 0 54.364 53.839 53.682 33.996 9.330 6.422 11.850 0.225
+1024 100133 36904 22549 360928 20499 29723 59446 0 114.081 112.879 112.706 68.967 18.959 12.909 23.842 0.367
+```
+
+The 512 to 1024 step after phase 3:
+
+```text
+straight_main:
+compile.tu 66.408 -> 156.418 ms 2.36x
+opt.o1.total 44.891 -> 109.919 ms 2.45x
+opt.live_blocks.pre_dde 10.516 -> 22.778 ms 2.17x
+opt.live_ranges.regalloc 8.204 -> 18.122 ms 2.21x
+opt.regalloc 20.599 -> 57.799 ms 2.81x
+pre_block_bytes 187616 -> 375008 2.00x
+used_loc_words 116974 -> 397774 3.40x
+conflict_bytes 0 -> 0
+
+table_main:
+compile.tu 53.839 -> 112.879 ms 2.10x
+opt.o1.total 33.996 -> 68.967 ms 2.03x
+opt.live_blocks.pre_dde 9.330 -> 18.959 ms 2.03x
+opt.live_ranges.regalloc 6.422 -> 12.909 ms 2.01x
+opt.regalloc 11.850 -> 23.842 ms 2.01x
+pre_block_bytes 180704 -> 360928 2.00x
+used_loc_words 29750 -> 59446 2.00x
+conflict_bytes 0 -> 0
+```
+
+This is the intended phase-3 result: the old conflict matrix is gone from O1,
+and the large straight-line case is much faster than the phase-1 rerun. The
+remaining non-linear straight-line bucket now tracks `used_loc_words`, because
+the first range allocator uses dense occupied-location bitmaps whose stack-slot
+portion is sized by candidate count. Table-main stays close to linear because
+each function remains small.
+
## Compile Perf
`compile.tu` is now split into generic and C-specific scopes:
@@ -245,35 +320,36 @@ compile.tu
compile.obj_relocs
```
-Updated 1024-function single-run examples with the detailed compile scopes:
+Updated phase-3 1024-function p50 examples with the detailed compile scopes:
```text
metric straight_main table_main
-run.total 828.387 ms 314.678 ms
-compile.tu 825.154 ms 309.944 ms
-compile.frontend 825.129 ms 309.825 ms
-compile.c.setup 0.156 ms 0.946 ms
-compile.c.pp_options 0.021 ms 0.021 ms
-compile.c.parse_codegen 824.900 ms 308.676 ms
-compile.c.cleanup 0.026 ms 0.015 ms
-compile.obj_finalize 0.000 ms 0.001 ms
-opt.o1.total summed 746.533 ms 231.538 ms
+run.total 157.795 ms 114.081 ms
+compile.tu 156.418 ms 112.879 ms
+compile.frontend 156.411 ms 112.868 ms
+compile.c.setup 0.128 ms 0.130 ms
+compile.c.pp_options 0.003 ms 0.004 ms
+compile.c.parse_codegen 156.251 ms 112.706 ms
+compile.c.cleanup 0.020 ms 0.015 ms
+compile.obj_finalize 0.000 ms 0.000 ms
+opt.o1.total summed 109.919 ms 68.967 ms
```
-Subtracting summed O1 from `compile.tu` leaves about `78 ms` in both variants:
+Subtracting summed O1 from `compile.tu` leaves about `44-46 ms` in both
+variants:
```text
-straight_main: 825.154 - 746.533 = 78.621 ms
-table_main: 309.944 - 231.538 = 78.406 ms
+straight_main: 156.418 - 109.919 = 46.499 ms
+table_main: 112.879 - 68.967 = 43.912 ms
```
That residual is the current frontend/PP/parser/CG API cost for roughly
-330 KiB of generated C. It appears linear with input size and number of small
-functions in these tests. The setup, PP option application, cleanup, and object
-finalization scopes are negligible. The opaque part that remains is inside
-`compile.c.parse_codegen`, which streams PP tokens, parses declarations and
-statements, performs type work, drives the CG API, and triggers per-function O1
-when function bodies end.
+100 KiB of generated C in these phase-3 inputs. It appears linear with input
+size and number of small functions in these tests. The setup, PP option
+application, cleanup, and object finalization scopes are negligible. The opaque
+part that remains is inside `compile.c.parse_codegen`, which streams PP tokens,
+parses declarations and statements, performs type work, drives the CG API, and
+triggers per-function O1 when function bodies end.
### Compile Scope Scaling
@@ -285,28 +361,28 @@ Straight-line main:
```text
funcs input_bytes compile.tu parse_codegen opt.o1.total non_o1_residual
-1 383 0.743 0.521 0.383 0.360
-4 1338 1.748 1.519 1.148 0.600
-16 5193 5.485 5.264 4.026 1.459
-64 20697 21.657 21.447 16.719 4.938
-128 41599 46.748 46.530 37.164 9.584
-256 83839 109.467 109.241 90.519 18.948
-512 168319 281.974 281.750 244.471 37.503
-1024 337481 824.834 824.586 746.447 78.387
+1 139 0.444 0.296 0.184 0.260
+4 430 0.784 0.642 0.418 0.366
+16 1624 2.306 2.147 1.414 0.892
+64 6520 8.042 7.894 5.236 2.806
+128 13188 15.399 15.250 10.068 5.331
+256 26884 30.795 30.648 20.296 10.499
+512 54276 66.408 66.265 44.891 21.517
+1024 109180 156.418 156.251 109.919 46.499
```
Function-table main:
```text
funcs input_bytes compile.tu parse_codegen opt.o1.total non_o1_residual
-1 504 0.783 0.563 0.402 0.381
-4 1429 1.781 1.561 1.171 0.610
-16 5166 5.272 5.062 3.839 1.433
-64 20190 19.208 18.990 14.467 4.741
-128 40454 38.143 37.929 28.862 9.281
-256 81414 75.035 74.816 56.935 18.100
-512 163334 150.504 150.275 114.101 36.403
-1024 327378 304.716 304.492 227.790 76.926
+1 212 0.486 0.335 0.194 0.292
+4 482 0.833 0.685 0.437 0.396
+16 1587 2.145 1.993 1.316 0.829
+64 6099 7.414 7.259 4.806 2.608
+128 12228 13.851 13.703 8.959 4.892
+256 24772 26.765 26.620 17.091 9.674
+512 49860 53.839 53.682 33.996 19.843
+1024 100133 112.879 112.706 68.967 43.912
```
The detailed compile scopes show:
@@ -320,7 +396,7 @@ The detailed compile scopes show:
options.
- `compile.c.cleanup` grows only to about `0.01-0.02 ms`.
- `non_o1_residual` is near-linear with source size at larger sizes, around
- `228-241 us/KiB` from 64 through 1024 functions.
+ `430-460 us/KiB` from 64 through 1024 functions in the phase-3 inputs.
So the compile-side scaling story is two-layered: the broad C parse/codegen
bucket scales linearly with input size after startup overhead, while the nested
@@ -340,16 +416,17 @@ Next useful compile instrumentation:
## Performance Priorities
-1. Move regalloc off full `opt_live_info`.
- Phase 1 removed the full liveness/conflict build from pre-DDE. The remaining
- superlinear bucket is now `opt.live_info.regalloc`, which still builds
- `live_after` and the dense conflict matrix before assignment/rewrite.
-
-2. Replace or narrow dense conflict structures.
- Regalloc-side `opt.conflict_bytes` still tracks the observed curve closely.
- Investigate sparse sets, segmented bitsets, per-block live sets, or
- interval-style structures that avoid touching full dense rows for values that
- cannot overlap.
+1. Finish phase-4 rewrite without persistent `live_after`.
+ Phase 3 removed the dense conflict matrix from O1 assignment. Rewrite still
+ needs per-instruction full live-after sets for call preservation and spill
+ insertion, so that is the next correctness-preserving structural target.
+
+2. Compress or narrow `used_locs`.
+ The remaining straight-line superlinear bucket now tracks
+ `opt.alloc.used_loc_words`. The first range allocator sizes the
+ occupied-location bitmap by hard-register bits plus candidate stack bits.
+ The stack side should become demand-sized, sparse, or split-capable instead
+ of paying for every candidate at every point.
3. Avoid whole-function contiguous growth where hot passes scan repeatedly.
Large `Val`/block/instruction arrays and dense bit matrices are likely to
diff --git a/src/opt/ir.h b/src/opt/ir.h
@@ -328,6 +328,7 @@ typedef struct Func {
u8 opt_rewritten;
u16 opt_live_words;
u16 opt_conflict_words;
+ u32 opt_used_loc_words;
u32 opt_position_count;
OptValInfo* val_info; /* indexed by Val, length nvals */
u64* val_conflicts; /* nvals x opt_conflict_words bit matrix */
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -1424,7 +1424,8 @@ static void w_func_end(CGTarget* t) {
metrics_scope_end(o->c, "opt.dead_def_elim");
metrics_scope_begin(o->c, "opt.regalloc");
opt_regalloc(o->f, 0);
- metrics_count(o->c, "opt.alloc.used_loc_words", 0);
+ metrics_count(o->c, "opt.alloc.used_loc_words",
+ o->f->opt_used_loc_words);
metrics_count(o->c, "opt.alloc.spills", func_spill_alloc_count(o->f));
metrics_count(o->c, "opt.rewrite.reloads", func_spill_load_count(o->f));
metrics_count(o->c, "opt.rewrite.stores", func_spill_store_count(o->f));
diff --git a/src/opt/opt.h b/src/opt/opt.h
@@ -123,6 +123,19 @@ typedef struct OptLiveRangeSet {
u32 whole_block_spans;
} OptLiveRangeSet;
+typedef enum OptLocKind {
+ OPT_LOC_NONE,
+ OPT_LOC_HARD,
+ OPT_LOC_STACK,
+} OptLocKind;
+
+typedef struct OptLoc {
+ u8 kind;
+ u8 cls;
+ Reg hard_reg;
+ FrameSlot spill_slot;
+} OptLoc;
+
typedef void (*OptBitsetIterFn)(Val, void*);
void opt_bitset_clear(OptBitset*);
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -704,39 +704,6 @@ void opt_live_info(Func* f) {
}
}
-static int ranges_overlap(const OptValInfo* a, const OptValInfo* b) {
- if (!a->first_pos || !b->first_pos) return 0;
- return a->first_pos <= b->last_pos && b->first_pos <= a->last_pos;
-}
-
-static int val_higher_priority(Func* f, Val a, Val b) {
- OptValInfo* av = &f->val_info[a];
- OptValInfo* bv = &f->val_info[b];
- int at = av->tied_hard_reg >= 0;
- int bt = bv->tied_hard_reg >= 0;
- if (at != bt) return at > bt;
- if (av->frequency != bv->frequency) return av->frequency > bv->frequency;
- if (av->live_across_call_freq != bv->live_across_call_freq)
- return av->live_across_call_freq > bv->live_across_call_freq;
- if (av->live_length != bv->live_length)
- return av->live_length < bv->live_length;
- return a < b;
-}
-
-static int hard_conflicts(Func* f, Val* assigned, u32 nassigned, Val v, Reg r) {
- for (u32 i = 0; i < nassigned; ++i) {
- Val ov = assigned[i];
- if (f->val_info[ov].alloc_kind != OPT_ALLOC_HARD) continue;
- if (f->val_info[ov].hard_reg != r) continue;
- if (f->val_conflicts && f->opt_conflict_words) {
- if (bit_has(conflict_row(f, v), ov)) return 1;
- } else if (ranges_overlap(&f->val_info[ov], &f->val_info[v])) {
- return 1;
- }
- }
- return 0;
-}
-
static int hard_available(Func* f, u8 cls, Reg r) {
if (cls >= OPT_REG_CLASSES) return 0;
for (u32 i = 0; i < f->opt_hard_reg_count[cls]; ++i)
@@ -757,6 +724,289 @@ static FrameSlot spill_slot_for(Func* f, Val v) {
return f->val_info[v].spill_slot;
}
+static u32 loc_bit_words(u32 nbits) { return (nbits + 63u) / 64u; }
+
+static u32 hard_loc_bit(u8 cls, Reg r) { return ((u32)cls * 32u) + (u32)r; }
+
+static void loc_bit_set(u64* bits, u32 bit) {
+ bits[bit / 64u] |= 1ull << (bit % 64u);
+}
+
+static int loc_bit_has(const u64* bits, u32 bit) {
+ return (bits[bit / 64u] & (1ull << (bit % 64u))) != 0;
+}
+
+static void loc_bits_clear(u64* bits, u32 words) {
+ for (u32 i = 0; i < words; ++i) bits[i] = 0;
+}
+
+static void loc_bits_or(u64* dst, const u64* src, u32 words) {
+ for (u32 i = 0; i < words; ++i) dst[i] |= src[i];
+}
+
+typedef struct OptAllocCandidate {
+ Val v;
+ u32 spill_cost;
+ u32 live_length;
+ u8 tied;
+ u8 pad[3];
+} OptAllocCandidate;
+
+typedef struct OptAllocator {
+ OptLoc* locs;
+ u64* used_locs;
+ u32 point_count;
+ u32 loc_words;
+ u32 hard_loc_bits;
+ FrameSlot* stack_slots;
+ u32 stack_slot_count;
+} OptAllocator;
+
+static int alloc_candidate_higher(const OptAllocCandidate* a,
+ const OptAllocCandidate* b) {
+ if (a->tied != b->tied) return a->tied > b->tied;
+ if (a->spill_cost != b->spill_cost) return a->spill_cost > b->spill_cost;
+ if (a->live_length != b->live_length) return a->live_length < b->live_length;
+ return a->v < b->v;
+}
+
+static void alloc_sort_candidates(OptAllocCandidate* cands, u32 n) {
+ for (u32 i = 1; i < n; ++i) {
+ OptAllocCandidate key = cands[i];
+ u32 j = i;
+ while (j > 0 && alloc_candidate_higher(&key, &cands[j - 1u])) {
+ cands[j] = cands[j - 1u];
+ --j;
+ }
+ cands[j] = key;
+ }
+}
+
+static void opt_init_val_info_from_ranges(Func* f,
+ const OptLiveRangeSet* ranges) {
+ OptValInfo* old = f->val_info;
+ OptValInfo* info = arena_zarray(f->arena, OptValInfo, f->nvals ? f->nvals : 1u);
+ for (Val v = 0; v < f->nvals; ++v) {
+ i32 tied = old ? old[v].tied_hard_reg : -1;
+ u32 forbidden = old ? old[v].forbidden_hard_regs : 0;
+ u32 old_frequency = old ? old[v].frequency : 0;
+ OptValInfo* vi = &info[v];
+ vi->tied_hard_reg = tied;
+ vi->hard_reg = REG_NONE;
+ vi->spill_slot = FRAME_SLOT_NONE;
+ vi->alloc_kind = OPT_ALLOC_NONE;
+ vi->cls = f->val_cls[v];
+ vi->forbidden_hard_regs = forbidden;
+ if (!ranges || v == VAL_NONE ||
+ ranges->first_range_by_val[v] == OPT_RANGE_NONE) {
+ continue;
+ }
+ u32 first = (u32)~0u;
+ u32 last = 0;
+ for (u32 r = ranges->first_range_by_val[v]; r != OPT_RANGE_NONE;
+ r = ranges->ranges[r].next) {
+ const OptLiveRange* lr = &ranges->ranges[r];
+ if (lr->start < first) first = lr->start;
+ if (lr->end > last) last = lr->end;
+ }
+ vi->first_pos = first == (u32)~0u ? 0 : first + 1u;
+ vi->last_pos = last;
+ vi->live_length = ranges->live_length_by_val[v];
+ vi->use_freq = ranges->use_freq_by_val[v];
+ vi->def_freq = ranges->def_freq_by_val[v];
+ vi->live_block_freq = ranges->live_block_freq_by_val[v];
+ vi->live_across_call_freq = ranges->live_across_call_freq_by_val[v];
+ vi->spill_cost = ranges->spill_cost_by_val[v];
+ vi->frequency = vi->spill_cost;
+ if (old_frequency > vi->frequency) vi->frequency = old_frequency;
+ }
+ f->val_info = info;
+ f->val_conflicts = NULL;
+ f->opt_conflict_words = 0;
+}
+
+static void opt_build_live_after_only(Func* f, const OptLiveInfo* live_info) {
+ u32 words = live_info ? live_info->words : f->opt_live_words;
+ if (!words) words = bit_words(f->nvals);
+ f->opt_live_words = (u16)words;
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ bl->live_after = arena_array(f->arena, u64*, bl->ninsts ? bl->ninsts : 1u);
+ u64* live = arena_zarray(f->arena, u64, words);
+ if (live_info) {
+ const OptBitset* out = &live_info->blocks[b].live_out;
+ for (u32 w = 0; w < words && w < out->nwords; ++w) live[w] = out->words[w];
+ } else if (bl->live_out) {
+ copy_bits(live, bl->live_out, words);
+ }
+ for (u32 ri = bl->ninsts; ri > 0; --ri) {
+ u32 i = ri - 1u;
+ Inst* in = &bl->insts[i];
+ u64* after = arena_zarray(f->arena, u64, words);
+ copy_bits(after, live, words);
+ bl->live_after[i] = after;
+
+ u64* use = arena_zarray(f->arena, u64, words);
+ u64* def = arena_zarray(f->arena, u64, words);
+ BitsCtx bc = {use, def};
+ walk_inst_operands(f, in, collect_bits, &bc);
+ if ((IROp)in->op == IR_ASM_BLOCK)
+ apply_asm_register_constraints(f, in, use, def, after);
+ for (u32 w = 0; w < words; ++w) live[w] = (live[w] & ~def[w]) | use[w];
+ }
+ }
+}
+
+static int spill_slot_compatible(Func* f, FrameSlot fs, Val v) {
+ if (fs == FRAME_SLOT_NONE || fs > f->nframe_slots) return 0;
+ IRFrameSlot* s = &f->frame_slots[fs - 1u];
+ u32 size = type_size_fallback(f, f->val_type[v]);
+ u32 align = size >= 8 ? 8 : size;
+ if (s->kind != FS_SPILL) return 0;
+ if (s->size < size) return 0;
+ if (s->align < align) return 0;
+ return 1;
+}
+
+static void alloc_collect_occupied(const OptAllocator* a,
+ const OptLiveRangeSet* ranges, Val v,
+ u64* occupied) {
+ loc_bits_clear(occupied, a->loc_words);
+ for (u32 r = ranges->first_range_by_val[v]; r != OPT_RANGE_NONE;
+ r = ranges->ranges[r].next) {
+ const OptLiveRange* lr = &ranges->ranges[r];
+ for (u32 p = lr->start; p < lr->end && p < a->point_count; ++p)
+ loc_bits_or(occupied, a->used_locs + ((size_t)p * a->loc_words),
+ a->loc_words);
+ }
+}
+
+static void alloc_mark_loc(OptAllocator* a, const OptLiveRangeSet* ranges,
+ Val v, u32 bit) {
+ for (u32 r = ranges->first_range_by_val[v]; r != OPT_RANGE_NONE;
+ r = ranges->ranges[r].next) {
+ const OptLiveRange* lr = &ranges->ranges[r];
+ for (u32 p = lr->start; p < lr->end && p < a->point_count; ++p)
+ loc_bit_set(a->used_locs + ((size_t)p * a->loc_words), bit);
+ }
+}
+
+static void alloc_assign_hard(Func* f, OptAllocator* a,
+ const OptLiveRangeSet* ranges, Val v, Reg r) {
+ OptValInfo* vi = &f->val_info[v];
+ vi->alloc_kind = OPT_ALLOC_HARD;
+ vi->hard_reg = r;
+ a->locs[v].kind = OPT_LOC_HARD;
+ a->locs[v].cls = vi->cls;
+ a->locs[v].hard_reg = r;
+ a->locs[v].spill_slot = FRAME_SLOT_NONE;
+ alloc_mark_loc(a, ranges, v, hard_loc_bit(vi->cls, r));
+}
+
+static void alloc_assign_stack(Func* f, OptAllocator* a,
+ const OptLiveRangeSet* ranges, Val v,
+ const u64* occupied) {
+ OptValInfo* vi = &f->val_info[v];
+ u32 stack_idx = a->stack_slot_count;
+ FrameSlot slot = FRAME_SLOT_NONE;
+ for (u32 i = 0; i < a->stack_slot_count; ++i) {
+ u32 bit = a->hard_loc_bits + i;
+ if (loc_bit_has(occupied, bit)) continue;
+ if (!spill_slot_compatible(f, a->stack_slots[i], v)) continue;
+ stack_idx = i;
+ slot = a->stack_slots[i];
+ break;
+ }
+ if (slot == FRAME_SLOT_NONE) {
+ slot = spill_slot_for(f, v);
+ a->stack_slots[a->stack_slot_count++] = slot;
+ stack_idx = a->stack_slot_count - 1u;
+ } else {
+ vi->spill_slot = slot;
+ }
+ vi->alloc_kind = OPT_ALLOC_SPILL;
+ a->locs[v].kind = OPT_LOC_STACK;
+ a->locs[v].cls = vi->cls;
+ a->locs[v].hard_reg = REG_NONE;
+ a->locs[v].spill_slot = slot;
+ alloc_mark_loc(a, ranges, v, a->hard_loc_bits + stack_idx);
+}
+
+static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
+ OptAllocator* a) {
+ memset(a, 0, sizeof *a);
+ a->point_count = ranges->point_count ? ranges->point_count : 1u;
+ a->hard_loc_bits = OPT_REG_CLASSES * 32u;
+ u32 ncands = 0;
+ for (Val v = 1; v < f->nvals; ++v)
+ if (ranges->first_range_by_val[v] != OPT_RANGE_NONE) ++ncands;
+ u32 loc_bits = a->hard_loc_bits + ncands;
+ a->loc_words = loc_bit_words(loc_bits);
+ f->opt_used_loc_words = a->point_count * a->loc_words;
+ a->locs = arena_zarray(f->arena, OptLoc, f->nvals ? f->nvals : 1u);
+ a->used_locs =
+ arena_zarray(f->arena, u64, (size_t)a->point_count * a->loc_words);
+ a->stack_slots = arena_array(f->arena, FrameSlot, ncands ? ncands : 1u);
+
+ OptAllocCandidate* cands =
+ arena_array(f->arena, OptAllocCandidate, ncands ? ncands : 1u);
+ u32 n = 0;
+ for (Val v = 1; v < f->nvals; ++v) {
+ if (ranges->first_range_by_val[v] == OPT_RANGE_NONE) continue;
+ OptValInfo* vi = &f->val_info[v];
+ cands[n].v = v;
+ cands[n].spill_cost = vi->frequency ? vi->frequency : vi->spill_cost;
+ cands[n].live_length = vi->live_length;
+ cands[n].tied = vi->tied_hard_reg >= 0;
+ memset(cands[n].pad, 0, sizeof cands[n].pad);
+ ++n;
+ }
+ alloc_sort_candidates(cands, n);
+
+ u64* occupied = arena_zarray(f->arena, u64, a->loc_words ? a->loc_words : 1u);
+ for (u32 i = 0; i < n; ++i) {
+ Val v = cands[i].v;
+ OptValInfo* vi = &f->val_info[v];
+ u8 cls = vi->cls;
+ alloc_collect_occupied(a, ranges, v, occupied);
+ if (vi->tied_hard_reg >= 0) {
+ Reg fixed = (Reg)vi->tied_hard_reg;
+ if (!hard_available(f, cls, fixed)) {
+ SrcLoc loc = {0, 0, 0};
+ compiler_panic(
+ f->c, loc,
+ "opt regalloc: fixed hard reg %u unavailable in class %u",
+ (unsigned)fixed, (unsigned)cls);
+ }
+ if (fixed < 32 && (vi->forbidden_hard_regs & (1u << fixed))) {
+ SrcLoc loc = {0, 0, 0};
+ compiler_panic(f->c, loc,
+ "opt regalloc: fixed hard reg %u is clobbered",
+ (unsigned)fixed);
+ }
+ if (fixed >= 32 || loc_bit_has(occupied, hard_loc_bit(cls, fixed))) {
+ SrcLoc loc = {0, 0, 0};
+ compiler_panic(f->c, loc, "opt regalloc: conflicting fixed hard reg %u",
+ (unsigned)fixed);
+ }
+ alloc_assign_hard(f, a, ranges, v, fixed);
+ continue;
+ }
+
+ int found = 0;
+ for (u32 r = 0; r < f->opt_hard_reg_count[cls]; ++r) {
+ Reg hr = f->opt_hard_regs[cls][r];
+ if (hr >= 32) continue;
+ if (vi->forbidden_hard_regs & (1u << hr)) continue;
+ if (loc_bit_has(occupied, hard_loc_bit(cls, hr))) continue;
+ alloc_assign_hard(f, a, ranges, v, hr);
+ found = 1;
+ break;
+ }
+ if (!found) alloc_assign_stack(f, a, ranges, v, occupied);
+ }
+}
+
typedef struct RewriteList {
Inst* data;
u32 n;
@@ -1041,76 +1291,19 @@ void opt_dead_def_elim(Func* f) {
void opt_regalloc(Func* f, int allow_live_range_split) {
(void)allow_live_range_split;
- if (!f->val_info) {
- metrics_scope_begin(f->c, "opt.live_info.regalloc");
- opt_live_info(f);
- metrics_count(f->c, "opt.live_words", f->opt_live_words);
- metrics_count(
- f->c, "opt.conflict_bytes",
- (u64)f->nvals * (u64)f->opt_conflict_words * (u64)sizeof(u64));
- metrics_scope_end(f->c, "opt.live_info.regalloc");
- }
- Val* order = arena_array(f->arena, Val, f->nvals ? f->nvals : 1u);
- u32 norder = 0;
- for (Val v = 1; v < f->nvals; ++v)
- if (f->val_info[v].first_pos) order[norder++] = v;
- for (u32 i = 1; i < norder; ++i) {
- Val key = order[i];
- u32 j = i;
- while (j > 0 && val_higher_priority(f, key, order[j - 1u])) {
- order[j] = order[j - 1u];
- --j;
- }
- order[j] = key;
- }
- Val* assigned = arena_array(f->arena, Val, norder ? norder : 1u);
- u32 nassigned = 0;
- for (u32 i = 0; i < norder; ++i) {
- Val v = order[i];
- OptValInfo* vi = &f->val_info[v];
- u8 cls = vi->cls;
- if (vi->tied_hard_reg >= 0) {
- Reg fixed = (Reg)vi->tied_hard_reg;
- if (!hard_available(f, cls, fixed)) {
- SrcLoc loc = {0, 0, 0};
- compiler_panic(
- f->c, loc,
- "opt regalloc: fixed hard reg %u unavailable in class %u",
- (unsigned)fixed, (unsigned)cls);
- }
- if (fixed < 32 && (vi->forbidden_hard_regs & (1u << fixed))) {
- SrcLoc loc = {0, 0, 0};
- compiler_panic(f->c, loc,
- "opt regalloc: fixed hard reg %u is clobbered",
- (unsigned)fixed);
- }
- if (hard_conflicts(f, assigned, nassigned, v, fixed)) {
- SrcLoc loc = {0, 0, 0};
- compiler_panic(f->c, loc, "opt regalloc: conflicting fixed hard reg %u",
- (unsigned)fixed);
- }
- vi->alloc_kind = OPT_ALLOC_HARD;
- vi->hard_reg = fixed;
- assigned[nassigned++] = v;
- continue;
- }
- int found = 0;
- for (u32 r = 0; r < f->opt_hard_reg_count[cls]; ++r) {
- Reg hr = f->opt_hard_regs[cls][r];
- if (hr < 32 && (vi->forbidden_hard_regs & (1u << hr))) continue;
- if (!hard_conflicts(f, assigned, nassigned, v, hr)) {
- vi->alloc_kind = OPT_ALLOC_HARD;
- vi->hard_reg = hr;
- assigned[nassigned++] = v;
- found = 1;
- break;
- }
- }
- if (!found) {
- vi->alloc_kind = OPT_ALLOC_SPILL;
- vi->spill_slot = spill_slot_for(f, v);
- }
- }
+ metrics_scope_begin(f->c, "opt.live_ranges.regalloc");
+ OptLiveInfo live;
+ opt_live_blocks(f, &live);
+ OptLiveRangeSet ranges;
+ opt_live_ranges_build(f, &live, &ranges);
+ opt_init_val_info_from_ranges(f, &ranges);
+ opt_build_live_after_only(f, &live);
+ metrics_count(f->c, "opt.live_words", f->opt_live_words);
+ metrics_count(f->c, "opt.conflict_bytes", 0);
+ metrics_scope_end(f->c, "opt.live_ranges.regalloc");
+
+ OptAllocator alloc;
+ opt_assign_ranges(f, &ranges, &alloc);
rewrite_func(f);
}
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -1161,6 +1161,41 @@ static void opt_regalloc_priority(void) {
tc_fini(&tc);
}
+static void opt_range_regalloc_no_conflicts_and_stack_reuse(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ MockCGTarget mock;
+ mock_init(&mock, tc.c);
+ static const Reg scratch[] = {9, 10};
+ mock_set_pool(&mock, RC_INT, NULL, 0, scratch, 2, 0);
+
+ Func* f = new_func(&tc);
+ opt_machinize(f, &mock.base);
+ Val a = add_val(f, tc.i32);
+ Val b = add_val(f, tc.i32);
+ emit_load_imm(f, f->entry, a, tc.i32, 1);
+ emit_load_imm(f, f->entry, b, tc.i32, 2);
+ emit_ret_val(f, f->entry, b, tc.i32);
+ opt_build_cfg(f);
+ opt_build_loop_tree(f);
+
+ opt_regalloc(f, 0);
+
+ EXPECT(f->val_conflicts == NULL,
+ "range allocator should not allocate a conflict matrix");
+ EXPECT(f->opt_conflict_words == 0,
+ "range allocator should leave conflict words at zero");
+ EXPECT(f->opt_used_loc_words != 0,
+ "range allocator should record used-location storage");
+ EXPECT(f->val_info[a].alloc_kind == OPT_ALLOC_SPILL,
+ "no hard regs should force v%u to stack", (unsigned)a);
+ EXPECT(f->val_info[b].alloc_kind == OPT_ALLOC_SPILL,
+ "no hard regs should force v%u to stack", (unsigned)b);
+ EXPECT(f->val_info[a].spill_slot == f->val_info[b].spill_slot,
+ "disjoint stack ranges should reuse a spill slot");
+ tc_fini(&tc);
+}
+
static void opt_rewrite_spill_use_def(void) {
TestCtx tc;
tc_init(&tc);
@@ -2078,6 +2113,7 @@ int main(void) {
opt_live_across_call_frequency();
opt_conflict_symmetry_and_class();
opt_regalloc_priority();
+ opt_range_regalloc_no_conflicts_and_stack_reuse();
opt_rewrite_spill_use_def();
opt_call_clobber_preservation();
opt_call_clobber_caller_saved();