kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 064cd29d7879a73a15ed06cbd9860f4ac4d59384
parent ed6d5b24fc8b242a6b5ba8405673c0b5b1836d2d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 14 May 2026 19:25:09 -0700

Implement range-based O1 assignment

Diffstat:
Mdoc/MIR_RA_REPORT.md | 36++++++++++++++++++------------------
Mdoc/PERF.md | 213++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
Msrc/opt/ir.h | 1+
Msrc/opt/opt.c | 3++-
Msrc/opt/opt.h | 13+++++++++++++
Msrc/opt/pass_lower.c | 399++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
Mtest/opt/opt_test.c | 36++++++++++++++++++++++++++++++++++++
7 files changed, 511 insertions(+), 190 deletions(-)

diff --git a/doc/MIR_RA_REPORT.md b/doc/MIR_RA_REPORT.md @@ -584,27 +584,27 @@ Exit criteria: ### Phase 3: Range-Based Assignment -- [ ] Define `OptLoc` assignment mapping for every `Val`. -- [ ] Build allocation candidate list from values with ranges. -- [ ] Sort candidates by: - - [ ] tied hard-register requirement - - [ ] frequency/spill cost - - [ ] shorter live length - - [ ] stable value id -- [ ] Add `used_locs[point]` bitmaps. -- [ ] Mark hard-register live ranges in `used_locs`. -- [ ] For each candidate, union occupied locations across its ranges. -- [ ] Assign a legal hard register when available. -- [ ] Assign/reuse stack slots when no hard register is available. -- [ ] Preserve tied hard-register validation. -- [ ] Preserve forbidden hard-register constraints from asm/calls. -- [ ] Keep live-range splitting disabled for O1. +- [x] Define `OptLoc` assignment mapping for every `Val`. +- [x] Build allocation candidate list from values with ranges. +- [x] Sort candidates by: + - [x] tied hard-register requirement + - [x] frequency/spill cost + - [x] shorter live length + - [x] stable value id +- [x] Add `used_locs[point]` bitmaps. +- [x] Mark hard-register live ranges in `used_locs`. +- [x] For each candidate, union occupied locations across its ranges. +- [x] Assign a legal hard register when available. +- [x] Assign/reuse stack slots when no hard register is available. +- [x] Preserve tied hard-register validation. +- [x] Preserve forbidden hard-register constraints from asm/calls. +- [x] Keep live-range splitting disabled for O1. Exit criteria: -- [ ] O1 regalloc no longer depends on `val_conflicts`. -- [ ] `opt.conflict_bytes` is zero or absent on the normal O1 allocation path. -- [ ] Branch, loop, call-clobber, spill, and tied-register tests pass. +- [x] O1 regalloc no longer depends on `val_conflicts`. +- [x] `opt.conflict_bytes` is zero or absent on the normal O1 allocation path. +- [x] Branch, loop, call-clobber, spill, and tied-register tests pass. ### Phase 4: Rewrite From Assignment Map diff --git a/doc/PERF.md b/doc/PERF.md @@ -30,6 +30,9 @@ Important counters: - `opt.funcs`, `opt.blocks`, `opt.insts`, `opt.vals` - `opt.live_words`, `opt.live.blocks`, `opt.live.active_words` - `opt.live.block_bytes`, `opt.live.set_bit_scans` +- `opt.ranges`, `opt.range_points`, `opt.range_raw_points` +- `opt.alloc.used_loc_words`, `opt.alloc.spills` +- `opt.rewrite.reloads`, `opt.rewrite.stores` - `opt.conflict_bytes` - `link.inputs`, `link.sections`, `link.segments`, `link.syms`, `link.relocs` - `jit.master_size`, `jit.nsegments`, `jit.input_section_bytes`, @@ -43,19 +46,19 @@ Times are `min / p50 / p95 / mean / max` in milliseconds. `int main(){return 0;}` ```text -run.total 0.298 / 0.312 / 0.344 / 0.319 / 0.390 -run.compile_and_jit 0.283 / 0.295 / 0.323 / 0.300 / 0.346 -compile.tu 0.209 / 0.222 / 0.242 / 0.224 / 0.244 +run.total 0.290 / 0.311 / 0.345 / 0.316 / 0.351 +run.compile_and_jit 0.274 / 0.293 / 0.313 / 0.296 / 0.329 +compile.tu 0.218 / 0.235 / 0.246 / 0.235 / 0.257 compile.c.parse_codegen - 0.073 / 0.081 / 0.092 / 0.082 / 0.095 -opt.o1.total 0.040 / 0.048 / 0.057 / 0.048 / 0.059 + 0.082 / 0.091 / 0.097 / 0.091 / 0.107 +opt.o1.total 0.048 / 0.056 / 0.061 / 0.055 / 0.065 opt.live_blocks.pre_dde - 0.008 / 0.008 / 0.017 / 0.010 / 0.019 -opt.live_info.regalloc - 0.003 / 0.003 / 0.004 / 0.004 / 0.007 -opt.regalloc 0.008 / 0.012 / 0.016 / 0.012 / 0.019 -link_jit.all 0.043 / 0.049 / 0.058 / 0.051 / 0.059 -link.resolve.total 0.020 / 0.021 / 0.027 / 0.023 / 0.029 + 0.013 / 0.017 / 0.023 / 0.017 / 0.024 +opt.live_ranges.regalloc + 0.003 / 0.004 / 0.004 / 0.004 / 0.005 +opt.regalloc 0.009 / 0.010 / 0.012 / 0.010 / 0.015 +link_jit.all 0.047 / 0.050 / 0.056 / 0.051 / 0.061 +link.resolve.total 0.023 / 0.025 / 0.028 / 0.025 / 0.029 ``` One small loop: @@ -65,19 +68,19 @@ int main(){int s=0; for(int i=0;i<10;i++) s+=i; return s==45?0:1;} ``` ```text -run.total 0.351 / 0.365 / 0.394 / 0.371 / 0.409 -run.compile_and_jit 0.334 / 0.349 / 0.377 / 0.353 / 0.389 -compile.tu 0.266 / 0.277 / 0.292 / 0.279 / 0.315 +run.total 0.369 / 0.400 / 0.446 / 0.401 / 0.448 +run.compile_and_jit 0.351 / 0.381 / 0.414 / 0.381 / 0.427 +compile.tu 0.293 / 0.314 / 0.349 / 0.315 / 0.353 compile.c.parse_codegen - 0.131 / 0.140 / 0.155 / 0.141 / 0.157 -opt.o1.total 0.077 / 0.083 / 0.094 / 0.084 / 0.095 + 0.151 / 0.165 / 0.196 / 0.167 / 0.212 +opt.o1.total 0.090 / 0.097 / 0.107 / 0.098 / 0.122 opt.live_blocks.pre_dde - 0.011 / 0.012 / 0.018 / 0.013 / 0.018 -opt.live_info.regalloc - 0.016 / 0.016 / 0.016 / 0.016 / 0.017 -opt.regalloc 0.027 / 0.028 / 0.034 / 0.029 / 0.034 -link_jit.all 0.043 / 0.046 / 0.057 / 0.049 / 0.064 -link.resolve.total 0.021 / 0.021 / 0.028 / 0.023 / 0.031 + 0.024 / 0.026 / 0.029 / 0.027 / 0.032 +opt.live_ranges.regalloc + 0.012 / 0.013 / 0.015 / 0.013 / 0.015 +opt.regalloc 0.025 / 0.027 / 0.030 / 0.028 / 0.042 +link_jit.all 0.048 / 0.052 / 0.066 / 0.055 / 0.088 +link.resolve.total 0.023 / 0.026 / 0.034 / 0.026 / 0.035 ``` For tiny programs, compile dominates, with O1 and JIT/link image setup as the @@ -221,6 +224,78 @@ conflict matrix and its block-live storage scales linearly on the large straight-line case. The remaining superlinear behavior is now concentrated in the regalloc-side full liveness/conflict path. +### Phase-3 Range-Allocator Rerun + +After `doc/MIR_RA_REPORT.md` phase 3, O1 assignment no longer uses the dense +pseudo conflict matrix. Regalloc rebuilds pass-local block liveness and live +ranges, assigns against `used_locs[point]`, and reports +`opt.conflict_bytes=0` on the normal allocation path. Rewrite still uses +per-instruction `live_after`, so this is not the final phase-4 shape. + +Samples: 5 runs per point. The table uses p50 milliseconds. O1 scopes and O1 +counters are summed across functions. The generator is the same direct-call vs +function-table family as above, but these are fresh generated inputs and not +byte-for-byte identical to the earlier tables. + +Straight-line main: + +```text +funcs input_bytes insts vals pre_block_bytes ranges range_points used_loc_words conflict_bytes run.total compile.tu parse_codegen opt.o1.total live_blocks_pre live_ranges_reg regalloc link_jit +1 139 55 32 576 28 41 82 0 0.551 0.444 0.296 0.184 0.048 0.025 0.051 0.057 +4 430 178 110 1632 100 143 286 0 0.887 0.784 0.642 0.418 0.114 0.069 0.132 0.055 +16 1624 670 422 6080 388 551 1189 0 2.433 2.306 2.147 1.414 0.391 0.254 0.480 0.069 +64 6520 2638 1670 23648 1540 2183 5674 0 8.195 8.042 7.894 5.236 1.435 0.992 1.894 0.084 +128 13188 5262 3334 47072 3076 4359 13894 0 15.634 15.399 15.250 10.068 2.657 1.936 3.893 0.102 +256 26884 10510 6662 93920 6148 8711 38014 0 31.114 30.795 30.648 20.296 5.097 3.876 8.415 0.139 +512 54276 21006 13318 187616 12292 17415 116974 0 66.957 66.408 66.265 44.891 10.516 8.204 20.599 0.219 +1024 109180 41998 26630 375008 24580 34823 397774 0 157.795 156.418 156.251 109.919 22.778 18.122 57.799 0.374 +``` + +Function-table main: + +```text +funcs input_bytes insts vals pre_block_bytes ranges range_points used_loc_words conflict_bytes run.total compile.tu parse_codegen opt.o1.total live_blocks_pre live_ranges_reg regalloc link_jit +1 212 76 43 832 39 56 112 0 0.594 0.486 0.335 0.194 0.051 0.031 0.060 0.061 +4 482 184 109 1888 99 143 286 0 0.944 0.833 0.685 0.437 0.118 0.072 0.140 0.061 +16 1587 616 373 6112 339 491 982 0 2.266 2.145 1.993 1.316 0.375 0.230 0.432 0.064 +64 6099 2344 1429 23008 1299 1883 3766 0 7.565 7.414 7.259 4.806 1.347 0.861 1.605 0.077 +128 12228 4648 2837 45536 2579 3739 7478 0 14.055 13.851 13.703 8.959 2.491 1.659 3.058 0.098 +256 24772 9256 5653 90592 5139 7451 14902 0 27.074 26.765 26.620 17.091 4.677 3.237 5.942 0.141 +512 49860 18472 11285 180704 10259 14875 29750 0 54.364 53.839 53.682 33.996 9.330 6.422 11.850 0.225 +1024 100133 36904 22549 360928 20499 29723 59446 0 114.081 112.879 112.706 68.967 18.959 12.909 23.842 0.367 +``` + +The 512 to 1024 step after phase 3: + +```text +straight_main: +compile.tu 66.408 -> 156.418 ms 2.36x +opt.o1.total 44.891 -> 109.919 ms 2.45x +opt.live_blocks.pre_dde 10.516 -> 22.778 ms 2.17x +opt.live_ranges.regalloc 8.204 -> 18.122 ms 2.21x +opt.regalloc 20.599 -> 57.799 ms 2.81x +pre_block_bytes 187616 -> 375008 2.00x +used_loc_words 116974 -> 397774 3.40x +conflict_bytes 0 -> 0 + +table_main: +compile.tu 53.839 -> 112.879 ms 2.10x +opt.o1.total 33.996 -> 68.967 ms 2.03x +opt.live_blocks.pre_dde 9.330 -> 18.959 ms 2.03x +opt.live_ranges.regalloc 6.422 -> 12.909 ms 2.01x +opt.regalloc 11.850 -> 23.842 ms 2.01x +pre_block_bytes 180704 -> 360928 2.00x +used_loc_words 29750 -> 59446 2.00x +conflict_bytes 0 -> 0 +``` + +This is the intended phase-3 result: the old conflict matrix is gone from O1, +and the large straight-line case is much faster than the phase-1 rerun. The +remaining non-linear straight-line bucket now tracks `used_loc_words`, because +the first range allocator uses dense occupied-location bitmaps whose stack-slot +portion is sized by candidate count. Table-main stays close to linear because +each function remains small. + ## Compile Perf `compile.tu` is now split into generic and C-specific scopes: @@ -245,35 +320,36 @@ compile.tu compile.obj_relocs ``` -Updated 1024-function single-run examples with the detailed compile scopes: +Updated phase-3 1024-function p50 examples with the detailed compile scopes: ```text metric straight_main table_main -run.total 828.387 ms 314.678 ms -compile.tu 825.154 ms 309.944 ms -compile.frontend 825.129 ms 309.825 ms -compile.c.setup 0.156 ms 0.946 ms -compile.c.pp_options 0.021 ms 0.021 ms -compile.c.parse_codegen 824.900 ms 308.676 ms -compile.c.cleanup 0.026 ms 0.015 ms -compile.obj_finalize 0.000 ms 0.001 ms -opt.o1.total summed 746.533 ms 231.538 ms +run.total 157.795 ms 114.081 ms +compile.tu 156.418 ms 112.879 ms +compile.frontend 156.411 ms 112.868 ms +compile.c.setup 0.128 ms 0.130 ms +compile.c.pp_options 0.003 ms 0.004 ms +compile.c.parse_codegen 156.251 ms 112.706 ms +compile.c.cleanup 0.020 ms 0.015 ms +compile.obj_finalize 0.000 ms 0.000 ms +opt.o1.total summed 109.919 ms 68.967 ms ``` -Subtracting summed O1 from `compile.tu` leaves about `78 ms` in both variants: +Subtracting summed O1 from `compile.tu` leaves about `44-46 ms` in both +variants: ```text -straight_main: 825.154 - 746.533 = 78.621 ms -table_main: 309.944 - 231.538 = 78.406 ms +straight_main: 156.418 - 109.919 = 46.499 ms +table_main: 112.879 - 68.967 = 43.912 ms ``` That residual is the current frontend/PP/parser/CG API cost for roughly -330 KiB of generated C. It appears linear with input size and number of small -functions in these tests. The setup, PP option application, cleanup, and object -finalization scopes are negligible. The opaque part that remains is inside -`compile.c.parse_codegen`, which streams PP tokens, parses declarations and -statements, performs type work, drives the CG API, and triggers per-function O1 -when function bodies end. +100 KiB of generated C in these phase-3 inputs. It appears linear with input +size and number of small functions in these tests. The setup, PP option +application, cleanup, and object finalization scopes are negligible. The opaque +part that remains is inside `compile.c.parse_codegen`, which streams PP tokens, +parses declarations and statements, performs type work, drives the CG API, and +triggers per-function O1 when function bodies end. ### Compile Scope Scaling @@ -285,28 +361,28 @@ Straight-line main: ```text funcs input_bytes compile.tu parse_codegen opt.o1.total non_o1_residual -1 383 0.743 0.521 0.383 0.360 -4 1338 1.748 1.519 1.148 0.600 -16 5193 5.485 5.264 4.026 1.459 -64 20697 21.657 21.447 16.719 4.938 -128 41599 46.748 46.530 37.164 9.584 -256 83839 109.467 109.241 90.519 18.948 -512 168319 281.974 281.750 244.471 37.503 -1024 337481 824.834 824.586 746.447 78.387 +1 139 0.444 0.296 0.184 0.260 +4 430 0.784 0.642 0.418 0.366 +16 1624 2.306 2.147 1.414 0.892 +64 6520 8.042 7.894 5.236 2.806 +128 13188 15.399 15.250 10.068 5.331 +256 26884 30.795 30.648 20.296 10.499 +512 54276 66.408 66.265 44.891 21.517 +1024 109180 156.418 156.251 109.919 46.499 ``` Function-table main: ```text funcs input_bytes compile.tu parse_codegen opt.o1.total non_o1_residual -1 504 0.783 0.563 0.402 0.381 -4 1429 1.781 1.561 1.171 0.610 -16 5166 5.272 5.062 3.839 1.433 -64 20190 19.208 18.990 14.467 4.741 -128 40454 38.143 37.929 28.862 9.281 -256 81414 75.035 74.816 56.935 18.100 -512 163334 150.504 150.275 114.101 36.403 -1024 327378 304.716 304.492 227.790 76.926 +1 212 0.486 0.335 0.194 0.292 +4 482 0.833 0.685 0.437 0.396 +16 1587 2.145 1.993 1.316 0.829 +64 6099 7.414 7.259 4.806 2.608 +128 12228 13.851 13.703 8.959 4.892 +256 24772 26.765 26.620 17.091 9.674 +512 49860 53.839 53.682 33.996 19.843 +1024 100133 112.879 112.706 68.967 43.912 ``` The detailed compile scopes show: @@ -320,7 +396,7 @@ The detailed compile scopes show: options. - `compile.c.cleanup` grows only to about `0.01-0.02 ms`. - `non_o1_residual` is near-linear with source size at larger sizes, around - `228-241 us/KiB` from 64 through 1024 functions. + `430-460 us/KiB` from 64 through 1024 functions in the phase-3 inputs. So the compile-side scaling story is two-layered: the broad C parse/codegen bucket scales linearly with input size after startup overhead, while the nested @@ -340,16 +416,17 @@ Next useful compile instrumentation: ## Performance Priorities -1. Move regalloc off full `opt_live_info`. - Phase 1 removed the full liveness/conflict build from pre-DDE. The remaining - superlinear bucket is now `opt.live_info.regalloc`, which still builds - `live_after` and the dense conflict matrix before assignment/rewrite. - -2. Replace or narrow dense conflict structures. - Regalloc-side `opt.conflict_bytes` still tracks the observed curve closely. - Investigate sparse sets, segmented bitsets, per-block live sets, or - interval-style structures that avoid touching full dense rows for values that - cannot overlap. +1. Finish phase-4 rewrite without persistent `live_after`. + Phase 3 removed the dense conflict matrix from O1 assignment. Rewrite still + needs per-instruction full live-after sets for call preservation and spill + insertion, so that is the next correctness-preserving structural target. + +2. Compress or narrow `used_locs`. + The remaining straight-line superlinear bucket now tracks + `opt.alloc.used_loc_words`. The first range allocator sizes the + occupied-location bitmap by hard-register bits plus candidate stack bits. + The stack side should become demand-sized, sparse, or split-capable instead + of paying for every candidate at every point. 3. Avoid whole-function contiguous growth where hot passes scan repeatedly. Large `Val`/block/instruction arrays and dense bit matrices are likely to diff --git a/src/opt/ir.h b/src/opt/ir.h @@ -328,6 +328,7 @@ typedef struct Func { u8 opt_rewritten; u16 opt_live_words; u16 opt_conflict_words; + u32 opt_used_loc_words; u32 opt_position_count; OptValInfo* val_info; /* indexed by Val, length nvals */ u64* val_conflicts; /* nvals x opt_conflict_words bit matrix */ diff --git a/src/opt/opt.c b/src/opt/opt.c @@ -1424,7 +1424,8 @@ static void w_func_end(CGTarget* t) { metrics_scope_end(o->c, "opt.dead_def_elim"); metrics_scope_begin(o->c, "opt.regalloc"); opt_regalloc(o->f, 0); - metrics_count(o->c, "opt.alloc.used_loc_words", 0); + metrics_count(o->c, "opt.alloc.used_loc_words", + o->f->opt_used_loc_words); metrics_count(o->c, "opt.alloc.spills", func_spill_alloc_count(o->f)); metrics_count(o->c, "opt.rewrite.reloads", func_spill_load_count(o->f)); metrics_count(o->c, "opt.rewrite.stores", func_spill_store_count(o->f)); diff --git a/src/opt/opt.h b/src/opt/opt.h @@ -123,6 +123,19 @@ typedef struct OptLiveRangeSet { u32 whole_block_spans; } OptLiveRangeSet; +typedef enum OptLocKind { + OPT_LOC_NONE, + OPT_LOC_HARD, + OPT_LOC_STACK, +} OptLocKind; + +typedef struct OptLoc { + u8 kind; + u8 cls; + Reg hard_reg; + FrameSlot spill_slot; +} OptLoc; + typedef void (*OptBitsetIterFn)(Val, void*); void opt_bitset_clear(OptBitset*); diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c @@ -704,39 +704,6 @@ void opt_live_info(Func* f) { } } -static int ranges_overlap(const OptValInfo* a, const OptValInfo* b) { - if (!a->first_pos || !b->first_pos) return 0; - return a->first_pos <= b->last_pos && b->first_pos <= a->last_pos; -} - -static int val_higher_priority(Func* f, Val a, Val b) { - OptValInfo* av = &f->val_info[a]; - OptValInfo* bv = &f->val_info[b]; - int at = av->tied_hard_reg >= 0; - int bt = bv->tied_hard_reg >= 0; - if (at != bt) return at > bt; - if (av->frequency != bv->frequency) return av->frequency > bv->frequency; - if (av->live_across_call_freq != bv->live_across_call_freq) - return av->live_across_call_freq > bv->live_across_call_freq; - if (av->live_length != bv->live_length) - return av->live_length < bv->live_length; - return a < b; -} - -static int hard_conflicts(Func* f, Val* assigned, u32 nassigned, Val v, Reg r) { - for (u32 i = 0; i < nassigned; ++i) { - Val ov = assigned[i]; - if (f->val_info[ov].alloc_kind != OPT_ALLOC_HARD) continue; - if (f->val_info[ov].hard_reg != r) continue; - if (f->val_conflicts && f->opt_conflict_words) { - if (bit_has(conflict_row(f, v), ov)) return 1; - } else if (ranges_overlap(&f->val_info[ov], &f->val_info[v])) { - return 1; - } - } - return 0; -} - static int hard_available(Func* f, u8 cls, Reg r) { if (cls >= OPT_REG_CLASSES) return 0; for (u32 i = 0; i < f->opt_hard_reg_count[cls]; ++i) @@ -757,6 +724,289 @@ static FrameSlot spill_slot_for(Func* f, Val v) { return f->val_info[v].spill_slot; } +static u32 loc_bit_words(u32 nbits) { return (nbits + 63u) / 64u; } + +static u32 hard_loc_bit(u8 cls, Reg r) { return ((u32)cls * 32u) + (u32)r; } + +static void loc_bit_set(u64* bits, u32 bit) { + bits[bit / 64u] |= 1ull << (bit % 64u); +} + +static int loc_bit_has(const u64* bits, u32 bit) { + return (bits[bit / 64u] & (1ull << (bit % 64u))) != 0; +} + +static void loc_bits_clear(u64* bits, u32 words) { + for (u32 i = 0; i < words; ++i) bits[i] = 0; +} + +static void loc_bits_or(u64* dst, const u64* src, u32 words) { + for (u32 i = 0; i < words; ++i) dst[i] |= src[i]; +} + +typedef struct OptAllocCandidate { + Val v; + u32 spill_cost; + u32 live_length; + u8 tied; + u8 pad[3]; +} OptAllocCandidate; + +typedef struct OptAllocator { + OptLoc* locs; + u64* used_locs; + u32 point_count; + u32 loc_words; + u32 hard_loc_bits; + FrameSlot* stack_slots; + u32 stack_slot_count; +} OptAllocator; + +static int alloc_candidate_higher(const OptAllocCandidate* a, + const OptAllocCandidate* b) { + if (a->tied != b->tied) return a->tied > b->tied; + if (a->spill_cost != b->spill_cost) return a->spill_cost > b->spill_cost; + if (a->live_length != b->live_length) return a->live_length < b->live_length; + return a->v < b->v; +} + +static void alloc_sort_candidates(OptAllocCandidate* cands, u32 n) { + for (u32 i = 1; i < n; ++i) { + OptAllocCandidate key = cands[i]; + u32 j = i; + while (j > 0 && alloc_candidate_higher(&key, &cands[j - 1u])) { + cands[j] = cands[j - 1u]; + --j; + } + cands[j] = key; + } +} + +static void opt_init_val_info_from_ranges(Func* f, + const OptLiveRangeSet* ranges) { + OptValInfo* old = f->val_info; + OptValInfo* info = arena_zarray(f->arena, OptValInfo, f->nvals ? f->nvals : 1u); + for (Val v = 0; v < f->nvals; ++v) { + i32 tied = old ? old[v].tied_hard_reg : -1; + u32 forbidden = old ? old[v].forbidden_hard_regs : 0; + u32 old_frequency = old ? old[v].frequency : 0; + OptValInfo* vi = &info[v]; + vi->tied_hard_reg = tied; + vi->hard_reg = REG_NONE; + vi->spill_slot = FRAME_SLOT_NONE; + vi->alloc_kind = OPT_ALLOC_NONE; + vi->cls = f->val_cls[v]; + vi->forbidden_hard_regs = forbidden; + if (!ranges || v == VAL_NONE || + ranges->first_range_by_val[v] == OPT_RANGE_NONE) { + continue; + } + u32 first = (u32)~0u; + u32 last = 0; + for (u32 r = ranges->first_range_by_val[v]; r != OPT_RANGE_NONE; + r = ranges->ranges[r].next) { + const OptLiveRange* lr = &ranges->ranges[r]; + if (lr->start < first) first = lr->start; + if (lr->end > last) last = lr->end; + } + vi->first_pos = first == (u32)~0u ? 0 : first + 1u; + vi->last_pos = last; + vi->live_length = ranges->live_length_by_val[v]; + vi->use_freq = ranges->use_freq_by_val[v]; + vi->def_freq = ranges->def_freq_by_val[v]; + vi->live_block_freq = ranges->live_block_freq_by_val[v]; + vi->live_across_call_freq = ranges->live_across_call_freq_by_val[v]; + vi->spill_cost = ranges->spill_cost_by_val[v]; + vi->frequency = vi->spill_cost; + if (old_frequency > vi->frequency) vi->frequency = old_frequency; + } + f->val_info = info; + f->val_conflicts = NULL; + f->opt_conflict_words = 0; +} + +static void opt_build_live_after_only(Func* f, const OptLiveInfo* live_info) { + u32 words = live_info ? live_info->words : f->opt_live_words; + if (!words) words = bit_words(f->nvals); + f->opt_live_words = (u16)words; + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + bl->live_after = arena_array(f->arena, u64*, bl->ninsts ? bl->ninsts : 1u); + u64* live = arena_zarray(f->arena, u64, words); + if (live_info) { + const OptBitset* out = &live_info->blocks[b].live_out; + for (u32 w = 0; w < words && w < out->nwords; ++w) live[w] = out->words[w]; + } else if (bl->live_out) { + copy_bits(live, bl->live_out, words); + } + for (u32 ri = bl->ninsts; ri > 0; --ri) { + u32 i = ri - 1u; + Inst* in = &bl->insts[i]; + u64* after = arena_zarray(f->arena, u64, words); + copy_bits(after, live, words); + bl->live_after[i] = after; + + u64* use = arena_zarray(f->arena, u64, words); + u64* def = arena_zarray(f->arena, u64, words); + BitsCtx bc = {use, def}; + walk_inst_operands(f, in, collect_bits, &bc); + if ((IROp)in->op == IR_ASM_BLOCK) + apply_asm_register_constraints(f, in, use, def, after); + for (u32 w = 0; w < words; ++w) live[w] = (live[w] & ~def[w]) | use[w]; + } + } +} + +static int spill_slot_compatible(Func* f, FrameSlot fs, Val v) { + if (fs == FRAME_SLOT_NONE || fs > f->nframe_slots) return 0; + IRFrameSlot* s = &f->frame_slots[fs - 1u]; + u32 size = type_size_fallback(f, f->val_type[v]); + u32 align = size >= 8 ? 8 : size; + if (s->kind != FS_SPILL) return 0; + if (s->size < size) return 0; + if (s->align < align) return 0; + return 1; +} + +static void alloc_collect_occupied(const OptAllocator* a, + const OptLiveRangeSet* ranges, Val v, + u64* occupied) { + loc_bits_clear(occupied, a->loc_words); + for (u32 r = ranges->first_range_by_val[v]; r != OPT_RANGE_NONE; + r = ranges->ranges[r].next) { + const OptLiveRange* lr = &ranges->ranges[r]; + for (u32 p = lr->start; p < lr->end && p < a->point_count; ++p) + loc_bits_or(occupied, a->used_locs + ((size_t)p * a->loc_words), + a->loc_words); + } +} + +static void alloc_mark_loc(OptAllocator* a, const OptLiveRangeSet* ranges, + Val v, u32 bit) { + for (u32 r = ranges->first_range_by_val[v]; r != OPT_RANGE_NONE; + r = ranges->ranges[r].next) { + const OptLiveRange* lr = &ranges->ranges[r]; + for (u32 p = lr->start; p < lr->end && p < a->point_count; ++p) + loc_bit_set(a->used_locs + ((size_t)p * a->loc_words), bit); + } +} + +static void alloc_assign_hard(Func* f, OptAllocator* a, + const OptLiveRangeSet* ranges, Val v, Reg r) { + OptValInfo* vi = &f->val_info[v]; + vi->alloc_kind = OPT_ALLOC_HARD; + vi->hard_reg = r; + a->locs[v].kind = OPT_LOC_HARD; + a->locs[v].cls = vi->cls; + a->locs[v].hard_reg = r; + a->locs[v].spill_slot = FRAME_SLOT_NONE; + alloc_mark_loc(a, ranges, v, hard_loc_bit(vi->cls, r)); +} + +static void alloc_assign_stack(Func* f, OptAllocator* a, + const OptLiveRangeSet* ranges, Val v, + const u64* occupied) { + OptValInfo* vi = &f->val_info[v]; + u32 stack_idx = a->stack_slot_count; + FrameSlot slot = FRAME_SLOT_NONE; + for (u32 i = 0; i < a->stack_slot_count; ++i) { + u32 bit = a->hard_loc_bits + i; + if (loc_bit_has(occupied, bit)) continue; + if (!spill_slot_compatible(f, a->stack_slots[i], v)) continue; + stack_idx = i; + slot = a->stack_slots[i]; + break; + } + if (slot == FRAME_SLOT_NONE) { + slot = spill_slot_for(f, v); + a->stack_slots[a->stack_slot_count++] = slot; + stack_idx = a->stack_slot_count - 1u; + } else { + vi->spill_slot = slot; + } + vi->alloc_kind = OPT_ALLOC_SPILL; + a->locs[v].kind = OPT_LOC_STACK; + a->locs[v].cls = vi->cls; + a->locs[v].hard_reg = REG_NONE; + a->locs[v].spill_slot = slot; + alloc_mark_loc(a, ranges, v, a->hard_loc_bits + stack_idx); +} + +static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges, + OptAllocator* a) { + memset(a, 0, sizeof *a); + a->point_count = ranges->point_count ? ranges->point_count : 1u; + a->hard_loc_bits = OPT_REG_CLASSES * 32u; + u32 ncands = 0; + for (Val v = 1; v < f->nvals; ++v) + if (ranges->first_range_by_val[v] != OPT_RANGE_NONE) ++ncands; + u32 loc_bits = a->hard_loc_bits + ncands; + a->loc_words = loc_bit_words(loc_bits); + f->opt_used_loc_words = a->point_count * a->loc_words; + a->locs = arena_zarray(f->arena, OptLoc, f->nvals ? f->nvals : 1u); + a->used_locs = + arena_zarray(f->arena, u64, (size_t)a->point_count * a->loc_words); + a->stack_slots = arena_array(f->arena, FrameSlot, ncands ? ncands : 1u); + + OptAllocCandidate* cands = + arena_array(f->arena, OptAllocCandidate, ncands ? ncands : 1u); + u32 n = 0; + for (Val v = 1; v < f->nvals; ++v) { + if (ranges->first_range_by_val[v] == OPT_RANGE_NONE) continue; + OptValInfo* vi = &f->val_info[v]; + cands[n].v = v; + cands[n].spill_cost = vi->frequency ? vi->frequency : vi->spill_cost; + cands[n].live_length = vi->live_length; + cands[n].tied = vi->tied_hard_reg >= 0; + memset(cands[n].pad, 0, sizeof cands[n].pad); + ++n; + } + alloc_sort_candidates(cands, n); + + u64* occupied = arena_zarray(f->arena, u64, a->loc_words ? a->loc_words : 1u); + for (u32 i = 0; i < n; ++i) { + Val v = cands[i].v; + OptValInfo* vi = &f->val_info[v]; + u8 cls = vi->cls; + alloc_collect_occupied(a, ranges, v, occupied); + if (vi->tied_hard_reg >= 0) { + Reg fixed = (Reg)vi->tied_hard_reg; + if (!hard_available(f, cls, fixed)) { + SrcLoc loc = {0, 0, 0}; + compiler_panic( + f->c, loc, + "opt regalloc: fixed hard reg %u unavailable in class %u", + (unsigned)fixed, (unsigned)cls); + } + if (fixed < 32 && (vi->forbidden_hard_regs & (1u << fixed))) { + SrcLoc loc = {0, 0, 0}; + compiler_panic(f->c, loc, + "opt regalloc: fixed hard reg %u is clobbered", + (unsigned)fixed); + } + if (fixed >= 32 || loc_bit_has(occupied, hard_loc_bit(cls, fixed))) { + SrcLoc loc = {0, 0, 0}; + compiler_panic(f->c, loc, "opt regalloc: conflicting fixed hard reg %u", + (unsigned)fixed); + } + alloc_assign_hard(f, a, ranges, v, fixed); + continue; + } + + int found = 0; + for (u32 r = 0; r < f->opt_hard_reg_count[cls]; ++r) { + Reg hr = f->opt_hard_regs[cls][r]; + if (hr >= 32) continue; + if (vi->forbidden_hard_regs & (1u << hr)) continue; + if (loc_bit_has(occupied, hard_loc_bit(cls, hr))) continue; + alloc_assign_hard(f, a, ranges, v, hr); + found = 1; + break; + } + if (!found) alloc_assign_stack(f, a, ranges, v, occupied); + } +} + typedef struct RewriteList { Inst* data; u32 n; @@ -1041,76 +1291,19 @@ void opt_dead_def_elim(Func* f) { void opt_regalloc(Func* f, int allow_live_range_split) { (void)allow_live_range_split; - if (!f->val_info) { - metrics_scope_begin(f->c, "opt.live_info.regalloc"); - opt_live_info(f); - metrics_count(f->c, "opt.live_words", f->opt_live_words); - metrics_count( - f->c, "opt.conflict_bytes", - (u64)f->nvals * (u64)f->opt_conflict_words * (u64)sizeof(u64)); - metrics_scope_end(f->c, "opt.live_info.regalloc"); - } - Val* order = arena_array(f->arena, Val, f->nvals ? f->nvals : 1u); - u32 norder = 0; - for (Val v = 1; v < f->nvals; ++v) - if (f->val_info[v].first_pos) order[norder++] = v; - for (u32 i = 1; i < norder; ++i) { - Val key = order[i]; - u32 j = i; - while (j > 0 && val_higher_priority(f, key, order[j - 1u])) { - order[j] = order[j - 1u]; - --j; - } - order[j] = key; - } - Val* assigned = arena_array(f->arena, Val, norder ? norder : 1u); - u32 nassigned = 0; - for (u32 i = 0; i < norder; ++i) { - Val v = order[i]; - OptValInfo* vi = &f->val_info[v]; - u8 cls = vi->cls; - if (vi->tied_hard_reg >= 0) { - Reg fixed = (Reg)vi->tied_hard_reg; - if (!hard_available(f, cls, fixed)) { - SrcLoc loc = {0, 0, 0}; - compiler_panic( - f->c, loc, - "opt regalloc: fixed hard reg %u unavailable in class %u", - (unsigned)fixed, (unsigned)cls); - } - if (fixed < 32 && (vi->forbidden_hard_regs & (1u << fixed))) { - SrcLoc loc = {0, 0, 0}; - compiler_panic(f->c, loc, - "opt regalloc: fixed hard reg %u is clobbered", - (unsigned)fixed); - } - if (hard_conflicts(f, assigned, nassigned, v, fixed)) { - SrcLoc loc = {0, 0, 0}; - compiler_panic(f->c, loc, "opt regalloc: conflicting fixed hard reg %u", - (unsigned)fixed); - } - vi->alloc_kind = OPT_ALLOC_HARD; - vi->hard_reg = fixed; - assigned[nassigned++] = v; - continue; - } - int found = 0; - for (u32 r = 0; r < f->opt_hard_reg_count[cls]; ++r) { - Reg hr = f->opt_hard_regs[cls][r]; - if (hr < 32 && (vi->forbidden_hard_regs & (1u << hr))) continue; - if (!hard_conflicts(f, assigned, nassigned, v, hr)) { - vi->alloc_kind = OPT_ALLOC_HARD; - vi->hard_reg = hr; - assigned[nassigned++] = v; - found = 1; - break; - } - } - if (!found) { - vi->alloc_kind = OPT_ALLOC_SPILL; - vi->spill_slot = spill_slot_for(f, v); - } - } + metrics_scope_begin(f->c, "opt.live_ranges.regalloc"); + OptLiveInfo live; + opt_live_blocks(f, &live); + OptLiveRangeSet ranges; + opt_live_ranges_build(f, &live, &ranges); + opt_init_val_info_from_ranges(f, &ranges); + opt_build_live_after_only(f, &live); + metrics_count(f->c, "opt.live_words", f->opt_live_words); + metrics_count(f->c, "opt.conflict_bytes", 0); + metrics_scope_end(f->c, "opt.live_ranges.regalloc"); + + OptAllocator alloc; + opt_assign_ranges(f, &ranges, &alloc); rewrite_func(f); } diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c @@ -1161,6 +1161,41 @@ static void opt_regalloc_priority(void) { tc_fini(&tc); } +static void opt_range_regalloc_no_conflicts_and_stack_reuse(void) { + TestCtx tc; + tc_init(&tc); + MockCGTarget mock; + mock_init(&mock, tc.c); + static const Reg scratch[] = {9, 10}; + mock_set_pool(&mock, RC_INT, NULL, 0, scratch, 2, 0); + + Func* f = new_func(&tc); + opt_machinize(f, &mock.base); + Val a = add_val(f, tc.i32); + Val b = add_val(f, tc.i32); + emit_load_imm(f, f->entry, a, tc.i32, 1); + emit_load_imm(f, f->entry, b, tc.i32, 2); + emit_ret_val(f, f->entry, b, tc.i32); + opt_build_cfg(f); + opt_build_loop_tree(f); + + opt_regalloc(f, 0); + + EXPECT(f->val_conflicts == NULL, + "range allocator should not allocate a conflict matrix"); + EXPECT(f->opt_conflict_words == 0, + "range allocator should leave conflict words at zero"); + EXPECT(f->opt_used_loc_words != 0, + "range allocator should record used-location storage"); + EXPECT(f->val_info[a].alloc_kind == OPT_ALLOC_SPILL, + "no hard regs should force v%u to stack", (unsigned)a); + EXPECT(f->val_info[b].alloc_kind == OPT_ALLOC_SPILL, + "no hard regs should force v%u to stack", (unsigned)b); + EXPECT(f->val_info[a].spill_slot == f->val_info[b].spill_slot, + "disjoint stack ranges should reuse a spill slot"); + tc_fini(&tc); +} + static void opt_rewrite_spill_use_def(void) { TestCtx tc; tc_init(&tc); @@ -2078,6 +2113,7 @@ int main(void) { opt_live_across_call_frequency(); opt_conflict_symmetry_and_class(); opt_regalloc_priority(); + opt_range_regalloc_no_conflicts_and_stack_reuse(); opt_rewrite_spill_use_def(); opt_call_clobber_preservation(); opt_call_clobber_caller_saved();