commit 03096a7125d3ed99ee15e2b6b39b6198c5f4cc59
parent a11e9f6d86f9aa5c4d8d543fee63730bfa8b3991
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 14 May 2026 20:16:09 -0700
Improve O1 wide-function allocation perf
Diffstat:
10 files changed, 772 insertions(+), 496 deletions(-)
diff --git a/doc/MIR_RA_REPORT.md b/doc/MIR_RA_REPORT.md
@@ -531,6 +531,9 @@ a clean fast implementation over shims for old pass boundaries.
- [x] `opt.ranges`
- [x] `opt.range_points`
- [x] `opt.alloc.used_loc_words`
+ - [x] `opt.alloc.hard_loc_words`
+ - [x] `opt.alloc.stack_loc_words`
+ - [x] `opt.alloc.stack_slots`
- [x] `opt.alloc.spills`
- [x] `opt.rewrite.reloads`
- [x] `opt.rewrite.stores`
@@ -557,7 +560,7 @@ coverage belongs in a later pass once the range allocator owns rewrite.
- [x] Compute `live_in`/`live_out` without allocating conflicts.
- [x] Make pre-DDE use only block liveness.
- [x] Keep old full `opt_live_info` available for regalloc during transition.
- This is now transition debt, not a compatibility requirement.
+ This transition debt was retired in phase 6.
- [x] Add dump output for block liveness.
- [x] Add tests for branch, loop, and call liveness.
@@ -647,13 +650,13 @@ straight-line case but regresses many-small-function/function-table inputs. Keep
the rolling-live design, but remove the temporary-list and reverse-copy cost
introduced by the first implementation.
-- [ ] Rewrite blocks with a single block-local output buffer or pre-sized edit
+- [x] Rewrite blocks with a single block-local output buffer or pre-sized edit
buffer.
-- [ ] Avoid per-instruction `RewriteList` allocation/copy churn.
-- [ ] Preserve rolling-live call-clobber behavior without persistent
+- [x] Avoid per-instruction `RewriteList` allocation/copy churn.
+- [x] Preserve rolling-live call-clobber behavior without persistent
`Block.live_after`.
-- [ ] Keep scratch-register selection deterministic after the rewrite cleanup.
-- [ ] Update tests that asserted old internal storage or exact intermediate
+- [x] Keep scratch-register selection deterministic after the rewrite cleanup.
+- [x] Update tests that asserted old internal storage or exact intermediate
dump details.
- [ ] Re-run the same-input phase-3 vs phase-5 timing comparison from
`doc/PERF.md`.
@@ -663,30 +666,39 @@ Exit criteria:
- [ ] Function-table/main-many-small `opt.regalloc` returns to phase-3 parity
or better.
- [ ] Large straight-line phase-4 gains are retained.
-- [ ] No persistent per-instruction full live-after sets are rebuilt in normal
+- [x] No persistent per-instruction full live-after sets are rebuilt in normal
O1.
+Implemented by rewriting each block into one backward-filled block output
+buffer. Per-instruction before/after insertion lists are reused per block, and
+the final reverse-copy pass is gone.
+
### Phase 6: Delete Old Dense Path
-- [ ] Remove normal-path allocation of `Func.val_conflicts`.
-- [ ] Delete old all-values conflict construction from O1, including
+- [x] Remove normal-path allocation of `Func.val_conflicts`.
+- [x] Delete old all-values conflict construction from O1, including
compatibility-only helpers and tests.
-- [ ] Remove or demote `opt_live_words`, `opt_conflict_words`,
+- [x] Remove or demote `opt_live_words`, `opt_conflict_words`,
`Block.live_in`, `Block.live_out`, `Block.live_use`, `Block.live_def`,
and `Block.live_after` from persistent IR state where no public behavior
requires them.
-- [ ] Replace `opt_live_info` callers with pass-local liveness/ranges, then
+- [x] Replace `opt_live_info` callers with pass-local liveness/ranges, then
delete or narrow `opt_live_info`.
-- [ ] Make tests assert public behavior and new pass dumps/metrics, not dense
+- [x] Make tests assert public behavior and new pass dumps/metrics, not dense
conflict matrix details.
-- [ ] Update docs and metrics names to reflect range-based RA.
+- [x] Update docs and metrics names to reflect range-based RA.
Exit criteria:
-- [ ] Normal O1 has no reachable dense pseudo-conflict construction.
-- [ ] The source tree no longer contains tests whose purpose is preserving the
+- [x] Normal O1 has no reachable dense pseudo-conflict construction.
+- [x] The source tree no longer contains tests whose purpose is preserving the
old dense interference model.
-- [ ] `opt.conflict_bytes` is absent or always zero on normal O1 paths.
+- [x] `opt.conflict_bytes` is absent or always zero on normal O1 paths.
+
+Implemented by deleting `opt_live_info`, `Func.val_conflicts`,
+`Func.opt_conflict_words`, and persistent block liveness/live-after fields.
+Post-rewrite combine/DCE now build pass-local hard-register liveness when they
+need block live-out information.
### Phase 7: Narrow `used_locs`
@@ -696,13 +708,13 @@ bitmaps as hard-register bits plus one possible stack bit per candidate; that
preserves correctness but reintroduces superlinear memory/time for one large
function.
-- [ ] Split hard-register occupancy from stack-slot occupancy.
-- [ ] Make stack occupancy demand-sized by allocated stack slots, not candidate
+- [x] Split hard-register occupancy from stack-slot occupancy.
+- [x] Make stack occupancy demand-sized by allocated stack slots, not candidate
count.
-- [ ] Consider sparse or interval-indexed stack occupancy for long straight-line
+- [x] Consider sparse or interval-indexed stack occupancy for long straight-line
functions.
-- [ ] Keep hard-register occupancy compact and class-aware.
-- [ ] Add metrics separating hard occupancy words, stack occupancy words, and
+- [x] Keep hard-register occupancy compact and class-aware.
+- [x] Add metrics separating hard occupancy words, stack occupancy words, and
allocated stack slot count.
- [ ] Re-run straight-line and table-main ladders after each shape change.
@@ -713,6 +725,11 @@ Exit criteria:
- [ ] `doc/PERF.md` can be updated with before/after O1 scaling numbers.
- [ ] Table-main remains at phase-5 parity or better.
+Implemented with separate `hard_used_locs` and `stack_used_locs` allocator
+tables. Hard occupancy remains `OPT_REG_CLASSES * 32` bits per point; stack
+occupancy grows only when a new spill slot crosses a word boundary. The sparse
+interval form is deferred until timing data shows it is needed.
+
### Phase 8: O2 Coalescing
- [ ] Collect move candidates after machinize.
diff --git a/doc/PERF.md b/doc/PERF.md
@@ -32,6 +32,8 @@ Important counters:
- `opt.live.block_bytes`, `opt.live.set_bit_scans`
- `opt.ranges`, `opt.range_points`, `opt.range_raw_points`
- `opt.alloc.used_loc_words`, `opt.alloc.spills`
+- `opt.alloc.hard_loc_words`, `opt.alloc.stack_loc_words`,
+ `opt.alloc.stack_slots`
- `opt.rewrite.reloads`, `opt.rewrite.stores`
- `opt.conflict_bytes`
- `link.inputs`, `link.sections`, `link.segments`, `link.syms`, `link.relocs`
@@ -488,33 +490,235 @@ the workload. A forward rewrite with a precomputed rolling-live cursor or a
block-local edit buffer should keep the no-`live_after` shape without the
table-main tax.
-## Performance Priorities
+### Phase-7 Fresh Rerun
+
+After phases 5, 6, and the implemented phase-7 occupancy split, the normal O1
+path no longer contains the old dense pseudo-conflict/live-after state. Rewrite
+uses a block-local output buffer, `opt_live_info` and persistent block liveness
+fields are gone, and allocator occupancy is split into hard-register and
+demand-sized stack-slot tables.
+
+Samples: 5 runs per point. The table uses p50 milliseconds. The direct-call and
+function-table generators are the same family as earlier sections, but these
+are fresh generated inputs and not byte-for-byte identical to prior phase
+tables.
+
+Straight-line main:
+
+```text
+funcs input_bytes insts vals block_bytes ranges range_points used_loc_words hard_words stack_words stack_slots conflict_bytes run.total compile.tu parse_codegen opt.o1.total live_pre live_ranges_reg regalloc link_jit
+1 331 84 57 576 53 70 140 140 0 0 0 0.625 0.517 0.365 0.230 0.061 0.033 0.073 0.058
+4 1051 297 213 1632 203 262 524 524 0 0 0 1.217 1.100 0.943 0.603 0.150 0.105 0.218 0.060
+16 3971 1149 837 6080 803 1030 2060 2060 0 0 0 3.562 3.445 3.301 2.148 0.539 0.375 0.803 0.061
+64 15749 4557 3333 23648 3203 4102 8204 8204 0 0 0 12.945 12.762 12.612 8.143 1.997 1.499 3.197 0.083
+128 31598 9101 6661 47072 6403 8198 16396 16396 0 0 0 24.576 24.306 24.158 15.474 3.730 2.955 6.252 0.118
+256 63654 18189 13317 93920 12803 16390 32780 32780 0 0 0 48.921 48.385 48.237 30.977 7.385 6.008 12.770 0.159
+512 127759 36365 26629 187616 25603 32774 65548 65548 0 0 0 98.418 97.658 97.503 63.030 14.960 12.268 26.552 0.249
+1024 256093 72717 53253 375008 51203 65542 131084 131084 0 0 0 206.580 204.832 204.682 131.904 30.624 25.894 57.609 0.421
+```
+
+Function-table main:
+
+```text
+funcs input_bytes insts vals block_bytes ranges range_points used_loc_words hard_words stack_words stack_slots conflict_bytes run.total compile.tu parse_codegen opt.o1.total live_pre live_ranges_reg regalloc link_jit
+1 400 106 69 832 65 86 172 172 0 0 0 0.680 0.571 0.413 0.252 0.064 0.040 0.087 0.062
+4 1096 304 213 1888 203 263 526 526 0 0 0 1.244 1.132 0.988 0.625 0.162 0.107 0.224 0.062
+16 3914 1096 789 6112 755 971 1942 1942 0 0 0 3.473 3.346 3.188 2.061 0.516 0.358 0.765 0.066
+64 15260 4264 3093 23008 2963 3803 7606 7606 0 0 0 12.162 11.990 11.840 7.621 1.864 1.382 2.980 0.086
+128 30505 8488 6165 45536 5907 7579 15158 15158 0 0 0 23.249 22.999 22.848 14.553 3.484 2.701 5.777 0.110
+256 61281 16936 12309 90592 11795 15131 30262 30262 0 0 0 45.760 45.318 45.169 28.765 6.871 5.386 11.538 0.150
+512 122826 33832 24597 180704 23571 30235 60470 60470 0 0 0 91.469 90.751 90.601 57.470 13.717 10.718 23.064 0.240
+1024 246016 67624 49173 360928 47123 60443 120886 120886 0 0 0 190.086 187.863 187.699 116.461 27.974 21.462 46.345 0.417
+```
+
+The 512 to 1024 step is now close to linear in both normal shapes:
-1. Reduce phase-4 rewrite overhead.
- Phase 4 removed persistent `Block.live_after` from the normal O1 path and
- improved the large straight-line case, but the current backward rewrite
- regresses many-small-function/table-shaped inputs. Keep the rolling-live
- design, but remove the extra temporary-list/reverse-copy cost.
+```text
+straight_main:
+compile.tu 97.658 -> 204.832 ms 2.10x
+opt.o1.total 63.030 -> 131.904 ms 2.09x
+opt.live_blocks.pre_dde 14.960 -> 30.624 ms 2.05x
+opt.live_ranges.regalloc 12.268 -> 25.894 ms 2.11x
+opt.regalloc 26.552 -> 57.609 ms 2.17x
+used_loc_words 65548 -> 131084 2.00x
+hard_loc_words 65548 -> 131084 2.00x
+stack_loc_words 0 -> 0
+
+table_main:
+compile.tu 90.751 -> 187.863 ms 2.07x
+opt.o1.total 57.470 -> 116.461 ms 2.03x
+opt.live_blocks.pre_dde 13.717 -> 27.974 ms 2.04x
+opt.live_ranges.regalloc 10.718 -> 21.462 ms 2.00x
+opt.regalloc 23.064 -> 46.345 ms 2.01x
+used_loc_words 60470 -> 120886 2.00x
+hard_loc_words 60470 -> 120886 2.00x
+stack_loc_words 0 -> 0
+```
+
+This confirms that phases 5 and 7 removed the prior table-main rewrite tax and
+the direct-call `used_locs` superlinear growth for this generator. These inputs
+do not force spills, so the stack side of the occupancy split stays at zero.
+
+To exercise stack occupancy, a separate spill-pressure ladder used one large
+`main` that repeatedly forms 24 live arguments and calls a local `sink24`.
+Some larger cases return non-zero guest status because the generated integer
+result is intentionally ignored by the benchmark harness; compile/JIT metrics
+are still emitted before process exit.
+
+```text
+groups input_bytes insts vals block_bytes ranges range_points used_loc_words hard_words stack_words stack_slots spills stores compile.tu opt.o1.total live_pre live_ranges_reg regalloc link_jit
+1 1103 204 176 288 172 200 551 400 151 16 16 16 0.815 0.384 0.097 0.075 0.150 0.055
+4 3377 651 548 864 544 647 1892 1294 598 16 64 64 2.062 1.127 0.242 0.271 0.584 0.060
+16 12905 2439 2036 3168 2032 2435 7256 4870 2386 16 256 256 9.368 6.436 1.072 1.452 4.212 0.071
+64 52745 9591 7988 12096 7984 9587 28712 19174 9538 16 1024 1024 76.878 64.917 7.948 12.695 49.800 0.110
+128 107881 19127 15924 24000 15920 19123 57320 38246 19074 16 2048 2048 262.764 234.319 25.701 43.068 186.860 0.168
+256 223337 38199 31796 47808 31792 38195 114536 76390 38146 16 4096 4096 964.562 893.449 89.543 156.086 732.317 0.255
+512 454249 76343 63540 95424 63536 76339 228968 152678 76290 16 8192 8192 3737.556 3536.356 332.393 595.509 2902.790 0.445
+```
+
+The spill ladder tells a different story:
+
+```text
+256 -> 512:
+compile.tu 964.562 -> 3737.556 ms 3.87x
+opt.o1.total 893.449 -> 3536.356 ms 3.96x
+opt.live_blocks.pre_dde 89.543 -> 332.393 ms 3.71x
+opt.live_ranges.regalloc 156.086 -> 595.509 ms 3.82x
+opt.regalloc 732.317 -> 2902.790 ms 3.96x
+used_loc_words 114536 -> 228968 2.00x
+hard_loc_words 76390 -> 152678 2.00x
+stack_loc_words 38146 -> 76290 2.00x
+stack_slots 16 -> 16 1.00x
+spills 4096 -> 8192 2.00x
+```
-2. Compress or narrow `used_locs`.
- The remaining straight-line superlinear bucket now tracks
- `opt.alloc.used_loc_words`. The first range allocator sizes the
- occupied-location bitmap by hard-register bits plus candidate stack bits.
- The stack side should become demand-sized, sparse, or split-capable instead
- of paying for every candidate at every point.
+The stack occupancy split is doing its job: stack words scale with allocated
+stack slots and points, not candidate count, and the slot count stays flat at
+16 while spills double. The remaining superlinear behavior is now elsewhere:
+wide single functions still drive full-width value bitset work in block
+liveness, live-range construction, DDE, and the rewrite/allocation walk. In
+other words, phase 7 fixed the specific `points * candidates` occupancy growth,
+but not the broader `instructions * live_words` shape for values that are live
+across very wide regions.
+
+### Phase-8 O1 Wide-Function Rerun
+
+Phase 8 attacks three remaining O1 costs exposed by the spill-pressure ladder:
+
+- allocator candidate ordering now uses `qsort` instead of insertion sort;
+- live-range construction uses per-instruction use/def value lists with
+ generation marks instead of clearing fixed-width temporary bitsets;
+- asm register-constraint application first checks whether the function has any
+ `IR_ASM_BLOCK`, avoiding a full backward live walk for normal C functions.
+
+Normal direct-call and table-shaped ladders remain close to linear at the
+512-to-1024 step:
+
+```text
+straight_main:
+compile.tu 97.938 -> 202.409 ms 2.07x
+opt.o1.total 61.803 -> 125.589 ms 2.03x
+opt.live_blocks.pre_dde 15.131 -> 30.724 ms 2.03x
+opt.live_ranges.regalloc 10.268 -> 20.757 ms 2.02x
+opt.regalloc 23.644 -> 47.980 ms 2.03x
+
+table_main:
+compile.tu 90.607 -> 188.297 ms 2.08x
+opt.o1.total 56.265 -> 115.105 ms 2.05x
+opt.live_blocks.pre_dde 13.915 -> 28.431 ms 2.04x
+opt.live_ranges.regalloc 9.156 -> 18.499 ms 2.02x
+opt.regalloc 21.062 -> 42.722 ms 2.03x
+```
+
+The spill-pressure ladder improves substantially in absolute time:
+
+```text
+groups compile.tu opt.o1.total live_pre live_ranges_reg regalloc hard_words stack_words stack_slots spills
+128 91.797 62.914 9.021 9.213 32.036 38246 19074 16 2048
+256 275.634 200.878 24.166 24.887 104.204 76390 38146 16 4096
+512 958.535 754.326 74.544 76.522 374.253 152678 76290 16 8192
+```
+
+Compared with the phase-7 512-group spill row, this cuts:
+
+```text
+compile.tu 3737.556 -> 958.535 ms 3.90x faster
+opt.o1.total 3536.356 -> 754.326 ms 4.69x faster
+opt.live_blocks.pre_dde 332.393 -> 74.544 ms 4.46x faster
+opt.live_ranges.regalloc 595.509 -> 76.522 ms 7.78x faster
+opt.regalloc 2902.790 -> 374.253 ms 7.76x faster
+```
+
+The 256-to-512 spill step is still superlinear:
+
+```text
+compile.tu 275.634 -> 958.535 ms 3.48x
+opt.o1.total 200.878 -> 754.326 ms 3.76x
+opt.live_blocks.pre_dde 24.166 -> 74.544 ms 3.08x
+opt.live_ranges.regalloc 24.887 -> 76.522 ms 3.07x
+opt.regalloc 104.204 -> 374.253 ms 3.59x
+hard_loc_words 76390 -> 152678 2.00x
+stack_loc_words 38146 -> 76290 2.00x
+stack_slots 16 -> 16 1.00x
+spills 4096 -> 8192 2.00x
+```
+
+The remaining wide-function cost is no longer the old conflict matrix,
+candidate sort, stack occupancy, or no-asm constraint scan. It is concentrated
+in repeated dense/high-watermark live-set work and range-point occupancy walks
+inside one very wide function.
+
+Next investigation should start in:
+
+- `src/opt/pass_live.c`: `opt_live_blocks`, `live_recompute_metrics`, and the
+ final per-value range summary in `opt_live_ranges_build`. These still have
+ high-watermark or all-value loops that show up on wide single functions.
+- `src/opt/pass_lower.c`: `alloc_collect_occupied_hard`,
+ `alloc_collect_occupied_stack`, `alloc_mark_hard_loc`, and
+ `alloc_mark_stack_loc`. These still walk every point in every candidate range;
+ interval/event state per location should replace repeated point scans.
+- `src/opt/pass_lower.c`: `rewrite_func` and `opt_dead_def_elim_with_live`.
+ Both still maintain rolling live bitsets with function-width scratch storage;
+ after range/occupancy work, convert these to set-bit or operand-event updates.
+
+## Performance Priorities
-3. Avoid whole-function contiguous growth where hot passes scan repeatedly.
+1. Continue reducing wide-function high-watermark bitset work.
+ Phase 8 removed several avoidable dense scans and the candidate-sort
+ quadratic, but the spill-pressure ladder is still superlinear. The next
+ target is persistent rolling-live updates and live/block metrics that still
+ scale with the highest live value id rather than the number of live values.
+
+2. Reduce range-point occupancy walks.
+ `hard_loc_words` and `stack_loc_words` scale linearly, but assignment still
+ revisits range points for every candidate. Per-location interval lists or
+ event sweeps should replace repeated point-by-point OR/mark loops on wide
+ functions.
+
+3. Keep the split `used_locs` design.
+ `hard_loc_words` and `stack_loc_words` now scale linearly in both normal and
+ spill-pressure ladders. Stack occupancy is demand-sized by allocated stack
+ slots, so it is no longer the explanation for the wide-spill superlinear
+ timings.
+
+4. Add live-range event metrics.
+ The next timing question is how much work is coming from `instructions *
+ live_words` versus range-point scans and rewrite insertions. Add counters for
+ bitset words touched during live updates, range point visits, assignment
+ point visits, and rewrite inserted instructions.
+
+5. Avoid whole-function contiguous growth where hot passes scan repeatedly.
Large `Val`/block/instruction arrays and dense bit matrices are likely to
hurt cache locality. Segmented arrays may help by reducing large copies and
stabilizing addresses, but pass algorithms need to avoid turning segment
traversal into extra indirection in inner loops.
-4. Keep link/JIT lower priority for now.
+6. Keep link/JIT lower priority for now.
Even at 1024 functions, `link_jit.all` is around `0.5 ms`; copy, reloc, and
icache flush are small and roughly linear. They are not the bottleneck for
submit-to-execution latency in these tests.
-5. Split `compile.c.parse_codegen` further before changing parser/CG data
+7. Split `compile.c.parse_codegen` further before changing parser/CG data
structures.
The non-O1 compile residual is meaningful at large source sizes, but it is
currently broad. More counters are needed before choosing parser, PP, type,
diff --git a/src/opt/ir.h b/src/opt/ir.h
@@ -252,11 +252,6 @@ typedef struct Block {
u8 loop_depth;
u16 pad;
u32 frequency;
- u64* live_in;
- u64* live_out;
- u64* live_use;
- u64* live_def;
- u64** live_after; /* live set after each instruction, length ninsts */
} Block;
typedef enum OptAllocKind {
@@ -327,11 +322,12 @@ typedef struct Func {
u8 opt_has_target;
u8 opt_rewritten;
u16 opt_live_words;
- u16 opt_conflict_words;
u32 opt_used_loc_words;
+ u32 opt_alloc_hard_loc_words;
+ u32 opt_alloc_stack_loc_words;
+ u32 opt_alloc_stack_slots;
u32 opt_position_count;
OptValInfo* val_info; /* indexed by Val, length nvals */
- u64* val_conflicts; /* nvals x opt_conflict_words bit matrix */
Reg opt_hard_regs[OPT_REG_CLASSES][OPT_MAX_HARD_REGS];
u32 opt_hard_reg_count[OPT_REG_CLASSES];
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -1426,6 +1426,12 @@ static void w_func_end(CGTarget* t) {
opt_regalloc(o->f, 0);
metrics_count(o->c, "opt.alloc.used_loc_words",
o->f->opt_used_loc_words);
+ metrics_count(o->c, "opt.alloc.hard_loc_words",
+ o->f->opt_alloc_hard_loc_words);
+ metrics_count(o->c, "opt.alloc.stack_loc_words",
+ o->f->opt_alloc_stack_loc_words);
+ metrics_count(o->c, "opt.alloc.stack_slots",
+ o->f->opt_alloc_stack_slots);
metrics_count(o->c, "opt.alloc.spills", func_spill_alloc_count(o->f));
metrics_count(o->c, "opt.rewrite.reloads", func_spill_load_count(o->f));
metrics_count(o->c, "opt.rewrite.stores", func_spill_store_count(o->f));
diff --git a/src/opt/opt.h b/src/opt/opt.h
@@ -152,7 +152,6 @@ void opt_live_dump_blocks(Func*, const OptLiveInfo*, Writer*);
void opt_live_ranges_build(Func*, const OptLiveInfo*, OptLiveRangeSet*);
void opt_live_dump_ranges(Func*, const OptLiveRangeSet*, Writer*);
void opt_rewrite_dump(Func*, Writer*);
-void opt_live_info(Func*);
void opt_coalesce(Func*);
void opt_regalloc(Func*, int allow_live_range_split);
void opt_combine(Func*); /* code selection: merge dependent insns */
diff --git a/src/opt/pass_cfg.c b/src/opt/pass_cfg.c
@@ -106,7 +106,6 @@ static void prune_unreachable(Func* f, const u8* reachable) {
bl->succ[0] = 0;
bl->succ[1] = 0;
bl->nsucc = 0;
- bl->live_after = NULL;
}
u32 w = 0;
diff --git a/src/opt/pass_live.c b/src/opt/pass_live.c
@@ -343,22 +343,78 @@ typedef struct RangeBuildCtx {
u32 raw_points_cap;
} RangeBuildCtx;
-typedef struct RangeBitsCtx {
- OptBitset* use;
- OptBitset* def;
-} RangeBitsCtx;
+typedef struct RangeInstRefs {
+ Val* uses;
+ Val* defs;
+ u32 nuses;
+ u32 ndefs;
+ u32 use_cap;
+ u32 def_cap;
+ u32* use_mark;
+ u32* def_mark;
+ u32 gen;
+} RangeInstRefs;
+
+static void range_refs_init(Func* f, RangeInstRefs* refs) {
+ memset(refs, 0, sizeof *refs);
+ refs->use_mark = arena_zarray(f->arena, u32, f->nvals ? f->nvals : 1u);
+ refs->def_mark = arena_zarray(f->arena, u32, f->nvals ? f->nvals : 1u);
+ refs->gen = 1;
+}
+
+static void range_refs_reset(Func* f, RangeInstRefs* refs) {
+ refs->nuses = 0;
+ refs->ndefs = 0;
+ ++refs->gen;
+ if (refs->gen) return;
+ memset(refs->use_mark, 0, sizeof(refs->use_mark[0]) *
+ (f->nvals ? f->nvals : 1u));
+ memset(refs->def_mark, 0, sizeof(refs->def_mark[0]) *
+ (f->nvals ? f->nvals : 1u));
+ refs->gen = 1;
+}
+
+static void range_refs_push_use(Func* f, RangeInstRefs* refs, Val v) {
+ if (refs->use_mark[v] == refs->gen) return;
+ refs->use_mark[v] = refs->gen;
+ if (refs->nuses == refs->use_cap) {
+ u32 ncap = refs->use_cap ? refs->use_cap * 2u : 8u;
+ Val* nv = arena_array(f->arena, Val, ncap);
+ if (refs->uses) memcpy(nv, refs->uses, sizeof(refs->uses[0]) * refs->nuses);
+ refs->uses = nv;
+ refs->use_cap = ncap;
+ }
+ refs->uses[refs->nuses++] = v;
+}
+
+static void range_refs_push_def(Func* f, RangeInstRefs* refs, Val v) {
+ if (refs->def_mark[v] == refs->gen) return;
+ refs->def_mark[v] = refs->gen;
+ if (refs->ndefs == refs->def_cap) {
+ u32 ncap = refs->def_cap ? refs->def_cap * 2u : 4u;
+ Val* nv = arena_array(f->arena, Val, ncap);
+ if (refs->defs) memcpy(nv, refs->defs, sizeof(refs->defs[0]) * refs->ndefs);
+ refs->defs = nv;
+ refs->def_cap = ncap;
+ }
+ refs->defs[refs->ndefs++] = v;
+}
+
+static int range_refs_has_def(const RangeInstRefs* refs, Val v) {
+ return refs->def_mark[v] == refs->gen;
+}
static void range_collect_bits(Func* f, Inst* in, Operand* op, int is_def,
void* arg) {
(void)in;
- RangeBitsCtx* c = (RangeBitsCtx*)arg;
+ RangeInstRefs* refs = (RangeInstRefs*)arg;
if (op->kind != OPK_REG) return;
Val v = (Val)op->v.reg;
if (v == VAL_NONE || v >= f->nvals) return;
if (is_def)
- opt_bitset_set(c->def, v);
+ range_refs_push_def(f, refs, v);
else
- opt_bitset_set(c->use, v);
+ range_refs_push_use(f, refs, v);
}
static void range_push_raw_point(RangeBuildCtx* c, u32 p) {
@@ -462,17 +518,25 @@ static void range_add_block_freq(Val v, void* arg) {
typedef struct RangeCallCtx {
Func* f;
OptLiveRangeSet* ranges;
- const OptBitset* def;
+ const RangeInstRefs* refs;
u32 freq;
} RangeCallCtx;
static void range_add_live_across_call(Val v, void* arg) {
RangeCallCtx* c = (RangeCallCtx*)arg;
if (v == VAL_NONE || v >= c->f->nvals) return;
- if (opt_bitset_has(c->def, v)) return;
+ if (range_refs_has_def(c->refs, v)) return;
c->ranges->live_across_call_freq_by_val[v] += c->freq;
}
+static void range_update_live_before(OptBitset* live_after,
+ const RangeInstRefs* refs) {
+ for (u32 i = 0; i < refs->ndefs; ++i)
+ opt_bitset_clear_bit(live_after, refs->defs[i]);
+ for (u32 i = 0; i < refs->nuses; ++i)
+ opt_bitset_set(live_after, refs->uses[i]);
+}
+
static int u32_cmp(const void* a, const void* b) {
u32 av = *(const u32*)a;
u32 bv = *(const u32*)b;
@@ -557,14 +621,12 @@ void opt_live_ranges_build(Func* f, const OptLiveInfo* live,
}
ranges->raw_point_count = raw + 1u;
- OptBitset use;
- OptBitset def;
OptBitset live_after;
OptBitset block_live;
- opt_bitset_init(f->arena, &use, live->words);
- opt_bitset_init(f->arena, &def, live->words);
opt_bitset_init(f->arena, &live_after, live->words);
opt_bitset_init(f->arena, &block_live, live->words);
+ RangeInstRefs refs;
+ range_refs_init(f, &refs);
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
@@ -585,24 +647,21 @@ void opt_live_ranges_build(Func* f, const OptLiveInfo* live,
for (u32 ri = bl->ninsts; ri > 0; --ri) {
u32 i = ri - 1u;
Inst* in = &bl->insts[i];
- opt_bitset_clear(&use);
- opt_bitset_clear(&def);
- RangeBitsCtx bits = {&use, &def};
- live_walk_inst_operands(f, in, range_collect_bits, &bits);
+ range_refs_reset(f, &refs);
+ live_walk_inst_operands(f, in, range_collect_bits, &refs);
if ((IROp)in->op == IR_CALL) {
- RangeCallCtx call_ctx = {f, ranges, &def, bl->frequency};
+ RangeCallCtx call_ctx = {f, ranges, &refs, bl->frequency};
opt_bitset_iter_set(&live_after, range_add_live_across_call, &call_ctx);
}
RangeCloseCtx close_ctx = {&build, raw_start + i, b};
- opt_bitset_iter_set(&def, range_close_def, &close_ctx);
+ for (u32 k = 0; k < refs.ndefs; ++k)
+ range_close_def(refs.defs[k], &close_ctx);
RangeUseCtx use_ctx = {&build, raw_start + i + 1u, b};
- opt_bitset_iter_set(&use, range_open_use, &use_ctx);
- for (u32 wi = 0; wi < live->words; ++wi)
- live_after.words[wi] =
- (live_after.words[wi] & ~def.words[wi]) | use.words[wi];
- opt_bitset_trim(&live_after);
+ for (u32 k = 0; k < refs.nuses; ++k)
+ range_open_use(refs.uses[k], &use_ctx);
+ range_update_live_before(&live_after, &refs);
}
for (Val v = 1; v < f->nvals; ++v) {
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -1,4 +1,5 @@
#include <stdio.h>
+#include <stdlib.h>
#include <string.h>
#include "core/arena.h"
@@ -49,23 +50,6 @@ static void bit_set(u64* bits, Val v) { bits[v / 64u] |= 1ull << (v % 64u); }
static int bit_has(const u64* bits, Val v) {
return (bits[v / 64u] & (1ull << (v % 64u))) != 0;
}
-static int bit_or_changed(u64* dst, const u64* src, u32 n) {
- int changed = 0;
- for (u32 i = 0; i < n; ++i) {
- u64 old = dst[i];
- dst[i] |= src[i];
- changed |= dst[i] != old;
- }
- return changed;
-}
-static int bit_copy_changed(u64* dst, const u64* src, u32 n) {
- int changed = 0;
- for (u32 i = 0; i < n; ++i) {
- changed |= dst[i] != src[i];
- dst[i] = src[i];
- }
- return changed;
-}
typedef void (*OperandWalkFn)(Func*, Inst*, Operand*, int is_def, void*);
@@ -156,35 +140,6 @@ static void walk_inst_operands(Func* f, Inst* in, OperandWalkFn fn, void* ctx) {
}
}
-typedef struct UseDefCtx {
- u64* use;
- u64* def;
- u32 pos;
- u32 freq;
-} UseDefCtx;
-
-static void collect_use_def(Func* f, Inst* in, Operand* op, int is_def,
- void* arg) {
- (void)in;
- UseDefCtx* c = (UseDefCtx*)arg;
- if (op->kind != OPK_REG) return;
- Val v = (Val)op->v.reg;
- if (v == VAL_NONE || v >= f->nvals) return;
- if (!f->val_info) return;
- OptValInfo* vi = &f->val_info[v];
- if (vi->first_pos == 0 || c->pos < vi->first_pos) vi->first_pos = c->pos;
- if (c->pos > vi->last_pos) vi->last_pos = c->pos;
- u32 freq = c->freq ? c->freq : 1u;
- vi->cls = f->val_cls[v];
- if (is_def) {
- vi->def_freq += freq;
- bit_set(c->def, v);
- } else {
- vi->use_freq += freq;
- if (!bit_has(c->def, v)) bit_set(c->use, v);
- }
-}
-
typedef struct BitsCtx {
u64* use;
u64* def;
@@ -203,40 +158,6 @@ static void collect_bits(Func* f, Inst* in, Operand* op, int is_def,
bit_set(c->use, v);
}
-static u64* conflict_row(Func* f, Val v) {
- return f->val_conflicts + ((size_t)v * f->opt_conflict_words);
-}
-
-static void add_conflict(Func* f, Val a, Val b) {
- if (a == VAL_NONE || b == VAL_NONE || a == b) return;
- if (a >= f->nvals || b >= f->nvals) return;
- if (f->val_cls[a] != f->val_cls[b]) return;
- bit_set(conflict_row(f, a), b);
- bit_set(conflict_row(f, b), a);
-}
-
-static void add_conflicts_with_set(Func* f, Val v, const u64* live,
- const u64* skip) {
- if (v == VAL_NONE || v >= f->nvals) return;
- for (Val other = 1; other < f->nvals; ++other) {
- if (skip && bit_has(skip, other)) continue;
- if (bit_has(live, other)) add_conflict(f, v, other);
- }
-}
-
-static void add_pairwise_conflicts(Func* f, const u64* live) {
- for (Val a = 1; a < f->nvals; ++a) {
- if (!bit_has(live, a)) continue;
- for (Val b = a + 1u; b < f->nvals; ++b) {
- if (bit_has(live, b)) add_conflict(f, a, b);
- }
- }
-}
-
-static void copy_bits(u64* dst, const u64* src, u32 words) {
- for (u32 w = 0; w < words; ++w) dst[w] = src[w];
-}
-
static void forbid_val_reg(Func* f, Val v, u8 cls, Reg r) {
if (v == VAL_NONE || v >= f->nvals || cls >= OPT_REG_CLASSES || r >= 32)
return;
@@ -564,147 +485,6 @@ void opt_build_loop_tree(Func* f) {
f->blocks[b].frequency = loop_frequency(f->blocks[b].loop_depth);
}
-void opt_live_info(Func* f) {
- f->opt_live_words = (u16)bit_words(f->nvals);
- f->opt_conflict_words = f->opt_live_words;
- f->opt_position_count = 0;
- u32 words = f->opt_live_words;
- int new_val_info = f->val_info == NULL;
- if (new_val_info) f->val_info = arena_zarray(f->arena, OptValInfo, f->nvals);
- f->val_conflicts =
- arena_zarray(f->arena, u64, (size_t)f->nvals * f->opt_conflict_words);
- for (u32 v = 0; v < f->nvals; ++v) {
- i32 tied = new_val_info ? -1 : f->val_info[v].tied_hard_reg;
- f->val_info[v].first_pos = 0;
- f->val_info[v].last_pos = 0;
- f->val_info[v].live_length = 0;
- f->val_info[v].frequency = 0;
- f->val_info[v].use_freq = 0;
- f->val_info[v].def_freq = 0;
- f->val_info[v].live_block_freq = 0;
- f->val_info[v].live_across_call_freq = 0;
- f->val_info[v].spill_cost = 0;
- f->val_info[v].tied_hard_reg = tied;
- f->val_info[v].hard_reg = REG_NONE;
- f->val_info[v].spill_slot = FRAME_SLOT_NONE;
- f->val_info[v].alloc_kind = OPT_ALLOC_NONE;
- f->val_info[v].cls = f->val_cls[v];
- f->val_info[v].forbidden_hard_regs = 0;
- }
- for (u32 b = 0; b < f->nblocks; ++b) {
- Block* bl = &f->blocks[b];
- bl->live_in = arena_zarray(f->arena, u64, words);
- bl->live_out = arena_zarray(f->arena, u64, words);
- bl->live_use = arena_zarray(f->arena, u64, words);
- bl->live_def = arena_zarray(f->arena, u64, words);
- bl->live_after = NULL;
- UseDefCtx c;
- memset(&c, 0, sizeof c);
- c.use = bl->live_use;
- c.def = bl->live_def;
- c.freq = bl->frequency;
- for (u32 i = 0; i < bl->ninsts; ++i) {
- c.pos = ++f->opt_position_count;
- walk_inst_operands(f, &bl->insts[i], collect_use_def, &c);
- }
- }
- int changed;
- do {
- changed = 0;
- for (u32 bi = f->nblocks; bi > 0; --bi) {
- u32 b = bi - 1u;
- Block* bl = &f->blocks[b];
- u64* new_out = arena_zarray(f->arena, u64, words);
- u64* new_in = arena_zarray(f->arena, u64, words);
- for (u32 s = 0; s < bl->nsucc; ++s) {
- u32 t = bl->succ[s];
- if (t < f->nblocks)
- bit_or_changed(new_out, f->blocks[t].live_in, words);
- }
- for (u32 w = 0; w < words; ++w)
- new_in[w] = bl->live_use[w] | (new_out[w] & ~bl->live_def[w]);
- changed |= bit_copy_changed(bl->live_out, new_out, words);
- changed |= bit_copy_changed(bl->live_in, new_in, words);
- }
- } while (changed);
-
- for (u32 b = 0; b < f->nblocks; ++b) {
- Block* bl = &f->blocks[b];
- for (Val v = 1; v < f->nvals; ++v) {
- if (bit_has(bl->live_in, v) || bit_has(bl->live_out, v)) {
- OptValInfo* vi = &f->val_info[v];
- if (vi->first_pos == 0) vi->first_pos = 1;
- if (vi->last_pos < f->opt_position_count)
- vi->last_pos = f->opt_position_count;
- vi->live_block_freq += bl->frequency;
- }
- }
- }
-
- for (u32 b = 0; b < f->nblocks; ++b) {
- Block* bl = &f->blocks[b];
- bl->live_after = arena_array(f->arena, u64*, bl->ninsts ? bl->ninsts : 1u);
- u64* live = arena_zarray(f->arena, u64, words);
- copy_bits(live, bl->live_out, words);
-
- add_pairwise_conflicts(f, bl->live_in);
-
- for (u32 ri = bl->ninsts; ri > 0; --ri) {
- u32 i = ri - 1u;
- Inst* in = &bl->insts[i];
- u64* after = arena_zarray(f->arena, u64, words);
- copy_bits(after, live, words);
- bl->live_after[i] = after;
-
- u64* use = arena_zarray(f->arena, u64, words);
- u64* def = arena_zarray(f->arena, u64, words);
- BitsCtx bc = {use, def};
- walk_inst_operands(f, in, collect_bits, &bc);
-
- for (Val d = 1; d < f->nvals; ++d) {
- if (!bit_has(def, d)) continue;
- for (Val u = 1; u < f->nvals; ++u)
- if (bit_has(use, u)) add_conflict(f, d, u);
- }
-
- for (Val v = 1; v < f->nvals; ++v) {
- if (bit_has(def, v)) {
- add_conflicts_with_set(f, v, after, def);
- for (Val other = v + 1u; other < f->nvals; ++other)
- if (bit_has(after, v) && bit_has(def, other) &&
- bit_has(after, other))
- add_conflict(f, v, other);
- }
- if (bit_has(use, v)) {
- add_conflicts_with_set(f, v, after, def);
- }
- }
- add_pairwise_conflicts(f, use);
-
- if ((IROp)in->op == IR_CALL) {
- for (Val v = 1; v < f->nvals; ++v) {
- if (bit_has(def, v)) continue;
- if (bit_has(after, v))
- f->val_info[v].live_across_call_freq += bl->frequency;
- }
- }
- if ((IROp)in->op == IR_ASM_BLOCK)
- apply_asm_register_constraints(f, in, use, def, after);
-
- for (u32 w = 0; w < words; ++w) live[w] = (live[w] & ~def[w]) | use[w];
- }
- }
-
- for (Val v = 1; v < f->nvals; ++v) {
- OptValInfo* vi = &f->val_info[v];
- if (vi->first_pos && vi->last_pos >= vi->first_pos)
- vi->live_length = vi->last_pos - vi->first_pos + 1u;
- vi->spill_cost = (vi->use_freq * 2u) + vi->def_freq +
- vi->live_across_call_freq + vi->live_block_freq;
- vi->frequency = vi->spill_cost;
- }
-}
-
static int hard_available(Func* f, u8 cls, Reg r) {
if (cls >= OPT_REG_CLASSES) return 0;
for (u32 i = 0; i < f->opt_hard_reg_count[cls]; ++i)
@@ -755,12 +535,15 @@ typedef struct OptAllocCandidate {
typedef struct OptAllocator {
OptLoc* locs;
- u64* used_locs;
+ u64* hard_used_locs;
+ u64* stack_used_locs;
u32 point_count;
- u32 loc_words;
+ u32 hard_loc_words;
+ u32 stack_loc_words;
u32 hard_loc_bits;
FrameSlot* stack_slots;
u32 stack_slot_count;
+ u32 stack_slot_cap;
} OptAllocator;
static int alloc_candidate_higher(const OptAllocCandidate* a,
@@ -771,16 +554,16 @@ static int alloc_candidate_higher(const OptAllocCandidate* a,
return a->v < b->v;
}
+static int alloc_candidate_cmp(const void* va, const void* vb) {
+ const OptAllocCandidate* a = (const OptAllocCandidate*)va;
+ const OptAllocCandidate* b = (const OptAllocCandidate*)vb;
+ if (alloc_candidate_higher(a, b)) return -1;
+ if (alloc_candidate_higher(b, a)) return 1;
+ return 0;
+}
+
static void alloc_sort_candidates(OptAllocCandidate* cands, u32 n) {
- for (u32 i = 1; i < n; ++i) {
- OptAllocCandidate key = cands[i];
- u32 j = i;
- while (j > 0 && alloc_candidate_higher(&key, &cands[j - 1u])) {
- cands[j] = cands[j - 1u];
- --j;
- }
- cands[j] = key;
- }
+ if (n > 1) qsort(cands, n, sizeof(cands[0]), alloc_candidate_cmp);
}
static void opt_init_val_info_from_ranges(Func* f,
@@ -822,8 +605,6 @@ static void opt_init_val_info_from_ranges(Func* f,
if (old_frequency > vi->frequency) vi->frequency = old_frequency;
}
f->val_info = info;
- f->val_conflicts = NULL;
- f->opt_conflict_words = 0;
}
static void bits_clear(u64* bits, u32 words) {
@@ -837,17 +618,28 @@ static void live_update_before(u64* live, const u64* use, const u64* def,
static void live_copy_block_out(Func* f, const OptLiveInfo* live_info, u32 b,
u64* live, u32 words) {
+ (void)f;
bits_clear(live, words);
if (live_info) {
const OptBitset* out = &live_info->blocks[b].live_out;
for (u32 w = 0; w < words && w < out->nwords; ++w) live[w] = out->words[w];
- } else if (b < f->nblocks && f->blocks[b].live_out) {
- copy_bits(live, f->blocks[b].live_out, words);
}
}
static void opt_apply_asm_constraints_from_live(Func* f,
const OptLiveInfo* live_info) {
+ int has_asm = 0;
+ for (u32 b = 0; b < f->nblocks && !has_asm; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ if ((IROp)bl->insts[i].op == IR_ASM_BLOCK) {
+ has_asm = 1;
+ break;
+ }
+ }
+ }
+ if (!has_asm) return;
+
u32 words = live_info ? live_info->words : f->opt_live_words;
if (!words) words = bit_words(f->nvals);
f->opt_live_words = (u16)words;
@@ -883,26 +675,68 @@ static int spill_slot_compatible(Func* f, FrameSlot fs, Val v) {
return 1;
}
-static void alloc_collect_occupied(const OptAllocator* a,
- const OptLiveRangeSet* ranges, Val v,
- u64* occupied) {
- loc_bits_clear(occupied, a->loc_words);
+static void alloc_collect_occupied_hard(const OptAllocator* a,
+ const OptLiveRangeSet* ranges, Val v,
+ u64* occupied) {
+ loc_bits_clear(occupied, a->hard_loc_words);
for (u32 r = ranges->first_range_by_val[v]; r != OPT_RANGE_NONE;
r = ranges->ranges[r].next) {
const OptLiveRange* lr = &ranges->ranges[r];
for (u32 p = lr->start; p < lr->end && p < a->point_count; ++p)
- loc_bits_or(occupied, a->used_locs + ((size_t)p * a->loc_words),
- a->loc_words);
+ loc_bits_or(occupied,
+ a->hard_used_locs + ((size_t)p * a->hard_loc_words),
+ a->hard_loc_words);
}
}
-static void alloc_mark_loc(OptAllocator* a, const OptLiveRangeSet* ranges,
- Val v, u32 bit) {
+static void alloc_collect_occupied_stack(const OptAllocator* a,
+ const OptLiveRangeSet* ranges, Val v,
+ u64* occupied) {
+ loc_bits_clear(occupied, a->stack_loc_words);
+ if (!a->stack_loc_words) return;
for (u32 r = ranges->first_range_by_val[v]; r != OPT_RANGE_NONE;
r = ranges->ranges[r].next) {
const OptLiveRange* lr = &ranges->ranges[r];
for (u32 p = lr->start; p < lr->end && p < a->point_count; ++p)
- loc_bit_set(a->used_locs + ((size_t)p * a->loc_words), bit);
+ loc_bits_or(occupied,
+ a->stack_used_locs + ((size_t)p * a->stack_loc_words),
+ a->stack_loc_words);
+ }
+}
+
+static void alloc_mark_hard_loc(OptAllocator* a, const OptLiveRangeSet* ranges,
+ Val v, u32 bit) {
+ for (u32 r = ranges->first_range_by_val[v]; r != OPT_RANGE_NONE;
+ r = ranges->ranges[r].next) {
+ const OptLiveRange* lr = &ranges->ranges[r];
+ for (u32 p = lr->start; p < lr->end && p < a->point_count; ++p)
+ loc_bit_set(a->hard_used_locs + ((size_t)p * a->hard_loc_words), bit);
+ }
+}
+
+static void alloc_grow_stack_words(Func* f, OptAllocator* a, u32 need_words) {
+ if (need_words <= a->stack_loc_words) return;
+ u64* next =
+ arena_zarray(f->arena, u64, (size_t)a->point_count * need_words);
+ for (u32 p = 0; p < a->point_count; ++p) {
+ u64* dst = next + ((size_t)p * need_words);
+ if (a->stack_loc_words) {
+ const u64* src = a->stack_used_locs + ((size_t)p * a->stack_loc_words);
+ for (u32 w = 0; w < a->stack_loc_words; ++w) dst[w] = src[w];
+ }
+ }
+ a->stack_used_locs = next;
+ a->stack_loc_words = need_words;
+}
+
+static void alloc_mark_stack_loc(OptAllocator* a, const OptLiveRangeSet* ranges,
+ Val v, u32 stack_idx) {
+ for (u32 r = ranges->first_range_by_val[v]; r != OPT_RANGE_NONE;
+ r = ranges->ranges[r].next) {
+ const OptLiveRange* lr = &ranges->ranges[r];
+ for (u32 p = lr->start; p < lr->end && p < a->point_count; ++p)
+ loc_bit_set(a->stack_used_locs + ((size_t)p * a->stack_loc_words),
+ stack_idx);
}
}
@@ -915,7 +749,7 @@ static void alloc_assign_hard(Func* f, OptAllocator* a,
a->locs[v].cls = vi->cls;
a->locs[v].hard_reg = r;
a->locs[v].spill_slot = FRAME_SLOT_NONE;
- alloc_mark_loc(a, ranges, v, hard_loc_bit(vi->cls, r));
+ alloc_mark_hard_loc(a, ranges, v, hard_loc_bit(vi->cls, r));
}
static void alloc_assign_stack(Func* f, OptAllocator* a,
@@ -925,8 +759,7 @@ static void alloc_assign_stack(Func* f, OptAllocator* a,
u32 stack_idx = a->stack_slot_count;
FrameSlot slot = FRAME_SLOT_NONE;
for (u32 i = 0; i < a->stack_slot_count; ++i) {
- u32 bit = a->hard_loc_bits + i;
- if (loc_bit_has(occupied, bit)) continue;
+ if (loc_bit_has(occupied, i)) continue;
if (!spill_slot_compatible(f, a->stack_slots[i], v)) continue;
stack_idx = i;
slot = a->stack_slots[i];
@@ -934,8 +767,18 @@ static void alloc_assign_stack(Func* f, OptAllocator* a,
}
if (slot == FRAME_SLOT_NONE) {
slot = spill_slot_for(f, v);
+ if (a->stack_slot_count == a->stack_slot_cap) {
+ u32 ncap = a->stack_slot_cap ? a->stack_slot_cap * 2u : 16u;
+ FrameSlot* ns = arena_array(f->arena, FrameSlot, ncap);
+ if (a->stack_slots)
+ memcpy(ns, a->stack_slots, sizeof(a->stack_slots[0]) *
+ a->stack_slot_count);
+ a->stack_slots = ns;
+ a->stack_slot_cap = ncap;
+ }
a->stack_slots[a->stack_slot_count++] = slot;
stack_idx = a->stack_slot_count - 1u;
+ alloc_grow_stack_words(f, a, loc_bit_words(a->stack_slot_count));
} else {
vi->spill_slot = slot;
}
@@ -944,7 +787,7 @@ static void alloc_assign_stack(Func* f, OptAllocator* a,
a->locs[v].cls = vi->cls;
a->locs[v].hard_reg = REG_NONE;
a->locs[v].spill_slot = slot;
- alloc_mark_loc(a, ranges, v, a->hard_loc_bits + stack_idx);
+ alloc_mark_stack_loc(a, ranges, v, stack_idx);
}
static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
@@ -955,13 +798,12 @@ static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
u32 ncands = 0;
for (Val v = 1; v < f->nvals; ++v)
if (ranges->first_range_by_val[v] != OPT_RANGE_NONE) ++ncands;
- u32 loc_bits = a->hard_loc_bits + ncands;
- a->loc_words = loc_bit_words(loc_bits);
- f->opt_used_loc_words = a->point_count * a->loc_words;
+ a->hard_loc_words = loc_bit_words(a->hard_loc_bits);
a->locs = arena_zarray(f->arena, OptLoc, f->nvals ? f->nvals : 1u);
- a->used_locs =
- arena_zarray(f->arena, u64, (size_t)a->point_count * a->loc_words);
- a->stack_slots = arena_array(f->arena, FrameSlot, ncands ? ncands : 1u);
+ a->hard_used_locs =
+ arena_zarray(f->arena, u64, (size_t)a->point_count * a->hard_loc_words);
+ a->stack_slots = NULL;
+ a->stack_slot_cap = 0;
OptAllocCandidate* cands =
arena_array(f->arena, OptAllocCandidate, ncands ? ncands : 1u);
@@ -978,12 +820,15 @@ static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
}
alloc_sort_candidates(cands, n);
- u64* occupied = arena_zarray(f->arena, u64, a->loc_words ? a->loc_words : 1u);
+ u64* hard_occupied =
+ arena_zarray(f->arena, u64, a->hard_loc_words ? a->hard_loc_words : 1u);
+ u64* stack_occupied =
+ arena_zarray(f->arena, u64, loc_bit_words(ncands ? ncands : 1u));
for (u32 i = 0; i < n; ++i) {
Val v = cands[i].v;
OptValInfo* vi = &f->val_info[v];
u8 cls = vi->cls;
- alloc_collect_occupied(a, ranges, v, occupied);
+ alloc_collect_occupied_hard(a, ranges, v, hard_occupied);
if (vi->tied_hard_reg >= 0) {
Reg fixed = (Reg)vi->tied_hard_reg;
if (!hard_available(f, cls, fixed)) {
@@ -999,7 +844,8 @@ static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
"opt regalloc: fixed hard reg %u is clobbered",
(unsigned)fixed);
}
- if (fixed >= 32 || loc_bit_has(occupied, hard_loc_bit(cls, fixed))) {
+ if (fixed >= 32 ||
+ loc_bit_has(hard_occupied, hard_loc_bit(cls, fixed))) {
SrcLoc loc = {0, 0, 0};
compiler_panic(f->c, loc, "opt regalloc: conflicting fixed hard reg %u",
(unsigned)fixed);
@@ -1013,13 +859,21 @@ static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
Reg hr = f->opt_hard_regs[cls][r];
if (hr >= 32) continue;
if (vi->forbidden_hard_regs & (1u << hr)) continue;
- if (loc_bit_has(occupied, hard_loc_bit(cls, hr))) continue;
+ if (loc_bit_has(hard_occupied, hard_loc_bit(cls, hr))) continue;
alloc_assign_hard(f, a, ranges, v, hr);
found = 1;
break;
}
- if (!found) alloc_assign_stack(f, a, ranges, v, occupied);
+ if (!found) {
+ alloc_collect_occupied_stack(a, ranges, v, stack_occupied);
+ alloc_assign_stack(f, a, ranges, v, stack_occupied);
+ }
}
+ f->opt_alloc_hard_loc_words = a->point_count * a->hard_loc_words;
+ f->opt_alloc_stack_loc_words = a->point_count * a->stack_loc_words;
+ f->opt_alloc_stack_slots = a->stack_slot_count;
+ f->opt_used_loc_words =
+ f->opt_alloc_hard_loc_words + f->opt_alloc_stack_loc_words;
}
typedef struct RewriteList {
@@ -1028,6 +882,12 @@ typedef struct RewriteList {
u32 cap;
} RewriteList;
+typedef struct RewriteOut {
+ Inst* data;
+ u32 cap;
+ u32 start;
+} RewriteOut;
+
static Inst* list_push(Func* f, RewriteList* l, IROp op) {
if (l->n == l->cap) {
u32 ncap = l->cap ? l->cap * 2u : 16u;
@@ -1042,6 +902,46 @@ static Inst* list_push(Func* f, RewriteList* l, IROp op) {
return in;
}
+static void list_reset(RewriteList* l) {
+ if (l) l->n = 0;
+}
+
+static void out_init(Func* f, RewriteOut* out, u32 cap) {
+ out->cap = cap ? cap : 16u;
+ out->data = arena_zarray(f->arena, Inst, out->cap);
+ out->start = out->cap;
+}
+
+static Inst* out_push_front(Func* f, RewriteOut* out, IROp op) {
+ if (out->start == 0) {
+ u32 n = out->cap;
+ u32 ncap = out->cap ? out->cap * 2u : 16u;
+ Inst* nb = arena_zarray(f->arena, Inst, ncap);
+ if (out->data && n)
+ memcpy(nb + (ncap - n), out->data + out->start, sizeof(Inst) * n);
+ out->data = nb;
+ out->cap = ncap;
+ out->start = ncap - n;
+ }
+ Inst* in = &out->data[--out->start];
+ memset(in, 0, sizeof *in);
+ in->op = (u16)op;
+ return in;
+}
+
+static void out_prepend_list_reverse(Func* f, RewriteOut* out,
+ const RewriteList* list) {
+ for (u32 i = list->n; i > 0; --i) {
+ Inst* dst = out_push_front(f, out, (IROp)list->data[i - 1u].op);
+ *dst = list->data[i - 1u];
+ }
+}
+
+static void out_prepend_inst(Func* f, RewriteOut* out, const Inst* in) {
+ Inst* dst = out_push_front(f, out, (IROp)in->op);
+ *dst = *in;
+}
+
static MemAccess spill_mem(Func* f, Val v) {
MemAccess m;
memset(&m, 0, sizeof m);
@@ -1222,8 +1122,13 @@ static void rewrite_func(Func* f, const OptLiveInfo* live_info) {
u64* def = arena_zarray(f->arena, u64, words ? words : 1u);
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
- RewriteList rev;
- memset(&rev, 0, sizeof rev);
+ RewriteOut out;
+ out_init(f, &out, bl->ninsts + 16u);
+ RewriteList before, after, call_saves, call_restores;
+ memset(&before, 0, sizeof before);
+ memset(&after, 0, sizeof after);
+ memset(&call_saves, 0, sizeof call_saves);
+ memset(&call_restores, 0, sizeof call_restores);
live_copy_block_out(f, live_info, b, live, words);
for (u32 ri = bl->ninsts; ri > 0; --ri) {
@@ -1233,9 +1138,10 @@ static void rewrite_func(Func* f, const OptLiveInfo* live_info) {
bits_clear(def, words);
BitsCtx db = {use, def};
walk_inst_operands(f, &in, collect_bits, &db);
- RewriteList before, after;
- memset(&before, 0, sizeof before);
- memset(&after, 0, sizeof after);
+ list_reset(&before);
+ list_reset(&after);
+ list_reset(&call_saves);
+ list_reset(&call_restores);
RewriteCtx ctx;
memset(&ctx, 0, sizeof ctx);
ctx.before = &before;
@@ -1252,38 +1158,21 @@ static void rewrite_func(Func* f, const OptLiveInfo* live_info) {
} else {
walk_inst_operands(f, &in, rewrite_one_operand, &ctx);
}
- RewriteList seq;
- memset(&seq, 0, sizeof seq);
- for (u32 k = 0; k < before.n; ++k) {
- Inst* dst = list_push(f, &seq, (IROp)before.data[k].op);
- *dst = before.data[k];
- }
- if ((IROp)in.op == IR_CALL)
- append_live_call_saves(f, &seq, live, def, words, 0);
- Inst* dst = list_push(f, &seq, (IROp)in.op);
- *dst = in;
- if ((IROp)in.op == IR_CALL)
- append_live_call_saves(f, &seq, live, def, words, 1);
- for (u32 k = 0; k < after.n; ++k) {
- Inst* ad = list_push(f, &seq, (IROp)after.data[k].op);
- *ad = after.data[k];
+ if ((IROp)in.op == IR_CALL) {
+ append_live_call_saves(f, &call_saves, live, def, words, 0);
+ append_live_call_saves(f, &call_restores, live, def, words, 1);
}
- for (u32 k = seq.n; k > 0; --k) {
- Inst* rd = list_push(f, &rev, (IROp)seq.data[k - 1u].op);
- *rd = seq.data[k - 1u];
- }
+ out_prepend_list_reverse(f, &out, &after);
+ out_prepend_list_reverse(f, &out, &call_restores);
+ out_prepend_inst(f, &out, &in);
+ out_prepend_list_reverse(f, &out, &call_saves);
+ out_prepend_list_reverse(f, &out, &before);
live_update_before(live, use, def, words);
}
- for (u32 i = 0; i < rev.n / 2u; ++i) {
- Inst tmp = rev.data[i];
- rev.data[i] = rev.data[rev.n - 1u - i];
- rev.data[rev.n - 1u - i] = tmp;
- }
- bl->insts = rev.data;
- bl->ninsts = rev.n;
- bl->cap = rev.cap;
- bl->live_after = NULL;
+ bl->insts = out.data + out.start;
+ bl->ninsts = out.cap - out.start;
+ bl->cap = bl->ninsts;
}
f->opt_rewritten = 1;
}
@@ -1328,8 +1217,7 @@ void opt_dead_def_elim_with_live(Func* f, const OptLiveInfo* live_info) {
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
u64* live = arena_zarray(f->arena, u64, words);
- const u64* live_out =
- live_info ? live_info->blocks[b].live_out.words : bl->live_out;
+ const u64* live_out = live_info ? live_info->blocks[b].live_out.words : NULL;
if (live_out) {
for (u32 w = 0; w < words; ++w) live[w] = live_out[w];
}
@@ -1365,13 +1253,9 @@ void opt_dead_def_elim_with_live(Func* f, const OptLiveInfo* live_info) {
}
void opt_dead_def_elim(Func* f) {
- if (f->nblocks && !f->blocks[0].live_out) {
- OptLiveInfo live;
- opt_live_blocks(f, &live);
- opt_dead_def_elim_with_live(f, &live);
- return;
- }
- opt_dead_def_elim_with_live(f, NULL);
+ OptLiveInfo live;
+ opt_live_blocks(f, &live);
+ opt_dead_def_elim_with_live(f, &live);
}
void opt_regalloc(Func* f, int allow_live_range_split) {
@@ -1624,17 +1508,10 @@ static int inst_defines_phys_reg(const Inst* in, const Operand* r) {
}
}
-static int block_live_out_has_phys_reg(Func* f, const Block* bl,
- const Operand* r) {
- if (!f->val_info || !bl->live_out || !r || r->kind != OPK_REG) return 0;
- for (Val v = 1; v < f->nvals; ++v) {
- if (!bit_has(bl->live_out, v)) continue;
- if (f->val_info[v].alloc_kind != OPT_ALLOC_HARD) continue;
- if (f->val_info[v].cls == r->cls && f->val_info[v].hard_reg == r->v.reg)
- return 1;
- }
- return 0;
-}
+typedef struct HardBlockLive HardBlockLive;
+static HardBlockLive* maybe_build_hard_live(Func* f);
+static int block_live_out_has_phys_reg(Func* f, const HardBlockLive* hard_live,
+ u32 block, const Operand* r);
static int copy_fold_slot(const Inst* in, u32 idx) {
switch ((IROp)in->op) {
@@ -1680,7 +1557,8 @@ static int identical_convert_pair(const Inst* a, const Inst* b) {
a->opnds[0].type == b->opnds[0].type;
}
-static int find_single_direct_use(Func* f, Block* bl, u32 def_i,
+static int find_single_direct_use(Func* f, Block* bl,
+ const HardBlockLive* hard_live, u32 def_i,
const Operand* def, const Operand* src,
int check_src, int imm_fold, int conv_fold,
u32* use_i_out, u32* op_i_out) {
@@ -1726,13 +1604,15 @@ static int find_single_direct_use(Func* f, Block* bl, u32 def_i,
if (total_uses != 1) return 0;
if (!found) return 0;
- if (!killed && block_live_out_has_phys_reg(f, bl, def)) return 0;
+ if (!killed && block_live_out_has_phys_reg(f, hard_live, bl->id, def))
+ return 0;
*use_i_out = found_i;
*op_i_out = found_op;
return 1;
}
-static void opt_combine_fold_block(Func* f, Block* bl) {
+static void opt_combine_fold_block(Func* f, Block* bl,
+ const HardBlockLive* hard_live) {
for (u32 i = 0; i < bl->ninsts; ++i) {
Inst* in = &bl->insts[i];
u32 use_i = 0;
@@ -1740,16 +1620,16 @@ static void opt_combine_fold_block(Func* f, Block* bl) {
if ((IROp)in->op == IR_COPY && in->nopnds >= 2 &&
in->opnds[0].kind == OPK_REG && in->opnds[1].kind == OPK_REG &&
!same_phys_reg(&in->opnds[0], &in->opnds[1]) &&
- find_single_direct_use(f, bl, i, &in->opnds[0], &in->opnds[1], 1, 0, 0,
- &use_i, &op_i)) {
+ find_single_direct_use(f, bl, hard_live, i, &in->opnds[0],
+ &in->opnds[1], 1, 0, 0, &use_i, &op_i)) {
bl->insts[use_i].opnds[op_i] = in->opnds[1];
continue;
}
if ((IROp)in->op == IR_LOAD_IMM && in->nopnds >= 1 &&
in->opnds[0].kind == OPK_REG &&
- find_single_direct_use(f, bl, i, &in->opnds[0], NULL, 0, 1, 0, &use_i,
- &op_i)) {
+ find_single_direct_use(f, bl, hard_live, i, &in->opnds[0], NULL, 0, 1,
+ 0, &use_i, &op_i)) {
Operand imm = in->opnds[0];
imm.kind = OPK_IMM;
imm.v.imm = in->extra.imm;
@@ -1759,17 +1639,18 @@ static void opt_combine_fold_block(Func* f, Block* bl) {
if ((IROp)in->op == IR_CONVERT && in->nopnds >= 2 &&
in->opnds[0].kind == OPK_REG && in->opnds[1].kind == OPK_REG &&
- find_single_direct_use(f, bl, i, &in->opnds[0], &in->opnds[1], 1, 0, 1,
- &use_i, &op_i)) {
+ find_single_direct_use(f, bl, hard_live, i, &in->opnds[0],
+ &in->opnds[1], 1, 0, 1, &use_i, &op_i)) {
bl->insts[use_i].opnds[op_i] = in->opnds[1];
}
}
}
void opt_combine(Func* f) {
+ HardBlockLive* hard_live = maybe_build_hard_live(f);
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
- opt_combine_fold_block(f, bl);
+ opt_combine_fold_block(f, bl, hard_live);
u32 w = 0;
for (u32 i = 0; i < bl->ninsts; ++i) {
Inst* in = &bl->insts[i];
@@ -1853,6 +1734,13 @@ typedef struct HardRegSet {
u32 cls[OPT_REG_CLASSES];
} HardRegSet;
+struct HardBlockLive {
+ HardRegSet live_in;
+ HardRegSet live_out;
+ HardRegSet live_use;
+ HardRegSet live_def;
+};
+
static void hard_add(HardRegSet* s, u8 cls, Reg r) {
if (cls >= OPT_REG_CLASSES || r >= 32) return;
s->cls[cls] |= 1u << r;
@@ -1870,6 +1758,23 @@ static int hard_intersects(const HardRegSet* a, const HardRegSet* b) {
return 0;
}
+static int hard_eq(const HardRegSet* a, const HardRegSet* b) {
+ for (u32 c = 0; c < OPT_REG_CLASSES; ++c)
+ if (a->cls[c] != b->cls[c]) return 0;
+ return 1;
+}
+
+static void hard_or(HardRegSet* dst, const HardRegSet* src) {
+ for (u32 c = 0; c < OPT_REG_CLASSES; ++c) dst->cls[c] |= src->cls[c];
+}
+
+static void hard_live_in_from_out(HardRegSet* dst, const HardRegSet* use,
+ const HardRegSet* out,
+ const HardRegSet* def) {
+ for (u32 c = 0; c < OPT_REG_CLASSES; ++c)
+ dst->cls[c] = use->cls[c] | (out->cls[c] & ~def->cls[c]);
+}
+
static void hard_live_step(HardRegSet* live, const HardRegSet* use,
const HardRegSet* def) {
for (u32 c = 0; c < OPT_REG_CLASSES; ++c)
@@ -2008,23 +1913,84 @@ static void hard_inst_use_def(Func* f, const Inst* in, HardRegSet* use,
}
}
-static HardRegSet hard_live_out_from_values(Func* f, const Block* bl) {
- HardRegSet live;
- memset(&live, 0, sizeof live);
- if (!bl->live_out || !f->val_info) return live;
- for (Val v = 1; v < f->nvals; ++v) {
- if (!bit_has(bl->live_out, v)) continue;
- if (f->val_info[v].alloc_kind != OPT_ALLOC_HARD) continue;
- hard_add(&live, f->val_info[v].cls, f->val_info[v].hard_reg);
+static void hard_live_blocks(Func* f, HardBlockLive* live) {
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ HardRegSet seen_def;
+ memset(&seen_def, 0, sizeof seen_def);
+ memset(&live[b], 0, sizeof live[b]);
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ HardRegSet use, def;
+ hard_inst_use_def(f, &bl->insts[i], &use, &def);
+ for (u32 c = 0; c < OPT_REG_CLASSES; ++c)
+ live[b].live_use.cls[c] |= use.cls[c] & ~seen_def.cls[c];
+ hard_or(&seen_def, &def);
+ hard_or(&live[b].live_def, &def);
+ }
}
+
+ int changed;
+ do {
+ changed = 0;
+ for (u32 bi = f->nblocks; bi > 0; --bi) {
+ u32 b = bi - 1u;
+ Block* bl = &f->blocks[b];
+ HardRegSet new_out, new_in;
+ memset(&new_out, 0, sizeof new_out);
+ for (u32 s = 0; s < bl->nsucc; ++s) {
+ u32 t = bl->succ[s];
+ if (t < f->nblocks) hard_or(&new_out, &live[t].live_in);
+ }
+ hard_live_in_from_out(&new_in, &live[b].live_use, &new_out,
+ &live[b].live_def);
+ if (!hard_eq(&live[b].live_out, &new_out)) {
+ live[b].live_out = new_out;
+ changed = 1;
+ }
+ if (!hard_eq(&live[b].live_in, &new_in)) {
+ live[b].live_in = new_in;
+ changed = 1;
+ }
+ }
+ } while (changed);
+}
+
+static int hard_live_out_has_phys_reg(const HardBlockLive* live,
+ const Operand* r) {
+ if (!live || !r || r->kind != OPK_REG || r->cls >= OPT_REG_CLASSES ||
+ r->v.reg >= 32)
+ return 0;
+ return (live->live_out.cls[r->cls] & (1u << r->v.reg)) != 0;
+}
+
+static HardBlockLive* maybe_build_hard_live(Func* f) {
+ if (!f->opt_rewritten) return NULL;
+ HardBlockLive* live = arena_zarray(f->arena, HardBlockLive,
+ f->nblocks ? f->nblocks : 1u);
+ hard_live_blocks(f, live);
return live;
}
+static HardRegSet hard_live_out_for_block(const HardBlockLive* live) {
+ HardRegSet out;
+ memset(&out, 0, sizeof out);
+ if (live) out = live->live_out;
+ return out;
+}
+
+static int block_live_out_has_phys_reg(Func* f, const HardBlockLive* hard_live,
+ u32 block, const Operand* r) {
+ (void)f;
+ if (!hard_live || block >= f->nblocks) return 0;
+ return hard_live_out_has_phys_reg(&hard_live[block], r);
+}
+
void opt_dce(Func* f) {
+ HardBlockLive* hard_live = maybe_build_hard_live(f);
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
if (f->opt_rewritten) {
- HardRegSet live = hard_live_out_from_values(f, bl);
+ HardRegSet live = hard_live_out_for_block(hard_live ? &hard_live[b] : NULL);
Inst* new_insts = arena_array(f->arena, Inst, bl->ninsts);
u32 w = 0;
for (u32 ri = bl->ninsts; ri > 0; --ri) {
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -289,10 +289,6 @@ static void emit_test_branch(Func* f, u32 b, u32 taken, u32 fallthrough,
f->blocks[b].nsucc = 2;
}
-static int bit_has(const u64* bits, Val v) {
- return (bits[v / 64u] & (1ull << (v % 64u))) != 0;
-}
-
static int bytes_contains(const unsigned char* data, size_t len,
const char* needle) {
size_t nlen = strlen(needle);
@@ -302,9 +298,28 @@ static int bytes_contains(const unsigned char* data, size_t len,
return 0;
}
-static int val_conflicts(const Func* f, Val a, Val b) {
- const u64* bits = f->val_conflicts + ((size_t)a * f->opt_conflict_words);
- return bit_has(bits, b);
+static void ensure_test_val_info(Func* f) {
+ if (f->val_info) return;
+ f->val_info = arena_zarray(f->arena, OptValInfo, f->nvals ? f->nvals : 1u);
+ for (Val v = 0; v < f->nvals; ++v) {
+ f->val_info[v].tied_hard_reg = -1;
+ f->val_info[v].hard_reg = REG_NONE;
+ f->val_info[v].spill_slot = FRAME_SLOT_NONE;
+ f->val_info[v].cls = f->val_cls[v];
+ }
+}
+
+static int ranges_overlap(const OptLiveRangeSet* ranges, Val a, Val b) {
+ for (u32 ar = ranges->first_range_by_val[a]; ar != OPT_RANGE_NONE;
+ ar = ranges->ranges[ar].next) {
+ const OptLiveRange* ra = &ranges->ranges[ar];
+ for (u32 br = ranges->first_range_by_val[b]; br != OPT_RANGE_NONE;
+ br = ranges->ranges[br].next) {
+ const OptLiveRange* rb = &ranges->ranges[br];
+ if (ra->start < rb->end && rb->start < ra->end) return 1;
+ }
+ }
+ return 0;
}
static int expect_panic(Compiler* c, void (*fn)(void*), void* arg) {
@@ -536,12 +551,17 @@ static void opt_liveness_branch(void) {
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
+ OptLiveInfo live;
+ opt_live_blocks(f, &live);
- EXPECT(bit_has(f->blocks[b0].live_out, v), "v%u live_out of branch block", v);
- EXPECT(bit_has(f->blocks[b1].live_in, v), "v%u live_in true block", v);
- EXPECT(bit_has(f->blocks[b2].live_in, v), "v%u live_in false block", v);
- EXPECT(!bit_has(f->blocks[b1].live_out, v), "v%u dies in true block", v);
+ EXPECT(opt_bitset_has(&live.blocks[b0].live_out, v),
+ "v%u live_out of branch block", v);
+ EXPECT(opt_bitset_has(&live.blocks[b1].live_in, v),
+ "v%u live_in true block", v);
+ EXPECT(opt_bitset_has(&live.blocks[b2].live_in, v),
+ "v%u live_in false block", v);
+ EXPECT(!opt_bitset_has(&live.blocks[b1].live_out, v),
+ "v%u dies in true block", v);
tc_fini(&tc);
}
@@ -586,8 +606,6 @@ static void opt_block_liveness_phase1(void) {
"call-crossing value should remain live-out on loop backedge");
EXPECT(!opt_bitset_has(&live.blocks[exit].live_out, across_call),
"returned value should die at function exit");
- EXPECT(f->val_conflicts == NULL,
- "block liveness should not allocate conflict matrix");
EXPECT(live.block_bytes != 0, "block-liveness byte metric should be set");
EXPECT(live.set_bit_scans != 0,
"block-liveness set-bit scan metric should be set");
@@ -615,8 +633,6 @@ static void opt_block_liveness_phase1(void) {
opt_dead_def_elim_with_live(g, &g_live);
EXPECT(count_op(g, IR_COPY) == 0,
"DDE should consume pass-local block liveness");
- EXPECT(g->val_conflicts == NULL,
- "DDE with block liveness should not allocate conflict matrix");
tc_fini(&tc);
}
@@ -670,8 +686,6 @@ static void opt_live_ranges_phase2(void) {
"range builder should record whole-block spans");
EXPECT(ranges.max_ranges_per_val >= live_range_count_for(&ranges, loop_v),
"range metrics should record max ranges per value");
- EXPECT(f->val_conflicts == NULL,
- "range construction should not allocate conflict matrix");
CfreeWriter* w = cfree_writer_mem(&g_heap);
opt_live_dump_ranges(f, &ranges, w);
@@ -685,7 +699,7 @@ static void opt_live_ranges_phase2(void) {
tc_fini(&tc);
}
-static void opt_live_after_linear(void) {
+static void opt_range_liveness_linear(void) {
TestCtx tc;
tc_init(&tc);
Func* f = new_func(&tc);
@@ -705,22 +719,26 @@ static void opt_live_after_linear(void) {
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
-
- EXPECT(f->blocks[b].live_after != NULL,
- "live_info should record per-instruction live_after sets");
- EXPECT(bit_has(f->blocks[b].live_after[ia], a),
- "a should be live after its def");
- EXPECT(!bit_has(f->blocks[b].live_after[ia], bv),
- "b should not be live before its def");
- EXPECT(bit_has(f->blocks[b].live_after[ib], a),
- "a should be live after b's def");
- EXPECT(bit_has(f->blocks[b].live_after[ib], bv),
- "b should be live after its def");
- EXPECT(!bit_has(f->blocks[b].live_after[ic], a), "a should die after add");
- EXPECT(!bit_has(f->blocks[b].live_after[ic], bv), "b should die after add");
- EXPECT(bit_has(f->blocks[b].live_after[ic], c), "c should be live after add");
- EXPECT(!bit_has(f->blocks[b].live_after[ir], c), "return should consume c");
+ OptLiveInfo live;
+ OptLiveRangeSet ranges;
+ opt_live_blocks(f, &live);
+ opt_live_ranges_build(f, &live, &ranges);
+
+ (void)ia;
+ (void)ib;
+ (void)ic;
+ (void)ir;
+ EXPECT(ranges.first_range_by_val[a] != OPT_RANGE_NONE,
+ "a should get a live range");
+ EXPECT(ranges.first_range_by_val[bv] != OPT_RANGE_NONE,
+ "b should get a live range");
+ EXPECT(ranges.first_range_by_val[c] != OPT_RANGE_NONE,
+ "c should get a live range");
+ EXPECT(ranges_overlap(&ranges, a, bv), "a and b ranges should overlap");
+ EXPECT(ranges_overlap(&ranges, a, c),
+ "same-instruction def/use should overlap a and c");
+ EXPECT(ranges_overlap(&ranges, bv, c),
+ "same-instruction def/use should overlap b and c");
tc_fini(&tc);
}
@@ -751,10 +769,13 @@ static void opt_interference_branch_disjoint(void) {
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
+ OptLiveInfo live;
+ OptLiveRangeSet ranges;
+ opt_live_blocks(f, &live);
+ opt_live_ranges_build(f, &live, &ranges);
- EXPECT(!val_conflicts(f, a, b),
- "branch-local values v%u and v%u should not conflict", a, b);
+ EXPECT(!ranges_overlap(&ranges, a, b),
+ "branch-local values v%u and v%u should not overlap", a, b);
opt_regalloc(f, 0);
EXPECT(f->val_info[a].alloc_kind == OPT_ALLOC_HARD,
@@ -766,7 +787,7 @@ static void opt_interference_branch_disjoint(void) {
tc_fini(&tc);
}
-static void opt_interference_def_live_out(void) {
+static void opt_range_overlap_def_live_out(void) {
TestCtx tc;
tc_init(&tc);
Func* f = new_func(&tc);
@@ -782,19 +803,21 @@ static void opt_interference_def_live_out(void) {
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
-
- EXPECT(val_conflicts(f, a, bv),
- "a and b should conflict while both are live");
- EXPECT(val_conflicts(f, bv, a), "conflicts should be symmetric");
- EXPECT(val_conflicts(f, a, c),
- "a should conflict with c for same-instruction def/use lowering");
- EXPECT(val_conflicts(f, bv, c),
- "b should conflict with c for same-instruction def/use lowering");
+ OptLiveInfo live;
+ OptLiveRangeSet ranges;
+ opt_live_blocks(f, &live);
+ opt_live_ranges_build(f, &live, &ranges);
+
+ EXPECT(ranges_overlap(&ranges, a, bv), "a and b should overlap");
+ EXPECT(ranges_overlap(&ranges, bv, a), "range overlap should be symmetric");
+ EXPECT(ranges_overlap(&ranges, a, c),
+ "a should overlap c for same-instruction def/use lowering");
+ EXPECT(ranges_overlap(&ranges, bv, c),
+ "b should overlap c for same-instruction def/use lowering");
tc_fini(&tc);
}
-static void opt_loop_frequency_weights_live_info(void) {
+static void opt_loop_frequency_weights_ranges(void) {
TestCtx tc;
tc_init(&tc);
Func* f = new_func(&tc);
@@ -819,11 +842,15 @@ static void opt_loop_frequency_weights_live_info(void) {
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
+ OptLiveInfo live;
+ OptLiveRangeSet ranges;
+ opt_live_blocks(f, &live);
+ opt_live_ranges_build(f, &live, &ranges);
- EXPECT(f->val_info[loop_v].use_freq > f->val_info[exit_v].use_freq,
+ EXPECT(ranges.use_freq_by_val[loop_v] > ranges.use_freq_by_val[exit_v],
"loop-used value should have higher weighted use frequency");
- EXPECT(f->val_info[loop_v].spill_cost > f->val_info[exit_v].spill_cost,
+ EXPECT(ranges.spill_cost_by_val[loop_v] >
+ ranges.spill_cost_by_val[exit_v],
"loop-used value should have higher spill cost");
tc_fini(&tc);
}
@@ -833,26 +860,29 @@ static void opt_live_across_call_frequency(void) {
tc_init(&tc);
Func* f = new_func(&tc);
u32 b = f->entry;
- Val live = add_val(f, tc.i32);
+ Val live_val = add_val(f, tc.i32);
Val dead = add_val(f, tc.i32);
- emit_load_imm(f, b, live, tc.i32, 11);
+ emit_load_imm(f, b, live_val, tc.i32, 11);
emit_load_imm(f, b, dead, tc.i32, 12);
emit_call_void(f, b);
- emit_ret_val(f, b, live, tc.i32);
+ emit_ret_val(f, b, live_val, tc.i32);
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
+ OptLiveInfo live_info;
+ OptLiveRangeSet ranges;
+ opt_live_blocks(f, &live_info);
+ opt_live_ranges_build(f, &live_info, &ranges);
- EXPECT(f->val_info[live].live_across_call_freq > 0,
+ EXPECT(ranges.live_across_call_freq_by_val[live_val] > 0,
"live value should be marked live across call");
- EXPECT(f->val_info[dead].live_across_call_freq == 0,
+ EXPECT(ranges.live_across_call_freq_by_val[dead] == 0,
"dead value should not be marked live across call");
tc_fini(&tc);
}
-static void opt_conflict_symmetry_and_class(void) {
+static void opt_range_overlap_class(void) {
TestCtx tc;
tc_init(&tc);
Func* f = new_func(&tc);
@@ -869,14 +899,15 @@ static void opt_conflict_symmetry_and_class(void) {
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
-
- EXPECT(val_conflicts(f, i0, i1), "int values should conflict");
- EXPECT(val_conflicts(f, i1, i0), "int conflict should be symmetric");
- EXPECT(!val_conflicts(f, i0, fp),
- "different register classes should not conflict");
- EXPECT(!val_conflicts(f, fp, i0),
- "different-class conflict should be symmetric false");
+ OptLiveInfo live;
+ OptLiveRangeSet ranges;
+ opt_live_blocks(f, &live);
+ opt_live_ranges_build(f, &live, &ranges);
+
+ EXPECT(ranges_overlap(&ranges, i0, i1), "int values should overlap");
+ EXPECT(ranges_overlap(&ranges, i1, i0), "overlap should be symmetric");
+ EXPECT(f->val_cls[i0] != f->val_cls[fp],
+ "range allocator keeps register classes out of overlap policy");
tc_fini(&tc);
}
@@ -890,20 +921,23 @@ static void opt_cfg_prunes_unreachable(void) {
ir_note_emit(f, b1);
ir_note_emit(f, dead);
- Val live = add_val(f, tc.i32);
+ Val live_val = add_val(f, tc.i32);
Val dead_val = add_val(f, tc.i32);
Inst* br = ir_emit(f, b0, IR_BR);
(void)br;
f->blocks[b0].succ[0] = b1;
f->blocks[b0].nsucc = 1;
- emit_load_imm(f, b1, live, tc.i32, 7);
- emit_ret_val(f, b1, live, tc.i32);
+ emit_load_imm(f, b1, live_val, tc.i32, 7);
+ emit_ret_val(f, b1, live_val, tc.i32);
emit_load_imm(f, dead, dead_val, tc.i32, 99);
emit_ret_val(f, dead, dead_val, tc.i32);
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
+ OptLiveInfo live;
+ OptLiveRangeSet ranges;
+ opt_live_blocks(f, &live);
+ opt_live_ranges_build(f, &live, &ranges);
EXPECT(f->blocks[dead].ninsts == 0,
"unreachable block should have no instructions after cfg cleanup");
@@ -912,7 +946,7 @@ static void opt_cfg_prunes_unreachable(void) {
EXPECT(f->emit_order_n == 2,
"emit_order should skip unreachable block, got %u",
(unsigned)f->emit_order_n);
- EXPECT(f->val_info[dead_val].first_pos == 0,
+ EXPECT(ranges.first_range_by_val[dead_val] == OPT_RANGE_NONE,
"unreachable value should not get a live range");
tc_fini(&tc);
}
@@ -964,7 +998,8 @@ static void opt_cfg_preserves_scope_edges(void) {
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
+ OptLiveInfo live;
+ opt_live_blocks(f, &live);
EXPECT(f->blocks[b0].nsucc == 2,
"scope_begin IF should preserve two CFG successors, got %u",
@@ -972,9 +1007,9 @@ static void opt_cfg_preserves_scope_edges(void) {
EXPECT(f->blocks[b_join].npreds == 2,
"scope join should have then/else predecessors, got %u",
(unsigned)f->blocks[b_join].npreds);
- EXPECT(bit_has(f->blocks[b_then].live_in, v),
+ EXPECT(opt_bitset_has(&live.blocks[b_then].live_in, v),
"scope then block should remain reachable and see v%u live-in", v);
- EXPECT(bit_has(f->blocks[b_else].live_in, v),
+ EXPECT(opt_bitset_has(&live.blocks[b_else].live_in, v),
"scope else block should remain reachable and see v%u live-in", v);
tc_fini(&tc);
}
@@ -1146,7 +1181,7 @@ static void opt_regalloc_priority(void) {
emit_ret_val(f, f->entry, out, tc.i32);
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
+ ensure_test_val_info(f);
f->val_info[pinned].tied_hard_reg = (i32)f->opt_hard_regs[RC_INT][0];
f->val_info[hot].frequency += 1000;
opt_regalloc(f, 0);
@@ -1181,12 +1216,14 @@ static void opt_range_regalloc_no_conflicts_and_stack_reuse(void) {
opt_regalloc(f, 0);
- EXPECT(f->val_conflicts == NULL,
- "range allocator should not allocate a conflict matrix");
- EXPECT(f->opt_conflict_words == 0,
- "range allocator should leave conflict words at zero");
EXPECT(f->opt_used_loc_words != 0,
"range allocator should record used-location storage");
+ EXPECT(f->opt_alloc_hard_loc_words != 0,
+ "range allocator should record hard occupancy words");
+ EXPECT(f->opt_alloc_stack_slots != 0,
+ "range allocator should record allocated stack slots");
+ EXPECT(f->opt_alloc_stack_loc_words <= f->opt_used_loc_words,
+ "stack occupancy should be reported separately");
EXPECT(f->val_info[a].alloc_kind == OPT_ALLOC_SPILL,
"no hard regs should force v%u to stack", (unsigned)a);
EXPECT(f->val_info[b].alloc_kind == OPT_ALLOC_SPILL,
@@ -1218,8 +1255,6 @@ static void opt_rewrite_spill_use_def(void) {
opt_build_loop_tree(f);
opt_regalloc(f, 0);
EXPECT(f->opt_rewritten, "regalloc should rewrite pseudos");
- EXPECT(f->blocks[f->entry].live_after == NULL,
- "range rewrite should not require per-instruction live_after sets");
EXPECT(count_op(f, IR_STORE) >= 1, "spill def should insert store");
EXPECT(count_op(f, IR_LOAD) >= 1, "spill use should insert reload");
int saw_spill_slot = 0;
@@ -1259,8 +1294,6 @@ static void opt_call_clobber_preservation(void) {
opt_regalloc(f, 0);
Block* b = &f->blocks[f->entry];
- EXPECT(b->live_after == NULL,
- "call rewrite should keep live-after state pass-local");
int saw_call_save_restore = 0;
for (u32 i = 1; i + 1 < b->ninsts; ++i) {
if ((IROp)b->insts[i].op == IR_CALL &&
@@ -1294,8 +1327,6 @@ static void opt_call_clobber_caller_saved(void) {
opt_regalloc(f, 0);
Block* b = &f->blocks[f->entry];
- EXPECT(b->live_after == NULL,
- "call rewrite should keep live-after state pass-local");
int saw_call_save_restore = 0;
for (u32 i = 1; i + 1 < b->ninsts; ++i) {
if ((IROp)b->insts[i].op == IR_CALL &&
@@ -1383,7 +1414,7 @@ static void opt_inline_asm_tied_fixed_regs(void) {
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
+ ensure_test_val_info(f);
f->val_info[tied].tied_hard_reg = (i32)f->opt_hard_regs[RC_INT][0];
opt_regalloc(f, 0);
aux = (IRAsmAux*)in->extra.aux;
@@ -1481,7 +1512,6 @@ static void opt_post_rewrite_dce(void) {
emit_ret_val(f, f->entry, a, tc.i32);
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
opt_regalloc(f, 0);
opt_combine(f);
opt_dce(f);
@@ -1906,7 +1936,6 @@ static void opt_dead_def_keeps_observable_loads(void) {
emit_load_local(f, f->entry, dead, fs, tc.i32, MF_VOLATILE);
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
opt_dead_def_elim(f);
EXPECT(count_op(f, IR_LOAD) == 1,
"dead_def_elim should keep volatile loads even when the result dies");
@@ -1918,7 +1947,6 @@ static void opt_dead_def_keeps_observable_loads(void) {
emit_load_local(g, g->entry, dead, fs, tc.i32, MF_ATOMIC);
opt_build_cfg(g);
opt_build_loop_tree(g);
- opt_live_info(g);
opt_dead_def_elim(g);
EXPECT(count_op(g, IR_LOAD) == 1,
"dead_def_elim should keep atomic loads even when the result dies");
@@ -1943,7 +1971,6 @@ static void opt_dead_def_elim_test(void) {
emit_ret_val(f, f->entry, a, tc.i32); /* a stays live */
opt_build_cfg(f);
opt_build_loop_tree(f);
- opt_live_info(f);
opt_dead_def_elim(f);
EXPECT(count_op(f, IR_COPY) == 0,
"dead copy should be eliminated by dead_def_elim");
@@ -2116,12 +2143,12 @@ int main(void) {
opt_liveness_branch();
opt_block_liveness_phase1();
opt_live_ranges_phase2();
- opt_live_after_linear();
+ opt_range_liveness_linear();
opt_interference_branch_disjoint();
- opt_interference_def_live_out();
- opt_loop_frequency_weights_live_info();
+ opt_range_overlap_def_live_out();
+ opt_loop_frequency_weights_ranges();
opt_live_across_call_frequency();
- opt_conflict_symmetry_and_class();
+ opt_range_overlap_class();
opt_regalloc_priority();
opt_range_regalloc_no_conflicts_and_stack_reuse();
opt_rewrite_spill_use_def();
diff --git a/test/opt/phase0_guardrails.sh b/test/opt/phase0_guardrails.sh
@@ -98,6 +98,9 @@ check_metrics() {
opt.range_max_length \
opt.range_whole_block_spans \
opt.alloc.used_loc_words \
+ opt.alloc.hard_loc_words \
+ opt.alloc.stack_loc_words \
+ opt.alloc.stack_slots \
opt.alloc.spills \
opt.rewrite.reloads \
opt.rewrite.stores; do