commit 633d6444262a670bdf4f7517be3d0bb70cfbfcbf
parent 86f37f3912b20a33fd054e34647e7e345b19233f
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 14 May 2026 19:02:45 -0700
opt: add pass-local block liveness
Diffstat:
| M | doc/MIR_RA_REPORT.md | | | 36 | ++++++++++++++++++------------------ |
| M | doc/PERF.md | | | 125 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------- |
| M | src/opt/opt.c | | | 134 | ++++++++++++++++++++++++++++++------------------------------------------------- |
| M | src/opt/opt.h | | | 42 | ++++++++++++++++++++++++++++++++++++++++-- |
| A | src/opt/pass_live.c | | | 333 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| M | src/opt/pass_lower.c | | | 88 | +++++++++++++++++++++++++++++++++++++++++++------------------------------------ |
| M | test/opt/opt_test.c | | | 169 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------- |
7 files changed, 717 insertions(+), 210 deletions(-)
diff --git a/doc/MIR_RA_REPORT.md b/doc/MIR_RA_REPORT.md
@@ -535,27 +535,27 @@ coverage belongs in a later pass once the range allocator owns rewrite.
### Phase 1: Pass-Local Liveness
-- [ ] Add an optimizer bitset abstraction with:
- - [ ] set/clear/test
- - [ ] copy
- - [ ] union
- - [ ] union-and-not
- - [ ] change reporting
- - [ ] set-bit iteration
- - [ ] active-word tracking or trailing-zero trimming
-- [ ] Add pass-local block liveness storage.
-- [ ] Move block `use`/`def` calculation into the new liveness module.
-- [ ] Compute `live_in`/`live_out` without allocating conflicts.
-- [ ] Make pre-DDE use only block liveness.
-- [ ] Keep old full `opt_live_info` available for regalloc during transition.
-- [ ] Add dump output for block liveness.
-- [ ] Add tests for branch, loop, and call liveness.
+- [x] Add an optimizer bitset abstraction with:
+ - [x] set/clear/test
+ - [x] copy
+ - [x] union
+ - [x] union-and-not
+ - [x] change reporting
+ - [x] set-bit iteration
+ - [x] active-word tracking or trailing-zero trimming
+- [x] Add pass-local block liveness storage.
+- [x] Move block `use`/`def` calculation into the new liveness module.
+- [x] Compute `live_in`/`live_out` without allocating conflicts.
+- [x] Make pre-DDE use only block liveness.
+- [x] Keep old full `opt_live_info` available for regalloc during transition.
+- [x] Add dump output for block liveness.
+- [x] Add tests for branch, loop, and call liveness.
Exit criteria:
-- [ ] Existing O1 behavior is unchanged.
-- [ ] Pre-DDE no longer builds `val_conflicts`.
-- [ ] Metrics show separate block-liveness timing and no pre-DDE conflict bytes.
+- [x] Existing O1 behavior is unchanged.
+- [x] Pre-DDE no longer builds `val_conflicts`.
+- [x] Metrics show separate block-liveness timing and no pre-DDE conflict bytes.
### Phase 2: Live Range Model
diff --git a/doc/PERF.md b/doc/PERF.md
@@ -28,7 +28,9 @@ Important counters:
- `compile.input_bytes`
- `compile.obj_sections`, `compile.obj_relocs`
- `opt.funcs`, `opt.blocks`, `opt.insts`, `opt.vals`
-- `opt.live_words`, `opt.conflict_bytes`
+- `opt.live_words`, `opt.live.blocks`, `opt.live.active_words`
+- `opt.live.block_bytes`, `opt.live.set_bit_scans`
+- `opt.conflict_bytes`
- `link.inputs`, `link.sections`, `link.segments`, `link.syms`, `link.relocs`
- `jit.master_size`, `jit.nsegments`, `jit.input_section_bytes`,
`jit.segment_bytes`
@@ -41,12 +43,19 @@ Times are `min / p50 / p95 / mean / max` in milliseconds.
`int main(){return 0;}`
```text
-run.total 0.423 / 0.448 / 0.506 / 0.520 / 1.889
-run.compile_and_jit 0.395 / 0.416 / 0.473 / 0.469 / 1.427
-compile.tu 0.254 / 0.267 / 0.304 / 0.306 / 1.015
-link_jit.all 0.124 / 0.132 / 0.149 / 0.140 / 0.287
-opt.o1.total 0.100 / 0.106 / 0.121 / 0.108 / 0.128
-link.resolve.total 0.064 / 0.067 / 0.076 / 0.072 / 0.159
+run.total 0.298 / 0.312 / 0.344 / 0.319 / 0.390
+run.compile_and_jit 0.283 / 0.295 / 0.323 / 0.300 / 0.346
+compile.tu 0.209 / 0.222 / 0.242 / 0.224 / 0.244
+compile.c.parse_codegen
+ 0.073 / 0.081 / 0.092 / 0.082 / 0.095
+opt.o1.total 0.040 / 0.048 / 0.057 / 0.048 / 0.059
+opt.live_blocks.pre_dde
+ 0.008 / 0.008 / 0.017 / 0.010 / 0.019
+opt.live_info.regalloc
+ 0.003 / 0.003 / 0.004 / 0.004 / 0.007
+opt.regalloc 0.008 / 0.012 / 0.016 / 0.012 / 0.019
+link_jit.all 0.043 / 0.049 / 0.058 / 0.051 / 0.059
+link.resolve.total 0.020 / 0.021 / 0.027 / 0.023 / 0.029
```
One small loop:
@@ -56,12 +65,19 @@ int main(){int s=0; for(int i=0;i<10;i++) s+=i; return s==45?0:1;}
```
```text
-run.total 0.500 / 0.522 / 0.539 / 0.525 / 0.618
-run.compile_and_jit 0.471 / 0.492 / 0.510 / 0.495 / 0.582
-compile.tu 0.324 / 0.340 / 0.356 / 0.345 / 0.428
-opt.o1.total 0.148 / 0.154 / 0.160 / 0.154 / 0.166
-link_jit.all 0.127 / 0.133 / 0.137 / 0.133 / 0.137
-link.resolve.total 0.064 / 0.066 / 0.068 / 0.066 / 0.069
+run.total 0.351 / 0.365 / 0.394 / 0.371 / 0.409
+run.compile_and_jit 0.334 / 0.349 / 0.377 / 0.353 / 0.389
+compile.tu 0.266 / 0.277 / 0.292 / 0.279 / 0.315
+compile.c.parse_codegen
+ 0.131 / 0.140 / 0.155 / 0.141 / 0.157
+opt.o1.total 0.077 / 0.083 / 0.094 / 0.084 / 0.095
+opt.live_blocks.pre_dde
+ 0.011 / 0.012 / 0.018 / 0.013 / 0.018
+opt.live_info.regalloc
+ 0.016 / 0.016 / 0.016 / 0.016 / 0.017
+opt.regalloc 0.027 / 0.028 / 0.034 / 0.029 / 0.034
+link_jit.all 0.043 / 0.046 / 0.057 / 0.049 / 0.064
+link.resolve.total 0.021 / 0.021 / 0.028 / 0.023 / 0.031
```
For tiny programs, compile dominates, with O1 and JIT/link image setup as the
@@ -137,6 +153,74 @@ This indicates that "many functions" is not the current scaling problem. The
problem is a single large function whose liveness/conflict structures become
large enough to bend past linear.
+### Phase-1 Pass-Local Liveness Rerun
+
+After `doc/MIR_RA_REPORT.md` phase 1, pre-DDE uses pass-local block liveness
+instead of full `opt_live_info`. Regalloc still uses the old dense liveness and
+conflict path. The synthetic source generator used here is the same family as
+the tables above, but these are fresh generated inputs rather than byte-for-byte
+identical files, so compare trends and bucket shapes more than individual rows.
+
+Samples: 5 runs per point. The table uses p50 milliseconds. O1 scopes and
+O1 counters are summed across functions. `pre_block_bytes` is
+`opt.live.block_bytes` from `opt.live_blocks.pre_dde`; `reg_conflict_bytes` is
+the remaining `opt.conflict_bytes` from `opt.live_info.regalloc`.
+
+Straight-line main:
+
+```text
+funcs input_bytes insts vals pre_block_bytes reg_conflict_bytes run.total compile.tu parse_codegen opt.o1.total live_blocks_pre live_reg regalloc link_jit
+1 372 91 72 320 576 0.737 0.627 0.465 0.297 0.029 0.128 0.171 0.062
+4 1327 322 270 608 2160 1.538 1.421 1.277 0.837 0.062 0.442 0.578 0.058
+16 5236 1246 1062 1984 9056 4.861 4.756 4.610 3.085 0.174 1.787 2.285 0.056
+64 21028 4942 4230 7264 42224 19.542 19.386 19.249 13.377 0.665 8.199 10.403 0.073
+128 42420 9870 8454 14304 100784 42.077 41.830 41.693 30.284 1.314 19.476 24.434 0.107
+256 85940 19726 16902 28384 267056 98.532 97.987 97.845 74.982 2.603 51.347 63.176 0.157
+512 172980 39438 33798 56544 796208 254.219 252.884 252.723 206.802 5.174 152.154 183.374 0.269
+1024 347352 78862 67590 112864 2640944 742.562 740.519 740.362 646.586 10.425 502.879 599.496 0.436
+```
+
+Function-table main:
+
+```text
+funcs input_bytes insts vals pre_block_bytes reg_conflict_bytes run.total compile.tu parse_codegen opt.o1.total live_blocks_pre live_reg regalloc link_jit
+1 461 112 83 576 664 0.694 0.602 0.465 0.285 0.023 0.129 0.173 0.058
+4 1392 328 269 864 2152 1.513 1.386 1.251 0.805 0.053 0.434 0.565 0.058
+16 5200 1192 1013 2016 8104 4.801 4.687 4.550 3.030 0.194 1.697 2.189 0.060
+64 20560 4648 3989 6624 31912 17.431 17.261 17.111 11.452 0.703 6.560 8.487 0.079
+128 41349 9256 7957 12768 63656 34.170 33.918 33.777 22.640 1.289 13.077 16.946 0.114
+256 83589 18472 15893 25056 127144 67.625 67.144 67.004 45.065 2.567 26.043 33.724 0.156
+512 168069 36904 31765 49632 254120 134.804 134.093 133.941 89.938 5.150 51.976 67.348 0.250
+1024 337298 73768 63509 98784 508072 272.397 271.171 271.026 179.323 10.224 103.779 134.391 0.402
+```
+
+The 512 to 1024 step now separates the fixed part from the remaining problem:
+
+```text
+straight_main:
+compile.tu 252.884 -> 740.519 ms 2.93x
+opt.o1.total 206.802 -> 646.586 ms 3.13x
+opt.live_blocks.pre_dde 5.174 -> 10.425 ms 2.01x
+opt.live_info.regalloc 152.154 -> 502.879 ms 3.31x
+opt.regalloc 183.374 -> 599.496 ms 3.27x
+pre_block_bytes 56544 -> 112864 2.00x
+reg_conflict_bytes 796208 -> 2640944 3.32x
+
+table_main:
+compile.tu 134.093 -> 271.171 ms 2.02x
+opt.o1.total 89.938 -> 179.323 ms 1.99x
+opt.live_blocks.pre_dde 5.150 -> 10.224 ms 1.99x
+opt.live_info.regalloc 51.976 -> 103.779 ms 2.00x
+opt.regalloc 67.348 -> 134.391 ms 2.00x
+pre_block_bytes 49632 -> 98784 1.99x
+reg_conflict_bytes 254120 -> 508072 2.00x
+```
+
+The phase-1 result is the intended partial fix: pre-DDE no longer allocates a
+conflict matrix and its block-live storage scales linearly on the large
+straight-line case. The remaining superlinear behavior is now concentrated in
+the regalloc-side full liveness/conflict path.
+
## Compile Perf
`compile.tu` is now split into generic and C-specific scopes:
@@ -256,15 +340,16 @@ Next useful compile instrumentation:
## Performance Priorities
-1. Make `opt_live_info` cheaper for large functions.
- The same liveness computation currently runs before DDE and again inside
- regalloc. The measurements show both passes growing superlinearly on the
- large straight-line function.
+1. Move regalloc off full `opt_live_info`.
+ Phase 1 removed the full liveness/conflict build from pre-DDE. The remaining
+ superlinear bucket is now `opt.live_info.regalloc`, which still builds
+ `live_after` and the dense conflict matrix before assignment/rewrite.
2. Replace or narrow dense conflict structures.
- `opt.conflict_bytes` tracks the observed curve closely. Investigate sparse
- sets, segmented bitsets, per-block live sets, or interval-style structures
- that avoid touching full dense rows for values that cannot overlap.
+ Regalloc-side `opt.conflict_bytes` still tracks the observed curve closely.
+ Investigate sparse sets, segmented bitsets, per-block live sets, or
+ interval-style structures that avoid touching full dense rows for values that
+ cannot overlap.
3. Avoid whole-function contiguous growth where hot passes scan repeatedly.
Large `Val`/block/instruction arrays and dense bit matrices are likely to
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -34,8 +34,8 @@ typedef struct OptImpl {
/* Current function being recorded. NULL between functions. */
Func* f;
- u32 cur; /* current block id */
- SrcLoc pending_loc; /* most recent set_loc; stamped on each Inst */
+ u32 cur; /* current block id */
+ SrcLoc pending_loc; /* most recent set_loc; stamped on each Inst */
Writer* dump_writer;
} OptImpl;
@@ -44,8 +44,8 @@ static OptImpl* impl_of(CGTarget* t) { return (OptImpl*)t; }
static _Noreturn void panic_unsupported(OptImpl* o, const char* what) {
SrcLoc loc = {0, 0, 0};
- compiler_panic(o->c, loc,
- "opt_cgtarget: %s called under unbounded virtuals", what);
+ compiler_panic(o->c, loc, "opt_cgtarget: %s called under unbounded virtuals",
+ what);
}
/* ---- recording helpers ---- */
@@ -113,9 +113,7 @@ static void set_cur(OptImpl* o, u32 b) {
/* After emitting a terminator, allocate a fresh block for any
* subsequent (likely unreachable) recording. */
-static void after_terminator(OptImpl* o) {
- set_cur(o, ir_block_new(o->f));
-}
+static void after_terminator(OptImpl* o) { set_cur(o, ir_block_new(o->f)); }
/* ---- function lifecycle ---- */
@@ -162,8 +160,8 @@ static void w_reload_reg(CGTarget* t, Operand dst, FrameSlot s, MemAccess m) {
panic_unsupported(impl_of(t), "reload_reg");
}
-static void w_get_allocable_regs(CGTarget* t, RegClass cls,
- const Reg** out, u32* nregs) {
+static void w_get_allocable_regs(CGTarget* t, RegClass cls, const Reg** out,
+ u32* nregs) {
CGTarget* wr = impl_of(t)->target;
if (wr->get_allocable_regs)
wr->get_allocable_regs(wr, cls, out, nregs);
@@ -173,8 +171,8 @@ static void w_get_allocable_regs(CGTarget* t, RegClass cls,
}
}
-static void w_get_scratch_regs(CGTarget* t, RegClass cls,
- const Reg** out, u32* nregs) {
+static void w_get_scratch_regs(CGTarget* t, RegClass cls, const Reg** out,
+ u32* nregs) {
CGTarget* wr = impl_of(t)->target;
if (wr->get_scratch_regs)
wr->get_scratch_regs(wr, cls, out, nregs);
@@ -186,23 +184,20 @@ static void w_get_scratch_regs(CGTarget* t, RegClass cls,
static int w_is_caller_saved(CGTarget* t, RegClass cls, Reg r) {
CGTarget* wr = impl_of(t)->target;
- if (wr->is_caller_saved)
- return wr->is_caller_saved(wr, cls, r);
+ if (wr->is_caller_saved) return wr->is_caller_saved(wr, cls, r);
return 0;
}
-static void w_reserve_hard_regs(CGTarget* t, RegClass cls,
- const Reg* regs, u32 n) {
+static void w_reserve_hard_regs(CGTarget* t, RegClass cls, const Reg* regs,
+ u32 n) {
CGTarget* wr = impl_of(t)->target;
- if (wr->reserve_hard_regs)
- wr->reserve_hard_regs(wr, cls, regs, n);
+ if (wr->reserve_hard_regs) wr->reserve_hard_regs(wr, cls, regs, n);
}
static int w_resolve_reg_name(CGTarget* t, Sym name, Reg* out,
RegClass* cls_out) {
CGTarget* wr = impl_of(t)->target;
- if (wr->resolve_reg_name)
- return wr->resolve_reg_name(wr, name, out, cls_out);
+ if (wr->resolve_reg_name) return wr->resolve_reg_name(wr, name, out, cls_out);
return 1;
}
@@ -218,8 +213,7 @@ static void w_label_place(CGTarget* t, Label l) {
u32 target_blk = (u32)l;
if (target_blk >= o->f->nblocks) {
SrcLoc loc = {0, 0, 0};
- compiler_panic(o->c, loc, "opt: label_place(%u) out of range",
- (unsigned)l);
+ compiler_panic(o->c, loc, "opt: label_place(%u) out of range", (unsigned)l);
}
if (!cur_terminated(o)) {
Block* cb = &o->f->blocks[o->cur];
@@ -739,12 +733,10 @@ static void w_intrinsic(CGTarget* t, IntrinKind kind, Operand* dsts, u32 nd,
in->ndefs = nd;
in->defs = arena_array(o->f->arena, Val, nd);
for (u32 i = 0; i < nd; ++i) {
- in->defs[i] =
- (dsts[i].kind == OPK_REG) ? (Val)dsts[i].v.reg : VAL_NONE;
+ in->defs[i] = (dsts[i].kind == OPK_REG) ? (Val)dsts[i].v.reg : VAL_NONE;
if (in->defs[i] != VAL_NONE && in->defs[i] < o->f->nvals) {
o->f->val_def_block[in->defs[i]] = o->cur;
- o->f->val_def_inst[in->defs[i]] =
- o->f->blocks[o->cur].ninsts - 1u;
+ o->f->val_def_inst[in->defs[i]] = o->f->blocks[o->cur].ninsts - 1u;
}
}
in->def = in->defs[0];
@@ -1049,11 +1041,10 @@ static void replay_inst(ReplayCtx* r, u32 b, Inst* in) {
if (cd.nargs) {
args = arena_array(r->f->arena, CGABIValue, cd.nargs);
for (u32 k = 0; k < cd.nargs; ++k) {
- CGABIPart* parts =
- aux->desc.args[k].nparts
- ? arena_array(r->f->arena, CGABIPart,
- aux->desc.args[k].nparts)
- : NULL;
+ CGABIPart* parts = aux->desc.args[k].nparts
+ ? arena_array(r->f->arena, CGABIPart,
+ aux->desc.args[k].nparts)
+ : NULL;
args[k] = xlat_abivalue(r, &aux->desc.args[k], parts);
}
cd.args = args;
@@ -1061,9 +1052,8 @@ static void replay_inst(ReplayCtx* r, u32 b, Inst* in) {
cd.args = NULL;
}
CGABIPart* ret_parts =
- cd.ret.nparts
- ? arena_array(r->f->arena, CGABIPart, cd.ret.nparts)
- : NULL;
+ cd.ret.nparts ? arena_array(r->f->arena, CGABIPart, cd.ret.nparts)
+ : NULL;
cd.ret = xlat_abivalue(r, &aux->desc.ret, ret_parts);
w->call(w, &cd);
break;
@@ -1088,10 +1078,9 @@ static void replay_inst(ReplayCtx* r, u32 b, Inst* in) {
if (!aux || !aux->present) {
w->ret(w, NULL);
} else {
- CGABIPart* parts =
- aux->val.nparts
- ? arena_array(r->f->arena, CGABIPart, aux->val.nparts)
- : NULL;
+ CGABIPart* parts = aux->val.nparts ? arena_array(r->f->arena, CGABIPart,
+ aux->val.nparts)
+ : NULL;
CGABIValue v = xlat_abivalue(r, &aux->val, parts);
w->ret(w, &v);
}
@@ -1246,7 +1235,8 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) {
if (w->get_allocable_regs)
w->get_allocable_regs(w, (RegClass)cidx, ®s, &nregs);
if (regs && nregs)
- cg_simple_regalloc_set_ordered(&r.regalloc, (RegClass)cidx, regs, nregs);
+ cg_simple_regalloc_set_ordered(&r.regalloc, (RegClass)cidx, regs,
+ nregs);
}
}
@@ -1299,7 +1289,10 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) {
Reg hr = f->val_info[v].hard_reg;
int already = 0;
for (u32 i = 0; i < nused; ++i) {
- if (used[i] == hr) { already = 1; break; }
+ if (used[i] == hr) {
+ already = 1;
+ break;
+ }
}
if (!already) used[nused++] = hr;
}
@@ -1307,7 +1300,10 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) {
Reg hr = f->opt_scratch_regs[c][i];
int already = 0;
for (u32 j = 0; j < nused; ++j) {
- if (used[j] == hr) { already = 1; break; }
+ if (used[j] == hr) {
+ already = 1;
+ break;
+ }
}
if (!already && nused < OPT_MAX_HARD_REGS) used[nused++] = hr;
}
@@ -1340,35 +1336,6 @@ static u64 func_inst_count(Func* f) {
return n;
}
-static u64 bitset_active_words(const u64* bits, u32 words) {
- u64 n = 0;
- if (!bits) return 0;
- for (u32 i = 0; i < words; ++i)
- if (bits[i]) ++n;
- return n;
-}
-
-static u64 func_live_active_words(Func* f) {
- u64 n = 0;
- if (!f) return 0;
- for (u32 b = 0; b < f->nblocks; ++b) {
- Block* bl = &f->blocks[b];
- n += bitset_active_words(bl->live_in, f->opt_live_words);
- n += bitset_active_words(bl->live_out, f->opt_live_words);
- n += bitset_active_words(bl->live_use, f->opt_live_words);
- n += bitset_active_words(bl->live_def, f->opt_live_words);
- }
- return n;
-}
-
-static u64 func_live_value_count(Func* f) {
- u64 n = 0;
- if (!f || !f->val_info) return 0;
- for (Val v = 1; v < f->nvals; ++v)
- if (f->val_info[v].first_pos) ++n;
- return n;
-}
-
static int inst_spill_local(Func* f, const Inst* in, u32 op_idx) {
FrameSlot fs;
if (!f || !in || op_idx >= in->nopnds) return 0;
@@ -1433,22 +1400,21 @@ static void w_func_end(CGTarget* t) {
metrics_scope_begin(o->c, "opt.build_loop_tree");
opt_build_loop_tree(o->f);
metrics_scope_end(o->c, "opt.build_loop_tree");
- metrics_scope_begin(o->c, "opt.live_info.pre_dde");
- opt_live_info(o->f);
+ metrics_scope_begin(o->c, "opt.live_blocks.pre_dde");
+ OptLiveInfo live;
+ opt_live_blocks(o->f, &live);
metrics_count(o->c, "opt.live_words", o->f->opt_live_words);
metrics_count(o->c, "opt.live.blocks", o->f->nblocks);
- metrics_count(o->c, "opt.live.active_words",
- func_live_active_words(o->f));
- metrics_count(o->c, "opt.ranges", func_live_value_count(o->f));
- metrics_count(o->c, "opt.range_points", o->f->opt_position_count);
- metrics_count(o->c, "opt.conflict_bytes",
- (u64)o->f->nvals * (u64)o->f->opt_conflict_words *
- (u64)sizeof(u64));
- metrics_scope_end(o->c, "opt.live_info.pre_dde");
+ metrics_count(o->c, "opt.live.active_words", live.active_words);
+ metrics_count(o->c, "opt.live.block_bytes", live.block_bytes);
+ metrics_count(o->c, "opt.live.set_bit_scans", live.set_bit_scans);
+ metrics_count(o->c, "opt.ranges", 0);
+ metrics_count(o->c, "opt.range_points", 0);
+ metrics_count(o->c, "opt.conflict_bytes", 0);
+ metrics_scope_end(o->c, "opt.live_blocks.pre_dde");
metrics_scope_begin(o->c, "opt.dead_def_elim");
- opt_dead_def_elim(o->f);
+ opt_dead_def_elim_with_live(o->f, &live);
metrics_scope_end(o->c, "opt.dead_def_elim");
- o->f->val_info = NULL; /* force opt_regalloc to recompute liveness */
metrics_scope_begin(o->c, "opt.regalloc");
opt_regalloc(o->f, 0);
metrics_count(o->c, "opt.alloc.used_loc_words", 0);
@@ -1531,9 +1497,9 @@ CGTarget* opt_cgtarget_new(Compiler* c, CGTarget* target, int level) {
t->reload_reg = w_reload_reg;
t->get_allocable_regs = w_get_allocable_regs;
- t->get_scratch_regs = w_get_scratch_regs;
- t->is_caller_saved = w_is_caller_saved;
- t->reserve_hard_regs = w_reserve_hard_regs;
+ t->get_scratch_regs = w_get_scratch_regs;
+ t->is_caller_saved = w_is_caller_saved;
+ t->reserve_hard_regs = w_reserve_hard_regs;
t->label_new = w_label_new;
t->label_place = w_label_place;
diff --git a/src/opt/opt.h b/src/opt/opt.h
@@ -67,12 +67,50 @@ void opt_cleanup(Func*);
* per-OS, so it takes the full Target. */
void opt_machinize(Func*, CGTarget* target);
void opt_build_loop_tree(Func*);
+
+typedef struct OptBitset {
+ u64* words;
+ u32 nwords;
+ u32 active_words;
+} OptBitset;
+
+typedef struct OptBlockLive {
+ OptBitset live_in;
+ OptBitset live_out;
+ OptBitset live_use;
+ OptBitset live_def;
+} OptBlockLive;
+
+typedef struct OptLiveInfo {
+ Arena* arena;
+ Func* f;
+ u32 words;
+ u64 active_words;
+ u64 block_bytes;
+ u64 set_bit_scans;
+ OptBlockLive* blocks;
+} OptLiveInfo;
+
+typedef void (*OptBitsetIterFn)(Val, void*);
+
+void opt_bitset_clear(OptBitset*);
+void opt_bitset_set(OptBitset*, Val);
+void opt_bitset_clear_bit(OptBitset*, Val);
+int opt_bitset_has(const OptBitset*, Val);
+int opt_bitset_copy(OptBitset*, const OptBitset*);
+int opt_bitset_union(OptBitset*, const OptBitset*);
+int opt_bitset_union_and_not(OptBitset*, const OptBitset*, const OptBitset*);
+void opt_bitset_iter_set(const OptBitset*, OptBitsetIterFn, void* arg);
+
+void opt_live_blocks(Func*, OptLiveInfo*);
+void opt_live_dump_blocks(Func*, const OptLiveInfo*, Writer*);
void opt_live_info(Func*);
void opt_coalesce(Func*);
void opt_regalloc(Func*, int allow_live_range_split);
-void opt_combine(Func*); /* code selection: merge dependent insns */
-void opt_dce(Func*); /* post-RA DCE */
+void opt_combine(Func*); /* code selection: merge dependent insns */
+void opt_dce(Func*); /* post-RA DCE */
void opt_dead_def_elim(Func*); /* pre-RA dead-definition elimination */
+void opt_dead_def_elim_with_live(Func*, const OptLiveInfo*);
/* Walks the lowered IR and drives a target CGTarget to emit machine code into
* its ObjBuilder. Inserts prolog/epilog. Splits long insns where the target
diff --git a/src/opt/pass_live.c b/src/opt/pass_live.c
@@ -0,0 +1,333 @@
+#include <stdio.h>
+#include <string.h>
+
+#include "core/arena.h"
+#include "opt/opt.h"
+
+static u32 opt_bit_words(u32 nvals) { return (nvals + 63u) / 64u; }
+
+static void opt_bitset_init(Arena* arena, OptBitset* bs, u32 nwords) {
+ memset(bs, 0, sizeof *bs);
+ bs->nwords = nwords;
+ bs->words = arena_zarray(arena, u64, nwords ? nwords : 1u);
+}
+
+static void opt_bitset_trim(OptBitset* bs) {
+ u32 n = bs->nwords;
+ while (n && bs->words[n - 1u] == 0) --n;
+ bs->active_words = n;
+}
+
+void opt_bitset_clear(OptBitset* bs) {
+ if (!bs || !bs->words) return;
+ for (u32 i = 0; i < bs->active_words; ++i) bs->words[i] = 0;
+ bs->active_words = 0;
+}
+
+void opt_bitset_set(OptBitset* bs, Val v) {
+ if (!bs || !bs->words) return;
+ u32 w = v / 64u;
+ if (w >= bs->nwords) return;
+ bs->words[w] |= 1ull << (v % 64u);
+ if (bs->active_words <= w) bs->active_words = w + 1u;
+}
+
+void opt_bitset_clear_bit(OptBitset* bs, Val v) {
+ if (!bs || !bs->words) return;
+ u32 w = v / 64u;
+ if (w >= bs->active_words) return;
+ bs->words[w] &= ~(1ull << (v % 64u));
+ if (w + 1u == bs->active_words && bs->words[w] == 0) opt_bitset_trim(bs);
+}
+
+int opt_bitset_has(const OptBitset* bs, Val v) {
+ if (!bs || !bs->words) return 0;
+ u32 w = v / 64u;
+ if (w >= bs->active_words) return 0;
+ return (bs->words[w] & (1ull << (v % 64u))) != 0;
+}
+
+int opt_bitset_copy(OptBitset* dst, const OptBitset* src) {
+ int changed = 0;
+ u32 n;
+ if (!dst || !src || !dst->words || !src->words) return 0;
+ n = dst->nwords < src->nwords ? dst->nwords : src->nwords;
+ for (u32 i = 0; i < n; ++i) {
+ changed |= dst->words[i] != src->words[i];
+ dst->words[i] = src->words[i];
+ }
+ for (u32 i = n; i < dst->active_words; ++i) {
+ changed |= dst->words[i] != 0;
+ dst->words[i] = 0;
+ }
+ opt_bitset_trim(dst);
+ return changed;
+}
+
+int opt_bitset_union(OptBitset* dst, const OptBitset* src) {
+ int changed = 0;
+ u32 n;
+ if (!dst || !src || !dst->words || !src->words) return 0;
+ n = dst->nwords < src->active_words ? dst->nwords : src->active_words;
+ for (u32 i = 0; i < n; ++i) {
+ u64 old = dst->words[i];
+ dst->words[i] |= src->words[i];
+ changed |= dst->words[i] != old;
+ }
+ if (changed) opt_bitset_trim(dst);
+ return changed;
+}
+
+int opt_bitset_union_and_not(OptBitset* dst, const OptBitset* src,
+ const OptBitset* not_bits) {
+ int changed = 0;
+ u32 n;
+ if (!dst || !src || !dst->words || !src->words) return 0;
+ n = dst->nwords < src->active_words ? dst->nwords : src->active_words;
+ for (u32 i = 0; i < n; ++i) {
+ u64 mask =
+ (not_bits && i < not_bits->active_words) ? not_bits->words[i] : 0;
+ u64 old = dst->words[i];
+ dst->words[i] |= src->words[i] & ~mask;
+ changed |= dst->words[i] != old;
+ }
+ if (changed) opt_bitset_trim(dst);
+ return changed;
+}
+
+void opt_bitset_iter_set(const OptBitset* bs, OptBitsetIterFn fn, void* arg) {
+ if (!bs || !bs->words || !fn) return;
+ for (u32 w = 0; w < bs->active_words; ++w) {
+ u64 bits = bs->words[w];
+ while (bits) {
+ u32 bit = (u32)__builtin_ctzll(bits);
+ fn(w * 64u + bit, arg);
+ bits &= bits - 1u;
+ }
+ }
+}
+
+typedef void (*LiveOperandWalkFn)(Func*, Inst*, Operand*, int is_def, void*);
+
+static int live_val_in_inst_defs(const Inst* in, Val v) {
+ if (v == VAL_NONE) return 0;
+ if (in->def == v) return 1;
+ for (u32 i = 0; i < in->ndefs; ++i)
+ if (in->defs[i] == v) return 1;
+ return 0;
+}
+
+static void live_walk_operand(Func* f, Inst* in, Operand* op, int is_def,
+ LiveOperandWalkFn fn, void* ctx) {
+ if (!op) return;
+ if (op->kind == OPK_REG) {
+ fn(f, in, op, is_def, ctx);
+ } else if (op->kind == OPK_INDIRECT) {
+ Operand base = *op;
+ base.kind = OPK_REG;
+ base.cls = RC_INT;
+ base.v.reg = op->v.ind.base;
+ fn(f, in, &base, 0, ctx);
+ }
+}
+
+static void live_walk_abivalue(Func* f, Inst* in, CGABIValue* v,
+ int storage_def, LiveOperandWalkFn fn,
+ void* ctx) {
+ live_walk_operand(f, in, &v->storage, storage_def, fn, ctx);
+ for (u32 i = 0; i < v->nparts; ++i)
+ live_walk_operand(f, in, (Operand*)&v->parts[i].op, storage_def, fn, ctx);
+}
+
+static void live_walk_inst_operands(Func* f, Inst* in, LiveOperandWalkFn fn,
+ void* ctx) {
+ for (u32 i = 0; i < in->nopnds; ++i) {
+ int is_def = 0;
+ if (in->opnds[i].kind == OPK_REG)
+ is_def = i == 0 && live_val_in_inst_defs(in, (Val)in->opnds[i].v.reg);
+ if (!is_def) live_walk_operand(f, in, &in->opnds[i], 0, fn, ctx);
+ }
+ for (u32 i = 0; i < in->nopnds; ++i) {
+ int is_def = 0;
+ if (in->opnds[i].kind == OPK_REG)
+ is_def = i == 0 && live_val_in_inst_defs(in, (Val)in->opnds[i].v.reg);
+ if (is_def) live_walk_operand(f, in, &in->opnds[i], 1, fn, ctx);
+ }
+
+ switch ((IROp)in->op) {
+ case IR_CALL: {
+ IRCallAux* aux = (IRCallAux*)in->extra.aux;
+ if (!aux) break;
+ live_walk_operand(f, in, &aux->desc.callee, 0, fn, ctx);
+ for (u32 i = 0; i < aux->desc.nargs; ++i)
+ live_walk_abivalue(f, in, (CGABIValue*)&aux->desc.args[i], 0, fn, ctx);
+ live_walk_abivalue(f, in, &aux->desc.ret, 1, fn, ctx);
+ break;
+ }
+ case IR_RET: {
+ IRRetAux* aux = (IRRetAux*)in->extra.aux;
+ if (aux && aux->present) live_walk_abivalue(f, in, &aux->val, 0, fn, ctx);
+ break;
+ }
+ case IR_SCOPE_BEGIN: {
+ IRScopeAux* aux = (IRScopeAux*)in->extra.aux;
+ if (aux) live_walk_operand(f, in, &aux->desc.cond, 0, fn, ctx);
+ break;
+ }
+ case IR_ASM_BLOCK: {
+ IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
+ if (!aux) break;
+ for (u32 i = 0; i < aux->nin; ++i)
+ live_walk_operand(f, in, &aux->in_ops[i], 0, fn, ctx);
+ for (u32 i = 0; i < aux->nout; ++i)
+ live_walk_operand(f, in, &aux->out_ops[i], 1, fn, ctx);
+ break;
+ }
+ case IR_INTRINSIC: {
+ IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
+ if (!aux) break;
+ for (u32 i = 0; i < aux->narg; ++i)
+ live_walk_operand(f, in, &aux->args[i], 0, fn, ctx);
+ for (u32 i = 0; i < aux->ndst; ++i)
+ live_walk_operand(f, in, &aux->dsts[i], 1, fn, ctx);
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+typedef struct LiveUseDefCtx {
+ OptBitset* use;
+ OptBitset* def;
+} LiveUseDefCtx;
+
+static void live_collect_use_def(Func* f, Inst* in, Operand* op, int is_def,
+ void* arg) {
+ (void)in;
+ LiveUseDefCtx* c = (LiveUseDefCtx*)arg;
+ if (op->kind != OPK_REG) return;
+ Val v = (Val)op->v.reg;
+ if (v == VAL_NONE || v >= f->nvals) return;
+ if (is_def) {
+ opt_bitset_set(c->def, v);
+ } else if (!opt_bitset_has(c->def, v)) {
+ opt_bitset_set(c->use, v);
+ }
+}
+
+static void live_count_bit(Val v, void* arg) {
+ (void)v;
+ ++*(u64*)arg;
+}
+
+static u64 live_count_set_bits(const OptBitset* bs) {
+ u64 n = 0;
+ opt_bitset_iter_set(bs, live_count_bit, &n);
+ return n;
+}
+
+static void live_recompute_metrics(OptLiveInfo* live) {
+ live->active_words = 0;
+ live->block_bytes = 0;
+ live->set_bit_scans = 0;
+ for (u32 b = 0; b < live->f->nblocks; ++b) {
+ OptBlockLive* bl = &live->blocks[b];
+ live->active_words += bl->live_in.active_words;
+ live->active_words += bl->live_out.active_words;
+ live->active_words += bl->live_use.active_words;
+ live->active_words += bl->live_def.active_words;
+ live->set_bit_scans += live_count_set_bits(&bl->live_in);
+ live->set_bit_scans += live_count_set_bits(&bl->live_out);
+ live->set_bit_scans += live_count_set_bits(&bl->live_use);
+ live->set_bit_scans += live_count_set_bits(&bl->live_def);
+ }
+ live->block_bytes =
+ (u64)live->f->nblocks * 4u * (u64)live->words * (u64)sizeof(u64);
+}
+
+void opt_live_blocks(Func* f, OptLiveInfo* live) {
+ memset(live, 0, sizeof *live);
+ live->arena = f->arena;
+ live->f = f;
+ live->words = opt_bit_words(f->nvals);
+ f->opt_live_words = (u16)live->words;
+ live->blocks =
+ arena_zarray(f->arena, OptBlockLive, f->nblocks ? f->nblocks : 1u);
+
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ OptBlockLive* lb = &live->blocks[b];
+ opt_bitset_init(f->arena, &lb->live_in, live->words);
+ opt_bitset_init(f->arena, &lb->live_out, live->words);
+ opt_bitset_init(f->arena, &lb->live_use, live->words);
+ opt_bitset_init(f->arena, &lb->live_def, live->words);
+
+ LiveUseDefCtx ctx;
+ ctx.use = &lb->live_use;
+ ctx.def = &lb->live_def;
+ for (u32 i = 0; i < bl->ninsts; ++i)
+ live_walk_inst_operands(f, &bl->insts[i], live_collect_use_def, &ctx);
+ }
+
+ OptBitset new_out;
+ OptBitset tmp;
+ opt_bitset_init(f->arena, &new_out, live->words);
+ opt_bitset_init(f->arena, &tmp, live->words);
+
+ int changed;
+ do {
+ changed = 0;
+ for (u32 bi = f->nblocks; bi > 0; --bi) {
+ u32 b = bi - 1u;
+ Block* bl = &f->blocks[b];
+ OptBlockLive* lb = &live->blocks[b];
+ opt_bitset_clear(&new_out);
+ opt_bitset_clear(&tmp);
+ for (u32 s = 0; s < bl->nsucc; ++s) {
+ u32 t = bl->succ[s];
+ if (t < f->nblocks)
+ opt_bitset_union(&new_out, &live->blocks[t].live_in);
+ }
+ opt_bitset_copy(&tmp, &lb->live_use);
+ opt_bitset_union_and_not(&tmp, &new_out, &lb->live_def);
+ changed |= opt_bitset_copy(&lb->live_out, &new_out);
+ changed |= opt_bitset_copy(&lb->live_in, &tmp);
+ }
+ } while (changed);
+
+ live_recompute_metrics(live);
+}
+
+static void dump_write(Writer* w, const char* s) {
+ cfree_writer_write(w, s, strlen(s));
+}
+
+static void dump_bit(Val v, void* arg) {
+ Writer* w = (Writer*)arg;
+ char buf[32];
+ snprintf(buf, sizeof buf, " v%u", (unsigned)v);
+ dump_write(w, buf);
+}
+
+static void dump_set(Writer* w, const char* name, const OptBitset* bs) {
+ dump_write(w, " ");
+ dump_write(w, name);
+ dump_write(w, ":");
+ opt_bitset_iter_set(bs, dump_bit, w);
+ dump_write(w, "\n");
+}
+
+void opt_live_dump_blocks(Func* f, const OptLiveInfo* live, Writer* w) {
+ (void)f;
+ if (!live || !w) return;
+ for (u32 b = 0; b < live->f->nblocks; ++b) {
+ char buf[64];
+ snprintf(buf, sizeof buf, "block %u\n", (unsigned)b);
+ dump_write(w, buf);
+ dump_set(w, "use", &live->blocks[b].live_use);
+ dump_set(w, "def", &live->blocks[b].live_def);
+ dump_set(w, "in", &live->blocks[b].live_in);
+ dump_set(w, "out", &live->blocks[b].live_out);
+ }
+}
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -1,11 +1,10 @@
-#include "opt/opt.h"
-
#include <string.h>
#include "core/arena.h"
#include "core/core.h"
#include "core/metrics.h"
#include "core/pool.h"
+#include "opt/opt.h"
enum {
OPT_CG_TYPE_SEG_SHIFT = 6,
@@ -299,8 +298,8 @@ static const char* asm_constraint_body(const char* s) {
}
static int asm_resolve_fixed_constraint(Func* f, CGTarget* target,
- const char* constraint,
- Reg* reg_out, RegClass* cls_out) {
+ const char* constraint, Reg* reg_out,
+ RegClass* cls_out) {
const char* body = asm_constraint_body(constraint);
if (!target->resolve_reg_name) return 0;
if (body[0] != '{') return 0;
@@ -557,8 +556,7 @@ void opt_build_loop_tree(Func* f) {
if (!has_loop) continue;
for (u32 b = 0; b < f->nblocks; ++b)
- if (body[b] && f->blocks[b].loop_depth < 31)
- ++f->blocks[b].loop_depth;
+ if (body[b] && f->blocks[b].loop_depth < 31) ++f->blocks[b].loop_depth;
}
for (u32 b = 0; b < f->nblocks; ++b)
@@ -619,7 +617,8 @@ void opt_live_info(Func* f) {
u64* new_in = arena_zarray(f->arena, u64, words);
for (u32 s = 0; s < bl->nsucc; ++s) {
u32 t = bl->succ[s];
- if (t < f->nblocks) bit_or_changed(new_out, f->blocks[t].live_in, words);
+ if (t < f->nblocks)
+ bit_or_changed(new_out, f->blocks[t].live_in, words);
}
for (u32 w = 0; w < words; ++w)
new_in[w] = bl->live_use[w] | (new_out[w] & ~bl->live_def[w]);
@@ -643,8 +642,7 @@ void opt_live_info(Func* f) {
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
- bl->live_after =
- arena_array(f->arena, u64*, bl->ninsts ? bl->ninsts : 1u);
+ bl->live_after = arena_array(f->arena, u64*, bl->ninsts ? bl->ninsts : 1u);
u64* live = arena_zarray(f->arena, u64, words);
copy_bits(live, bl->live_out, words);
@@ -692,8 +690,7 @@ void opt_live_info(Func* f) {
if ((IROp)in->op == IR_ASM_BLOCK)
apply_asm_register_constraints(f, in, use, def, after);
- for (u32 w = 0; w < words; ++w)
- live[w] = (live[w] & ~def[w]) | use[w];
+ for (u32 w = 0; w < words; ++w) live[w] = (live[w] & ~def[w]) | use[w];
}
}
@@ -721,7 +718,8 @@ static int val_higher_priority(Func* f, Val a, Val b) {
if (av->frequency != bv->frequency) return av->frequency > bv->frequency;
if (av->live_across_call_freq != bv->live_across_call_freq)
return av->live_across_call_freq > bv->live_across_call_freq;
- if (av->live_length != bv->live_length) return av->live_length < bv->live_length;
+ if (av->live_length != bv->live_length)
+ return av->live_length < bv->live_length;
return a < b;
}
@@ -954,8 +952,7 @@ static void rewrite_func(Func* f) {
for (Val v = 1; v < f->nvals; ++v) {
if (!bit_has(live_after[i], v) || bit_has(def, v)) continue;
if (f->val_info[v].alloc_kind == OPT_ALLOC_HARD &&
- is_caller_saved(f, f->val_info[v].cls,
- f->val_info[v].hard_reg))
+ is_caller_saved(f, f->val_info[v].cls, f->val_info[v].hard_reg))
append_store_val(f, &out, v);
}
}
@@ -965,8 +962,7 @@ static void rewrite_func(Func* f) {
for (Val v = 1; v < f->nvals; ++v) {
if (!bit_has(live_after[i], v) || bit_has(def, v)) continue;
if (f->val_info[v].alloc_kind == OPT_ALLOC_HARD &&
- is_caller_saved(f, f->val_info[v].cls,
- f->val_info[v].hard_reg))
+ is_caller_saved(f, f->val_info[v].cls, f->val_info[v].hard_reg))
append_load_val(f, &out, v);
}
}
@@ -992,12 +988,16 @@ static int all_defs_dead(Func* f, Inst* in, u64* live) {
return 1;
}
-void opt_dead_def_elim(Func* f) {
- u32 words = f->opt_live_words;
+void opt_dead_def_elim_with_live(Func* f, const OptLiveInfo* live_info) {
+ u32 words = live_info ? live_info->words : f->opt_live_words;
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
u64* live = arena_zarray(f->arena, u64, words);
- for (u32 w = 0; w < words; ++w) live[w] = bl->live_out[w];
+ const u64* live_out =
+ live_info ? live_info->blocks[b].live_out.words : bl->live_out;
+ if (live_out) {
+ for (u32 w = 0; w < words; ++w) live[w] = live_out[w];
+ }
Inst* new_insts = arena_array(f->arena, Inst, bl->ninsts);
u32 w = 0;
@@ -1029,15 +1029,25 @@ void opt_dead_def_elim(Func* f) {
}
}
+void opt_dead_def_elim(Func* f) {
+ if (f->nblocks && !f->blocks[0].live_out) {
+ OptLiveInfo live;
+ opt_live_blocks(f, &live);
+ opt_dead_def_elim_with_live(f, &live);
+ return;
+ }
+ opt_dead_def_elim_with_live(f, NULL);
+}
+
void opt_regalloc(Func* f, int allow_live_range_split) {
(void)allow_live_range_split;
if (!f->val_info) {
metrics_scope_begin(f->c, "opt.live_info.regalloc");
opt_live_info(f);
metrics_count(f->c, "opt.live_words", f->opt_live_words);
- metrics_count(f->c, "opt.conflict_bytes",
- (u64)f->nvals * (u64)f->opt_conflict_words *
- (u64)sizeof(u64));
+ metrics_count(
+ f->c, "opt.conflict_bytes",
+ (u64)f->nvals * (u64)f->opt_conflict_words * (u64)sizeof(u64));
metrics_scope_end(f->c, "opt.live_info.regalloc");
}
Val* order = arena_array(f->arena, Val, f->nvals ? f->nvals : 1u);
@@ -1063,9 +1073,10 @@ void opt_regalloc(Func* f, int allow_live_range_split) {
Reg fixed = (Reg)vi->tied_hard_reg;
if (!hard_available(f, cls, fixed)) {
SrcLoc loc = {0, 0, 0};
- compiler_panic(f->c, loc,
- "opt regalloc: fixed hard reg %u unavailable in class %u",
- (unsigned)fixed, (unsigned)cls);
+ compiler_panic(
+ f->c, loc,
+ "opt regalloc: fixed hard reg %u unavailable in class %u",
+ (unsigned)fixed, (unsigned)cls);
}
if (fixed < 32 && (vi->forbidden_hard_regs & (1u << fixed))) {
SrcLoc loc = {0, 0, 0};
@@ -1075,8 +1086,7 @@ void opt_regalloc(Func* f, int allow_live_range_split) {
}
if (hard_conflicts(f, assigned, nassigned, v, fixed)) {
SrcLoc loc = {0, 0, 0};
- compiler_panic(f->c, loc,
- "opt regalloc: conflicting fixed hard reg %u",
+ compiler_panic(f->c, loc, "opt regalloc: conflicting fixed hard reg %u",
(unsigned)fixed);
}
vi->alloc_kind = OPT_ALLOC_HARD;
@@ -1172,8 +1182,7 @@ static int same_phys_reg(const Operand* a, const Operand* b) {
static int operand_uses_phys_reg(const Operand* op, const Operand* r) {
if (!op || !r || r->kind != OPK_REG) return 0;
- if (op->kind == OPK_REG)
- return op->cls == r->cls && op->v.reg == r->v.reg;
+ if (op->kind == OPK_REG) return op->cls == r->cls && op->v.reg == r->v.reg;
if (op->kind == OPK_INDIRECT)
return r->cls == RC_INT && op->v.ind.base == r->v.reg;
return 0;
@@ -1412,11 +1421,10 @@ static int find_single_direct_use(Func* f, Block* bl, u32 def_i,
total_uses += uses;
if (total_uses > 1) return 0;
for (u32 oi = 0; oi < in->nopnds; ++oi) {
- int ok = conv_fold
- ? (oi == 1 &&
- identical_convert_pair(&bl->insts[def_i], in))
- : (imm_fold ? imm_fold_slot(in, oi)
- : copy_fold_slot(in, oi));
+ int ok =
+ conv_fold
+ ? (oi == 1 && identical_convert_pair(&bl->insts[def_i], in))
+ : (imm_fold ? imm_fold_slot(in, oi) : copy_fold_slot(in, oi));
if (!ok) continue;
if (!same_phys_reg(&in->opnds[oi], def)) continue;
found_i = i;
@@ -1454,16 +1462,16 @@ static void opt_combine_fold_block(Func* f, Block* bl) {
if ((IROp)in->op == IR_COPY && in->nopnds >= 2 &&
in->opnds[0].kind == OPK_REG && in->opnds[1].kind == OPK_REG &&
!same_phys_reg(&in->opnds[0], &in->opnds[1]) &&
- find_single_direct_use(f, bl, i, &in->opnds[0], &in->opnds[1], 1, 0,
- 0, &use_i, &op_i)) {
+ find_single_direct_use(f, bl, i, &in->opnds[0], &in->opnds[1], 1, 0, 0,
+ &use_i, &op_i)) {
bl->insts[use_i].opnds[op_i] = in->opnds[1];
continue;
}
if ((IROp)in->op == IR_LOAD_IMM && in->nopnds >= 1 &&
in->opnds[0].kind == OPK_REG &&
- find_single_direct_use(f, bl, i, &in->opnds[0], NULL, 0, 1, 0,
- &use_i, &op_i)) {
+ find_single_direct_use(f, bl, i, &in->opnds[0], NULL, 0, 1, 0, &use_i,
+ &op_i)) {
Operand imm = in->opnds[0];
imm.kind = OPK_IMM;
imm.v.imm = in->extra.imm;
@@ -1473,8 +1481,8 @@ static void opt_combine_fold_block(Func* f, Block* bl) {
if ((IROp)in->op == IR_CONVERT && in->nopnds >= 2 &&
in->opnds[0].kind == OPK_REG && in->opnds[1].kind == OPK_REG &&
- find_single_direct_use(f, bl, i, &in->opnds[0], &in->opnds[1], 1, 0,
- 1, &use_i, &op_i)) {
+ find_single_direct_use(f, bl, i, &in->opnds[0], &in->opnds[1], 1, 0, 1,
+ &use_i, &op_i)) {
bl->insts[use_i].opnds[op_i] = in->opnds[1];
}
}
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -1,10 +1,11 @@
+#include "opt/opt.h"
+
+#include <cfree.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include <cfree.h>
-
#include "abi/abi.h"
#include "arch/arch.h"
#include "arch/regalloc.h"
@@ -12,7 +13,6 @@
#include "core/heap.h"
#include "core/pool.h"
#include "opt/ir.h"
-#include "opt/opt.h"
static void* h_alloc(CfreeHeap* h, size_t n, size_t a) {
(void)h;
@@ -48,15 +48,15 @@ static CfreeDiagSink g_diag = {diag_emit, NULL, 0, 0};
static int g_fails;
static int g_checks;
-#define EXPECT(cond, ...) \
- do { \
- ++g_checks; \
- if (!(cond)) { \
- ++g_fails; \
- fprintf(stderr, "FAIL %s:%d: ", __FILE__, __LINE__); \
- fprintf(stderr, __VA_ARGS__); \
- fputc('\n', stderr); \
- } \
+#define EXPECT(cond, ...) \
+ do { \
+ ++g_checks; \
+ if (!(cond)) { \
+ ++g_fails; \
+ fprintf(stderr, "FAIL %s:%d: ", __FILE__, __LINE__); \
+ fprintf(stderr, __VA_ARGS__); \
+ fputc('\n', stderr); \
+ } \
} while (0)
typedef struct TestCtx {
@@ -179,8 +179,7 @@ static FrameSlot add_frame_slot(Func* f, CfreeCgTypeId ty, FrameSlotKind kind,
return ir_frame_slot_new(f, &d);
}
-static Inst* emit_load_imm(Func* f, u32 b, Val dst, CfreeCgTypeId ty,
- i64 imm) {
+static Inst* emit_load_imm(Func* f, u32 b, Val dst, CfreeCgTypeId ty, i64 imm) {
Inst* in = ir_emit(f, b, IR_LOAD_IMM);
in->opnds = arena_array(f->arena, Operand, 1);
in->opnds[0] = op_reg_(dst, ty);
@@ -294,6 +293,15 @@ static int bit_has(const u64* bits, Val v) {
return (bits[v / 64u] & (1ull << (v % 64u))) != 0;
}
+static int bytes_contains(const unsigned char* data, size_t len,
+ const char* needle) {
+ size_t nlen = strlen(needle);
+ if (!data || nlen > len) return 0;
+ for (size_t i = 0; i + nlen <= len; ++i)
+ if (memcmp(data + i, needle, nlen) == 0) return 1;
+ return 0;
+}
+
static int val_conflicts(const Func* f, Val a, Val b) {
const u64* bits = f->val_conflicts + ((size_t)a * f->opt_conflict_words);
return bit_has(bits, b);
@@ -350,15 +358,15 @@ static void mock_func_begin(CGTarget* t, const CGFuncDesc* d) {
}
static void mock_func_end(CGTarget* t) { (void)t; }
-static void mock_get_allocable_regs(CGTarget* t, RegClass cls,
- const Reg** out, u32* nregs) {
+static void mock_get_allocable_regs(CGTarget* t, RegClass cls, const Reg** out,
+ u32* nregs) {
MockCGTarget* m = (MockCGTarget*)t;
*out = m->pool[cls];
*nregs = m->pool_n[cls];
}
-static void mock_get_scratch_regs(CGTarget* t, RegClass cls,
- const Reg** out, u32* nregs) {
+static void mock_get_scratch_regs(CGTarget* t, RegClass cls, const Reg** out,
+ u32* nregs) {
MockCGTarget* m = (MockCGTarget*)t;
*out = m->scratch[cls];
*nregs = m->scratch_n[cls];
@@ -370,8 +378,8 @@ static int mock_is_caller_saved(CGTarget* t, RegClass cls, Reg reg) {
return (m->caller_saved_mask[cls] & (1u << reg)) != 0;
}
-static void mock_reserve_hard_regs(CGTarget* t, RegClass cls,
- const Reg* regs, u32 n) {
+static void mock_reserve_hard_regs(CGTarget* t, RegClass cls, const Reg* regs,
+ u32 n) {
MockCGTarget* m = (MockCGTarget*)t;
if (cls < OPT_REG_CLASSES) m->reserve_calls[cls] += (int)n;
(void)regs;
@@ -382,8 +390,7 @@ static int mock_resolve_reg_name(CGTarget* t, Sym name, Reg* out,
size_t len = 0;
const char* s = pool_str(t->c->global, name, &len);
if (!s || len < 2) return 1;
- if ((s[0] != 'r' && s[0] != 'x' && s[0] != 'v') ||
- s[1] < '0' || s[1] > '9')
+ if ((s[0] != 'r' && s[0] != 'x' && s[0] != 'v') || s[1] < '0' || s[1] > '9')
return 1;
u32 n = 0;
for (size_t i = 1; i < len; ++i) {
@@ -474,9 +481,8 @@ static void mock_init(MockCGTarget* m, Compiler* c) {
m->base.resolve_reg_name = mock_resolve_reg_name;
}
-static void mock_set_pool(MockCGTarget* m, RegClass cls,
- const Reg* pool, u32 npool,
- const Reg* scratch_, u32 nscratch,
+static void mock_set_pool(MockCGTarget* m, RegClass cls, const Reg* pool,
+ u32 npool, const Reg* scratch_, u32 nscratch,
u32 caller_mask) {
m->pool[cls] = pool;
m->pool_n[cls] = npool;
@@ -524,14 +530,88 @@ static void opt_liveness_branch(void) {
opt_build_loop_tree(f);
opt_live_info(f);
- EXPECT(bit_has(f->blocks[b0].live_out, v), "v%u live_out of branch block",
- v);
+ EXPECT(bit_has(f->blocks[b0].live_out, v), "v%u live_out of branch block", v);
EXPECT(bit_has(f->blocks[b1].live_in, v), "v%u live_in true block", v);
EXPECT(bit_has(f->blocks[b2].live_in, v), "v%u live_in false block", v);
EXPECT(!bit_has(f->blocks[b1].live_out, v), "v%u dies in true block", v);
tc_fini(&tc);
}
+static void opt_block_liveness_phase1(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ u32 entry = f->entry;
+ u32 header = ir_block_new(f);
+ u32 body = ir_block_new(f);
+ u32 exit = ir_block_new(f);
+ ir_note_emit(f, header);
+ ir_note_emit(f, body);
+ ir_note_emit(f, exit);
+ Val across_branch = add_val(f, tc.i32);
+ Val across_call = add_val(f, tc.i32);
+ Val out = add_val(f, tc.i32);
+
+ emit_load_imm(f, entry, across_branch, tc.i32, 7);
+ emit_load_imm(f, entry, across_call, tc.i32, 11);
+ emit_br_to(f, entry, header);
+ emit_test_branch(f, header, body, exit, tc.i32);
+ emit_call_void(f, body);
+ emit_binop(f, body, out, across_branch, across_call, tc.i32);
+ emit_br_to(f, body, header);
+ emit_ret_val(f, exit, across_call, tc.i32);
+
+ opt_build_cfg(f);
+ opt_build_loop_tree(f);
+
+ OptLiveInfo live;
+ opt_live_blocks(f, &live);
+ EXPECT(opt_bitset_has(&live.blocks[entry].live_out, across_branch),
+ "branch value should be live-out from entry");
+ EXPECT(opt_bitset_has(&live.blocks[header].live_in, across_branch),
+ "branch value should be live-in to loop header");
+ EXPECT(opt_bitset_has(&live.blocks[body].live_in, across_branch),
+ "branch value should be live-in to loop body");
+ EXPECT(opt_bitset_has(&live.blocks[body].live_in, across_call),
+ "call-crossing value should be live-in to call block");
+ EXPECT(opt_bitset_has(&live.blocks[body].live_out, across_call),
+ "call-crossing value should remain live-out on loop backedge");
+ EXPECT(!opt_bitset_has(&live.blocks[exit].live_out, across_call),
+ "returned value should die at function exit");
+ EXPECT(f->val_conflicts == NULL,
+ "block liveness should not allocate conflict matrix");
+ EXPECT(live.block_bytes != 0, "block-liveness byte metric should be set");
+ EXPECT(live.set_bit_scans != 0,
+ "block-liveness set-bit scan metric should be set");
+
+ CfreeWriter* w = cfree_writer_mem(&g_heap);
+ opt_live_dump_blocks(f, &live, w);
+ size_t len = 0;
+ const unsigned char* bytes = cfree_writer_mem_bytes(w, &len);
+ EXPECT(bytes_contains(bytes, len, "block 0\n"),
+ "block liveness dump should include block header");
+ EXPECT(bytes_contains(bytes, len, " in:"),
+ "block liveness dump should include live-in set");
+ cfree_writer_close(w);
+
+ Func* g = new_func(&tc);
+ Val a = add_val(g, tc.i32);
+ Val dead = add_val(g, tc.i32);
+ emit_load_imm(g, g->entry, a, tc.i32, 1);
+ emit_copy(g, g->entry, dead, a, tc.i32);
+ emit_ret_val(g, g->entry, a, tc.i32);
+ opt_build_cfg(g);
+ opt_build_loop_tree(g);
+ OptLiveInfo g_live;
+ opt_live_blocks(g, &g_live);
+ opt_dead_def_elim_with_live(g, &g_live);
+ EXPECT(count_op(g, IR_COPY) == 0,
+ "DDE should consume pass-local block liveness");
+ EXPECT(g->val_conflicts == NULL,
+ "DDE with block liveness should not allocate conflict matrix");
+ tc_fini(&tc);
+}
+
static void opt_live_after_linear(void) {
TestCtx tc;
tc_init(&tc);
@@ -564,14 +644,10 @@ static void opt_live_after_linear(void) {
"a should be live after b's def");
EXPECT(bit_has(f->blocks[b].live_after[ib], bv),
"b should be live after its def");
- EXPECT(!bit_has(f->blocks[b].live_after[ic], a),
- "a should die after add");
- EXPECT(!bit_has(f->blocks[b].live_after[ic], bv),
- "b should die after add");
- EXPECT(bit_has(f->blocks[b].live_after[ic], c),
- "c should be live after add");
- EXPECT(!bit_has(f->blocks[b].live_after[ir], c),
- "return should consume c");
+ EXPECT(!bit_has(f->blocks[b].live_after[ic], a), "a should die after add");
+ EXPECT(!bit_has(f->blocks[b].live_after[ic], bv), "b should die after add");
+ EXPECT(bit_has(f->blocks[b].live_after[ic], c), "c should be live after add");
+ EXPECT(!bit_has(f->blocks[b].live_after[ir], c), "return should consume c");
tc_fini(&tc);
}
@@ -637,8 +713,7 @@ static void opt_interference_def_live_out(void) {
EXPECT(val_conflicts(f, a, bv),
"a and b should conflict while both are live");
- EXPECT(val_conflicts(f, bv, a),
- "conflicts should be symmetric");
+ EXPECT(val_conflicts(f, bv, a), "conflicts should be symmetric");
EXPECT(val_conflicts(f, a, c),
"a should conflict with c for same-instruction def/use lowering");
EXPECT(val_conflicts(f, bv, c),
@@ -921,8 +996,7 @@ static void opt_loop_tree_nested_depths(void) {
EXPECT(f->blocks[exit].loop_depth == 0,
"exit should not be in nested loops, got depth %u",
(unsigned)f->blocks[exit].loop_depth);
- EXPECT(f->blocks[inner_header].frequency >
- f->blocks[outer_header].frequency,
+ EXPECT(f->blocks[inner_header].frequency > f->blocks[outer_header].frequency,
"inner loop should be hotter than outer-only blocks");
tc_fini(&tc);
}
@@ -1007,8 +1081,8 @@ static void opt_regalloc_priority(void) {
EXPECT(f->val_info[pinned].alloc_kind == OPT_ALLOC_HARD,
"tied value should get a hard register");
EXPECT(f->val_info[pinned].hard_reg == expected_hard,
- "tied value should get hard r%u, got r%u",
- (unsigned)expected_hard, (unsigned)f->val_info[pinned].hard_reg);
+ "tied value should get hard r%u, got r%u", (unsigned)expected_hard,
+ (unsigned)f->val_info[pinned].hard_reg);
EXPECT(f->val_info[hot].alloc_kind == OPT_ALLOC_SPILL,
"overlapping untied value should spill under one-reg pressure");
tc_fini(&tc);
@@ -1110,7 +1184,8 @@ static void opt_call_clobber_caller_saved(void) {
}
}
EXPECT(saw_call_save_restore,
- "live caller-saved hard reg across call should be stored before and loaded after");
+ "live caller-saved hard reg across call should be stored before and "
+ "loaded after");
tc_fini(&tc);
}
@@ -1195,8 +1270,8 @@ static void opt_inline_asm_tied_fixed_regs(void) {
Reg expected = f->opt_hard_regs[RC_INT][0];
EXPECT(f->val_info[tied].alloc_kind == OPT_ALLOC_HARD,
"tied asm val should allocate hard");
- EXPECT(f->val_info[tied].hard_reg == expected,
- "tied asm val should get r%u", (unsigned)expected);
+ EXPECT(f->val_info[tied].hard_reg == expected, "tied asm val should get r%u",
+ (unsigned)expected);
EXPECT(aux->out_ops[0].v.reg == expected && aux->in_ops[0].v.reg == expected,
"asm tied operands should rewrite to the fixed hard reg");
tc_fini(&tc);
@@ -1431,8 +1506,9 @@ static void opt_combine_spill_peeps(void) {
st->extra.mem = mem_local_(fs, tc.i32, 4, 0);
opt_combine(g);
- EXPECT(g->blocks[g->entry].ninsts == 1,
- "spill reload followed by same-reg writeback should combine to one inst");
+ EXPECT(
+ g->blocks[g->entry].ninsts == 1,
+ "spill reload followed by same-reg writeback should combine to one inst");
EXPECT((IROp)g->blocks[g->entry].insts[0].op == IR_LOAD,
"remaining inst should be the spill reload");
tc_fini(&tc);
@@ -1920,6 +1996,7 @@ int main(void) {
opt_loop_tree_nested_depths();
opt_loop_tree_does_not_mutate_cfg();
opt_liveness_branch();
+ opt_block_liveness_phase1();
opt_live_after_linear();
opt_interference_branch_disjoint();
opt_interference_def_live_out();