kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 633d6444262a670bdf4f7517be3d0bb70cfbfcbf
parent 86f37f3912b20a33fd054e34647e7e345b19233f
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 14 May 2026 19:02:45 -0700

opt: add pass-local block liveness

Diffstat:
Mdoc/MIR_RA_REPORT.md | 36++++++++++++++++++------------------
Mdoc/PERF.md | 125++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
Msrc/opt/opt.c | 134++++++++++++++++++++++++++++++-------------------------------------------------
Msrc/opt/opt.h | 42++++++++++++++++++++++++++++++++++++++++--
Asrc/opt/pass_live.c | 333+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/opt/pass_lower.c | 88+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Mtest/opt/opt_test.c | 169+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
7 files changed, 717 insertions(+), 210 deletions(-)

diff --git a/doc/MIR_RA_REPORT.md b/doc/MIR_RA_REPORT.md @@ -535,27 +535,27 @@ coverage belongs in a later pass once the range allocator owns rewrite. ### Phase 1: Pass-Local Liveness -- [ ] Add an optimizer bitset abstraction with: - - [ ] set/clear/test - - [ ] copy - - [ ] union - - [ ] union-and-not - - [ ] change reporting - - [ ] set-bit iteration - - [ ] active-word tracking or trailing-zero trimming -- [ ] Add pass-local block liveness storage. -- [ ] Move block `use`/`def` calculation into the new liveness module. -- [ ] Compute `live_in`/`live_out` without allocating conflicts. -- [ ] Make pre-DDE use only block liveness. -- [ ] Keep old full `opt_live_info` available for regalloc during transition. -- [ ] Add dump output for block liveness. -- [ ] Add tests for branch, loop, and call liveness. +- [x] Add an optimizer bitset abstraction with: + - [x] set/clear/test + - [x] copy + - [x] union + - [x] union-and-not + - [x] change reporting + - [x] set-bit iteration + - [x] active-word tracking or trailing-zero trimming +- [x] Add pass-local block liveness storage. +- [x] Move block `use`/`def` calculation into the new liveness module. +- [x] Compute `live_in`/`live_out` without allocating conflicts. +- [x] Make pre-DDE use only block liveness. +- [x] Keep old full `opt_live_info` available for regalloc during transition. +- [x] Add dump output for block liveness. +- [x] Add tests for branch, loop, and call liveness. Exit criteria: -- [ ] Existing O1 behavior is unchanged. -- [ ] Pre-DDE no longer builds `val_conflicts`. -- [ ] Metrics show separate block-liveness timing and no pre-DDE conflict bytes. +- [x] Existing O1 behavior is unchanged. +- [x] Pre-DDE no longer builds `val_conflicts`. +- [x] Metrics show separate block-liveness timing and no pre-DDE conflict bytes. ### Phase 2: Live Range Model diff --git a/doc/PERF.md b/doc/PERF.md @@ -28,7 +28,9 @@ Important counters: - `compile.input_bytes` - `compile.obj_sections`, `compile.obj_relocs` - `opt.funcs`, `opt.blocks`, `opt.insts`, `opt.vals` -- `opt.live_words`, `opt.conflict_bytes` +- `opt.live_words`, `opt.live.blocks`, `opt.live.active_words` +- `opt.live.block_bytes`, `opt.live.set_bit_scans` +- `opt.conflict_bytes` - `link.inputs`, `link.sections`, `link.segments`, `link.syms`, `link.relocs` - `jit.master_size`, `jit.nsegments`, `jit.input_section_bytes`, `jit.segment_bytes` @@ -41,12 +43,19 @@ Times are `min / p50 / p95 / mean / max` in milliseconds. `int main(){return 0;}` ```text -run.total 0.423 / 0.448 / 0.506 / 0.520 / 1.889 -run.compile_and_jit 0.395 / 0.416 / 0.473 / 0.469 / 1.427 -compile.tu 0.254 / 0.267 / 0.304 / 0.306 / 1.015 -link_jit.all 0.124 / 0.132 / 0.149 / 0.140 / 0.287 -opt.o1.total 0.100 / 0.106 / 0.121 / 0.108 / 0.128 -link.resolve.total 0.064 / 0.067 / 0.076 / 0.072 / 0.159 +run.total 0.298 / 0.312 / 0.344 / 0.319 / 0.390 +run.compile_and_jit 0.283 / 0.295 / 0.323 / 0.300 / 0.346 +compile.tu 0.209 / 0.222 / 0.242 / 0.224 / 0.244 +compile.c.parse_codegen + 0.073 / 0.081 / 0.092 / 0.082 / 0.095 +opt.o1.total 0.040 / 0.048 / 0.057 / 0.048 / 0.059 +opt.live_blocks.pre_dde + 0.008 / 0.008 / 0.017 / 0.010 / 0.019 +opt.live_info.regalloc + 0.003 / 0.003 / 0.004 / 0.004 / 0.007 +opt.regalloc 0.008 / 0.012 / 0.016 / 0.012 / 0.019 +link_jit.all 0.043 / 0.049 / 0.058 / 0.051 / 0.059 +link.resolve.total 0.020 / 0.021 / 0.027 / 0.023 / 0.029 ``` One small loop: @@ -56,12 +65,19 @@ int main(){int s=0; for(int i=0;i<10;i++) s+=i; return s==45?0:1;} ``` ```text -run.total 0.500 / 0.522 / 0.539 / 0.525 / 0.618 -run.compile_and_jit 0.471 / 0.492 / 0.510 / 0.495 / 0.582 -compile.tu 0.324 / 0.340 / 0.356 / 0.345 / 0.428 -opt.o1.total 0.148 / 0.154 / 0.160 / 0.154 / 0.166 -link_jit.all 0.127 / 0.133 / 0.137 / 0.133 / 0.137 -link.resolve.total 0.064 / 0.066 / 0.068 / 0.066 / 0.069 +run.total 0.351 / 0.365 / 0.394 / 0.371 / 0.409 +run.compile_and_jit 0.334 / 0.349 / 0.377 / 0.353 / 0.389 +compile.tu 0.266 / 0.277 / 0.292 / 0.279 / 0.315 +compile.c.parse_codegen + 0.131 / 0.140 / 0.155 / 0.141 / 0.157 +opt.o1.total 0.077 / 0.083 / 0.094 / 0.084 / 0.095 +opt.live_blocks.pre_dde + 0.011 / 0.012 / 0.018 / 0.013 / 0.018 +opt.live_info.regalloc + 0.016 / 0.016 / 0.016 / 0.016 / 0.017 +opt.regalloc 0.027 / 0.028 / 0.034 / 0.029 / 0.034 +link_jit.all 0.043 / 0.046 / 0.057 / 0.049 / 0.064 +link.resolve.total 0.021 / 0.021 / 0.028 / 0.023 / 0.031 ``` For tiny programs, compile dominates, with O1 and JIT/link image setup as the @@ -137,6 +153,74 @@ This indicates that "many functions" is not the current scaling problem. The problem is a single large function whose liveness/conflict structures become large enough to bend past linear. +### Phase-1 Pass-Local Liveness Rerun + +After `doc/MIR_RA_REPORT.md` phase 1, pre-DDE uses pass-local block liveness +instead of full `opt_live_info`. Regalloc still uses the old dense liveness and +conflict path. The synthetic source generator used here is the same family as +the tables above, but these are fresh generated inputs rather than byte-for-byte +identical files, so compare trends and bucket shapes more than individual rows. + +Samples: 5 runs per point. The table uses p50 milliseconds. O1 scopes and +O1 counters are summed across functions. `pre_block_bytes` is +`opt.live.block_bytes` from `opt.live_blocks.pre_dde`; `reg_conflict_bytes` is +the remaining `opt.conflict_bytes` from `opt.live_info.regalloc`. + +Straight-line main: + +```text +funcs input_bytes insts vals pre_block_bytes reg_conflict_bytes run.total compile.tu parse_codegen opt.o1.total live_blocks_pre live_reg regalloc link_jit +1 372 91 72 320 576 0.737 0.627 0.465 0.297 0.029 0.128 0.171 0.062 +4 1327 322 270 608 2160 1.538 1.421 1.277 0.837 0.062 0.442 0.578 0.058 +16 5236 1246 1062 1984 9056 4.861 4.756 4.610 3.085 0.174 1.787 2.285 0.056 +64 21028 4942 4230 7264 42224 19.542 19.386 19.249 13.377 0.665 8.199 10.403 0.073 +128 42420 9870 8454 14304 100784 42.077 41.830 41.693 30.284 1.314 19.476 24.434 0.107 +256 85940 19726 16902 28384 267056 98.532 97.987 97.845 74.982 2.603 51.347 63.176 0.157 +512 172980 39438 33798 56544 796208 254.219 252.884 252.723 206.802 5.174 152.154 183.374 0.269 +1024 347352 78862 67590 112864 2640944 742.562 740.519 740.362 646.586 10.425 502.879 599.496 0.436 +``` + +Function-table main: + +```text +funcs input_bytes insts vals pre_block_bytes reg_conflict_bytes run.total compile.tu parse_codegen opt.o1.total live_blocks_pre live_reg regalloc link_jit +1 461 112 83 576 664 0.694 0.602 0.465 0.285 0.023 0.129 0.173 0.058 +4 1392 328 269 864 2152 1.513 1.386 1.251 0.805 0.053 0.434 0.565 0.058 +16 5200 1192 1013 2016 8104 4.801 4.687 4.550 3.030 0.194 1.697 2.189 0.060 +64 20560 4648 3989 6624 31912 17.431 17.261 17.111 11.452 0.703 6.560 8.487 0.079 +128 41349 9256 7957 12768 63656 34.170 33.918 33.777 22.640 1.289 13.077 16.946 0.114 +256 83589 18472 15893 25056 127144 67.625 67.144 67.004 45.065 2.567 26.043 33.724 0.156 +512 168069 36904 31765 49632 254120 134.804 134.093 133.941 89.938 5.150 51.976 67.348 0.250 +1024 337298 73768 63509 98784 508072 272.397 271.171 271.026 179.323 10.224 103.779 134.391 0.402 +``` + +The 512 to 1024 step now separates the fixed part from the remaining problem: + +```text +straight_main: +compile.tu 252.884 -> 740.519 ms 2.93x +opt.o1.total 206.802 -> 646.586 ms 3.13x +opt.live_blocks.pre_dde 5.174 -> 10.425 ms 2.01x +opt.live_info.regalloc 152.154 -> 502.879 ms 3.31x +opt.regalloc 183.374 -> 599.496 ms 3.27x +pre_block_bytes 56544 -> 112864 2.00x +reg_conflict_bytes 796208 -> 2640944 3.32x + +table_main: +compile.tu 134.093 -> 271.171 ms 2.02x +opt.o1.total 89.938 -> 179.323 ms 1.99x +opt.live_blocks.pre_dde 5.150 -> 10.224 ms 1.99x +opt.live_info.regalloc 51.976 -> 103.779 ms 2.00x +opt.regalloc 67.348 -> 134.391 ms 2.00x +pre_block_bytes 49632 -> 98784 1.99x +reg_conflict_bytes 254120 -> 508072 2.00x +``` + +The phase-1 result is the intended partial fix: pre-DDE no longer allocates a +conflict matrix and its block-live storage scales linearly on the large +straight-line case. The remaining superlinear behavior is now concentrated in +the regalloc-side full liveness/conflict path. + ## Compile Perf `compile.tu` is now split into generic and C-specific scopes: @@ -256,15 +340,16 @@ Next useful compile instrumentation: ## Performance Priorities -1. Make `opt_live_info` cheaper for large functions. - The same liveness computation currently runs before DDE and again inside - regalloc. The measurements show both passes growing superlinearly on the - large straight-line function. +1. Move regalloc off full `opt_live_info`. + Phase 1 removed the full liveness/conflict build from pre-DDE. The remaining + superlinear bucket is now `opt.live_info.regalloc`, which still builds + `live_after` and the dense conflict matrix before assignment/rewrite. 2. Replace or narrow dense conflict structures. - `opt.conflict_bytes` tracks the observed curve closely. Investigate sparse - sets, segmented bitsets, per-block live sets, or interval-style structures - that avoid touching full dense rows for values that cannot overlap. + Regalloc-side `opt.conflict_bytes` still tracks the observed curve closely. + Investigate sparse sets, segmented bitsets, per-block live sets, or + interval-style structures that avoid touching full dense rows for values that + cannot overlap. 3. Avoid whole-function contiguous growth where hot passes scan repeatedly. Large `Val`/block/instruction arrays and dense bit matrices are likely to diff --git a/src/opt/opt.c b/src/opt/opt.c @@ -34,8 +34,8 @@ typedef struct OptImpl { /* Current function being recorded. NULL between functions. */ Func* f; - u32 cur; /* current block id */ - SrcLoc pending_loc; /* most recent set_loc; stamped on each Inst */ + u32 cur; /* current block id */ + SrcLoc pending_loc; /* most recent set_loc; stamped on each Inst */ Writer* dump_writer; } OptImpl; @@ -44,8 +44,8 @@ static OptImpl* impl_of(CGTarget* t) { return (OptImpl*)t; } static _Noreturn void panic_unsupported(OptImpl* o, const char* what) { SrcLoc loc = {0, 0, 0}; - compiler_panic(o->c, loc, - "opt_cgtarget: %s called under unbounded virtuals", what); + compiler_panic(o->c, loc, "opt_cgtarget: %s called under unbounded virtuals", + what); } /* ---- recording helpers ---- */ @@ -113,9 +113,7 @@ static void set_cur(OptImpl* o, u32 b) { /* After emitting a terminator, allocate a fresh block for any * subsequent (likely unreachable) recording. */ -static void after_terminator(OptImpl* o) { - set_cur(o, ir_block_new(o->f)); -} +static void after_terminator(OptImpl* o) { set_cur(o, ir_block_new(o->f)); } /* ---- function lifecycle ---- */ @@ -162,8 +160,8 @@ static void w_reload_reg(CGTarget* t, Operand dst, FrameSlot s, MemAccess m) { panic_unsupported(impl_of(t), "reload_reg"); } -static void w_get_allocable_regs(CGTarget* t, RegClass cls, - const Reg** out, u32* nregs) { +static void w_get_allocable_regs(CGTarget* t, RegClass cls, const Reg** out, + u32* nregs) { CGTarget* wr = impl_of(t)->target; if (wr->get_allocable_regs) wr->get_allocable_regs(wr, cls, out, nregs); @@ -173,8 +171,8 @@ static void w_get_allocable_regs(CGTarget* t, RegClass cls, } } -static void w_get_scratch_regs(CGTarget* t, RegClass cls, - const Reg** out, u32* nregs) { +static void w_get_scratch_regs(CGTarget* t, RegClass cls, const Reg** out, + u32* nregs) { CGTarget* wr = impl_of(t)->target; if (wr->get_scratch_regs) wr->get_scratch_regs(wr, cls, out, nregs); @@ -186,23 +184,20 @@ static void w_get_scratch_regs(CGTarget* t, RegClass cls, static int w_is_caller_saved(CGTarget* t, RegClass cls, Reg r) { CGTarget* wr = impl_of(t)->target; - if (wr->is_caller_saved) - return wr->is_caller_saved(wr, cls, r); + if (wr->is_caller_saved) return wr->is_caller_saved(wr, cls, r); return 0; } -static void w_reserve_hard_regs(CGTarget* t, RegClass cls, - const Reg* regs, u32 n) { +static void w_reserve_hard_regs(CGTarget* t, RegClass cls, const Reg* regs, + u32 n) { CGTarget* wr = impl_of(t)->target; - if (wr->reserve_hard_regs) - wr->reserve_hard_regs(wr, cls, regs, n); + if (wr->reserve_hard_regs) wr->reserve_hard_regs(wr, cls, regs, n); } static int w_resolve_reg_name(CGTarget* t, Sym name, Reg* out, RegClass* cls_out) { CGTarget* wr = impl_of(t)->target; - if (wr->resolve_reg_name) - return wr->resolve_reg_name(wr, name, out, cls_out); + if (wr->resolve_reg_name) return wr->resolve_reg_name(wr, name, out, cls_out); return 1; } @@ -218,8 +213,7 @@ static void w_label_place(CGTarget* t, Label l) { u32 target_blk = (u32)l; if (target_blk >= o->f->nblocks) { SrcLoc loc = {0, 0, 0}; - compiler_panic(o->c, loc, "opt: label_place(%u) out of range", - (unsigned)l); + compiler_panic(o->c, loc, "opt: label_place(%u) out of range", (unsigned)l); } if (!cur_terminated(o)) { Block* cb = &o->f->blocks[o->cur]; @@ -739,12 +733,10 @@ static void w_intrinsic(CGTarget* t, IntrinKind kind, Operand* dsts, u32 nd, in->ndefs = nd; in->defs = arena_array(o->f->arena, Val, nd); for (u32 i = 0; i < nd; ++i) { - in->defs[i] = - (dsts[i].kind == OPK_REG) ? (Val)dsts[i].v.reg : VAL_NONE; + in->defs[i] = (dsts[i].kind == OPK_REG) ? (Val)dsts[i].v.reg : VAL_NONE; if (in->defs[i] != VAL_NONE && in->defs[i] < o->f->nvals) { o->f->val_def_block[in->defs[i]] = o->cur; - o->f->val_def_inst[in->defs[i]] = - o->f->blocks[o->cur].ninsts - 1u; + o->f->val_def_inst[in->defs[i]] = o->f->blocks[o->cur].ninsts - 1u; } } in->def = in->defs[0]; @@ -1049,11 +1041,10 @@ static void replay_inst(ReplayCtx* r, u32 b, Inst* in) { if (cd.nargs) { args = arena_array(r->f->arena, CGABIValue, cd.nargs); for (u32 k = 0; k < cd.nargs; ++k) { - CGABIPart* parts = - aux->desc.args[k].nparts - ? arena_array(r->f->arena, CGABIPart, - aux->desc.args[k].nparts) - : NULL; + CGABIPart* parts = aux->desc.args[k].nparts + ? arena_array(r->f->arena, CGABIPart, + aux->desc.args[k].nparts) + : NULL; args[k] = xlat_abivalue(r, &aux->desc.args[k], parts); } cd.args = args; @@ -1061,9 +1052,8 @@ static void replay_inst(ReplayCtx* r, u32 b, Inst* in) { cd.args = NULL; } CGABIPart* ret_parts = - cd.ret.nparts - ? arena_array(r->f->arena, CGABIPart, cd.ret.nparts) - : NULL; + cd.ret.nparts ? arena_array(r->f->arena, CGABIPart, cd.ret.nparts) + : NULL; cd.ret = xlat_abivalue(r, &aux->desc.ret, ret_parts); w->call(w, &cd); break; @@ -1088,10 +1078,9 @@ static void replay_inst(ReplayCtx* r, u32 b, Inst* in) { if (!aux || !aux->present) { w->ret(w, NULL); } else { - CGABIPart* parts = - aux->val.nparts - ? arena_array(r->f->arena, CGABIPart, aux->val.nparts) - : NULL; + CGABIPart* parts = aux->val.nparts ? arena_array(r->f->arena, CGABIPart, + aux->val.nparts) + : NULL; CGABIValue v = xlat_abivalue(r, &aux->val, parts); w->ret(w, &v); } @@ -1246,7 +1235,8 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) { if (w->get_allocable_regs) w->get_allocable_regs(w, (RegClass)cidx, &regs, &nregs); if (regs && nregs) - cg_simple_regalloc_set_ordered(&r.regalloc, (RegClass)cidx, regs, nregs); + cg_simple_regalloc_set_ordered(&r.regalloc, (RegClass)cidx, regs, + nregs); } } @@ -1299,7 +1289,10 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) { Reg hr = f->val_info[v].hard_reg; int already = 0; for (u32 i = 0; i < nused; ++i) { - if (used[i] == hr) { already = 1; break; } + if (used[i] == hr) { + already = 1; + break; + } } if (!already) used[nused++] = hr; } @@ -1307,7 +1300,10 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) { Reg hr = f->opt_scratch_regs[c][i]; int already = 0; for (u32 j = 0; j < nused; ++j) { - if (used[j] == hr) { already = 1; break; } + if (used[j] == hr) { + already = 1; + break; + } } if (!already && nused < OPT_MAX_HARD_REGS) used[nused++] = hr; } @@ -1340,35 +1336,6 @@ static u64 func_inst_count(Func* f) { return n; } -static u64 bitset_active_words(const u64* bits, u32 words) { - u64 n = 0; - if (!bits) return 0; - for (u32 i = 0; i < words; ++i) - if (bits[i]) ++n; - return n; -} - -static u64 func_live_active_words(Func* f) { - u64 n = 0; - if (!f) return 0; - for (u32 b = 0; b < f->nblocks; ++b) { - Block* bl = &f->blocks[b]; - n += bitset_active_words(bl->live_in, f->opt_live_words); - n += bitset_active_words(bl->live_out, f->opt_live_words); - n += bitset_active_words(bl->live_use, f->opt_live_words); - n += bitset_active_words(bl->live_def, f->opt_live_words); - } - return n; -} - -static u64 func_live_value_count(Func* f) { - u64 n = 0; - if (!f || !f->val_info) return 0; - for (Val v = 1; v < f->nvals; ++v) - if (f->val_info[v].first_pos) ++n; - return n; -} - static int inst_spill_local(Func* f, const Inst* in, u32 op_idx) { FrameSlot fs; if (!f || !in || op_idx >= in->nopnds) return 0; @@ -1433,22 +1400,21 @@ static void w_func_end(CGTarget* t) { metrics_scope_begin(o->c, "opt.build_loop_tree"); opt_build_loop_tree(o->f); metrics_scope_end(o->c, "opt.build_loop_tree"); - metrics_scope_begin(o->c, "opt.live_info.pre_dde"); - opt_live_info(o->f); + metrics_scope_begin(o->c, "opt.live_blocks.pre_dde"); + OptLiveInfo live; + opt_live_blocks(o->f, &live); metrics_count(o->c, "opt.live_words", o->f->opt_live_words); metrics_count(o->c, "opt.live.blocks", o->f->nblocks); - metrics_count(o->c, "opt.live.active_words", - func_live_active_words(o->f)); - metrics_count(o->c, "opt.ranges", func_live_value_count(o->f)); - metrics_count(o->c, "opt.range_points", o->f->opt_position_count); - metrics_count(o->c, "opt.conflict_bytes", - (u64)o->f->nvals * (u64)o->f->opt_conflict_words * - (u64)sizeof(u64)); - metrics_scope_end(o->c, "opt.live_info.pre_dde"); + metrics_count(o->c, "opt.live.active_words", live.active_words); + metrics_count(o->c, "opt.live.block_bytes", live.block_bytes); + metrics_count(o->c, "opt.live.set_bit_scans", live.set_bit_scans); + metrics_count(o->c, "opt.ranges", 0); + metrics_count(o->c, "opt.range_points", 0); + metrics_count(o->c, "opt.conflict_bytes", 0); + metrics_scope_end(o->c, "opt.live_blocks.pre_dde"); metrics_scope_begin(o->c, "opt.dead_def_elim"); - opt_dead_def_elim(o->f); + opt_dead_def_elim_with_live(o->f, &live); metrics_scope_end(o->c, "opt.dead_def_elim"); - o->f->val_info = NULL; /* force opt_regalloc to recompute liveness */ metrics_scope_begin(o->c, "opt.regalloc"); opt_regalloc(o->f, 0); metrics_count(o->c, "opt.alloc.used_loc_words", 0); @@ -1531,9 +1497,9 @@ CGTarget* opt_cgtarget_new(Compiler* c, CGTarget* target, int level) { t->reload_reg = w_reload_reg; t->get_allocable_regs = w_get_allocable_regs; - t->get_scratch_regs = w_get_scratch_regs; - t->is_caller_saved = w_is_caller_saved; - t->reserve_hard_regs = w_reserve_hard_regs; + t->get_scratch_regs = w_get_scratch_regs; + t->is_caller_saved = w_is_caller_saved; + t->reserve_hard_regs = w_reserve_hard_regs; t->label_new = w_label_new; t->label_place = w_label_place; diff --git a/src/opt/opt.h b/src/opt/opt.h @@ -67,12 +67,50 @@ void opt_cleanup(Func*); * per-OS, so it takes the full Target. */ void opt_machinize(Func*, CGTarget* target); void opt_build_loop_tree(Func*); + +typedef struct OptBitset { + u64* words; + u32 nwords; + u32 active_words; +} OptBitset; + +typedef struct OptBlockLive { + OptBitset live_in; + OptBitset live_out; + OptBitset live_use; + OptBitset live_def; +} OptBlockLive; + +typedef struct OptLiveInfo { + Arena* arena; + Func* f; + u32 words; + u64 active_words; + u64 block_bytes; + u64 set_bit_scans; + OptBlockLive* blocks; +} OptLiveInfo; + +typedef void (*OptBitsetIterFn)(Val, void*); + +void opt_bitset_clear(OptBitset*); +void opt_bitset_set(OptBitset*, Val); +void opt_bitset_clear_bit(OptBitset*, Val); +int opt_bitset_has(const OptBitset*, Val); +int opt_bitset_copy(OptBitset*, const OptBitset*); +int opt_bitset_union(OptBitset*, const OptBitset*); +int opt_bitset_union_and_not(OptBitset*, const OptBitset*, const OptBitset*); +void opt_bitset_iter_set(const OptBitset*, OptBitsetIterFn, void* arg); + +void opt_live_blocks(Func*, OptLiveInfo*); +void opt_live_dump_blocks(Func*, const OptLiveInfo*, Writer*); void opt_live_info(Func*); void opt_coalesce(Func*); void opt_regalloc(Func*, int allow_live_range_split); -void opt_combine(Func*); /* code selection: merge dependent insns */ -void opt_dce(Func*); /* post-RA DCE */ +void opt_combine(Func*); /* code selection: merge dependent insns */ +void opt_dce(Func*); /* post-RA DCE */ void opt_dead_def_elim(Func*); /* pre-RA dead-definition elimination */ +void opt_dead_def_elim_with_live(Func*, const OptLiveInfo*); /* Walks the lowered IR and drives a target CGTarget to emit machine code into * its ObjBuilder. Inserts prolog/epilog. Splits long insns where the target diff --git a/src/opt/pass_live.c b/src/opt/pass_live.c @@ -0,0 +1,333 @@ +#include <stdio.h> +#include <string.h> + +#include "core/arena.h" +#include "opt/opt.h" + +static u32 opt_bit_words(u32 nvals) { return (nvals + 63u) / 64u; } + +static void opt_bitset_init(Arena* arena, OptBitset* bs, u32 nwords) { + memset(bs, 0, sizeof *bs); + bs->nwords = nwords; + bs->words = arena_zarray(arena, u64, nwords ? nwords : 1u); +} + +static void opt_bitset_trim(OptBitset* bs) { + u32 n = bs->nwords; + while (n && bs->words[n - 1u] == 0) --n; + bs->active_words = n; +} + +void opt_bitset_clear(OptBitset* bs) { + if (!bs || !bs->words) return; + for (u32 i = 0; i < bs->active_words; ++i) bs->words[i] = 0; + bs->active_words = 0; +} + +void opt_bitset_set(OptBitset* bs, Val v) { + if (!bs || !bs->words) return; + u32 w = v / 64u; + if (w >= bs->nwords) return; + bs->words[w] |= 1ull << (v % 64u); + if (bs->active_words <= w) bs->active_words = w + 1u; +} + +void opt_bitset_clear_bit(OptBitset* bs, Val v) { + if (!bs || !bs->words) return; + u32 w = v / 64u; + if (w >= bs->active_words) return; + bs->words[w] &= ~(1ull << (v % 64u)); + if (w + 1u == bs->active_words && bs->words[w] == 0) opt_bitset_trim(bs); +} + +int opt_bitset_has(const OptBitset* bs, Val v) { + if (!bs || !bs->words) return 0; + u32 w = v / 64u; + if (w >= bs->active_words) return 0; + return (bs->words[w] & (1ull << (v % 64u))) != 0; +} + +int opt_bitset_copy(OptBitset* dst, const OptBitset* src) { + int changed = 0; + u32 n; + if (!dst || !src || !dst->words || !src->words) return 0; + n = dst->nwords < src->nwords ? dst->nwords : src->nwords; + for (u32 i = 0; i < n; ++i) { + changed |= dst->words[i] != src->words[i]; + dst->words[i] = src->words[i]; + } + for (u32 i = n; i < dst->active_words; ++i) { + changed |= dst->words[i] != 0; + dst->words[i] = 0; + } + opt_bitset_trim(dst); + return changed; +} + +int opt_bitset_union(OptBitset* dst, const OptBitset* src) { + int changed = 0; + u32 n; + if (!dst || !src || !dst->words || !src->words) return 0; + n = dst->nwords < src->active_words ? dst->nwords : src->active_words; + for (u32 i = 0; i < n; ++i) { + u64 old = dst->words[i]; + dst->words[i] |= src->words[i]; + changed |= dst->words[i] != old; + } + if (changed) opt_bitset_trim(dst); + return changed; +} + +int opt_bitset_union_and_not(OptBitset* dst, const OptBitset* src, + const OptBitset* not_bits) { + int changed = 0; + u32 n; + if (!dst || !src || !dst->words || !src->words) return 0; + n = dst->nwords < src->active_words ? dst->nwords : src->active_words; + for (u32 i = 0; i < n; ++i) { + u64 mask = + (not_bits && i < not_bits->active_words) ? not_bits->words[i] : 0; + u64 old = dst->words[i]; + dst->words[i] |= src->words[i] & ~mask; + changed |= dst->words[i] != old; + } + if (changed) opt_bitset_trim(dst); + return changed; +} + +void opt_bitset_iter_set(const OptBitset* bs, OptBitsetIterFn fn, void* arg) { + if (!bs || !bs->words || !fn) return; + for (u32 w = 0; w < bs->active_words; ++w) { + u64 bits = bs->words[w]; + while (bits) { + u32 bit = (u32)__builtin_ctzll(bits); + fn(w * 64u + bit, arg); + bits &= bits - 1u; + } + } +} + +typedef void (*LiveOperandWalkFn)(Func*, Inst*, Operand*, int is_def, void*); + +static int live_val_in_inst_defs(const Inst* in, Val v) { + if (v == VAL_NONE) return 0; + if (in->def == v) return 1; + for (u32 i = 0; i < in->ndefs; ++i) + if (in->defs[i] == v) return 1; + return 0; +} + +static void live_walk_operand(Func* f, Inst* in, Operand* op, int is_def, + LiveOperandWalkFn fn, void* ctx) { + if (!op) return; + if (op->kind == OPK_REG) { + fn(f, in, op, is_def, ctx); + } else if (op->kind == OPK_INDIRECT) { + Operand base = *op; + base.kind = OPK_REG; + base.cls = RC_INT; + base.v.reg = op->v.ind.base; + fn(f, in, &base, 0, ctx); + } +} + +static void live_walk_abivalue(Func* f, Inst* in, CGABIValue* v, + int storage_def, LiveOperandWalkFn fn, + void* ctx) { + live_walk_operand(f, in, &v->storage, storage_def, fn, ctx); + for (u32 i = 0; i < v->nparts; ++i) + live_walk_operand(f, in, (Operand*)&v->parts[i].op, storage_def, fn, ctx); +} + +static void live_walk_inst_operands(Func* f, Inst* in, LiveOperandWalkFn fn, + void* ctx) { + for (u32 i = 0; i < in->nopnds; ++i) { + int is_def = 0; + if (in->opnds[i].kind == OPK_REG) + is_def = i == 0 && live_val_in_inst_defs(in, (Val)in->opnds[i].v.reg); + if (!is_def) live_walk_operand(f, in, &in->opnds[i], 0, fn, ctx); + } + for (u32 i = 0; i < in->nopnds; ++i) { + int is_def = 0; + if (in->opnds[i].kind == OPK_REG) + is_def = i == 0 && live_val_in_inst_defs(in, (Val)in->opnds[i].v.reg); + if (is_def) live_walk_operand(f, in, &in->opnds[i], 1, fn, ctx); + } + + switch ((IROp)in->op) { + case IR_CALL: { + IRCallAux* aux = (IRCallAux*)in->extra.aux; + if (!aux) break; + live_walk_operand(f, in, &aux->desc.callee, 0, fn, ctx); + for (u32 i = 0; i < aux->desc.nargs; ++i) + live_walk_abivalue(f, in, (CGABIValue*)&aux->desc.args[i], 0, fn, ctx); + live_walk_abivalue(f, in, &aux->desc.ret, 1, fn, ctx); + break; + } + case IR_RET: { + IRRetAux* aux = (IRRetAux*)in->extra.aux; + if (aux && aux->present) live_walk_abivalue(f, in, &aux->val, 0, fn, ctx); + break; + } + case IR_SCOPE_BEGIN: { + IRScopeAux* aux = (IRScopeAux*)in->extra.aux; + if (aux) live_walk_operand(f, in, &aux->desc.cond, 0, fn, ctx); + break; + } + case IR_ASM_BLOCK: { + IRAsmAux* aux = (IRAsmAux*)in->extra.aux; + if (!aux) break; + for (u32 i = 0; i < aux->nin; ++i) + live_walk_operand(f, in, &aux->in_ops[i], 0, fn, ctx); + for (u32 i = 0; i < aux->nout; ++i) + live_walk_operand(f, in, &aux->out_ops[i], 1, fn, ctx); + break; + } + case IR_INTRINSIC: { + IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; + if (!aux) break; + for (u32 i = 0; i < aux->narg; ++i) + live_walk_operand(f, in, &aux->args[i], 0, fn, ctx); + for (u32 i = 0; i < aux->ndst; ++i) + live_walk_operand(f, in, &aux->dsts[i], 1, fn, ctx); + break; + } + default: + break; + } +} + +typedef struct LiveUseDefCtx { + OptBitset* use; + OptBitset* def; +} LiveUseDefCtx; + +static void live_collect_use_def(Func* f, Inst* in, Operand* op, int is_def, + void* arg) { + (void)in; + LiveUseDefCtx* c = (LiveUseDefCtx*)arg; + if (op->kind != OPK_REG) return; + Val v = (Val)op->v.reg; + if (v == VAL_NONE || v >= f->nvals) return; + if (is_def) { + opt_bitset_set(c->def, v); + } else if (!opt_bitset_has(c->def, v)) { + opt_bitset_set(c->use, v); + } +} + +static void live_count_bit(Val v, void* arg) { + (void)v; + ++*(u64*)arg; +} + +static u64 live_count_set_bits(const OptBitset* bs) { + u64 n = 0; + opt_bitset_iter_set(bs, live_count_bit, &n); + return n; +} + +static void live_recompute_metrics(OptLiveInfo* live) { + live->active_words = 0; + live->block_bytes = 0; + live->set_bit_scans = 0; + for (u32 b = 0; b < live->f->nblocks; ++b) { + OptBlockLive* bl = &live->blocks[b]; + live->active_words += bl->live_in.active_words; + live->active_words += bl->live_out.active_words; + live->active_words += bl->live_use.active_words; + live->active_words += bl->live_def.active_words; + live->set_bit_scans += live_count_set_bits(&bl->live_in); + live->set_bit_scans += live_count_set_bits(&bl->live_out); + live->set_bit_scans += live_count_set_bits(&bl->live_use); + live->set_bit_scans += live_count_set_bits(&bl->live_def); + } + live->block_bytes = + (u64)live->f->nblocks * 4u * (u64)live->words * (u64)sizeof(u64); +} + +void opt_live_blocks(Func* f, OptLiveInfo* live) { + memset(live, 0, sizeof *live); + live->arena = f->arena; + live->f = f; + live->words = opt_bit_words(f->nvals); + f->opt_live_words = (u16)live->words; + live->blocks = + arena_zarray(f->arena, OptBlockLive, f->nblocks ? f->nblocks : 1u); + + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + OptBlockLive* lb = &live->blocks[b]; + opt_bitset_init(f->arena, &lb->live_in, live->words); + opt_bitset_init(f->arena, &lb->live_out, live->words); + opt_bitset_init(f->arena, &lb->live_use, live->words); + opt_bitset_init(f->arena, &lb->live_def, live->words); + + LiveUseDefCtx ctx; + ctx.use = &lb->live_use; + ctx.def = &lb->live_def; + for (u32 i = 0; i < bl->ninsts; ++i) + live_walk_inst_operands(f, &bl->insts[i], live_collect_use_def, &ctx); + } + + OptBitset new_out; + OptBitset tmp; + opt_bitset_init(f->arena, &new_out, live->words); + opt_bitset_init(f->arena, &tmp, live->words); + + int changed; + do { + changed = 0; + for (u32 bi = f->nblocks; bi > 0; --bi) { + u32 b = bi - 1u; + Block* bl = &f->blocks[b]; + OptBlockLive* lb = &live->blocks[b]; + opt_bitset_clear(&new_out); + opt_bitset_clear(&tmp); + for (u32 s = 0; s < bl->nsucc; ++s) { + u32 t = bl->succ[s]; + if (t < f->nblocks) + opt_bitset_union(&new_out, &live->blocks[t].live_in); + } + opt_bitset_copy(&tmp, &lb->live_use); + opt_bitset_union_and_not(&tmp, &new_out, &lb->live_def); + changed |= opt_bitset_copy(&lb->live_out, &new_out); + changed |= opt_bitset_copy(&lb->live_in, &tmp); + } + } while (changed); + + live_recompute_metrics(live); +} + +static void dump_write(Writer* w, const char* s) { + cfree_writer_write(w, s, strlen(s)); +} + +static void dump_bit(Val v, void* arg) { + Writer* w = (Writer*)arg; + char buf[32]; + snprintf(buf, sizeof buf, " v%u", (unsigned)v); + dump_write(w, buf); +} + +static void dump_set(Writer* w, const char* name, const OptBitset* bs) { + dump_write(w, " "); + dump_write(w, name); + dump_write(w, ":"); + opt_bitset_iter_set(bs, dump_bit, w); + dump_write(w, "\n"); +} + +void opt_live_dump_blocks(Func* f, const OptLiveInfo* live, Writer* w) { + (void)f; + if (!live || !w) return; + for (u32 b = 0; b < live->f->nblocks; ++b) { + char buf[64]; + snprintf(buf, sizeof buf, "block %u\n", (unsigned)b); + dump_write(w, buf); + dump_set(w, "use", &live->blocks[b].live_use); + dump_set(w, "def", &live->blocks[b].live_def); + dump_set(w, "in", &live->blocks[b].live_in); + dump_set(w, "out", &live->blocks[b].live_out); + } +} diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c @@ -1,11 +1,10 @@ -#include "opt/opt.h" - #include <string.h> #include "core/arena.h" #include "core/core.h" #include "core/metrics.h" #include "core/pool.h" +#include "opt/opt.h" enum { OPT_CG_TYPE_SEG_SHIFT = 6, @@ -299,8 +298,8 @@ static const char* asm_constraint_body(const char* s) { } static int asm_resolve_fixed_constraint(Func* f, CGTarget* target, - const char* constraint, - Reg* reg_out, RegClass* cls_out) { + const char* constraint, Reg* reg_out, + RegClass* cls_out) { const char* body = asm_constraint_body(constraint); if (!target->resolve_reg_name) return 0; if (body[0] != '{') return 0; @@ -557,8 +556,7 @@ void opt_build_loop_tree(Func* f) { if (!has_loop) continue; for (u32 b = 0; b < f->nblocks; ++b) - if (body[b] && f->blocks[b].loop_depth < 31) - ++f->blocks[b].loop_depth; + if (body[b] && f->blocks[b].loop_depth < 31) ++f->blocks[b].loop_depth; } for (u32 b = 0; b < f->nblocks; ++b) @@ -619,7 +617,8 @@ void opt_live_info(Func* f) { u64* new_in = arena_zarray(f->arena, u64, words); for (u32 s = 0; s < bl->nsucc; ++s) { u32 t = bl->succ[s]; - if (t < f->nblocks) bit_or_changed(new_out, f->blocks[t].live_in, words); + if (t < f->nblocks) + bit_or_changed(new_out, f->blocks[t].live_in, words); } for (u32 w = 0; w < words; ++w) new_in[w] = bl->live_use[w] | (new_out[w] & ~bl->live_def[w]); @@ -643,8 +642,7 @@ void opt_live_info(Func* f) { for (u32 b = 0; b < f->nblocks; ++b) { Block* bl = &f->blocks[b]; - bl->live_after = - arena_array(f->arena, u64*, bl->ninsts ? bl->ninsts : 1u); + bl->live_after = arena_array(f->arena, u64*, bl->ninsts ? bl->ninsts : 1u); u64* live = arena_zarray(f->arena, u64, words); copy_bits(live, bl->live_out, words); @@ -692,8 +690,7 @@ void opt_live_info(Func* f) { if ((IROp)in->op == IR_ASM_BLOCK) apply_asm_register_constraints(f, in, use, def, after); - for (u32 w = 0; w < words; ++w) - live[w] = (live[w] & ~def[w]) | use[w]; + for (u32 w = 0; w < words; ++w) live[w] = (live[w] & ~def[w]) | use[w]; } } @@ -721,7 +718,8 @@ static int val_higher_priority(Func* f, Val a, Val b) { if (av->frequency != bv->frequency) return av->frequency > bv->frequency; if (av->live_across_call_freq != bv->live_across_call_freq) return av->live_across_call_freq > bv->live_across_call_freq; - if (av->live_length != bv->live_length) return av->live_length < bv->live_length; + if (av->live_length != bv->live_length) + return av->live_length < bv->live_length; return a < b; } @@ -954,8 +952,7 @@ static void rewrite_func(Func* f) { for (Val v = 1; v < f->nvals; ++v) { if (!bit_has(live_after[i], v) || bit_has(def, v)) continue; if (f->val_info[v].alloc_kind == OPT_ALLOC_HARD && - is_caller_saved(f, f->val_info[v].cls, - f->val_info[v].hard_reg)) + is_caller_saved(f, f->val_info[v].cls, f->val_info[v].hard_reg)) append_store_val(f, &out, v); } } @@ -965,8 +962,7 @@ static void rewrite_func(Func* f) { for (Val v = 1; v < f->nvals; ++v) { if (!bit_has(live_after[i], v) || bit_has(def, v)) continue; if (f->val_info[v].alloc_kind == OPT_ALLOC_HARD && - is_caller_saved(f, f->val_info[v].cls, - f->val_info[v].hard_reg)) + is_caller_saved(f, f->val_info[v].cls, f->val_info[v].hard_reg)) append_load_val(f, &out, v); } } @@ -992,12 +988,16 @@ static int all_defs_dead(Func* f, Inst* in, u64* live) { return 1; } -void opt_dead_def_elim(Func* f) { - u32 words = f->opt_live_words; +void opt_dead_def_elim_with_live(Func* f, const OptLiveInfo* live_info) { + u32 words = live_info ? live_info->words : f->opt_live_words; for (u32 b = 0; b < f->nblocks; ++b) { Block* bl = &f->blocks[b]; u64* live = arena_zarray(f->arena, u64, words); - for (u32 w = 0; w < words; ++w) live[w] = bl->live_out[w]; + const u64* live_out = + live_info ? live_info->blocks[b].live_out.words : bl->live_out; + if (live_out) { + for (u32 w = 0; w < words; ++w) live[w] = live_out[w]; + } Inst* new_insts = arena_array(f->arena, Inst, bl->ninsts); u32 w = 0; @@ -1029,15 +1029,25 @@ void opt_dead_def_elim(Func* f) { } } +void opt_dead_def_elim(Func* f) { + if (f->nblocks && !f->blocks[0].live_out) { + OptLiveInfo live; + opt_live_blocks(f, &live); + opt_dead_def_elim_with_live(f, &live); + return; + } + opt_dead_def_elim_with_live(f, NULL); +} + void opt_regalloc(Func* f, int allow_live_range_split) { (void)allow_live_range_split; if (!f->val_info) { metrics_scope_begin(f->c, "opt.live_info.regalloc"); opt_live_info(f); metrics_count(f->c, "opt.live_words", f->opt_live_words); - metrics_count(f->c, "opt.conflict_bytes", - (u64)f->nvals * (u64)f->opt_conflict_words * - (u64)sizeof(u64)); + metrics_count( + f->c, "opt.conflict_bytes", + (u64)f->nvals * (u64)f->opt_conflict_words * (u64)sizeof(u64)); metrics_scope_end(f->c, "opt.live_info.regalloc"); } Val* order = arena_array(f->arena, Val, f->nvals ? f->nvals : 1u); @@ -1063,9 +1073,10 @@ void opt_regalloc(Func* f, int allow_live_range_split) { Reg fixed = (Reg)vi->tied_hard_reg; if (!hard_available(f, cls, fixed)) { SrcLoc loc = {0, 0, 0}; - compiler_panic(f->c, loc, - "opt regalloc: fixed hard reg %u unavailable in class %u", - (unsigned)fixed, (unsigned)cls); + compiler_panic( + f->c, loc, + "opt regalloc: fixed hard reg %u unavailable in class %u", + (unsigned)fixed, (unsigned)cls); } if (fixed < 32 && (vi->forbidden_hard_regs & (1u << fixed))) { SrcLoc loc = {0, 0, 0}; @@ -1075,8 +1086,7 @@ void opt_regalloc(Func* f, int allow_live_range_split) { } if (hard_conflicts(f, assigned, nassigned, v, fixed)) { SrcLoc loc = {0, 0, 0}; - compiler_panic(f->c, loc, - "opt regalloc: conflicting fixed hard reg %u", + compiler_panic(f->c, loc, "opt regalloc: conflicting fixed hard reg %u", (unsigned)fixed); } vi->alloc_kind = OPT_ALLOC_HARD; @@ -1172,8 +1182,7 @@ static int same_phys_reg(const Operand* a, const Operand* b) { static int operand_uses_phys_reg(const Operand* op, const Operand* r) { if (!op || !r || r->kind != OPK_REG) return 0; - if (op->kind == OPK_REG) - return op->cls == r->cls && op->v.reg == r->v.reg; + if (op->kind == OPK_REG) return op->cls == r->cls && op->v.reg == r->v.reg; if (op->kind == OPK_INDIRECT) return r->cls == RC_INT && op->v.ind.base == r->v.reg; return 0; @@ -1412,11 +1421,10 @@ static int find_single_direct_use(Func* f, Block* bl, u32 def_i, total_uses += uses; if (total_uses > 1) return 0; for (u32 oi = 0; oi < in->nopnds; ++oi) { - int ok = conv_fold - ? (oi == 1 && - identical_convert_pair(&bl->insts[def_i], in)) - : (imm_fold ? imm_fold_slot(in, oi) - : copy_fold_slot(in, oi)); + int ok = + conv_fold + ? (oi == 1 && identical_convert_pair(&bl->insts[def_i], in)) + : (imm_fold ? imm_fold_slot(in, oi) : copy_fold_slot(in, oi)); if (!ok) continue; if (!same_phys_reg(&in->opnds[oi], def)) continue; found_i = i; @@ -1454,16 +1462,16 @@ static void opt_combine_fold_block(Func* f, Block* bl) { if ((IROp)in->op == IR_COPY && in->nopnds >= 2 && in->opnds[0].kind == OPK_REG && in->opnds[1].kind == OPK_REG && !same_phys_reg(&in->opnds[0], &in->opnds[1]) && - find_single_direct_use(f, bl, i, &in->opnds[0], &in->opnds[1], 1, 0, - 0, &use_i, &op_i)) { + find_single_direct_use(f, bl, i, &in->opnds[0], &in->opnds[1], 1, 0, 0, + &use_i, &op_i)) { bl->insts[use_i].opnds[op_i] = in->opnds[1]; continue; } if ((IROp)in->op == IR_LOAD_IMM && in->nopnds >= 1 && in->opnds[0].kind == OPK_REG && - find_single_direct_use(f, bl, i, &in->opnds[0], NULL, 0, 1, 0, - &use_i, &op_i)) { + find_single_direct_use(f, bl, i, &in->opnds[0], NULL, 0, 1, 0, &use_i, + &op_i)) { Operand imm = in->opnds[0]; imm.kind = OPK_IMM; imm.v.imm = in->extra.imm; @@ -1473,8 +1481,8 @@ static void opt_combine_fold_block(Func* f, Block* bl) { if ((IROp)in->op == IR_CONVERT && in->nopnds >= 2 && in->opnds[0].kind == OPK_REG && in->opnds[1].kind == OPK_REG && - find_single_direct_use(f, bl, i, &in->opnds[0], &in->opnds[1], 1, 0, - 1, &use_i, &op_i)) { + find_single_direct_use(f, bl, i, &in->opnds[0], &in->opnds[1], 1, 0, 1, + &use_i, &op_i)) { bl->insts[use_i].opnds[op_i] = in->opnds[1]; } } diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c @@ -1,10 +1,11 @@ +#include "opt/opt.h" + +#include <cfree.h> #include <stdarg.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <cfree.h> - #include "abi/abi.h" #include "arch/arch.h" #include "arch/regalloc.h" @@ -12,7 +13,6 @@ #include "core/heap.h" #include "core/pool.h" #include "opt/ir.h" -#include "opt/opt.h" static void* h_alloc(CfreeHeap* h, size_t n, size_t a) { (void)h; @@ -48,15 +48,15 @@ static CfreeDiagSink g_diag = {diag_emit, NULL, 0, 0}; static int g_fails; static int g_checks; -#define EXPECT(cond, ...) \ - do { \ - ++g_checks; \ - if (!(cond)) { \ - ++g_fails; \ - fprintf(stderr, "FAIL %s:%d: ", __FILE__, __LINE__); \ - fprintf(stderr, __VA_ARGS__); \ - fputc('\n', stderr); \ - } \ +#define EXPECT(cond, ...) \ + do { \ + ++g_checks; \ + if (!(cond)) { \ + ++g_fails; \ + fprintf(stderr, "FAIL %s:%d: ", __FILE__, __LINE__); \ + fprintf(stderr, __VA_ARGS__); \ + fputc('\n', stderr); \ + } \ } while (0) typedef struct TestCtx { @@ -179,8 +179,7 @@ static FrameSlot add_frame_slot(Func* f, CfreeCgTypeId ty, FrameSlotKind kind, return ir_frame_slot_new(f, &d); } -static Inst* emit_load_imm(Func* f, u32 b, Val dst, CfreeCgTypeId ty, - i64 imm) { +static Inst* emit_load_imm(Func* f, u32 b, Val dst, CfreeCgTypeId ty, i64 imm) { Inst* in = ir_emit(f, b, IR_LOAD_IMM); in->opnds = arena_array(f->arena, Operand, 1); in->opnds[0] = op_reg_(dst, ty); @@ -294,6 +293,15 @@ static int bit_has(const u64* bits, Val v) { return (bits[v / 64u] & (1ull << (v % 64u))) != 0; } +static int bytes_contains(const unsigned char* data, size_t len, + const char* needle) { + size_t nlen = strlen(needle); + if (!data || nlen > len) return 0; + for (size_t i = 0; i + nlen <= len; ++i) + if (memcmp(data + i, needle, nlen) == 0) return 1; + return 0; +} + static int val_conflicts(const Func* f, Val a, Val b) { const u64* bits = f->val_conflicts + ((size_t)a * f->opt_conflict_words); return bit_has(bits, b); @@ -350,15 +358,15 @@ static void mock_func_begin(CGTarget* t, const CGFuncDesc* d) { } static void mock_func_end(CGTarget* t) { (void)t; } -static void mock_get_allocable_regs(CGTarget* t, RegClass cls, - const Reg** out, u32* nregs) { +static void mock_get_allocable_regs(CGTarget* t, RegClass cls, const Reg** out, + u32* nregs) { MockCGTarget* m = (MockCGTarget*)t; *out = m->pool[cls]; *nregs = m->pool_n[cls]; } -static void mock_get_scratch_regs(CGTarget* t, RegClass cls, - const Reg** out, u32* nregs) { +static void mock_get_scratch_regs(CGTarget* t, RegClass cls, const Reg** out, + u32* nregs) { MockCGTarget* m = (MockCGTarget*)t; *out = m->scratch[cls]; *nregs = m->scratch_n[cls]; @@ -370,8 +378,8 @@ static int mock_is_caller_saved(CGTarget* t, RegClass cls, Reg reg) { return (m->caller_saved_mask[cls] & (1u << reg)) != 0; } -static void mock_reserve_hard_regs(CGTarget* t, RegClass cls, - const Reg* regs, u32 n) { +static void mock_reserve_hard_regs(CGTarget* t, RegClass cls, const Reg* regs, + u32 n) { MockCGTarget* m = (MockCGTarget*)t; if (cls < OPT_REG_CLASSES) m->reserve_calls[cls] += (int)n; (void)regs; @@ -382,8 +390,7 @@ static int mock_resolve_reg_name(CGTarget* t, Sym name, Reg* out, size_t len = 0; const char* s = pool_str(t->c->global, name, &len); if (!s || len < 2) return 1; - if ((s[0] != 'r' && s[0] != 'x' && s[0] != 'v') || - s[1] < '0' || s[1] > '9') + if ((s[0] != 'r' && s[0] != 'x' && s[0] != 'v') || s[1] < '0' || s[1] > '9') return 1; u32 n = 0; for (size_t i = 1; i < len; ++i) { @@ -474,9 +481,8 @@ static void mock_init(MockCGTarget* m, Compiler* c) { m->base.resolve_reg_name = mock_resolve_reg_name; } -static void mock_set_pool(MockCGTarget* m, RegClass cls, - const Reg* pool, u32 npool, - const Reg* scratch_, u32 nscratch, +static void mock_set_pool(MockCGTarget* m, RegClass cls, const Reg* pool, + u32 npool, const Reg* scratch_, u32 nscratch, u32 caller_mask) { m->pool[cls] = pool; m->pool_n[cls] = npool; @@ -524,14 +530,88 @@ static void opt_liveness_branch(void) { opt_build_loop_tree(f); opt_live_info(f); - EXPECT(bit_has(f->blocks[b0].live_out, v), "v%u live_out of branch block", - v); + EXPECT(bit_has(f->blocks[b0].live_out, v), "v%u live_out of branch block", v); EXPECT(bit_has(f->blocks[b1].live_in, v), "v%u live_in true block", v); EXPECT(bit_has(f->blocks[b2].live_in, v), "v%u live_in false block", v); EXPECT(!bit_has(f->blocks[b1].live_out, v), "v%u dies in true block", v); tc_fini(&tc); } +static void opt_block_liveness_phase1(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + u32 entry = f->entry; + u32 header = ir_block_new(f); + u32 body = ir_block_new(f); + u32 exit = ir_block_new(f); + ir_note_emit(f, header); + ir_note_emit(f, body); + ir_note_emit(f, exit); + Val across_branch = add_val(f, tc.i32); + Val across_call = add_val(f, tc.i32); + Val out = add_val(f, tc.i32); + + emit_load_imm(f, entry, across_branch, tc.i32, 7); + emit_load_imm(f, entry, across_call, tc.i32, 11); + emit_br_to(f, entry, header); + emit_test_branch(f, header, body, exit, tc.i32); + emit_call_void(f, body); + emit_binop(f, body, out, across_branch, across_call, tc.i32); + emit_br_to(f, body, header); + emit_ret_val(f, exit, across_call, tc.i32); + + opt_build_cfg(f); + opt_build_loop_tree(f); + + OptLiveInfo live; + opt_live_blocks(f, &live); + EXPECT(opt_bitset_has(&live.blocks[entry].live_out, across_branch), + "branch value should be live-out from entry"); + EXPECT(opt_bitset_has(&live.blocks[header].live_in, across_branch), + "branch value should be live-in to loop header"); + EXPECT(opt_bitset_has(&live.blocks[body].live_in, across_branch), + "branch value should be live-in to loop body"); + EXPECT(opt_bitset_has(&live.blocks[body].live_in, across_call), + "call-crossing value should be live-in to call block"); + EXPECT(opt_bitset_has(&live.blocks[body].live_out, across_call), + "call-crossing value should remain live-out on loop backedge"); + EXPECT(!opt_bitset_has(&live.blocks[exit].live_out, across_call), + "returned value should die at function exit"); + EXPECT(f->val_conflicts == NULL, + "block liveness should not allocate conflict matrix"); + EXPECT(live.block_bytes != 0, "block-liveness byte metric should be set"); + EXPECT(live.set_bit_scans != 0, + "block-liveness set-bit scan metric should be set"); + + CfreeWriter* w = cfree_writer_mem(&g_heap); + opt_live_dump_blocks(f, &live, w); + size_t len = 0; + const unsigned char* bytes = cfree_writer_mem_bytes(w, &len); + EXPECT(bytes_contains(bytes, len, "block 0\n"), + "block liveness dump should include block header"); + EXPECT(bytes_contains(bytes, len, " in:"), + "block liveness dump should include live-in set"); + cfree_writer_close(w); + + Func* g = new_func(&tc); + Val a = add_val(g, tc.i32); + Val dead = add_val(g, tc.i32); + emit_load_imm(g, g->entry, a, tc.i32, 1); + emit_copy(g, g->entry, dead, a, tc.i32); + emit_ret_val(g, g->entry, a, tc.i32); + opt_build_cfg(g); + opt_build_loop_tree(g); + OptLiveInfo g_live; + opt_live_blocks(g, &g_live); + opt_dead_def_elim_with_live(g, &g_live); + EXPECT(count_op(g, IR_COPY) == 0, + "DDE should consume pass-local block liveness"); + EXPECT(g->val_conflicts == NULL, + "DDE with block liveness should not allocate conflict matrix"); + tc_fini(&tc); +} + static void opt_live_after_linear(void) { TestCtx tc; tc_init(&tc); @@ -564,14 +644,10 @@ static void opt_live_after_linear(void) { "a should be live after b's def"); EXPECT(bit_has(f->blocks[b].live_after[ib], bv), "b should be live after its def"); - EXPECT(!bit_has(f->blocks[b].live_after[ic], a), - "a should die after add"); - EXPECT(!bit_has(f->blocks[b].live_after[ic], bv), - "b should die after add"); - EXPECT(bit_has(f->blocks[b].live_after[ic], c), - "c should be live after add"); - EXPECT(!bit_has(f->blocks[b].live_after[ir], c), - "return should consume c"); + EXPECT(!bit_has(f->blocks[b].live_after[ic], a), "a should die after add"); + EXPECT(!bit_has(f->blocks[b].live_after[ic], bv), "b should die after add"); + EXPECT(bit_has(f->blocks[b].live_after[ic], c), "c should be live after add"); + EXPECT(!bit_has(f->blocks[b].live_after[ir], c), "return should consume c"); tc_fini(&tc); } @@ -637,8 +713,7 @@ static void opt_interference_def_live_out(void) { EXPECT(val_conflicts(f, a, bv), "a and b should conflict while both are live"); - EXPECT(val_conflicts(f, bv, a), - "conflicts should be symmetric"); + EXPECT(val_conflicts(f, bv, a), "conflicts should be symmetric"); EXPECT(val_conflicts(f, a, c), "a should conflict with c for same-instruction def/use lowering"); EXPECT(val_conflicts(f, bv, c), @@ -921,8 +996,7 @@ static void opt_loop_tree_nested_depths(void) { EXPECT(f->blocks[exit].loop_depth == 0, "exit should not be in nested loops, got depth %u", (unsigned)f->blocks[exit].loop_depth); - EXPECT(f->blocks[inner_header].frequency > - f->blocks[outer_header].frequency, + EXPECT(f->blocks[inner_header].frequency > f->blocks[outer_header].frequency, "inner loop should be hotter than outer-only blocks"); tc_fini(&tc); } @@ -1007,8 +1081,8 @@ static void opt_regalloc_priority(void) { EXPECT(f->val_info[pinned].alloc_kind == OPT_ALLOC_HARD, "tied value should get a hard register"); EXPECT(f->val_info[pinned].hard_reg == expected_hard, - "tied value should get hard r%u, got r%u", - (unsigned)expected_hard, (unsigned)f->val_info[pinned].hard_reg); + "tied value should get hard r%u, got r%u", (unsigned)expected_hard, + (unsigned)f->val_info[pinned].hard_reg); EXPECT(f->val_info[hot].alloc_kind == OPT_ALLOC_SPILL, "overlapping untied value should spill under one-reg pressure"); tc_fini(&tc); @@ -1110,7 +1184,8 @@ static void opt_call_clobber_caller_saved(void) { } } EXPECT(saw_call_save_restore, - "live caller-saved hard reg across call should be stored before and loaded after"); + "live caller-saved hard reg across call should be stored before and " + "loaded after"); tc_fini(&tc); } @@ -1195,8 +1270,8 @@ static void opt_inline_asm_tied_fixed_regs(void) { Reg expected = f->opt_hard_regs[RC_INT][0]; EXPECT(f->val_info[tied].alloc_kind == OPT_ALLOC_HARD, "tied asm val should allocate hard"); - EXPECT(f->val_info[tied].hard_reg == expected, - "tied asm val should get r%u", (unsigned)expected); + EXPECT(f->val_info[tied].hard_reg == expected, "tied asm val should get r%u", + (unsigned)expected); EXPECT(aux->out_ops[0].v.reg == expected && aux->in_ops[0].v.reg == expected, "asm tied operands should rewrite to the fixed hard reg"); tc_fini(&tc); @@ -1431,8 +1506,9 @@ static void opt_combine_spill_peeps(void) { st->extra.mem = mem_local_(fs, tc.i32, 4, 0); opt_combine(g); - EXPECT(g->blocks[g->entry].ninsts == 1, - "spill reload followed by same-reg writeback should combine to one inst"); + EXPECT( + g->blocks[g->entry].ninsts == 1, + "spill reload followed by same-reg writeback should combine to one inst"); EXPECT((IROp)g->blocks[g->entry].insts[0].op == IR_LOAD, "remaining inst should be the spill reload"); tc_fini(&tc); @@ -1920,6 +1996,7 @@ int main(void) { opt_loop_tree_nested_depths(); opt_loop_tree_does_not_mutate_cfg(); opt_liveness_branch(); + opt_block_liveness_phase1(); opt_live_after_linear(); opt_interference_branch_disjoint(); opt_interference_def_live_out();