kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit c72fb5b7d435448fad6f4baf766a7d5670809fc8
parent df0362afce63cd628b21bd4bad1a7ea2f09fe35e
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 21 May 2026 17:51:48 -0700

Implement LICM optimization pass

Diffstat:
Mdoc/OPT.md | 2+-
Msrc/opt/opt.c | 3+++
Msrc/opt/pass_o2.c | 232+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest/opt/opt_test.c | 88+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 324 insertions(+), 1 deletion(-)

diff --git a/doc/OPT.md b/doc/OPT.md @@ -801,7 +801,7 @@ Implement in order: `opt_simplify` as described in Section 4.3 9. [x] extended memory tracker for joins/path state 10. [x] `opt_dse` -11. [ ] `opt_licm` +11. [x] `opt_licm` 12. [ ] `opt_pressure_relief` 13. [x] `opt_make_conventional_ssa` 14. [ ] `opt_ssa_combine` diff --git a/src/opt/opt.c b/src/opt/opt.c @@ -1443,6 +1443,9 @@ static void opt_run_o2_pipeline(OptImpl* o) { opt_verify(o->f, "o2-dse-ssa"); opt_ssa_dce(o->f); opt_verify(o->f, "o2-dse-dce"); + opt_build_loop_tree(o->f); + opt_licm(o->f); + opt_verify(o->f, "o2-licm-ssa"); opt_make_conventional_ssa(o->f); opt_verify(o->f, "o2-conventional-ssa"); opt_undo_ssa(o->f); diff --git a/src/opt/pass_o2.c b/src/opt/pass_o2.c @@ -155,6 +155,12 @@ typedef struct DseCtx { int changed; } DseCtx; +typedef struct LicmLoop { + u32 header; + u32 preheader; + u8* body; +} LicmLoop; + static u32 opt_inst_count(Func* f) { u32 n = 0; for (u32 b = 0; b < f->nblocks; ++b) n += f->blocks[b].ninsts; @@ -2179,6 +2185,232 @@ void opt_dse(Func* f) { opt_rebuild_def_use(f); } +static void licm_mark_body(Func* f, const u8* reachable, u32 header, u32 latch, + u8* body, u32* stack) { + u32 sp = 0; + if (!body[header]) body[header] = 1; + if (!body[latch]) { + body[latch] = 1; + stack[sp++] = latch; + } + + while (sp) { + u32 b = stack[--sp]; + if (b == header) continue; + Block* bl = &f->blocks[b]; + for (u32 p = 0; p < bl->npreds; ++p) { + u32 pred = bl->preds[p]; + if (pred >= f->nblocks || !reachable[pred] || body[pred]) continue; + body[pred] = 1; + stack[sp++] = pred; + } + } +} + +static u32 licm_collect_loops(Func* f, OptAnalysis* a, LicmLoop** out) { + u32 nloops = 0; + LicmLoop* loops = arena_zarray(f->arena, LicmLoop, f->nblocks ? f->nblocks + : 1u); + u8* scratch = arena_zarray(f->arena, u8, f->nblocks ? f->nblocks : 1u); + u32* stack = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u); + + for (u32 header = 0; header < f->nblocks; ++header) { + if (!a->reachable || !a->reachable[header]) continue; + memset(scratch, 0, f->nblocks * sizeof scratch[0]); + int has_loop = 0; + + for (u32 latch = 0; latch < f->nblocks; ++latch) { + if (!a->reachable[latch]) continue; + Block* lb = &f->blocks[latch]; + for (u32 s = 0; s < lb->nsucc; ++s) { + if (lb->succ[s] != header) continue; + if (!opt_analysis_dominates(a, header, latch)) continue; + has_loop = 1; + licm_mark_body(f, a->reachable, header, latch, scratch, stack); + } + } + if (!has_loop) continue; + + u32 preheader = OPT_BLOCK_NONE; + Block* hb = &f->blocks[header]; + for (u32 p = 0; p < hb->npreds; ++p) { + u32 pred = hb->preds[p]; + if (pred >= f->nblocks || scratch[pred]) continue; + if (preheader != OPT_BLOCK_NONE) { + preheader = OPT_BLOCK_NONE; + break; + } + preheader = pred; + } + if (preheader == OPT_BLOCK_NONE) continue; + if (f->blocks[preheader].nsucc != 1 || + f->blocks[preheader].succ[0] != header) + continue; + + u8* body = arena_array(f->arena, u8, f->nblocks ? f->nblocks : 1u); + memcpy(body, scratch, f->nblocks * sizeof body[0]); + loops[nloops].header = header; + loops[nloops].preheader = preheader; + loops[nloops].body = body; + ++nloops; + } + + *out = loops; + return nloops; +} + +static int licm_safe_binop(BinOp op) { + switch (op) { + case BO_SDIV: + case BO_UDIV: + case BO_SREM: + case BO_UREM: + case BO_FDIV: + return 0; + default: + return 1; + } +} + +static int licm_candidate(Func* f, const Inst* in) { + if (!in || in->def == VAL_NONE || in->ndefs || in->flags) return 0; + if (in->def >= f->nvals || opt_inst_has_side_effect(f, in)) return 0; + switch ((IROp)in->op) { + case IR_CONST_I: + case IR_CONST_BYTES: + case IR_LOAD_IMM: + case IR_LOAD_CONST: + case IR_LOAD_LABEL_ADDR: + case IR_COPY: + case IR_ADDR_OF: + case IR_UNOP: + case IR_CMP: + case IR_CONVERT: + return 1; + case IR_BINOP: + return licm_safe_binop((BinOp)in->extra.imm); + default: + return 0; + } +} + +static int licm_operand_invariant(Func* f, const u8* body, const Operand* op) { + if (!op) return 1; + if (op->kind == OPK_REG) { + Val v = (Val)op->v.reg; + if (v == VAL_NONE || v >= f->nvals) return 0; + u32 def_block = f->val_def_block[v]; + return def_block < f->nblocks && !body[def_block]; + } + if (op->kind == OPK_INDIRECT) { + Val v = (Val)op->v.ind.base; + if (v == VAL_NONE || v >= f->nvals) return 0; + u32 def_block = f->val_def_block[v]; + return def_block < f->nblocks && !body[def_block]; + } + return 1; +} + +static int licm_inst_invariant(Func* f, const LicmLoop* loop, const Inst* in) { + if (!licm_candidate(f, in)) return 0; + for (u32 i = 0; i < in->nopnds; ++i) { + const Operand* op = &in->opnds[i]; + if (op->kind == OPK_REG && opt_val_in_inst_defs(in, (Val)op->v.reg)) + continue; + if (!licm_operand_invariant(f, loop->body, op)) return 0; + } + return 1; +} + +static void licm_insert_inst(Func* f, u32 block, u32 at, Inst in) { + Block* bl = &f->blocks[block]; + if (at > bl->ninsts) at = bl->ninsts; + if (bl->ninsts == bl->cap) { + u32 ncap = bl->cap ? bl->cap * 2u : 8u; + while (ncap < bl->ninsts + 1u) ncap *= 2u; + Inst* insts = arena_zarray(f->arena, Inst, ncap); + if (at) memcpy(insts, bl->insts, sizeof(insts[0]) * at); + if (bl->ninsts > at) + memcpy(&insts[at + 1u], &bl->insts[at], + sizeof(insts[0]) * (bl->ninsts - at)); + bl->insts = insts; + bl->cap = ncap; + } else { + for (u32 i = bl->ninsts; i > at; --i) bl->insts[i] = bl->insts[i - 1u]; + } + bl->insts[at] = in; + ++bl->ninsts; +} + +static void licm_remove_inst(Func* f, u32 block, u32 at) { + Block* bl = &f->blocks[block]; + if (at >= bl->ninsts) return; + for (u32 i = at + 1u; i < bl->ninsts; ++i) bl->insts[i - 1u] = bl->insts[i]; + --bl->ninsts; +} + +static u32 licm_preheader_insert_pos(Func* f, u32 preheader) { + Block* bl = &f->blocks[preheader]; + if (bl->ninsts && o2_is_terminator(&bl->insts[bl->ninsts - 1u])) + return bl->ninsts - 1u; + return bl->ninsts; +} + +static void licm_hoist_inst(Func* f, const LicmLoop* loop, u32 block, u32 inst) { + Inst moved = f->blocks[block].insts[inst]; + u32 at = licm_preheader_insert_pos(f, loop->preheader); + licm_insert_inst(f, loop->preheader, at, moved); + licm_remove_inst(f, block, inst); + if (moved.def != VAL_NONE && moved.def < f->nvals) { + f->val_def_block[moved.def] = loop->preheader; + f->val_def_inst[moved.def] = at; + } +} + +void opt_licm(Func* f) { + if (!f || f->opt_rewritten) return; + if (!f->opt_reg_ssa && f->npregs > 1) { + opt_rebuild_def_use(f); + return; + } + + OptAnalysis a; + memset(&a, 0, sizeof a); + opt_analysis_build_dominators(f, &a); + LicmLoop* loops = NULL; + u32 nloops = licm_collect_loops(f, &a, &loops); + int changed = 0; + + for (u32 l = 0; l < nloops; ++l) { + LicmLoop* loop = &loops[l]; + int again = 1; + while (again) { + again = 0; + for (u32 b = 0; b < f->nblocks; ++b) { + if (!loop->body[b] || b == loop->preheader) continue; + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts;) { + Inst* in = &bl->insts[i]; + if (!licm_inst_invariant(f, loop, in)) { + ++i; + continue; + } + licm_hoist_inst(f, loop, b, i); + changed = 1; + again = 1; + bl = &f->blocks[b]; + } + } + } + } + + if (changed) { + dse_refresh_def_locations(f); + opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE); + } + opt_rebuild_def_use(f); +} + void opt_cleanup(Func* f) { if (!f) return; opt_ssa_dce(f); diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c @@ -3593,6 +3593,92 @@ static void opt_loop_tree_does_not_mutate_cfg(void) { tc_fini(&tc); } +static void opt_licm_hoists_safe_invariant_to_preheader(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, FSF_ADDR_TAKEN); + u32 entry = f->entry; + u32 header = ir_block_new(f); + u32 body = ir_block_new(f); + u32 exit = ir_block_new(f); + ir_note_emit(f, header); + ir_note_emit(f, body); + ir_note_emit(f, exit); + + Val cond = add_val(f, tc.i32); + Val a = add_val(f, tc.i32); + Val b = add_val(f, tc.i32); + Val inv = add_val(f, tc.i32); + Val loaded = add_val(f, tc.i32); + Val use = add_val(f, tc.i32); + emit_scalar_input(f, entry, cond, tc.i32); + emit_scalar_input(f, entry, a, tc.i32); + emit_scalar_input(f, entry, b, tc.i32); + emit_br_to(f, entry, header); + emit_cond_branch(f, header, cond, body, exit, tc.i32); + Inst* mul = emit_binop(f, body, inv, a, b, tc.i32); + mul->extra.imm = BO_IMUL; + emit_load_local(f, body, loaded, fs, tc.i32, 0); + emit_binop(f, body, use, inv, loaded, tc.i32); + emit_br_to(f, body, header); + emit_ret_val(f, exit, a, tc.i32); + + opt_build_cfg(f); + opt_build_loop_tree(f); + opt_licm(f); + opt_verify(f, "test-licm-hoist"); + + EXPECT(f->val_def_block[inv] == entry, + "LICM should hoist invariant multiply to preheader"); + EXPECT(f->val_def_inst[inv] + 1u < f->blocks[entry].ninsts, + "hoisted multiply should be inserted before preheader branch"); + EXPECT(f->val_def_block[use] == body, + "LICM should leave loop-variant dependent use in loop body"); + tc_fini(&tc); +} + +static void opt_licm_preserves_trapping_and_memory_ops(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, FSF_ADDR_TAKEN); + u32 entry = f->entry; + u32 header = ir_block_new(f); + u32 body = ir_block_new(f); + u32 exit = ir_block_new(f); + ir_note_emit(f, header); + ir_note_emit(f, body); + ir_note_emit(f, exit); + + Val cond = add_val(f, tc.i32); + Val num = add_val(f, tc.i32); + Val den = add_val(f, tc.i32); + Val loaded = add_val(f, tc.i32); + Val div = add_val(f, tc.i32); + emit_scalar_input(f, entry, cond, tc.i32); + emit_scalar_input(f, entry, num, tc.i32); + emit_scalar_input(f, entry, den, tc.i32); + emit_br_to(f, entry, header); + emit_cond_branch(f, header, cond, body, exit, tc.i32); + emit_load_local(f, body, loaded, fs, tc.i32, 0); + Inst* div_inst = emit_binop(f, body, div, num, den, tc.i32); + div_inst->extra.imm = BO_SDIV; + emit_br_to(f, body, header); + emit_ret_val(f, exit, num, tc.i32); + + opt_build_cfg(f); + opt_build_loop_tree(f); + opt_licm(f); + opt_verify(f, "test-licm-preserve-unsafe"); + + EXPECT(f->val_def_block[loaded] == body, + "LICM must not hoist memory loads"); + EXPECT(f->val_def_block[div] == body, + "LICM must not hoist potentially trapping division"); + tc_fini(&tc); +} + static void opt_regalloc_priority(void) { TestCtx tc; tc_init(&tc); @@ -5708,6 +5794,8 @@ int main(void) { opt_loop_tree_excludes_side_exit(); opt_loop_tree_nested_depths(); opt_loop_tree_does_not_mutate_cfg(); + opt_licm_hoists_safe_invariant_to_preheader(); + opt_licm_preserves_trapping_and_memory_ops(); opt_liveness_branch(); opt_block_liveness_phase1(); opt_live_ranges_phase2();