commit c72fb5b7d435448fad6f4baf766a7d5670809fc8
parent df0362afce63cd628b21bd4bad1a7ea2f09fe35e
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 21 May 2026 17:51:48 -0700
Implement LICM optimization pass
Diffstat:
4 files changed, 324 insertions(+), 1 deletion(-)
diff --git a/doc/OPT.md b/doc/OPT.md
@@ -801,7 +801,7 @@ Implement in order:
`opt_simplify` as described in Section 4.3
9. [x] extended memory tracker for joins/path state
10. [x] `opt_dse`
-11. [ ] `opt_licm`
+11. [x] `opt_licm`
12. [ ] `opt_pressure_relief`
13. [x] `opt_make_conventional_ssa`
14. [ ] `opt_ssa_combine`
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -1443,6 +1443,9 @@ static void opt_run_o2_pipeline(OptImpl* o) {
opt_verify(o->f, "o2-dse-ssa");
opt_ssa_dce(o->f);
opt_verify(o->f, "o2-dse-dce");
+ opt_build_loop_tree(o->f);
+ opt_licm(o->f);
+ opt_verify(o->f, "o2-licm-ssa");
opt_make_conventional_ssa(o->f);
opt_verify(o->f, "o2-conventional-ssa");
opt_undo_ssa(o->f);
diff --git a/src/opt/pass_o2.c b/src/opt/pass_o2.c
@@ -155,6 +155,12 @@ typedef struct DseCtx {
int changed;
} DseCtx;
+typedef struct LicmLoop {
+ u32 header;
+ u32 preheader;
+ u8* body;
+} LicmLoop;
+
static u32 opt_inst_count(Func* f) {
u32 n = 0;
for (u32 b = 0; b < f->nblocks; ++b) n += f->blocks[b].ninsts;
@@ -2179,6 +2185,232 @@ void opt_dse(Func* f) {
opt_rebuild_def_use(f);
}
+static void licm_mark_body(Func* f, const u8* reachable, u32 header, u32 latch,
+ u8* body, u32* stack) {
+ u32 sp = 0;
+ if (!body[header]) body[header] = 1;
+ if (!body[latch]) {
+ body[latch] = 1;
+ stack[sp++] = latch;
+ }
+
+ while (sp) {
+ u32 b = stack[--sp];
+ if (b == header) continue;
+ Block* bl = &f->blocks[b];
+ for (u32 p = 0; p < bl->npreds; ++p) {
+ u32 pred = bl->preds[p];
+ if (pred >= f->nblocks || !reachable[pred] || body[pred]) continue;
+ body[pred] = 1;
+ stack[sp++] = pred;
+ }
+ }
+}
+
+static u32 licm_collect_loops(Func* f, OptAnalysis* a, LicmLoop** out) {
+ u32 nloops = 0;
+ LicmLoop* loops = arena_zarray(f->arena, LicmLoop, f->nblocks ? f->nblocks
+ : 1u);
+ u8* scratch = arena_zarray(f->arena, u8, f->nblocks ? f->nblocks : 1u);
+ u32* stack = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u);
+
+ for (u32 header = 0; header < f->nblocks; ++header) {
+ if (!a->reachable || !a->reachable[header]) continue;
+ memset(scratch, 0, f->nblocks * sizeof scratch[0]);
+ int has_loop = 0;
+
+ for (u32 latch = 0; latch < f->nblocks; ++latch) {
+ if (!a->reachable[latch]) continue;
+ Block* lb = &f->blocks[latch];
+ for (u32 s = 0; s < lb->nsucc; ++s) {
+ if (lb->succ[s] != header) continue;
+ if (!opt_analysis_dominates(a, header, latch)) continue;
+ has_loop = 1;
+ licm_mark_body(f, a->reachable, header, latch, scratch, stack);
+ }
+ }
+ if (!has_loop) continue;
+
+ u32 preheader = OPT_BLOCK_NONE;
+ Block* hb = &f->blocks[header];
+ for (u32 p = 0; p < hb->npreds; ++p) {
+ u32 pred = hb->preds[p];
+ if (pred >= f->nblocks || scratch[pred]) continue;
+ if (preheader != OPT_BLOCK_NONE) {
+ preheader = OPT_BLOCK_NONE;
+ break;
+ }
+ preheader = pred;
+ }
+ if (preheader == OPT_BLOCK_NONE) continue;
+ if (f->blocks[preheader].nsucc != 1 ||
+ f->blocks[preheader].succ[0] != header)
+ continue;
+
+ u8* body = arena_array(f->arena, u8, f->nblocks ? f->nblocks : 1u);
+ memcpy(body, scratch, f->nblocks * sizeof body[0]);
+ loops[nloops].header = header;
+ loops[nloops].preheader = preheader;
+ loops[nloops].body = body;
+ ++nloops;
+ }
+
+ *out = loops;
+ return nloops;
+}
+
+static int licm_safe_binop(BinOp op) {
+ switch (op) {
+ case BO_SDIV:
+ case BO_UDIV:
+ case BO_SREM:
+ case BO_UREM:
+ case BO_FDIV:
+ return 0;
+ default:
+ return 1;
+ }
+}
+
+static int licm_candidate(Func* f, const Inst* in) {
+ if (!in || in->def == VAL_NONE || in->ndefs || in->flags) return 0;
+ if (in->def >= f->nvals || opt_inst_has_side_effect(f, in)) return 0;
+ switch ((IROp)in->op) {
+ case IR_CONST_I:
+ case IR_CONST_BYTES:
+ case IR_LOAD_IMM:
+ case IR_LOAD_CONST:
+ case IR_LOAD_LABEL_ADDR:
+ case IR_COPY:
+ case IR_ADDR_OF:
+ case IR_UNOP:
+ case IR_CMP:
+ case IR_CONVERT:
+ return 1;
+ case IR_BINOP:
+ return licm_safe_binop((BinOp)in->extra.imm);
+ default:
+ return 0;
+ }
+}
+
+static int licm_operand_invariant(Func* f, const u8* body, const Operand* op) {
+ if (!op) return 1;
+ if (op->kind == OPK_REG) {
+ Val v = (Val)op->v.reg;
+ if (v == VAL_NONE || v >= f->nvals) return 0;
+ u32 def_block = f->val_def_block[v];
+ return def_block < f->nblocks && !body[def_block];
+ }
+ if (op->kind == OPK_INDIRECT) {
+ Val v = (Val)op->v.ind.base;
+ if (v == VAL_NONE || v >= f->nvals) return 0;
+ u32 def_block = f->val_def_block[v];
+ return def_block < f->nblocks && !body[def_block];
+ }
+ return 1;
+}
+
+static int licm_inst_invariant(Func* f, const LicmLoop* loop, const Inst* in) {
+ if (!licm_candidate(f, in)) return 0;
+ for (u32 i = 0; i < in->nopnds; ++i) {
+ const Operand* op = &in->opnds[i];
+ if (op->kind == OPK_REG && opt_val_in_inst_defs(in, (Val)op->v.reg))
+ continue;
+ if (!licm_operand_invariant(f, loop->body, op)) return 0;
+ }
+ return 1;
+}
+
+static void licm_insert_inst(Func* f, u32 block, u32 at, Inst in) {
+ Block* bl = &f->blocks[block];
+ if (at > bl->ninsts) at = bl->ninsts;
+ if (bl->ninsts == bl->cap) {
+ u32 ncap = bl->cap ? bl->cap * 2u : 8u;
+ while (ncap < bl->ninsts + 1u) ncap *= 2u;
+ Inst* insts = arena_zarray(f->arena, Inst, ncap);
+ if (at) memcpy(insts, bl->insts, sizeof(insts[0]) * at);
+ if (bl->ninsts > at)
+ memcpy(&insts[at + 1u], &bl->insts[at],
+ sizeof(insts[0]) * (bl->ninsts - at));
+ bl->insts = insts;
+ bl->cap = ncap;
+ } else {
+ for (u32 i = bl->ninsts; i > at; --i) bl->insts[i] = bl->insts[i - 1u];
+ }
+ bl->insts[at] = in;
+ ++bl->ninsts;
+}
+
+static void licm_remove_inst(Func* f, u32 block, u32 at) {
+ Block* bl = &f->blocks[block];
+ if (at >= bl->ninsts) return;
+ for (u32 i = at + 1u; i < bl->ninsts; ++i) bl->insts[i - 1u] = bl->insts[i];
+ --bl->ninsts;
+}
+
+static u32 licm_preheader_insert_pos(Func* f, u32 preheader) {
+ Block* bl = &f->blocks[preheader];
+ if (bl->ninsts && o2_is_terminator(&bl->insts[bl->ninsts - 1u]))
+ return bl->ninsts - 1u;
+ return bl->ninsts;
+}
+
+static void licm_hoist_inst(Func* f, const LicmLoop* loop, u32 block, u32 inst) {
+ Inst moved = f->blocks[block].insts[inst];
+ u32 at = licm_preheader_insert_pos(f, loop->preheader);
+ licm_insert_inst(f, loop->preheader, at, moved);
+ licm_remove_inst(f, block, inst);
+ if (moved.def != VAL_NONE && moved.def < f->nvals) {
+ f->val_def_block[moved.def] = loop->preheader;
+ f->val_def_inst[moved.def] = at;
+ }
+}
+
+void opt_licm(Func* f) {
+ if (!f || f->opt_rewritten) return;
+ if (!f->opt_reg_ssa && f->npregs > 1) {
+ opt_rebuild_def_use(f);
+ return;
+ }
+
+ OptAnalysis a;
+ memset(&a, 0, sizeof a);
+ opt_analysis_build_dominators(f, &a);
+ LicmLoop* loops = NULL;
+ u32 nloops = licm_collect_loops(f, &a, &loops);
+ int changed = 0;
+
+ for (u32 l = 0; l < nloops; ++l) {
+ LicmLoop* loop = &loops[l];
+ int again = 1;
+ while (again) {
+ again = 0;
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ if (!loop->body[b] || b == loop->preheader) continue;
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts;) {
+ Inst* in = &bl->insts[i];
+ if (!licm_inst_invariant(f, loop, in)) {
+ ++i;
+ continue;
+ }
+ licm_hoist_inst(f, loop, b, i);
+ changed = 1;
+ again = 1;
+ bl = &f->blocks[b];
+ }
+ }
+ }
+ }
+
+ if (changed) {
+ dse_refresh_def_locations(f);
+ opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE);
+ }
+ opt_rebuild_def_use(f);
+}
+
void opt_cleanup(Func* f) {
if (!f) return;
opt_ssa_dce(f);
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -3593,6 +3593,92 @@ static void opt_loop_tree_does_not_mutate_cfg(void) {
tc_fini(&tc);
}
+static void opt_licm_hoists_safe_invariant_to_preheader(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, FSF_ADDR_TAKEN);
+ u32 entry = f->entry;
+ u32 header = ir_block_new(f);
+ u32 body = ir_block_new(f);
+ u32 exit = ir_block_new(f);
+ ir_note_emit(f, header);
+ ir_note_emit(f, body);
+ ir_note_emit(f, exit);
+
+ Val cond = add_val(f, tc.i32);
+ Val a = add_val(f, tc.i32);
+ Val b = add_val(f, tc.i32);
+ Val inv = add_val(f, tc.i32);
+ Val loaded = add_val(f, tc.i32);
+ Val use = add_val(f, tc.i32);
+ emit_scalar_input(f, entry, cond, tc.i32);
+ emit_scalar_input(f, entry, a, tc.i32);
+ emit_scalar_input(f, entry, b, tc.i32);
+ emit_br_to(f, entry, header);
+ emit_cond_branch(f, header, cond, body, exit, tc.i32);
+ Inst* mul = emit_binop(f, body, inv, a, b, tc.i32);
+ mul->extra.imm = BO_IMUL;
+ emit_load_local(f, body, loaded, fs, tc.i32, 0);
+ emit_binop(f, body, use, inv, loaded, tc.i32);
+ emit_br_to(f, body, header);
+ emit_ret_val(f, exit, a, tc.i32);
+
+ opt_build_cfg(f);
+ opt_build_loop_tree(f);
+ opt_licm(f);
+ opt_verify(f, "test-licm-hoist");
+
+ EXPECT(f->val_def_block[inv] == entry,
+ "LICM should hoist invariant multiply to preheader");
+ EXPECT(f->val_def_inst[inv] + 1u < f->blocks[entry].ninsts,
+ "hoisted multiply should be inserted before preheader branch");
+ EXPECT(f->val_def_block[use] == body,
+ "LICM should leave loop-variant dependent use in loop body");
+ tc_fini(&tc);
+}
+
+static void opt_licm_preserves_trapping_and_memory_ops(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, FSF_ADDR_TAKEN);
+ u32 entry = f->entry;
+ u32 header = ir_block_new(f);
+ u32 body = ir_block_new(f);
+ u32 exit = ir_block_new(f);
+ ir_note_emit(f, header);
+ ir_note_emit(f, body);
+ ir_note_emit(f, exit);
+
+ Val cond = add_val(f, tc.i32);
+ Val num = add_val(f, tc.i32);
+ Val den = add_val(f, tc.i32);
+ Val loaded = add_val(f, tc.i32);
+ Val div = add_val(f, tc.i32);
+ emit_scalar_input(f, entry, cond, tc.i32);
+ emit_scalar_input(f, entry, num, tc.i32);
+ emit_scalar_input(f, entry, den, tc.i32);
+ emit_br_to(f, entry, header);
+ emit_cond_branch(f, header, cond, body, exit, tc.i32);
+ emit_load_local(f, body, loaded, fs, tc.i32, 0);
+ Inst* div_inst = emit_binop(f, body, div, num, den, tc.i32);
+ div_inst->extra.imm = BO_SDIV;
+ emit_br_to(f, body, header);
+ emit_ret_val(f, exit, num, tc.i32);
+
+ opt_build_cfg(f);
+ opt_build_loop_tree(f);
+ opt_licm(f);
+ opt_verify(f, "test-licm-preserve-unsafe");
+
+ EXPECT(f->val_def_block[loaded] == body,
+ "LICM must not hoist memory loads");
+ EXPECT(f->val_def_block[div] == body,
+ "LICM must not hoist potentially trapping division");
+ tc_fini(&tc);
+}
+
static void opt_regalloc_priority(void) {
TestCtx tc;
tc_init(&tc);
@@ -5708,6 +5794,8 @@ int main(void) {
opt_loop_tree_excludes_side_exit();
opt_loop_tree_nested_depths();
opt_loop_tree_does_not_mutate_cfg();
+ opt_licm_hoists_safe_invariant_to_preheader();
+ opt_licm_preserves_trapping_and_memory_ops();
opt_liveness_branch();
opt_block_liveness_phase1();
opt_live_ranges_phase2();