kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 619a06df5b377adca4306a10108923d8cb3365ef
parent b83a60ab03163c763a4e01b3d03435f784131628
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 14 May 2026 12:13:35 -0700

opt: add natural loop tree for O1

Diffstat:
Msrc/opt/opt.c | 1+
Msrc/opt/opt.h | 1+
Msrc/opt/pass_lower.c | 148+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Mtest/opt/opt_test.c | 182+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 322 insertions(+), 10 deletions(-)

diff --git a/src/opt/opt.c b/src/opt/opt.c @@ -1333,6 +1333,7 @@ static void w_func_end(CGTarget* t) { if (o->level == 1) { opt_build_cfg(o->f); opt_machinize(o->f, o->target); + opt_build_loop_tree(o->f); opt_live_info(o->f); opt_dead_def_elim(o->f); o->f->val_info = NULL; /* force opt_regalloc to recompute liveness */ diff --git a/src/opt/opt.h b/src/opt/opt.h @@ -66,6 +66,7 @@ void opt_cleanup(Func*); /* Machine-dependent ABI lowering, 2-op insns, etc. Implemented per-arch and * per-OS, so it takes the full Target. */ void opt_machinize(Func*, CGTarget* target); +void opt_build_loop_tree(Func*); void opt_live_info(Func*); void opt_coalesce(Func*); void opt_regalloc(Func*, int allow_live_range_split); diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c @@ -253,22 +253,151 @@ static int is_caller_saved(Func* f, u8 cls, Reg r) { return (f->opt_caller_saved[cls] & (1u << r)) != 0; } -static void build_loop_depth(Func* f) { +#define OPT_BLK_NONE 0xffffffffu + +typedef struct LoopPostorderCtx { + Func* f; + u32* po; + u32* po_idx; + u8* visited; + u32 count; +} LoopPostorderCtx; + +static void loop_postorder_dfs(LoopPostorderCtx* ctx, u32 b) { + if (b >= ctx->f->nblocks || ctx->visited[b]) return; + ctx->visited[b] = 1; + Block* bl = &ctx->f->blocks[b]; + for (u32 s = 0; s < bl->nsucc; ++s) { + u32 t = bl->succ[s]; + if (t < ctx->f->nblocks) loop_postorder_dfs(ctx, t); + } + ctx->po[ctx->count] = b; + ctx->po_idx[b] = ctx->count; + ctx->count++; +} + +static u32 loop_dom_intersect(u32 b1, u32 b2, const u32* idom, + const u32* po_idx) { + while (b1 != b2) { + while (po_idx[b1] < po_idx[b2]) b1 = idom[b1]; + while (po_idx[b2] < po_idx[b1]) b2 = idom[b2]; + } + return b1; +} + +static u32* loop_compute_idom(Func* f, const u8* visited, const u32* po, + const u32* po_idx, u32 npo, u32 entry) { + u32* idom = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u); + for (u32 b = 0; b < f->nblocks; ++b) idom[b] = OPT_BLK_NONE; + idom[entry] = entry; + + int changed = 1; + while (changed) { + changed = 0; + for (u32 ri = npo; ri > 0; --ri) { + u32 b = po[ri - 1u]; + if (b == entry) continue; + Block* bl = &f->blocks[b]; + u32 new_idom = OPT_BLK_NONE; + for (u32 p = 0; p < bl->npreds; ++p) { + u32 pp = bl->preds[p]; + if (pp >= f->nblocks || !visited[pp]) continue; + if (idom[pp] == OPT_BLK_NONE) continue; + new_idom = (new_idom == OPT_BLK_NONE) + ? pp + : loop_dom_intersect(pp, new_idom, idom, po_idx); + } + if (new_idom != OPT_BLK_NONE && idom[b] != new_idom) { + idom[b] = new_idom; + changed = 1; + } + } + } + return idom; +} + +static int loop_dominates(const u32* idom, u32 entry, u32 dom, u32 node) { + u32 cur = node; + while (cur != OPT_BLK_NONE) { + if (cur == dom) return 1; + if (cur == entry) break; + cur = idom[cur]; + } + return 0; +} + +static void loop_mark_body(Func* f, const u8* visited, u32 header, u32 latch, + u8* body, u32* stack) { + u32 sp = 0; + if (!body[header]) body[header] = 1; + if (!body[latch]) { + body[latch] = 1; + stack[sp++] = latch; + } + + while (sp) { + u32 b = stack[--sp]; + if (b == header) continue; + Block* bl = &f->blocks[b]; + for (u32 p = 0; p < bl->npreds; ++p) { + u32 pred = bl->preds[p]; + if (pred >= f->nblocks || !visited[pred] || body[pred]) continue; + body[pred] = 1; + stack[sp++] = pred; + } + } +} + +static u32 loop_frequency(u8 depth) { + u8 capped = depth > 10 ? 10 : depth; + return 1u << capped; +} + +void opt_build_loop_tree(Func* f) { for (u32 b = 0; b < f->nblocks; ++b) { f->blocks[b].loop_depth = 0; f->blocks[b].frequency = 1; } - for (u32 b = 0; b < f->nblocks; ++b) { - Block* bl = &f->blocks[b]; - for (u32 s = 0; s < bl->nsucc; ++s) { - u32 h = bl->succ[s]; - if (h > b || h >= f->nblocks) continue; - for (u32 k = h; k <= b; ++k) - if (f->blocks[k].loop_depth < 31) ++f->blocks[k].loop_depth; + if (f->nblocks == 0 || f->entry >= f->nblocks) return; + + LoopPostorderCtx pctx; + memset(&pctx, 0, sizeof pctx); + pctx.f = f; + pctx.po = arena_array(f->arena, u32, f->nblocks); + pctx.po_idx = arena_array(f->arena, u32, f->nblocks); + pctx.visited = arena_zarray(f->arena, u8, f->nblocks); + loop_postorder_dfs(&pctx, f->entry); + if (pctx.count == 0) return; + + u32* idom = loop_compute_idom(f, pctx.visited, pctx.po, pctx.po_idx, + pctx.count, f->entry); + u8* body = arena_zarray(f->arena, u8, f->nblocks); + u32* stack = arena_array(f->arena, u32, f->nblocks); + + for (u32 header = 0; header < f->nblocks; ++header) { + if (!pctx.visited[header]) continue; + memset(body, 0, f->nblocks * sizeof body[0]); + int has_loop = 0; + + for (u32 latch = 0; latch < f->nblocks; ++latch) { + if (!pctx.visited[latch]) continue; + Block* lb = &f->blocks[latch]; + for (u32 s = 0; s < lb->nsucc; ++s) { + if (lb->succ[s] != header) continue; + if (!loop_dominates(idom, f->entry, header, latch)) continue; + has_loop = 1; + loop_mark_body(f, pctx.visited, header, latch, body, stack); + } } + + if (!has_loop) continue; + for (u32 b = 0; b < f->nblocks; ++b) + if (body[b] && f->blocks[b].loop_depth < 31) + ++f->blocks[b].loop_depth; } + for (u32 b = 0; b < f->nblocks; ++b) - f->blocks[b].frequency = 1u << f->blocks[b].loop_depth; + f->blocks[b].frequency = loop_frequency(f->blocks[b].loop_depth); } void opt_live_info(Func* f) { @@ -287,7 +416,6 @@ void opt_live_info(Func* f) { f->val_info[v].alloc_kind = OPT_ALLOC_NONE; f->val_info[v].cls = f->val_cls[v]; } - build_loop_depth(f); for (u32 b = 0; b < f->nblocks; ++b) { Block* bl = &f->blocks[b]; bl->live_in = arena_zarray(f->arena, u64, words); diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c @@ -187,6 +187,26 @@ static void emit_ret_val(Func* f, u32 b, Val v, CfreeCgTypeId ty) { in->extra.aux = aux; } +static void emit_br_to(Func* f, u32 b, u32 target) { + Inst* in = ir_emit(f, b, IR_BR); + (void)in; + f->blocks[b].succ[0] = target; + f->blocks[b].nsucc = 1; +} + +static void emit_test_branch(Func* f, u32 b, u32 taken, u32 fallthrough, + CfreeCgTypeId ty) { + Inst* in = ir_emit(f, b, IR_CMP_BRANCH); + in->opnds = arena_array(f->arena, Operand, 2); + in->opnds[0] = op_imm_(1, ty); + in->opnds[1] = op_imm_(0, ty); + in->nopnds = 2; + in->extra.imm = CMP_NE; + f->blocks[b].succ[0] = taken; + f->blocks[b].succ[1] = fallthrough; + f->blocks[b].nsucc = 2; +} + static int bit_has(const u64* bits, Val v) { return (bits[v / 64u] & (1ull << (v % 64u))) != 0; } @@ -353,6 +373,7 @@ static void opt_liveness_branch(void) { ir_emit(f, b2, IR_RET); opt_build_cfg(f); + opt_build_loop_tree(f); opt_live_info(f); EXPECT(bit_has(f->blocks[b0].live_out, v), "v%u live_out of branch block", @@ -385,6 +406,7 @@ static void opt_cfg_prunes_unreachable(void) { emit_ret_val(f, dead, dead_val, tc.i32); opt_build_cfg(f); + opt_build_loop_tree(f); opt_live_info(f); EXPECT(f->blocks[dead].ninsts == 0, @@ -445,6 +467,7 @@ static void opt_cfg_preserves_scope_edges(void) { emit_ret_val(f, b_join, v, tc.i32); opt_build_cfg(f); + opt_build_loop_tree(f); opt_live_info(f); EXPECT(f->blocks[b0].nsucc == 2, @@ -460,6 +483,154 @@ static void opt_cfg_preserves_scope_edges(void) { tc_fini(&tc); } +static void opt_loop_tree_excludes_side_exit(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + u32 entry = f->entry; + u32 header = ir_block_new(f); + u32 exit = ir_block_new(f); /* Numerically between header and latch. */ + u32 body = ir_block_new(f); + u32 latch = ir_block_new(f); + ir_note_emit(f, header); + ir_note_emit(f, exit); + ir_note_emit(f, body); + ir_note_emit(f, latch); + + emit_br_to(f, entry, header); + emit_test_branch(f, header, body, exit, tc.i32); + emit_br_to(f, body, latch); + emit_br_to(f, latch, header); + ir_emit(f, exit, IR_RET); + + opt_build_cfg(f); + opt_build_loop_tree(f); + + EXPECT(f->blocks[entry].loop_depth == 0, + "entry should not be in loop, got depth %u", + (unsigned)f->blocks[entry].loop_depth); + EXPECT(f->blocks[header].loop_depth == 1, + "header should have loop depth 1, got %u", + (unsigned)f->blocks[header].loop_depth); + EXPECT(f->blocks[body].loop_depth == 1, + "body should have loop depth 1, got %u", + (unsigned)f->blocks[body].loop_depth); + EXPECT(f->blocks[latch].loop_depth == 1, + "latch should have loop depth 1, got %u", + (unsigned)f->blocks[latch].loop_depth); + EXPECT(f->blocks[exit].loop_depth == 0, + "side exit should not be in loop, got depth %u", + (unsigned)f->blocks[exit].loop_depth); + EXPECT(f->blocks[exit].frequency == 1, + "side exit should keep frequency 1, got %u", + (unsigned)f->blocks[exit].frequency); + EXPECT(f->blocks[header].frequency > f->blocks[exit].frequency, + "loop header should be hotter than side exit"); + tc_fini(&tc); +} + +static void opt_loop_tree_nested_depths(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + u32 entry = f->entry; + u32 outer_header = ir_block_new(f); + u32 inner_header = ir_block_new(f); + u32 inner_body = ir_block_new(f); + u32 outer_latch = ir_block_new(f); + u32 exit = ir_block_new(f); + ir_note_emit(f, outer_header); + ir_note_emit(f, inner_header); + ir_note_emit(f, inner_body); + ir_note_emit(f, outer_latch); + ir_note_emit(f, exit); + + emit_br_to(f, entry, outer_header); + emit_test_branch(f, outer_header, inner_header, exit, tc.i32); + emit_test_branch(f, inner_header, inner_body, outer_latch, tc.i32); + emit_br_to(f, inner_body, inner_header); + emit_br_to(f, outer_latch, outer_header); + ir_emit(f, exit, IR_RET); + + opt_build_cfg(f); + opt_build_loop_tree(f); + + EXPECT(f->blocks[entry].loop_depth == 0, + "entry should not be in nested loops, got depth %u", + (unsigned)f->blocks[entry].loop_depth); + EXPECT(f->blocks[outer_header].loop_depth == 1, + "outer header should have depth 1, got %u", + (unsigned)f->blocks[outer_header].loop_depth); + EXPECT(f->blocks[outer_latch].loop_depth == 1, + "outer latch should have depth 1, got %u", + (unsigned)f->blocks[outer_latch].loop_depth); + EXPECT(f->blocks[inner_header].loop_depth == 2, + "inner header should have depth 2, got %u", + (unsigned)f->blocks[inner_header].loop_depth); + EXPECT(f->blocks[inner_body].loop_depth == 2, + "inner body should have depth 2, got %u", + (unsigned)f->blocks[inner_body].loop_depth); + EXPECT(f->blocks[exit].loop_depth == 0, + "exit should not be in nested loops, got depth %u", + (unsigned)f->blocks[exit].loop_depth); + EXPECT(f->blocks[inner_header].frequency > + f->blocks[outer_header].frequency, + "inner loop should be hotter than outer-only blocks"); + tc_fini(&tc); +} + +static void opt_loop_tree_does_not_mutate_cfg(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + u32 entry = f->entry; + u32 header = ir_block_new(f); + u32 body = ir_block_new(f); + u32 exit = ir_block_new(f); + ir_note_emit(f, header); + ir_note_emit(f, body); + ir_note_emit(f, exit); + + emit_br_to(f, entry, header); + emit_test_branch(f, header, body, exit, tc.i32); + emit_br_to(f, body, header); + ir_emit(f, exit, IR_RET); + + opt_build_cfg(f); + + u32 nblocks = f->nblocks; + u8 nsucc[8]; + u32 succ0[8]; + u32 succ1[8]; + u32 npreds[8]; + EXPECT(nblocks <= 8, "test CFG grew past fixed snapshot size"); + for (u32 b = 0; b < nblocks && b < 8; ++b) { + nsucc[b] = f->blocks[b].nsucc; + succ0[b] = f->blocks[b].succ[0]; + succ1[b] = f->blocks[b].succ[1]; + npreds[b] = f->blocks[b].npreds; + } + + opt_build_loop_tree(f); + + EXPECT(f->nblocks == nblocks, "loop tree must not change block count"); + for (u32 b = 0; b < nblocks && b < 8; ++b) { + EXPECT(f->blocks[b].nsucc == nsucc[b], + "loop tree changed b%u nsucc from %u to %u", (unsigned)b, + (unsigned)nsucc[b], (unsigned)f->blocks[b].nsucc); + EXPECT(f->blocks[b].succ[0] == succ0[b], + "loop tree changed b%u succ0 from %u to %u", (unsigned)b, + (unsigned)succ0[b], (unsigned)f->blocks[b].succ[0]); + EXPECT(f->blocks[b].succ[1] == succ1[b], + "loop tree changed b%u succ1 from %u to %u", (unsigned)b, + (unsigned)succ1[b], (unsigned)f->blocks[b].succ[1]); + EXPECT(f->blocks[b].npreds == npreds[b], + "loop tree changed b%u npreds from %u to %u", (unsigned)b, + (unsigned)npreds[b], (unsigned)f->blocks[b].npreds); + } + tc_fini(&tc); +} + static void opt_regalloc_priority(void) { TestCtx tc; tc_init(&tc); @@ -479,6 +650,7 @@ static void opt_regalloc_priority(void) { emit_binop(f, f->entry, out, pinned, hot, tc.i32); emit_ret_val(f, f->entry, out, tc.i32); opt_build_cfg(f); + opt_build_loop_tree(f); opt_live_info(f); f->val_info[pinned].tied_hard_reg = (i32)f->opt_hard_regs[RC_INT][0]; f->val_info[hot].frequency += 1000; @@ -513,6 +685,7 @@ static void opt_rewrite_spill_use_def(void) { emit_binop(f, f->entry, c, a, b, tc.i32); emit_ret_val(f, f->entry, c, tc.i32); opt_build_cfg(f); + opt_build_loop_tree(f); opt_live_info(f); opt_regalloc(f, 0); EXPECT(f->opt_rewritten, "regalloc should rewrite pseudos"); @@ -541,6 +714,7 @@ static void opt_call_clobber_preservation(void) { emit_call_void(f, f->entry); emit_ret_val(f, f->entry, live, tc.i32); opt_build_cfg(f); + opt_build_loop_tree(f); opt_live_info(f); opt_regalloc(f, 0); @@ -574,6 +748,7 @@ static void opt_call_clobber_caller_saved(void) { emit_call_void(f, f->entry); emit_ret_val(f, f->entry, live, tc.i32); opt_build_cfg(f); + opt_build_loop_tree(f); opt_live_info(f); opt_regalloc(f, 0); @@ -613,6 +788,7 @@ static void opt_spill_pressure(void) { emit_binop(f, f->entry, d, d, c, tc.i32); emit_ret_val(f, f->entry, d, tc.i32); opt_build_cfg(f); + opt_build_loop_tree(f); opt_live_info(f); opt_regalloc(f, 0); @@ -663,6 +839,7 @@ static void opt_inline_asm_tied_fixed_regs(void) { emit_ret_val(f, f->entry, tied, tc.i32); opt_build_cfg(f); + opt_build_loop_tree(f); opt_live_info(f); f->val_info[tied].tied_hard_reg = (i32)f->opt_hard_regs[RC_INT][0]; opt_regalloc(f, 0); @@ -694,6 +871,7 @@ static void opt_post_rewrite_dce(void) { ir_emit(f, f->entry, IR_NOP); emit_ret_val(f, f->entry, a, tc.i32); opt_build_cfg(f); + opt_build_loop_tree(f); opt_live_info(f); opt_regalloc(f, 0); opt_combine(f); @@ -720,6 +898,7 @@ static void opt_dead_def_elim_test(void) { emit_copy(f, f->entry, b, a, tc.i32); /* b is dead after this */ emit_ret_val(f, f->entry, a, tc.i32); /* a stays live */ opt_build_cfg(f); + opt_build_loop_tree(f); opt_live_info(f); opt_dead_def_elim(f); EXPECT(count_op(f, IR_COPY) == 0, @@ -777,6 +956,9 @@ static void opt_emit_no_virtual_alloc(void) { int main(void) { opt_cfg_prunes_unreachable(); opt_cfg_preserves_scope_edges(); + opt_loop_tree_excludes_side_exit(); + opt_loop_tree_nested_depths(); + opt_loop_tree_does_not_mutate_cfg(); opt_liveness_branch(); opt_regalloc_priority(); opt_rewrite_spill_use_def();