commit 619a06df5b377adca4306a10108923d8cb3365ef
parent b83a60ab03163c763a4e01b3d03435f784131628
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 14 May 2026 12:13:35 -0700
opt: add natural loop tree for O1
Diffstat:
4 files changed, 322 insertions(+), 10 deletions(-)
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -1333,6 +1333,7 @@ static void w_func_end(CGTarget* t) {
if (o->level == 1) {
opt_build_cfg(o->f);
opt_machinize(o->f, o->target);
+ opt_build_loop_tree(o->f);
opt_live_info(o->f);
opt_dead_def_elim(o->f);
o->f->val_info = NULL; /* force opt_regalloc to recompute liveness */
diff --git a/src/opt/opt.h b/src/opt/opt.h
@@ -66,6 +66,7 @@ void opt_cleanup(Func*);
/* Machine-dependent ABI lowering, 2-op insns, etc. Implemented per-arch and
* per-OS, so it takes the full Target. */
void opt_machinize(Func*, CGTarget* target);
+void opt_build_loop_tree(Func*);
void opt_live_info(Func*);
void opt_coalesce(Func*);
void opt_regalloc(Func*, int allow_live_range_split);
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -253,22 +253,151 @@ static int is_caller_saved(Func* f, u8 cls, Reg r) {
return (f->opt_caller_saved[cls] & (1u << r)) != 0;
}
-static void build_loop_depth(Func* f) {
+#define OPT_BLK_NONE 0xffffffffu
+
+typedef struct LoopPostorderCtx {
+ Func* f;
+ u32* po;
+ u32* po_idx;
+ u8* visited;
+ u32 count;
+} LoopPostorderCtx;
+
+static void loop_postorder_dfs(LoopPostorderCtx* ctx, u32 b) {
+ if (b >= ctx->f->nblocks || ctx->visited[b]) return;
+ ctx->visited[b] = 1;
+ Block* bl = &ctx->f->blocks[b];
+ for (u32 s = 0; s < bl->nsucc; ++s) {
+ u32 t = bl->succ[s];
+ if (t < ctx->f->nblocks) loop_postorder_dfs(ctx, t);
+ }
+ ctx->po[ctx->count] = b;
+ ctx->po_idx[b] = ctx->count;
+ ctx->count++;
+}
+
+static u32 loop_dom_intersect(u32 b1, u32 b2, const u32* idom,
+ const u32* po_idx) {
+ while (b1 != b2) {
+ while (po_idx[b1] < po_idx[b2]) b1 = idom[b1];
+ while (po_idx[b2] < po_idx[b1]) b2 = idom[b2];
+ }
+ return b1;
+}
+
+static u32* loop_compute_idom(Func* f, const u8* visited, const u32* po,
+ const u32* po_idx, u32 npo, u32 entry) {
+ u32* idom = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u);
+ for (u32 b = 0; b < f->nblocks; ++b) idom[b] = OPT_BLK_NONE;
+ idom[entry] = entry;
+
+ int changed = 1;
+ while (changed) {
+ changed = 0;
+ for (u32 ri = npo; ri > 0; --ri) {
+ u32 b = po[ri - 1u];
+ if (b == entry) continue;
+ Block* bl = &f->blocks[b];
+ u32 new_idom = OPT_BLK_NONE;
+ for (u32 p = 0; p < bl->npreds; ++p) {
+ u32 pp = bl->preds[p];
+ if (pp >= f->nblocks || !visited[pp]) continue;
+ if (idom[pp] == OPT_BLK_NONE) continue;
+ new_idom = (new_idom == OPT_BLK_NONE)
+ ? pp
+ : loop_dom_intersect(pp, new_idom, idom, po_idx);
+ }
+ if (new_idom != OPT_BLK_NONE && idom[b] != new_idom) {
+ idom[b] = new_idom;
+ changed = 1;
+ }
+ }
+ }
+ return idom;
+}
+
+static int loop_dominates(const u32* idom, u32 entry, u32 dom, u32 node) {
+ u32 cur = node;
+ while (cur != OPT_BLK_NONE) {
+ if (cur == dom) return 1;
+ if (cur == entry) break;
+ cur = idom[cur];
+ }
+ return 0;
+}
+
+static void loop_mark_body(Func* f, const u8* visited, u32 header, u32 latch,
+ u8* body, u32* stack) {
+ u32 sp = 0;
+ if (!body[header]) body[header] = 1;
+ if (!body[latch]) {
+ body[latch] = 1;
+ stack[sp++] = latch;
+ }
+
+ while (sp) {
+ u32 b = stack[--sp];
+ if (b == header) continue;
+ Block* bl = &f->blocks[b];
+ for (u32 p = 0; p < bl->npreds; ++p) {
+ u32 pred = bl->preds[p];
+ if (pred >= f->nblocks || !visited[pred] || body[pred]) continue;
+ body[pred] = 1;
+ stack[sp++] = pred;
+ }
+ }
+}
+
+static u32 loop_frequency(u8 depth) {
+ u8 capped = depth > 10 ? 10 : depth;
+ return 1u << capped;
+}
+
+void opt_build_loop_tree(Func* f) {
for (u32 b = 0; b < f->nblocks; ++b) {
f->blocks[b].loop_depth = 0;
f->blocks[b].frequency = 1;
}
- for (u32 b = 0; b < f->nblocks; ++b) {
- Block* bl = &f->blocks[b];
- for (u32 s = 0; s < bl->nsucc; ++s) {
- u32 h = bl->succ[s];
- if (h > b || h >= f->nblocks) continue;
- for (u32 k = h; k <= b; ++k)
- if (f->blocks[k].loop_depth < 31) ++f->blocks[k].loop_depth;
+ if (f->nblocks == 0 || f->entry >= f->nblocks) return;
+
+ LoopPostorderCtx pctx;
+ memset(&pctx, 0, sizeof pctx);
+ pctx.f = f;
+ pctx.po = arena_array(f->arena, u32, f->nblocks);
+ pctx.po_idx = arena_array(f->arena, u32, f->nblocks);
+ pctx.visited = arena_zarray(f->arena, u8, f->nblocks);
+ loop_postorder_dfs(&pctx, f->entry);
+ if (pctx.count == 0) return;
+
+ u32* idom = loop_compute_idom(f, pctx.visited, pctx.po, pctx.po_idx,
+ pctx.count, f->entry);
+ u8* body = arena_zarray(f->arena, u8, f->nblocks);
+ u32* stack = arena_array(f->arena, u32, f->nblocks);
+
+ for (u32 header = 0; header < f->nblocks; ++header) {
+ if (!pctx.visited[header]) continue;
+ memset(body, 0, f->nblocks * sizeof body[0]);
+ int has_loop = 0;
+
+ for (u32 latch = 0; latch < f->nblocks; ++latch) {
+ if (!pctx.visited[latch]) continue;
+ Block* lb = &f->blocks[latch];
+ for (u32 s = 0; s < lb->nsucc; ++s) {
+ if (lb->succ[s] != header) continue;
+ if (!loop_dominates(idom, f->entry, header, latch)) continue;
+ has_loop = 1;
+ loop_mark_body(f, pctx.visited, header, latch, body, stack);
+ }
}
+
+ if (!has_loop) continue;
+ for (u32 b = 0; b < f->nblocks; ++b)
+ if (body[b] && f->blocks[b].loop_depth < 31)
+ ++f->blocks[b].loop_depth;
}
+
for (u32 b = 0; b < f->nblocks; ++b)
- f->blocks[b].frequency = 1u << f->blocks[b].loop_depth;
+ f->blocks[b].frequency = loop_frequency(f->blocks[b].loop_depth);
}
void opt_live_info(Func* f) {
@@ -287,7 +416,6 @@ void opt_live_info(Func* f) {
f->val_info[v].alloc_kind = OPT_ALLOC_NONE;
f->val_info[v].cls = f->val_cls[v];
}
- build_loop_depth(f);
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
bl->live_in = arena_zarray(f->arena, u64, words);
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -187,6 +187,26 @@ static void emit_ret_val(Func* f, u32 b, Val v, CfreeCgTypeId ty) {
in->extra.aux = aux;
}
+static void emit_br_to(Func* f, u32 b, u32 target) {
+ Inst* in = ir_emit(f, b, IR_BR);
+ (void)in;
+ f->blocks[b].succ[0] = target;
+ f->blocks[b].nsucc = 1;
+}
+
+static void emit_test_branch(Func* f, u32 b, u32 taken, u32 fallthrough,
+ CfreeCgTypeId ty) {
+ Inst* in = ir_emit(f, b, IR_CMP_BRANCH);
+ in->opnds = arena_array(f->arena, Operand, 2);
+ in->opnds[0] = op_imm_(1, ty);
+ in->opnds[1] = op_imm_(0, ty);
+ in->nopnds = 2;
+ in->extra.imm = CMP_NE;
+ f->blocks[b].succ[0] = taken;
+ f->blocks[b].succ[1] = fallthrough;
+ f->blocks[b].nsucc = 2;
+}
+
static int bit_has(const u64* bits, Val v) {
return (bits[v / 64u] & (1ull << (v % 64u))) != 0;
}
@@ -353,6 +373,7 @@ static void opt_liveness_branch(void) {
ir_emit(f, b2, IR_RET);
opt_build_cfg(f);
+ opt_build_loop_tree(f);
opt_live_info(f);
EXPECT(bit_has(f->blocks[b0].live_out, v), "v%u live_out of branch block",
@@ -385,6 +406,7 @@ static void opt_cfg_prunes_unreachable(void) {
emit_ret_val(f, dead, dead_val, tc.i32);
opt_build_cfg(f);
+ opt_build_loop_tree(f);
opt_live_info(f);
EXPECT(f->blocks[dead].ninsts == 0,
@@ -445,6 +467,7 @@ static void opt_cfg_preserves_scope_edges(void) {
emit_ret_val(f, b_join, v, tc.i32);
opt_build_cfg(f);
+ opt_build_loop_tree(f);
opt_live_info(f);
EXPECT(f->blocks[b0].nsucc == 2,
@@ -460,6 +483,154 @@ static void opt_cfg_preserves_scope_edges(void) {
tc_fini(&tc);
}
+static void opt_loop_tree_excludes_side_exit(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ u32 entry = f->entry;
+ u32 header = ir_block_new(f);
+ u32 exit = ir_block_new(f); /* Numerically between header and latch. */
+ u32 body = ir_block_new(f);
+ u32 latch = ir_block_new(f);
+ ir_note_emit(f, header);
+ ir_note_emit(f, exit);
+ ir_note_emit(f, body);
+ ir_note_emit(f, latch);
+
+ emit_br_to(f, entry, header);
+ emit_test_branch(f, header, body, exit, tc.i32);
+ emit_br_to(f, body, latch);
+ emit_br_to(f, latch, header);
+ ir_emit(f, exit, IR_RET);
+
+ opt_build_cfg(f);
+ opt_build_loop_tree(f);
+
+ EXPECT(f->blocks[entry].loop_depth == 0,
+ "entry should not be in loop, got depth %u",
+ (unsigned)f->blocks[entry].loop_depth);
+ EXPECT(f->blocks[header].loop_depth == 1,
+ "header should have loop depth 1, got %u",
+ (unsigned)f->blocks[header].loop_depth);
+ EXPECT(f->blocks[body].loop_depth == 1,
+ "body should have loop depth 1, got %u",
+ (unsigned)f->blocks[body].loop_depth);
+ EXPECT(f->blocks[latch].loop_depth == 1,
+ "latch should have loop depth 1, got %u",
+ (unsigned)f->blocks[latch].loop_depth);
+ EXPECT(f->blocks[exit].loop_depth == 0,
+ "side exit should not be in loop, got depth %u",
+ (unsigned)f->blocks[exit].loop_depth);
+ EXPECT(f->blocks[exit].frequency == 1,
+ "side exit should keep frequency 1, got %u",
+ (unsigned)f->blocks[exit].frequency);
+ EXPECT(f->blocks[header].frequency > f->blocks[exit].frequency,
+ "loop header should be hotter than side exit");
+ tc_fini(&tc);
+}
+
+static void opt_loop_tree_nested_depths(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ u32 entry = f->entry;
+ u32 outer_header = ir_block_new(f);
+ u32 inner_header = ir_block_new(f);
+ u32 inner_body = ir_block_new(f);
+ u32 outer_latch = ir_block_new(f);
+ u32 exit = ir_block_new(f);
+ ir_note_emit(f, outer_header);
+ ir_note_emit(f, inner_header);
+ ir_note_emit(f, inner_body);
+ ir_note_emit(f, outer_latch);
+ ir_note_emit(f, exit);
+
+ emit_br_to(f, entry, outer_header);
+ emit_test_branch(f, outer_header, inner_header, exit, tc.i32);
+ emit_test_branch(f, inner_header, inner_body, outer_latch, tc.i32);
+ emit_br_to(f, inner_body, inner_header);
+ emit_br_to(f, outer_latch, outer_header);
+ ir_emit(f, exit, IR_RET);
+
+ opt_build_cfg(f);
+ opt_build_loop_tree(f);
+
+ EXPECT(f->blocks[entry].loop_depth == 0,
+ "entry should not be in nested loops, got depth %u",
+ (unsigned)f->blocks[entry].loop_depth);
+ EXPECT(f->blocks[outer_header].loop_depth == 1,
+ "outer header should have depth 1, got %u",
+ (unsigned)f->blocks[outer_header].loop_depth);
+ EXPECT(f->blocks[outer_latch].loop_depth == 1,
+ "outer latch should have depth 1, got %u",
+ (unsigned)f->blocks[outer_latch].loop_depth);
+ EXPECT(f->blocks[inner_header].loop_depth == 2,
+ "inner header should have depth 2, got %u",
+ (unsigned)f->blocks[inner_header].loop_depth);
+ EXPECT(f->blocks[inner_body].loop_depth == 2,
+ "inner body should have depth 2, got %u",
+ (unsigned)f->blocks[inner_body].loop_depth);
+ EXPECT(f->blocks[exit].loop_depth == 0,
+ "exit should not be in nested loops, got depth %u",
+ (unsigned)f->blocks[exit].loop_depth);
+ EXPECT(f->blocks[inner_header].frequency >
+ f->blocks[outer_header].frequency,
+ "inner loop should be hotter than outer-only blocks");
+ tc_fini(&tc);
+}
+
+static void opt_loop_tree_does_not_mutate_cfg(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ u32 entry = f->entry;
+ u32 header = ir_block_new(f);
+ u32 body = ir_block_new(f);
+ u32 exit = ir_block_new(f);
+ ir_note_emit(f, header);
+ ir_note_emit(f, body);
+ ir_note_emit(f, exit);
+
+ emit_br_to(f, entry, header);
+ emit_test_branch(f, header, body, exit, tc.i32);
+ emit_br_to(f, body, header);
+ ir_emit(f, exit, IR_RET);
+
+ opt_build_cfg(f);
+
+ u32 nblocks = f->nblocks;
+ u8 nsucc[8];
+ u32 succ0[8];
+ u32 succ1[8];
+ u32 npreds[8];
+ EXPECT(nblocks <= 8, "test CFG grew past fixed snapshot size");
+ for (u32 b = 0; b < nblocks && b < 8; ++b) {
+ nsucc[b] = f->blocks[b].nsucc;
+ succ0[b] = f->blocks[b].succ[0];
+ succ1[b] = f->blocks[b].succ[1];
+ npreds[b] = f->blocks[b].npreds;
+ }
+
+ opt_build_loop_tree(f);
+
+ EXPECT(f->nblocks == nblocks, "loop tree must not change block count");
+ for (u32 b = 0; b < nblocks && b < 8; ++b) {
+ EXPECT(f->blocks[b].nsucc == nsucc[b],
+ "loop tree changed b%u nsucc from %u to %u", (unsigned)b,
+ (unsigned)nsucc[b], (unsigned)f->blocks[b].nsucc);
+ EXPECT(f->blocks[b].succ[0] == succ0[b],
+ "loop tree changed b%u succ0 from %u to %u", (unsigned)b,
+ (unsigned)succ0[b], (unsigned)f->blocks[b].succ[0]);
+ EXPECT(f->blocks[b].succ[1] == succ1[b],
+ "loop tree changed b%u succ1 from %u to %u", (unsigned)b,
+ (unsigned)succ1[b], (unsigned)f->blocks[b].succ[1]);
+ EXPECT(f->blocks[b].npreds == npreds[b],
+ "loop tree changed b%u npreds from %u to %u", (unsigned)b,
+ (unsigned)npreds[b], (unsigned)f->blocks[b].npreds);
+ }
+ tc_fini(&tc);
+}
+
static void opt_regalloc_priority(void) {
TestCtx tc;
tc_init(&tc);
@@ -479,6 +650,7 @@ static void opt_regalloc_priority(void) {
emit_binop(f, f->entry, out, pinned, hot, tc.i32);
emit_ret_val(f, f->entry, out, tc.i32);
opt_build_cfg(f);
+ opt_build_loop_tree(f);
opt_live_info(f);
f->val_info[pinned].tied_hard_reg = (i32)f->opt_hard_regs[RC_INT][0];
f->val_info[hot].frequency += 1000;
@@ -513,6 +685,7 @@ static void opt_rewrite_spill_use_def(void) {
emit_binop(f, f->entry, c, a, b, tc.i32);
emit_ret_val(f, f->entry, c, tc.i32);
opt_build_cfg(f);
+ opt_build_loop_tree(f);
opt_live_info(f);
opt_regalloc(f, 0);
EXPECT(f->opt_rewritten, "regalloc should rewrite pseudos");
@@ -541,6 +714,7 @@ static void opt_call_clobber_preservation(void) {
emit_call_void(f, f->entry);
emit_ret_val(f, f->entry, live, tc.i32);
opt_build_cfg(f);
+ opt_build_loop_tree(f);
opt_live_info(f);
opt_regalloc(f, 0);
@@ -574,6 +748,7 @@ static void opt_call_clobber_caller_saved(void) {
emit_call_void(f, f->entry);
emit_ret_val(f, f->entry, live, tc.i32);
opt_build_cfg(f);
+ opt_build_loop_tree(f);
opt_live_info(f);
opt_regalloc(f, 0);
@@ -613,6 +788,7 @@ static void opt_spill_pressure(void) {
emit_binop(f, f->entry, d, d, c, tc.i32);
emit_ret_val(f, f->entry, d, tc.i32);
opt_build_cfg(f);
+ opt_build_loop_tree(f);
opt_live_info(f);
opt_regalloc(f, 0);
@@ -663,6 +839,7 @@ static void opt_inline_asm_tied_fixed_regs(void) {
emit_ret_val(f, f->entry, tied, tc.i32);
opt_build_cfg(f);
+ opt_build_loop_tree(f);
opt_live_info(f);
f->val_info[tied].tied_hard_reg = (i32)f->opt_hard_regs[RC_INT][0];
opt_regalloc(f, 0);
@@ -694,6 +871,7 @@ static void opt_post_rewrite_dce(void) {
ir_emit(f, f->entry, IR_NOP);
emit_ret_val(f, f->entry, a, tc.i32);
opt_build_cfg(f);
+ opt_build_loop_tree(f);
opt_live_info(f);
opt_regalloc(f, 0);
opt_combine(f);
@@ -720,6 +898,7 @@ static void opt_dead_def_elim_test(void) {
emit_copy(f, f->entry, b, a, tc.i32); /* b is dead after this */
emit_ret_val(f, f->entry, a, tc.i32); /* a stays live */
opt_build_cfg(f);
+ opt_build_loop_tree(f);
opt_live_info(f);
opt_dead_def_elim(f);
EXPECT(count_op(f, IR_COPY) == 0,
@@ -777,6 +956,9 @@ static void opt_emit_no_virtual_alloc(void) {
int main(void) {
opt_cfg_prunes_unreachable();
opt_cfg_preserves_scope_edges();
+ opt_loop_tree_excludes_side_exit();
+ opt_loop_tree_nested_depths();
+ opt_loop_tree_does_not_mutate_cfg();
opt_liveness_branch();
opt_regalloc_priority();
opt_rewrite_spill_use_def();