kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 627d98f8eb8716464ededb979972b695f94a4579
parent 7d47b9fc98c761d38925c34d547866bcf426d908
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 22 May 2026 03:08:42 -0700

test O2 jump and inline cleanup quality

Diffstat:
Mdoc/OPT.md | 17+++++++++--------
Msrc/opt/pass_emit.c | 80+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Msrc/opt/pass_inline.c | 2+-
Mtest/opt/opt_test.c | 110+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest/opt/run.sh | 26++++++++++++++++++++++++++
Atest/toy/cases/135_inline_cleanup_quality.expected | 1+
Atest/toy/cases/135_inline_cleanup_quality.toy | 9+++++++++
7 files changed, 234 insertions(+), 11 deletions(-)

diff --git a/doc/OPT.md b/doc/OPT.md @@ -970,11 +970,13 @@ Remaining Phase D exit criteria: default O2 schedule. - [x] `opt_ssa_combine` has focused red-green tests before it enters the default O2 schedule. -- [ ] Full O2 jump optimization has focused red-green tests before it enters - the default O2 schedule. -- [ ] Post-inline cleanup quality: after Phase E inlines a simple Toy wrapper, +- [x] Full O2 jump optimization has focused red-green tests before it enters + the default O2 schedule, covering switch target forwarding, empty + fallthrough-chain forwarding, repeated branch forwarding, and same-target + conditional collapse. +- [x] Post-inline cleanup quality: after Phase E inlines a simple Toy wrapper, the existing Phase D value passes should collapse the exposed constant/copy - chain and remove frame traffic. Current example: + chain and remove frame traffic. Covered example: ``` fn add1(x: i32): i32 { @@ -990,10 +992,9 @@ Remaining Phase D exit criteria: } ``` - `-O2` now removes the `bl _add1`, but AArch64 still shows materialized - parameter/local frame traffic and a runtime compare/branch in `main`. The - desired Phase D follow-up is for inlining plus cleanup to fold the call - result and branch to the constant return path. + `-O2` removes the wrapper call leftovers and AArch64 now lowers the focused + quality fixture's `main` to a constant return with no call, prologue, spill, + or reload traffic. ### Phase E - Inlining and Cleanup diff --git a/src/opt/pass_emit.c b/src/opt/pass_emit.c @@ -942,10 +942,86 @@ static u32 collect_replayed_hard_regs(Func* f, CGTarget* w, RegClass cls, return nused; } +static int replay_operand_uses_frame_slot(const Operand* op) { + return op && op->kind == OPK_LOCAL && op->v.frame_slot != FRAME_SLOT_NONE; +} + +static int replay_abivalue_uses_frame_slot(const CGABIValue* v) { + if (!v) return 0; + if (replay_operand_uses_frame_slot(&v->storage)) return 1; + for (u32 i = 0; i < v->nparts; ++i) + if (replay_operand_uses_frame_slot(&v->parts[i].op)) return 1; + return 0; +} + +static int replay_inst_uses_frame_slot(const Inst* in) { + for (u32 i = 0; i < in->nopnds; ++i) + if (replay_operand_uses_frame_slot(&in->opnds[i])) return 1; + switch ((IROp)in->op) { + case IR_CALL: { + IRCallAux* aux = (IRCallAux*)in->extra.aux; + if (!aux) return 0; + if (aux->use_plan_replay) { + if (replay_operand_uses_frame_slot(&aux->plan.callee)) return 1; + for (u32 i = 0; i < aux->plan.nargs; ++i) + if (replay_operand_uses_frame_slot(&aux->plan.args[i].src)) return 1; + for (u32 i = 0; i < aux->plan.nrets; ++i) + if (replay_operand_uses_frame_slot(&aux->plan.rets[i].dst)) return 1; + } else { + if (replay_operand_uses_frame_slot(&aux->desc.callee)) return 1; + for (u32 i = 0; i < aux->desc.nargs; ++i) + if (replay_abivalue_uses_frame_slot(&aux->desc.args[i])) return 1; + if (replay_abivalue_uses_frame_slot(&aux->desc.ret)) return 1; + } + return 0; + } + case IR_RET: { + IRRetAux* aux = (IRRetAux*)in->extra.aux; + return aux && aux->present && replay_abivalue_uses_frame_slot(&aux->val); + } + case IR_SCOPE_BEGIN: { + IRScopeAux* aux = (IRScopeAux*)in->extra.aux; + return aux && replay_operand_uses_frame_slot(&aux->desc.cond); + } + case IR_ASM_BLOCK: { + IRAsmAux* aux = (IRAsmAux*)in->extra.aux; + if (!aux) return 0; + for (u32 i = 0; i < aux->nin; ++i) + if (replay_operand_uses_frame_slot(&aux->in_ops[i])) return 1; + for (u32 i = 0; i < aux->nout; ++i) + if (replay_operand_uses_frame_slot(&aux->out_ops[i])) return 1; + return 0; + } + case IR_INTRINSIC: { + IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; + if (!aux) return 0; + for (u32 i = 0; i < aux->narg; ++i) + if (replay_operand_uses_frame_slot(&aux->args[i])) return 1; + for (u32 i = 0; i < aux->ndst; ++i) + if (replay_operand_uses_frame_slot(&aux->dsts[i])) return 1; + return 0; + } + default: + return 0; + } +} + +static int replay_func_uses_frame_slot(Func* f) { + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) + if (replay_inst_uses_frame_slot(&bl->insts[i])) return 1; + } + for (u32 i = 0; i < f->nframe_slots; ++i) + if (f->frame_slots[i].flags & (FSF_ADDR_TAKEN | FSF_VOLATILE)) return 1; + return 0; +} + static void collect_known_frame(Func* f, CGTarget* w, CGKnownFrameDesc* out) { memset(out, 0, sizeof(*out)); FrameSlotDesc* slots = NULL; - if (f->nframe_slots) { + int uses_frame_slot = replay_func_uses_frame_slot(f); + if (uses_frame_slot && f->nframe_slots) { slots = arena_zarray(f->arena, FrameSlotDesc, f->nframe_slots); for (u32 i = 0; i < f->nframe_slots; ++i) { IRFrameSlot* s = &f->frame_slots[i]; @@ -959,7 +1035,7 @@ static void collect_known_frame(Func* f, CGTarget* w, CGKnownFrameDesc* out) { } } out->slots = slots; - out->nslots = f->nframe_slots; + out->nslots = uses_frame_slot ? f->nframe_slots : 0; for (u32 b = 0; b < f->nblocks; ++b) { Block* bl = &f->blocks[b]; diff --git a/src/opt/pass_inline.c b/src/opt/pass_inline.c @@ -243,7 +243,7 @@ static int build_inline_map(InlineMap* m, Func* caller, Func* callee) { d.loc = old->loc; d.size = old->size; d.align = old->align; - d.kind = old->kind; + d.kind = old->kind == FS_PARAM ? FS_LOCAL : old->kind; d.flags = old->flags; m->slot[s] = ir_frame_slot_new(caller, &d); } diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c @@ -629,6 +629,14 @@ static int val_is_load_imm(Func* f, Val v, i64 imm) { return in && (IROp)in->op == IR_LOAD_IMM && in->extra.imm == imm; } +static int any_ret_load_imm(Func* f, i64 imm) { + for (u32 b = 0; b < f->nblocks; ++b) { + Val v = ret_val(f, b); + if (v != VAL_NONE && val_is_load_imm(f, v, imm)) return 1; + } + return 0; +} + static u32 count_uses_of(Func* f, Val v) { opt_rebuild_def_use(f); u32 n = 0; @@ -1222,6 +1230,20 @@ static void add_reg_param(Func* f, PReg r, CfreeCgTypeId ty) { in->type = ty; } +static FrameSlot add_frame_param(Func* f, CfreeCgTypeId ty) { + FrameSlot fs = add_frame_slot(f, ty, FS_PARAM, 4, 0); + CGParamDesc d; + memset(&d, 0, sizeof d); + d.index = f->nparams; + d.type = ty; + d.size = 4; + d.align = 4; + d.storage.kind = CG_LOCAL_STORAGE_FRAME; + d.storage.v.frame_slot = fs; + ir_param_add(f, &d); + return fs; +} + static Inst* emit_preg_load_imm(Func* f, u32 b, PReg dst, CfreeCgTypeId ty, i64 imm) { Inst* in = ir_emit(f, b, IR_LOAD_IMM); @@ -1234,6 +1256,19 @@ static Inst* emit_preg_load_imm(Func* f, u32 b, PReg dst, CfreeCgTypeId ty, return in; } +static Inst* emit_preg_load_local(Func* f, u32 b, PReg dst, FrameSlot fs, + CfreeCgTypeId ty, u16 flags) { + Inst* in = ir_emit(f, b, IR_LOAD); + in->opnds = arena_array(f->arena, Operand, 2); + in->opnds[0] = op_reg_(dst, ty); + in->opnds[1] = op_local_(fs, ty); + in->nopnds = 2; + in->def = dst; + in->type = ty; + in->extra.mem = mem_local_(fs, ty, 4, flags); + return in; +} + static Inst* emit_preg_binop(Func* f, u32 b, PReg dst, PReg a, PReg c, CfreeCgTypeId ty) { Inst* in = ir_emit(f, b, IR_BINOP); @@ -3711,6 +3746,42 @@ static void opt_jump_opt_forwards_empty_fallthrough_chain(void) { tc_fini(&tc); } +static void opt_jump_opt_repeatedly_forwards_branch_chain(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + u32 b0 = f->entry; + u32 hop0 = ir_block_new(f); + u32 hop1 = ir_block_new(f); + u32 ret = ir_block_new(f); + u32 other = ir_block_new(f); + ir_note_emit(f, hop0); + ir_note_emit(f, hop1); + ir_note_emit(f, ret); + ir_note_emit(f, other); + + Val cond = add_val(f, tc.i32); + Val a = add_val(f, tc.i32); + Val b = add_val(f, tc.i32); + emit_scalar_input(f, b0, cond, tc.i32); + emit_cond_branch(f, b0, cond, hop0, other, tc.i32); + emit_br_to(f, hop0, hop1); + emit_br_to(f, hop1, ret); + emit_load_imm(f, ret, a, tc.i32, 7); + emit_ret_val(f, ret, a, tc.i32); + emit_load_imm(f, other, b, tc.i32, 9); + emit_ret_val(f, other, b, tc.i32); + + opt_jump_opt(f); + opt_verify(f, "test-jump-opt-repeated-branch-forward"); + + EXPECT(f->blocks[b0].succ[0] == ret, + "jump opt should forward repeated branch-only chain"); + EXPECT(f->blocks[hop0].ninsts == 0 && f->blocks[hop1].ninsts == 0, + "repeated branch trampolines should be pruned"); + tc_fini(&tc); +} + static void opt_jump_opt_collapses_same_target_cond_branch(void) { TestCtx tc; tc_init(&tc); @@ -6389,6 +6460,43 @@ static void opt_inline_caller_growth_cap(void) { tc_fini(&tc); } +static void opt_inline_cleanup_promotes_cloned_param_frame(void) { + TestCtx tc; + tc_init(&tc); + CfreeCgTypeId ps[1] = {tc.i32}; + Func* callee = new_named_func(&tc, (ObjSymId)16, tc.i32, ps, 1, 0); + FrameSlot param = add_frame_param(callee, tc.i32); + PReg x = add_preg(callee, tc.i32); + PReg one = add_preg(callee, tc.i32); + PReg sum = add_preg(callee, tc.i32); + emit_preg_load_local(callee, callee->entry, x, param, tc.i32, 0); + emit_preg_load_imm(callee, callee->entry, one, tc.i32, 1); + emit_preg_binop(callee, callee->entry, sum, x, one, tc.i32); + emit_preg_ret(callee, callee->entry, sum, tc.i32); + + Func* caller = new_named_func(&tc, (ObjSymId)17, tc.i32, NULL, 0, 0); + PReg arg = add_preg(caller, tc.i32); + PReg ret = add_preg(caller, tc.i32); + emit_preg_load_imm(caller, caller->entry, arg, tc.i32, 41); + Operand arg_op = op_reg_(arg, tc.i32); + emit_direct_call(&tc, caller, caller->entry, (ObjSymId)16, callee->type, + &arg_op, 1, op_reg_(ret, tc.i32)); + emit_preg_ret(caller, caller->entry, ret, tc.i32); + + Func* funcs[2] = {caller, callee}; + FuncSet fs = {tc.c, tc.c->tu, funcs, 2, 2}; + opt_inline(&fs, 1); + EXPECT(count_op(caller, IR_CALL) == 0, "wrapper call should inline"); + opt_cleanup(caller); + opt_verify(caller, "test-inline-cleanup-param-frame"); + + EXPECT(count_op(caller, IR_LOAD) == 0 && count_op(caller, IR_STORE) == 0, + "cleanup should promote cloned parameter frame traffic"); + EXPECT(any_ret_load_imm(caller, 42), + "cleanup should fold inlined wrapper result to constant 42"); + tc_fini(&tc); +} + int main(void) { opt_machinize_uses_phys_reg_metadata(); opt_machinize_keeps_abi_regs_without_legacy_call_fallback(); @@ -6445,6 +6553,7 @@ int main(void) { opt_jump_cleanup_layout_deletes_fallthrough_branch(); opt_jump_opt_forwards_switch_targets(); opt_jump_opt_forwards_empty_fallthrough_chain(); + opt_jump_opt_repeatedly_forwards_branch_chain(); opt_jump_opt_collapses_same_target_cond_branch(); opt_loop_tree_excludes_side_exit(); opt_loop_tree_nested_depths(); @@ -6515,6 +6624,7 @@ int main(void) { opt_inline_bottom_up_chain_single_iter(); opt_inline_refuses_recursive_and_unsupported(); opt_inline_caller_growth_cap(); + opt_inline_cleanup_promotes_cloned_param_frame(); simple_regalloc_reports_exact_used_regs(); if (g_fails) { fprintf(stderr, "opt tests: %d failed (%d checks)\n", g_fails, g_checks); diff --git a/test/opt/run.sh b/test/opt/run.sh @@ -8,3 +8,29 @@ mkdir -p "$ROOT/build/test/opt" "$ROOT/build/cfree" cc -target aarch64-linux-gnu \ -O2 -c "$ROOT/test/opt/o2_many_values.c" \ -o "$ROOT/build/test/opt/o2_many_values.o" + +INLINE_WORK="$ROOT/build/test/opt/inline_cleanup_quality" +mkdir -p "$INLINE_WORK" +"$ROOT/build/cfree" cc -target aarch64-linux-gnu -O2 -c \ + "$ROOT/test/toy/cases/135_inline_cleanup_quality.toy" \ + -o "$INLINE_WORK/inline_cleanup_quality.o" \ + > "$INLINE_WORK/cc.out" 2> "$INLINE_WORK/cc.err" +"$ROOT/build/cfree" objdump -d "$INLINE_WORK/inline_cleanup_quality.o" \ + > "$INLINE_WORK/objdump.out" 2> "$INLINE_WORK/objdump.err" +awk ' + /^000000000000[0-9a-f]+ <main>:/ { in_main = 1; print; next } + /^000000000000[0-9a-f]+ </ { in_main = 0 } + in_main { print } +' "$INLINE_WORK/objdump.out" > "$INLINE_WORK/main.dis" +if grep -Eq '\bbl\b|stur|ldur|stp|ldp|sub sp|add sp' \ + "$INLINE_WORK/main.dis"; then + printf 'inline cleanup quality check failed; unexpected call/frame traffic:\n' >&2 + sed 's/^/ | /' "$INLINE_WORK/main.dis" >&2 + exit 1 +fi +if ! grep -Eq 'movz[[:space:]]+w0, 0x2a|mov[[:space:]]+w0, #42|li[[:space:]]+a0, 42' \ + "$INLINE_WORK/main.dis"; then + printf 'inline cleanup quality check failed; main is not a constant 42 return:\n' >&2 + sed 's/^/ | /' "$INLINE_WORK/main.dis" >&2 + exit 1 +fi diff --git a/test/toy/cases/135_inline_cleanup_quality.expected b/test/toy/cases/135_inline_cleanup_quality.expected @@ -0,0 +1 @@ +42 diff --git a/test/toy/cases/135_inline_cleanup_quality.toy b/test/toy/cases/135_inline_cleanup_quality.toy @@ -0,0 +1,9 @@ +fn add1(x: i64): i64 { + return x + 1; +} + +fn __user_main(): i64 { + return add1(41); +} + +fn main(): i32 { return __user_main() as i32; }