kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 15697ccc61d1100c553d0f8b7c631d3695377565
parent 7617fbc06d101d6345d167594ce106aca90ac979
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 15 May 2026 15:49:29 -0700

Tighten O1 post-rewrite cleanup

Diffstat:
Mdoc/OPT1.md | 21++++++---------------
Msrc/opt/pass_lower.c | 77+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest/opt/opt_test.c | 188++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 270 insertions(+), 16 deletions(-)

diff --git a/doc/OPT1.md b/doc/OPT1.md @@ -263,6 +263,12 @@ mov w0, #0x2a b ... ``` +- Post-rewrite DCE and copy cleanup now model hard-register call arguments, + ABI argument parts, return registers, and caller-saved call clobbers. The + local combine pass also retargets safe single-use arithmetic producers to a + following physical-copy destination, including commutative operand swaps for + x64-style two-operand overlap cases. + Remaining O1 shape issues visible in the current dumps: - Parameter and entry-slot promotion is incomplete. The trivial AArch64 @@ -278,15 +284,6 @@ mov w0, w19 - O1 still saves/restores more callee-saved registers than the body appears to need in small functions. The AArch64 while-loop probe saves `x19-x22`, and the x64 direct-call probe saves `rbx/r12/r13/r14` in tiny functions. -- Post-RA copy cleanup still leaves avoidable moves such as: - -```asm -add w21, w20, w19 -mov w20, w21 -add w21, w19, #1 -mov w19, w21 -``` - - Direct-call tiny functions are still heavy at O1. The x64 `callee(x) + 2` probe emitted 167 bytes and 47 instructions across two small functions, mostly frame setup, callee-save traffic, copies, and branch-to-epilogue @@ -311,9 +308,3 @@ fast tier: cleanup, and consider caller-saved registers for values that are not live across calls. This would make small leaf functions much closer to expected O1 output without requiring global optimization. - -3. Continue tightening post-rewrite DCE and copy cleanup. - Model hard-register call arguments and call clobbers precisely enough to - delete dead caller-saved defs before calls without removing required ABI - traffic. Also fold single-use arithmetic temporaries into their destination - when target constraints allow it. diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c @@ -1769,6 +1769,58 @@ static int identical_convert_pair(const Inst* a, const Inst* b) { a->opnds[0].type == b->opnds[0].type; } +static int binop_is_commutative(BinOp op) { + switch (op) { + case BO_IADD: + case BO_IMUL: + case BO_FADD: + case BO_FMUL: + case BO_AND: + case BO_OR: + case BO_XOR: + return 1; + default: + return 0; + } +} + +static int no_intervening_phys_access(Block* bl, u32 first, u32 last, + const Operand* r) { + for (u32 i = first; i < last; ++i) { + Inst* in = &bl->insts[i]; + if (inst_uses_phys_reg(in, r) || inst_defines_phys_reg(in, r)) return 0; + } + return 1; +} + +static int retarget_producer_legal(Inst* producer, const Operand* copy_dst, + int* swap_binop) { + *swap_binop = 0; + if (!copy_dst || copy_dst->kind != OPK_REG) return 0; + if (producer->nopnds < 2 || producer->opnds[0].kind != OPK_REG) return 0; + if (producer->opnds[0].cls != copy_dst->cls) return 0; + if (producer->opnds[0].type != copy_dst->type) return 0; + + switch ((IROp)producer->op) { + case IR_UNOP: + return 1; + case IR_BINOP: { + if (producer->nopnds < 3) return 0; + int dst_is_lhs = operand_uses_phys_reg(&producer->opnds[1], copy_dst); + int dst_is_rhs = operand_uses_phys_reg(&producer->opnds[2], copy_dst); + if (!dst_is_lhs && !dst_is_rhs) return 1; + if (dst_is_lhs) return 1; + if (binop_is_commutative((BinOp)producer->extra.imm)) { + *swap_binop = 1; + return 1; + } + return 0; + } + default: + return 0; + } +} + static int find_single_direct_use(Func* f, Block* bl, const HardBlockLive* hard_live, u32 def_i, const Operand* def, const Operand* src, @@ -1829,6 +1881,31 @@ static void opt_combine_fold_block(Func* f, Block* bl, Inst* in = &bl->insts[i]; u32 use_i = 0; u32 op_i = 0; + + if (f->opt_rewritten && + ((IROp)in->op == IR_BINOP || (IROp)in->op == IR_UNOP) && + in->nopnds >= 1 && in->opnds[0].kind == OPK_REG && + find_single_direct_use(f, bl, hard_live, i, &in->opnds[0], NULL, 0, 0, + 0, &use_i, &op_i)) { + Inst* copy = &bl->insts[use_i]; + int swap_binop = 0; + if ((IROp)copy->op == IR_COPY && op_i == 1 && copy->nopnds >= 2 && + copy->opnds[0].kind == OPK_REG && + same_phys_reg(&copy->opnds[1], &in->opnds[0]) && + !same_phys_reg(&copy->opnds[0], &in->opnds[0]) && + no_intervening_phys_access(bl, i + 1u, use_i, &copy->opnds[0]) && + retarget_producer_legal(in, &copy->opnds[0], &swap_binop)) { + if (swap_binop) { + Operand tmp = in->opnds[1]; + in->opnds[1] = in->opnds[2]; + in->opnds[2] = tmp; + } + in->opnds[0] = copy->opnds[0]; + copy->opnds[1] = copy->opnds[0]; + continue; + } + } + if ((IROp)in->op == IR_COPY && in->nopnds >= 2 && in->opnds[0].kind == OPK_REG && in->opnds[1].kind == OPK_REG && !same_phys_reg(&in->opnds[0], &in->opnds[1]) && diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c @@ -242,6 +242,28 @@ static Inst* emit_binop(Func* f, u32 b, Val dst, Val a, Val c, return in; } +static Inst* emit_phys_copy(Func* f, u32 b, Reg dst, Reg src, + CfreeCgTypeId ty) { + Inst* in = ir_emit(f, b, IR_COPY); + in->opnds = arena_array(f->arena, Operand, 2); + in->opnds[0] = op_reg_(dst, ty); + in->opnds[1] = op_reg_(src, ty); + in->nopnds = 2; + return in; +} + +static Inst* emit_phys_binop(Func* f, u32 b, Reg dst, Reg a, Reg c, + CfreeCgTypeId ty, BinOp op) { + Inst* in = ir_emit(f, b, IR_BINOP); + in->opnds = arena_array(f->arena, Operand, 3); + in->opnds[0] = op_reg_(dst, ty); + in->opnds[1] = op_reg_(a, ty); + in->opnds[2] = op_reg_(c, ty); + in->nopnds = 3; + in->extra.imm = op; + return in; +} + static Inst* emit_convert(Func* f, u32 b, Val dst, Val src, CfreeCgTypeId ty, ConvKind k) { Inst* in = ir_emit(f, b, IR_CONVERT); @@ -1771,7 +1793,67 @@ static void opt_dce_call_clobbers_hard_regs(void) { opt_dce(g); EXPECT(count_op(g, IR_LOAD_IMM) == 1, - "hard-reg call argument should keep its reaching def"); + "hard-reg direct call argument should keep its reaching def"); + + Func* h = new_func(&tc); + opt_machinize(h, &mock.base); + h->opt_rewritten = 1; + li = ir_emit(h, h->entry, IR_LOAD_IMM); + li->opnds = arena_array(h->arena, Operand, 1); + li->opnds[0] = op_reg_(10, tc.i32); + li->nopnds = 1; + li->extra.imm = 11; + call = emit_call_void(h, h->entry); + caux = (IRCallAux*)call->extra.aux; + args = arena_zarray(h->arena, CGABIValue, 1); + CGABIPart* parts = arena_zarray(h->arena, CGABIPart, 1); + parts[0].op = op_reg_(10, tc.i32); + args[0].nparts = 1; + args[0].parts = parts; + caux->desc.nargs = 1; + caux->desc.args = args; + ir_emit(h, h->entry, IR_RET); + + opt_dce(h); + EXPECT(count_op(h, IR_LOAD_IMM) == 1, + "hard-reg ABI argument part should keep its reaching def"); + + Func* callee_f = new_func(&tc); + opt_machinize(callee_f, &mock.base); + callee_f->opt_rewritten = 1; + li = ir_emit(callee_f, callee_f->entry, IR_LOAD_IMM); + li->opnds = arena_array(callee_f->arena, Operand, 1); + li->opnds[0] = op_reg_(10, tc.i32); + li->nopnds = 1; + li->extra.imm = 13; + call = emit_call_void(callee_f, callee_f->entry); + caux = (IRCallAux*)call->extra.aux; + caux->desc.callee = op_indirect_(10, tc.i32); + ir_emit(callee_f, callee_f->entry, IR_RET); + + opt_dce(callee_f); + EXPECT(count_op(callee_f, IR_LOAD_IMM) == 1, + "indirect call callee base should keep its reaching def"); + + MockCGTarget ret_mock; + mock_init(&ret_mock, tc.c); + mock_set_pool(&ret_mock, RC_INT, pool, 1, scratch, 1, 0); + Func* ret_f = new_func(&tc); + opt_machinize(ret_f, &ret_mock.base); + ret_f->opt_rewritten = 1; + li = ir_emit(ret_f, ret_f->entry, IR_LOAD_IMM); + li->opnds = arena_array(ret_f->arena, Operand, 1); + li->opnds[0] = op_reg_(10, tc.i32); + li->nopnds = 1; + li->extra.imm = 15; + call = emit_call_void(ret_f, ret_f->entry); + caux = (IRCallAux*)call->extra.aux; + caux->desc.ret.storage = op_reg_(10, tc.i32); + emit_ret_val(ret_f, ret_f->entry, 10, tc.i32); + + opt_dce(ret_f); + EXPECT(count_op(ret_f, IR_LOAD_IMM) == 0 && count_op(ret_f, IR_CALL) == 1, + "call return hard-reg def should feed ret and kill pre-call def"); tc_fini(&tc); } @@ -1953,6 +2035,109 @@ static void opt_combine_single_use_copy_and_imm(void) { tc_fini(&tc); } +static void opt_combine_retargets_single_use_producer_copy(void) { + TestCtx tc; + tc_init(&tc); + + Func* f = new_func(&tc); + f->opt_rewritten = 1; + emit_phys_binop(f, f->entry, 21, 20, 19, tc.i32, BO_IADD); + emit_phys_copy(f, f->entry, 22, 21, tc.i32); + emit_ret_val(f, f->entry, 22, tc.i32); + + opt_combine(f); + EXPECT(count_op(f, IR_BINOP) == 1 && count_op(f, IR_COPY) == 0, + "single-use binop-copy should retarget producer and remove copy"); + Inst* add = &f->blocks[f->entry].insts[0]; + EXPECT(add->opnds[0].v.reg == 22, + "retargeted binop should define the copy destination"); + + Func* lhs = new_func(&tc); + lhs->opt_rewritten = 1; + emit_phys_binop(lhs, lhs->entry, 21, 20, 19, tc.i32, BO_IADD); + emit_phys_copy(lhs, lhs->entry, 20, 21, tc.i32); + emit_ret_val(lhs, lhs->entry, 20, tc.i32); + + opt_combine(lhs); + add = &lhs->blocks[lhs->entry].insts[0]; + EXPECT(count_op(lhs, IR_COPY) == 0 && add->opnds[0].v.reg == 20 && + add->opnds[1].v.reg == 20, + "producer may retarget to its lhs operand"); + + Func* rhs = new_func(&tc); + rhs->opt_rewritten = 1; + emit_phys_binop(rhs, rhs->entry, 21, 19, 20, tc.i32, BO_IADD); + emit_phys_copy(rhs, rhs->entry, 20, 21, tc.i32); + emit_ret_val(rhs, rhs->entry, 20, tc.i32); + + opt_combine(rhs); + add = &rhs->blocks[rhs->entry].insts[0]; + EXPECT(count_op(rhs, IR_COPY) == 0 && add->opnds[0].v.reg == 20 && + add->opnds[1].v.reg == 20 && add->opnds[2].v.reg == 19, + "commutative rhs overlap should swap operands before retargeting"); + + Func* sub = new_func(&tc); + sub->opt_rewritten = 1; + emit_phys_binop(sub, sub->entry, 21, 19, 20, tc.i32, BO_ISUB); + emit_phys_copy(sub, sub->entry, 20, 21, tc.i32); + emit_ret_val(sub, sub->entry, 20, tc.i32); + + opt_combine(sub); + EXPECT(count_op(sub, IR_COPY) == 1, + "noncommutative rhs overlap should not retarget producer"); + + Func* multi = new_func(&tc); + multi->opt_rewritten = 1; + emit_phys_binop(multi, multi->entry, 21, 20, 19, tc.i32, BO_IADD); + emit_phys_copy(multi, multi->entry, 22, 21, tc.i32); + emit_phys_binop(multi, multi->entry, 23, 21, 18, tc.i32, BO_IADD); + emit_ret_val(multi, multi->entry, 23, tc.i32); + + opt_combine(multi); + EXPECT(count_op(multi, IR_COPY) == 1, + "producer with two same-block uses should not retarget"); + + Func* call_f = new_func(&tc); + call_f->opt_rewritten = 1; + emit_phys_binop(call_f, call_f->entry, 21, 20, 19, tc.i32, BO_IADD); + emit_call_void(call_f, call_f->entry); + emit_phys_copy(call_f, call_f->entry, 22, 21, tc.i32); + emit_ret_val(call_f, call_f->entry, 22, tc.i32); + + opt_combine(call_f); + EXPECT(count_op(call_f, IR_COPY) == 1, + "producer-copy retargeting should not cross a call"); + + Func* clobber = new_func(&tc); + clobber->opt_rewritten = 1; + emit_phys_binop(clobber, clobber->entry, 21, 20, 19, tc.i32, BO_IADD); + Inst* li = ir_emit(clobber, clobber->entry, IR_LOAD_IMM); + li->opnds = arena_array(clobber->arena, Operand, 1); + li->opnds[0] = op_reg_(20, tc.i32); + li->nopnds = 1; + li->extra.imm = 7; + emit_phys_copy(clobber, clobber->entry, 20, 21, tc.i32); + emit_ret_val(clobber, clobber->entry, 20, tc.i32); + + opt_combine(clobber); + EXPECT(count_op(clobber, IR_COPY) == 1, + "producer retargeting should not cross a destination clobber"); + + Func* liveout = new_func(&tc); + liveout->opt_rewritten = 1; + u32 succ = ir_block_new(liveout); + emit_phys_binop(liveout, liveout->entry, 21, 20, 19, tc.i32, BO_IADD); + emit_phys_copy(liveout, liveout->entry, 22, 21, tc.i32); + emit_br_to(liveout, liveout->entry, succ); + emit_ret_val(liveout, succ, 21, tc.i32); + + opt_combine(liveout); + EXPECT(count_op(liveout, IR_COPY) == 1, + "producer live out of the block should not retarget to copy dst"); + + tc_fini(&tc); +} + static void opt_combine_keeps_unsafe_and_multiuse_defs(void) { TestCtx tc; tc_init(&tc); @@ -2491,6 +2676,7 @@ int main(void) { opt_regalloc_spill_requires_scratch(); opt_combine_spill_peeps(); opt_combine_single_use_copy_and_imm(); + opt_combine_retargets_single_use_producer_copy(); opt_combine_keeps_unsafe_and_multiuse_defs(); opt_combine_copy_chains_and_convert_pairs(); opt_dce_physical_dead_defs();