commit 15697ccc61d1100c553d0f8b7c631d3695377565
parent 7617fbc06d101d6345d167594ce106aca90ac979
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 15 May 2026 15:49:29 -0700
Tighten O1 post-rewrite cleanup
Diffstat:
3 files changed, 270 insertions(+), 16 deletions(-)
diff --git a/doc/OPT1.md b/doc/OPT1.md
@@ -263,6 +263,12 @@ mov w0, #0x2a
b ...
```
+- Post-rewrite DCE and copy cleanup now model hard-register call arguments,
+ ABI argument parts, return registers, and caller-saved call clobbers. The
+ local combine pass also retargets safe single-use arithmetic producers to a
+ following physical-copy destination, including commutative operand swaps for
+ x64-style two-operand overlap cases.
+
Remaining O1 shape issues visible in the current dumps:
- Parameter and entry-slot promotion is incomplete. The trivial AArch64
@@ -278,15 +284,6 @@ mov w0, w19
- O1 still saves/restores more callee-saved registers than the body appears to
need in small functions. The AArch64 while-loop probe saves `x19-x22`, and
the x64 direct-call probe saves `rbx/r12/r13/r14` in tiny functions.
-- Post-RA copy cleanup still leaves avoidable moves such as:
-
-```asm
-add w21, w20, w19
-mov w20, w21
-add w21, w19, #1
-mov w19, w21
-```
-
- Direct-call tiny functions are still heavy at O1. The x64 `callee(x) + 2`
probe emitted 167 bytes and 47 instructions across two small functions,
mostly frame setup, callee-save traffic, copies, and branch-to-epilogue
@@ -311,9 +308,3 @@ fast tier:
cleanup, and consider caller-saved registers for values that are not live
across calls. This would make small leaf functions much closer to expected
O1 output without requiring global optimization.
-
-3. Continue tightening post-rewrite DCE and copy cleanup.
- Model hard-register call arguments and call clobbers precisely enough to
- delete dead caller-saved defs before calls without removing required ABI
- traffic. Also fold single-use arithmetic temporaries into their destination
- when target constraints allow it.
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -1769,6 +1769,58 @@ static int identical_convert_pair(const Inst* a, const Inst* b) {
a->opnds[0].type == b->opnds[0].type;
}
+static int binop_is_commutative(BinOp op) {
+ switch (op) {
+ case BO_IADD:
+ case BO_IMUL:
+ case BO_FADD:
+ case BO_FMUL:
+ case BO_AND:
+ case BO_OR:
+ case BO_XOR:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int no_intervening_phys_access(Block* bl, u32 first, u32 last,
+ const Operand* r) {
+ for (u32 i = first; i < last; ++i) {
+ Inst* in = &bl->insts[i];
+ if (inst_uses_phys_reg(in, r) || inst_defines_phys_reg(in, r)) return 0;
+ }
+ return 1;
+}
+
+static int retarget_producer_legal(Inst* producer, const Operand* copy_dst,
+ int* swap_binop) {
+ *swap_binop = 0;
+ if (!copy_dst || copy_dst->kind != OPK_REG) return 0;
+ if (producer->nopnds < 2 || producer->opnds[0].kind != OPK_REG) return 0;
+ if (producer->opnds[0].cls != copy_dst->cls) return 0;
+ if (producer->opnds[0].type != copy_dst->type) return 0;
+
+ switch ((IROp)producer->op) {
+ case IR_UNOP:
+ return 1;
+ case IR_BINOP: {
+ if (producer->nopnds < 3) return 0;
+ int dst_is_lhs = operand_uses_phys_reg(&producer->opnds[1], copy_dst);
+ int dst_is_rhs = operand_uses_phys_reg(&producer->opnds[2], copy_dst);
+ if (!dst_is_lhs && !dst_is_rhs) return 1;
+ if (dst_is_lhs) return 1;
+ if (binop_is_commutative((BinOp)producer->extra.imm)) {
+ *swap_binop = 1;
+ return 1;
+ }
+ return 0;
+ }
+ default:
+ return 0;
+ }
+}
+
static int find_single_direct_use(Func* f, Block* bl,
const HardBlockLive* hard_live, u32 def_i,
const Operand* def, const Operand* src,
@@ -1829,6 +1881,31 @@ static void opt_combine_fold_block(Func* f, Block* bl,
Inst* in = &bl->insts[i];
u32 use_i = 0;
u32 op_i = 0;
+
+ if (f->opt_rewritten &&
+ ((IROp)in->op == IR_BINOP || (IROp)in->op == IR_UNOP) &&
+ in->nopnds >= 1 && in->opnds[0].kind == OPK_REG &&
+ find_single_direct_use(f, bl, hard_live, i, &in->opnds[0], NULL, 0, 0,
+ 0, &use_i, &op_i)) {
+ Inst* copy = &bl->insts[use_i];
+ int swap_binop = 0;
+ if ((IROp)copy->op == IR_COPY && op_i == 1 && copy->nopnds >= 2 &&
+ copy->opnds[0].kind == OPK_REG &&
+ same_phys_reg(©->opnds[1], &in->opnds[0]) &&
+ !same_phys_reg(©->opnds[0], &in->opnds[0]) &&
+ no_intervening_phys_access(bl, i + 1u, use_i, ©->opnds[0]) &&
+ retarget_producer_legal(in, ©->opnds[0], &swap_binop)) {
+ if (swap_binop) {
+ Operand tmp = in->opnds[1];
+ in->opnds[1] = in->opnds[2];
+ in->opnds[2] = tmp;
+ }
+ in->opnds[0] = copy->opnds[0];
+ copy->opnds[1] = copy->opnds[0];
+ continue;
+ }
+ }
+
if ((IROp)in->op == IR_COPY && in->nopnds >= 2 &&
in->opnds[0].kind == OPK_REG && in->opnds[1].kind == OPK_REG &&
!same_phys_reg(&in->opnds[0], &in->opnds[1]) &&
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -242,6 +242,28 @@ static Inst* emit_binop(Func* f, u32 b, Val dst, Val a, Val c,
return in;
}
+static Inst* emit_phys_copy(Func* f, u32 b, Reg dst, Reg src,
+ CfreeCgTypeId ty) {
+ Inst* in = ir_emit(f, b, IR_COPY);
+ in->opnds = arena_array(f->arena, Operand, 2);
+ in->opnds[0] = op_reg_(dst, ty);
+ in->opnds[1] = op_reg_(src, ty);
+ in->nopnds = 2;
+ return in;
+}
+
+static Inst* emit_phys_binop(Func* f, u32 b, Reg dst, Reg a, Reg c,
+ CfreeCgTypeId ty, BinOp op) {
+ Inst* in = ir_emit(f, b, IR_BINOP);
+ in->opnds = arena_array(f->arena, Operand, 3);
+ in->opnds[0] = op_reg_(dst, ty);
+ in->opnds[1] = op_reg_(a, ty);
+ in->opnds[2] = op_reg_(c, ty);
+ in->nopnds = 3;
+ in->extra.imm = op;
+ return in;
+}
+
static Inst* emit_convert(Func* f, u32 b, Val dst, Val src, CfreeCgTypeId ty,
ConvKind k) {
Inst* in = ir_emit(f, b, IR_CONVERT);
@@ -1771,7 +1793,67 @@ static void opt_dce_call_clobbers_hard_regs(void) {
opt_dce(g);
EXPECT(count_op(g, IR_LOAD_IMM) == 1,
- "hard-reg call argument should keep its reaching def");
+ "hard-reg direct call argument should keep its reaching def");
+
+ Func* h = new_func(&tc);
+ opt_machinize(h, &mock.base);
+ h->opt_rewritten = 1;
+ li = ir_emit(h, h->entry, IR_LOAD_IMM);
+ li->opnds = arena_array(h->arena, Operand, 1);
+ li->opnds[0] = op_reg_(10, tc.i32);
+ li->nopnds = 1;
+ li->extra.imm = 11;
+ call = emit_call_void(h, h->entry);
+ caux = (IRCallAux*)call->extra.aux;
+ args = arena_zarray(h->arena, CGABIValue, 1);
+ CGABIPart* parts = arena_zarray(h->arena, CGABIPart, 1);
+ parts[0].op = op_reg_(10, tc.i32);
+ args[0].nparts = 1;
+ args[0].parts = parts;
+ caux->desc.nargs = 1;
+ caux->desc.args = args;
+ ir_emit(h, h->entry, IR_RET);
+
+ opt_dce(h);
+ EXPECT(count_op(h, IR_LOAD_IMM) == 1,
+ "hard-reg ABI argument part should keep its reaching def");
+
+ Func* callee_f = new_func(&tc);
+ opt_machinize(callee_f, &mock.base);
+ callee_f->opt_rewritten = 1;
+ li = ir_emit(callee_f, callee_f->entry, IR_LOAD_IMM);
+ li->opnds = arena_array(callee_f->arena, Operand, 1);
+ li->opnds[0] = op_reg_(10, tc.i32);
+ li->nopnds = 1;
+ li->extra.imm = 13;
+ call = emit_call_void(callee_f, callee_f->entry);
+ caux = (IRCallAux*)call->extra.aux;
+ caux->desc.callee = op_indirect_(10, tc.i32);
+ ir_emit(callee_f, callee_f->entry, IR_RET);
+
+ opt_dce(callee_f);
+ EXPECT(count_op(callee_f, IR_LOAD_IMM) == 1,
+ "indirect call callee base should keep its reaching def");
+
+ MockCGTarget ret_mock;
+ mock_init(&ret_mock, tc.c);
+ mock_set_pool(&ret_mock, RC_INT, pool, 1, scratch, 1, 0);
+ Func* ret_f = new_func(&tc);
+ opt_machinize(ret_f, &ret_mock.base);
+ ret_f->opt_rewritten = 1;
+ li = ir_emit(ret_f, ret_f->entry, IR_LOAD_IMM);
+ li->opnds = arena_array(ret_f->arena, Operand, 1);
+ li->opnds[0] = op_reg_(10, tc.i32);
+ li->nopnds = 1;
+ li->extra.imm = 15;
+ call = emit_call_void(ret_f, ret_f->entry);
+ caux = (IRCallAux*)call->extra.aux;
+ caux->desc.ret.storage = op_reg_(10, tc.i32);
+ emit_ret_val(ret_f, ret_f->entry, 10, tc.i32);
+
+ opt_dce(ret_f);
+ EXPECT(count_op(ret_f, IR_LOAD_IMM) == 0 && count_op(ret_f, IR_CALL) == 1,
+ "call return hard-reg def should feed ret and kill pre-call def");
tc_fini(&tc);
}
@@ -1953,6 +2035,109 @@ static void opt_combine_single_use_copy_and_imm(void) {
tc_fini(&tc);
}
+static void opt_combine_retargets_single_use_producer_copy(void) {
+ TestCtx tc;
+ tc_init(&tc);
+
+ Func* f = new_func(&tc);
+ f->opt_rewritten = 1;
+ emit_phys_binop(f, f->entry, 21, 20, 19, tc.i32, BO_IADD);
+ emit_phys_copy(f, f->entry, 22, 21, tc.i32);
+ emit_ret_val(f, f->entry, 22, tc.i32);
+
+ opt_combine(f);
+ EXPECT(count_op(f, IR_BINOP) == 1 && count_op(f, IR_COPY) == 0,
+ "single-use binop-copy should retarget producer and remove copy");
+ Inst* add = &f->blocks[f->entry].insts[0];
+ EXPECT(add->opnds[0].v.reg == 22,
+ "retargeted binop should define the copy destination");
+
+ Func* lhs = new_func(&tc);
+ lhs->opt_rewritten = 1;
+ emit_phys_binop(lhs, lhs->entry, 21, 20, 19, tc.i32, BO_IADD);
+ emit_phys_copy(lhs, lhs->entry, 20, 21, tc.i32);
+ emit_ret_val(lhs, lhs->entry, 20, tc.i32);
+
+ opt_combine(lhs);
+ add = &lhs->blocks[lhs->entry].insts[0];
+ EXPECT(count_op(lhs, IR_COPY) == 0 && add->opnds[0].v.reg == 20 &&
+ add->opnds[1].v.reg == 20,
+ "producer may retarget to its lhs operand");
+
+ Func* rhs = new_func(&tc);
+ rhs->opt_rewritten = 1;
+ emit_phys_binop(rhs, rhs->entry, 21, 19, 20, tc.i32, BO_IADD);
+ emit_phys_copy(rhs, rhs->entry, 20, 21, tc.i32);
+ emit_ret_val(rhs, rhs->entry, 20, tc.i32);
+
+ opt_combine(rhs);
+ add = &rhs->blocks[rhs->entry].insts[0];
+ EXPECT(count_op(rhs, IR_COPY) == 0 && add->opnds[0].v.reg == 20 &&
+ add->opnds[1].v.reg == 20 && add->opnds[2].v.reg == 19,
+ "commutative rhs overlap should swap operands before retargeting");
+
+ Func* sub = new_func(&tc);
+ sub->opt_rewritten = 1;
+ emit_phys_binop(sub, sub->entry, 21, 19, 20, tc.i32, BO_ISUB);
+ emit_phys_copy(sub, sub->entry, 20, 21, tc.i32);
+ emit_ret_val(sub, sub->entry, 20, tc.i32);
+
+ opt_combine(sub);
+ EXPECT(count_op(sub, IR_COPY) == 1,
+ "noncommutative rhs overlap should not retarget producer");
+
+ Func* multi = new_func(&tc);
+ multi->opt_rewritten = 1;
+ emit_phys_binop(multi, multi->entry, 21, 20, 19, tc.i32, BO_IADD);
+ emit_phys_copy(multi, multi->entry, 22, 21, tc.i32);
+ emit_phys_binop(multi, multi->entry, 23, 21, 18, tc.i32, BO_IADD);
+ emit_ret_val(multi, multi->entry, 23, tc.i32);
+
+ opt_combine(multi);
+ EXPECT(count_op(multi, IR_COPY) == 1,
+ "producer with two same-block uses should not retarget");
+
+ Func* call_f = new_func(&tc);
+ call_f->opt_rewritten = 1;
+ emit_phys_binop(call_f, call_f->entry, 21, 20, 19, tc.i32, BO_IADD);
+ emit_call_void(call_f, call_f->entry);
+ emit_phys_copy(call_f, call_f->entry, 22, 21, tc.i32);
+ emit_ret_val(call_f, call_f->entry, 22, tc.i32);
+
+ opt_combine(call_f);
+ EXPECT(count_op(call_f, IR_COPY) == 1,
+ "producer-copy retargeting should not cross a call");
+
+ Func* clobber = new_func(&tc);
+ clobber->opt_rewritten = 1;
+ emit_phys_binop(clobber, clobber->entry, 21, 20, 19, tc.i32, BO_IADD);
+ Inst* li = ir_emit(clobber, clobber->entry, IR_LOAD_IMM);
+ li->opnds = arena_array(clobber->arena, Operand, 1);
+ li->opnds[0] = op_reg_(20, tc.i32);
+ li->nopnds = 1;
+ li->extra.imm = 7;
+ emit_phys_copy(clobber, clobber->entry, 20, 21, tc.i32);
+ emit_ret_val(clobber, clobber->entry, 20, tc.i32);
+
+ opt_combine(clobber);
+ EXPECT(count_op(clobber, IR_COPY) == 1,
+ "producer retargeting should not cross a destination clobber");
+
+ Func* liveout = new_func(&tc);
+ liveout->opt_rewritten = 1;
+ u32 succ = ir_block_new(liveout);
+ emit_phys_binop(liveout, liveout->entry, 21, 20, 19, tc.i32, BO_IADD);
+ emit_phys_copy(liveout, liveout->entry, 22, 21, tc.i32);
+ emit_br_to(liveout, liveout->entry, succ);
+ emit_ret_val(liveout, succ, 21, tc.i32);
+
+ opt_combine(liveout);
+ EXPECT(count_op(liveout, IR_COPY) == 1,
+ "producer live out of the block should not retarget to copy dst");
+
+ tc_fini(&tc);
+}
+
static void opt_combine_keeps_unsafe_and_multiuse_defs(void) {
TestCtx tc;
tc_init(&tc);
@@ -2491,6 +2676,7 @@ int main(void) {
opt_regalloc_spill_requires_scratch();
opt_combine_spill_peeps();
opt_combine_single_use_copy_and_imm();
+ opt_combine_retargets_single_use_producer_copy();
opt_combine_keeps_unsafe_and_multiuse_defs();
opt_combine_copy_chains_and_convert_pairs();
opt_dce_physical_dead_defs();