commit 104b3914e3d72c60eef8824d69b03f6ec726aa30
parent 283f35cc35b3d8f28773a7820d1513081ea28862
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Wed, 27 May 2026 11:37:32 -0700
opt: fold constant operands as immediates at O1; drop copy/return round-trips
Keep integer constants out of registers on the optimizer path. Three related
changes turn the common "materialize a constant, then use it" shape into a
single immediate-form instruction:
- aa64 backend (native.c): add an imm_legal hook reporting which constants the
ISA can encode (add/sub/cmp 12-bit immediates, optionally <<12; any value for
a plain move) and wire it. aa_binop and a shared aa_emit_cmp_to_flags
(factored out of aa_cmp / aa_cmp_branch) now emit add/sub/subs immediate forms
when the rhs is a NATIVE_LOC_IMM.
- emit (pass_native_emit.c): operand_imm_or_reg leaves a binop/cmp/cmp_branch
constant operand as an immediate when imm_legal accepts it, instead of always
materializing it into a register. write_loc sends an immediate straight to its
destination register (one load_imm) rather than through a scratch + move.
emit_ret hands plan_ret the return value's location directly, dropping the
per-return spill-to-temp + reload of scalar results.
- combine (pass_combine.c): on the O1 (no-coalesce) path, fold a load_imm into
a consuming IR_COPY so "load_imm rT,k; copy rD,rT" collapses to "copy rD,#k",
which the emit path lowers to a single load_imm rD,k. O2 is unchanged (it
leaves copies register-to-register for its coalescer + self-copy removal).
On a representative loop this removes the per-iteration movz for +7/+1/+100,
the constant-initializer copy chains, and the return-value round trip,
shrinking the function from ~20 to ~12 instructions and halving its frame.
Full toy suite (R/L/C/W x O0/O1/O2) green.
Diffstat:
3 files changed, 114 insertions(+), 44 deletions(-)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -460,6 +460,8 @@ static u32 aa_subs_reg(u32 sf, u32 rd, u32 rn, u32 rm) {
(AA64AddSubSR){.sf = sf, .op = 1, .S = 1, .Rm = rm, .Rn = rn, .Rd = rd});
}
+static void aa_emit_cmp_to_flags(NativeTarget* t, NativeLoc lhs, NativeLoc rhs);
+
static u32 aa_add_lsl(u32 rd, u32 rn, u32 rm, u32 shift) {
return aa64_addsubsr_pack((AA64AddSubSR){.sf = 1,
.op = 0,
@@ -671,6 +673,32 @@ static int aa_addr_legal(NativeTarget* t, const NativeAddr* addr,
return addr->log2_scale == sz;
}
+/* Which constant operands the backend can fold directly into an instruction
+ * (so the optimizer can leave them as immediates instead of materializing a
+ * register). Currently: add/sub/cmp 12-bit immediates (optionally <<12), and
+ * any value for a plain register move (movz/movk synthesizes it). */
+static int aa_imm_legal(NativeTarget* t, NativeImmUse use, u32 op,
+ CfreeCgTypeId type, i64 imm) {
+ u32 imm12, sh;
+ (void)t;
+ (void)type;
+ switch (use) {
+ case NATIVE_IMM_BINOP:
+ if ((BinOp)op == BO_IADD || (BinOp)op == BO_ISUB)
+ return aa64_addsub_imm_fits(imm < 0 ? -imm : imm, &imm12, &sh);
+ return 0;
+ case NATIVE_IMM_CMP:
+ /* cmp lowers to subs #imm12; cmn (negative) is not wired, so require a
+ * non-negative immediate. */
+ return imm >= 0 && aa64_addsub_imm_fits(imm, &imm12, &sh);
+ case NATIVE_IMM_ADDR_OFFSET:
+ return aa64_addsub_imm_fits(imm < 0 ? -imm : imm, &imm12, &sh);
+ case NATIVE_IMM_MOVE:
+ return 1;
+ }
+ return 0;
+}
+
static void aa_apply_index(AANativeTarget* a, u32 rd, const NativeAddr* addr) {
if (addr->index_kind == NATIVE_ADDR_INDEX_NONE) return;
if (addr->index_kind != NATIVE_ADDR_INDEX_REG)
@@ -1145,13 +1173,7 @@ static void aa_jump(NativeTarget* t, MCLabel label) {
static void aa_cmp_branch(NativeTarget* t, CmpOp op, NativeLoc lhs,
NativeLoc rhs, MCLabel label) {
- if (loc_is_fp(lhs)) {
- aa_emit32(t->mc, aa_fcmp(type_size32(t, lhs.type) == 8u, loc_reg(lhs),
- loc_reg(rhs)));
- } else {
- u32 sf = loc_is_64(t, lhs) ? 1u : 0u;
- aa_emit32(t->mc, aa_subs_reg(sf, AA64_ZR, loc_reg(lhs), loc_reg(rhs)));
- }
+ aa_emit_cmp_to_flags(t, lhs, rhs);
aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(op)}));
t->mc->emit_label_ref(t->mc, label, R_AARCH64_CONDBR19, 4, 0);
}
@@ -1412,6 +1434,20 @@ static void aa_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc lhs,
aa_panic(aa_of(t), "unsupported floating binary op");
}
}
+ if (rhs.kind == NATIVE_LOC_IMM && (op == BO_IADD || op == BO_ISUB)) {
+ i64 imm = rhs.v.imm;
+ int is_add = (op == BO_IADD);
+ u32 imm12, sh;
+ if (imm < 0) {
+ is_add = !is_add;
+ imm = -imm;
+ }
+ if (!aa64_addsub_imm_fits(imm, &imm12, &sh))
+ aa_panic(aa_of(t), "binop immediate not encodable");
+ aa_emit32(t->mc, is_add ? aa64_add_imm(sf, rd, rn, imm12, sh)
+ : aa64_sub_imm(sf, rd, rn, imm12, sh));
+ return;
+ }
switch (op) {
case BO_IADD:
aa_emit32(t->mc, aa64_add(sf, rd, rn, rm));
@@ -1490,15 +1526,29 @@ static void aa_unop(NativeTarget* t, UnOp op, NativeLoc dst, NativeLoc src) {
}
}
-static void aa_cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc lhs,
- NativeLoc rhs) {
+static void aa_emit_cmp_to_flags(NativeTarget* t, NativeLoc lhs,
+ NativeLoc rhs) {
if (loc_is_fp(lhs)) {
aa_emit32(t->mc, aa_fcmp(type_size32(t, lhs.type) == 8u, loc_reg(lhs),
loc_reg(rhs)));
- } else {
+ return;
+ }
+ {
u32 sf = loc_is_64(t, lhs) ? 1u : 0u;
+ if (rhs.kind == NATIVE_LOC_IMM) {
+ u32 imm12 = 0, sh = 0;
+ if (rhs.v.imm < 0 || !aa64_addsub_imm_fits(rhs.v.imm, &imm12, &sh))
+ aa_panic(aa_of(t), "cmp immediate not encodable");
+ aa_emit32(t->mc, aa64_subs_imm12(sf, AA64_ZR, loc_reg(lhs), imm12, sh));
+ return;
+ }
aa_emit32(t->mc, aa_subs_reg(sf, AA64_ZR, loc_reg(lhs), loc_reg(rhs)));
}
+}
+
+static void aa_cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc lhs,
+ NativeLoc rhs) {
+ aa_emit_cmp_to_flags(t, lhs, rhs);
aa_emit32(t->mc, aa_cset(loc_is_64(t, dst), loc_reg(dst), cmp_cond(op)));
}
@@ -2707,6 +2757,7 @@ NativeTarget* aa64_native_target_new(Compiler* c, ObjBuilder* obj,
t->mc = mc;
t->regs = &aa_reg_info;
t->class_for_type = aa_class_for_type;
+ t->imm_legal = aa_imm_legal;
t->addr_legal = aa_addr_legal;
t->func_begin = aa_func_begin;
t->func_begin_known_frame = aa_func_begin_known_frame;
diff --git a/src/opt/pass_combine.c b/src/opt/pass_combine.c
@@ -338,13 +338,16 @@ typedef enum SubstKind {
/* Returns 1 if the given operand-index `idx` of `in` is foldable for `kind`.
* SK_REG / SK_CV: register substitution slots. SK_IMM: immediate substitution
* slots. */
-static int combine_subst_slot(const Inst* in, u32 idx, SubstKind kind) {
+static int combine_subst_slot(const Inst* in, u32 idx, SubstKind kind,
+ int copy_imm_ok) {
switch ((IROp)in->op) {
case IR_COPY:
- /* IR_COPY src is register-to-register by definition; folding an
- * immediate would change its shape and defeat the self-copy detection
- * that fires after coalescing assigns matching hard regs. */
- return kind != SK_IMM && idx == 1;
+ /* Normally IR_COPY stays register-to-register so that, after coalescing
+ * assigns src and dst the same hard reg, it becomes a self-copy combine
+ * removes. The O1 path never coalesces, so folding the immediate
+ * (copy_imm_ok) collapses `load_imm rT,k; copy rD,rT` into `copy rD,#k`,
+ * which the emit path turns into a single `load_imm rD,k`. */
+ return (kind != SK_IMM || copy_imm_ok) && idx == 1;
case IR_UNOP:
return kind != SK_IMM && idx == 1;
case IR_CONVERT:
@@ -687,13 +690,14 @@ static void set_indirect_field(Operand* ind, Reg old_reg, Reg new_reg) {
* only valid for OPK_REG `src`. Returns the number of operands actually
* rewritten. */
static int subst_consumer_operands(Inst* in, const Operand* def,
- const Operand* src, SubstKind kind) {
+ const Operand* src, SubstKind kind,
+ int copy_imm_ok) {
int n = 0;
for (u32 oi = 0; oi < in->nopnds; ++oi) {
Operand* op = &in->opnds[oi];
/* Direct OPK_REG substitution: requires the slot to be on the whitelist. */
if (op->kind == OPK_REG && same_phys_reg(op, def) &&
- combine_subst_slot(in, oi, kind)) {
+ combine_subst_slot(in, oi, kind, copy_imm_ok)) {
*op = *src;
++n;
continue;
@@ -771,7 +775,10 @@ static int try_substitute_for_reg(CombineCtx* ctx, Inst* in, i32 i, u8 cls,
ctx->bl->id, &def))
return 0;
- int n = subst_consumer_operands(in, &def, &src_op, kind);
+ /* O1 (no coalescing) folds immediates into IR_COPY; O2 leaves the copy
+ * register-to-register so coalescing + self-copy removal handles it. */
+ int copy_imm_ok = ctx->f && !ctx->f->opt_coalesce_parent;
+ int n = subst_consumer_operands(in, &def, &src_op, kind, copy_imm_ok);
if (n > 0) {
ctx->block_change_p = 1;
return 1;
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -450,6 +450,12 @@ static void write_loc(NativeEmitCtx* e, NativeLoc dst, NativeLoc src,
e->target->move(e->target, dst, src);
return;
}
+ /* An immediate goes straight into the destination register; routing it
+ * through a scratch and then moving would cost an extra instruction. */
+ if (src.kind == NATIVE_LOC_IMM) {
+ e->target->load_imm(e->target, dst, src.v.imm);
+ return;
+ }
tmp = materialize(e, src, (NativeAllocClass)dst.cls, dst.type, dst.v.reg,
REG_NONE, loc);
if (tmp.v.reg != dst.v.reg || tmp.cls != dst.cls)
@@ -463,6 +469,23 @@ static void write_loc(NativeEmitCtx* e, NativeLoc dst, NativeLoc src,
e->target->store(e->target, addr, src, mem);
}
+/* For an arithmetic / compare source operand: keep it as an immediate when it
+ * is a constant the target can encode for `use` (so no register is wasted
+ * materializing it); otherwise materialize into a register. */
+static NativeLoc operand_imm_or_reg(NativeEmitCtx* e, const OptOperand* op,
+ NativeImmUse use, u32 sub, Reg avoid_a,
+ Reg avoid_b, SrcLoc loc) {
+ if (op->kind == OPK_IMM && e->target->imm_legal &&
+ e->target->imm_legal(e->target, use, sub, op->type, op->v.imm))
+ return loc_imm(op->type, op->v.imm);
+ return materialize(e, loc_from_operand(e, op, loc), class_for_type(e, op->type),
+ op->type, avoid_a, avoid_b, loc);
+}
+
+static Reg loc_avoid_reg(NativeLoc l) {
+ return l.kind == NATIVE_LOC_REG ? l.v.reg : REG_NONE;
+}
+
static int type_is_aggregate_or_large(NativeEmitCtx* e, CfreeCgTypeId type) {
return type &&
(cg_type_is_aggregate(e->c, type) || type_size_or(e->c, type, 8u) > 8u);
@@ -656,21 +679,13 @@ static void emit_ret(NativeEmitCtx* e, Inst* in, const CGFuncDesc* fd) {
NativeCallPlanRet* rets = NULL;
u32 nrets = 0;
if (aux && aux->present) {
- NativeLoc final = abi_storage_loc(e, &aux->val, in->loc);
- CfreeCgTypeId vty = aux->val.type;
- if (cg_type_is_aggregate(e->c, vty) || type_size_or(e->c, vty, 8u) > 8u) {
- /* Aggregate / oversized return: hand plan_ret the value's memory
- * location directly. It copies to the sret pointer (indirect) or reads
- * the parts into the return registers (direct); a scalar copy through a
- * temp would exceed the single-register width. */
- value = final;
- } else {
- NativeFrameSlot slot =
- temp_slot(e, vty, in->loc, NATIVE_FRAME_SLOT_SPILL);
- NativeLoc frame = loc_frame(vty, class_for_type(e, vty), slot);
- write_loc(e, frame, final, mem_for_type(e->c, vty), in->loc);
- value = frame;
- }
+ /* Hand plan_ret the value's location directly. For an aggregate / oversized
+ * result it is a memory location (plan_ret copies to the sret pointer or
+ * reads parts into the return registers); for a scalar it is the value's
+ * register or slot, which plan_ret moves into the return register. The old
+ * code spilled scalars to a fresh slot and reloaded them, a pure round
+ * trip on every return. */
+ value = abi_storage_loc(e, &aux->val, in->loc);
values = &value;
}
e->target->plan_ret(e->target, fd, values, values ? 1u : 0u, &rets, &nrets);
@@ -851,13 +866,12 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in,
a = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc),
class_for_type(e, in->opnds[1].type), in->opnds[1].type,
dst_reg, REG_NONE, in->loc);
- b = materialize(e, loc_from_operand(e, &in->opnds[2], in->loc),
- class_for_type(e, in->opnds[2].type), in->opnds[2].type,
- a.v.reg, dst_reg, in->loc);
+ b = operand_imm_or_reg(e, &in->opnds[2], NATIVE_IMM_BINOP,
+ (u32)in->extra.imm, a.v.reg, dst_reg, in->loc);
if (dst.kind != NATIVE_LOC_REG)
dst = scratch_loc(e, in->opnds[0].type,
class_for_type(e, in->opnds[0].type), a.v.reg,
- b.v.reg, in->loc);
+ loc_avoid_reg(b), in->loc);
e->target->binop(e->target, (BinOp)in->extra.imm, dst, a, b);
if (in->opnds[0].kind != OPK_REG)
write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst,
@@ -884,13 +898,12 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in,
a = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc),
class_for_type(e, in->opnds[1].type), in->opnds[1].type,
dst_reg, REG_NONE, in->loc);
- b = materialize(e, loc_from_operand(e, &in->opnds[2], in->loc),
- class_for_type(e, in->opnds[2].type), in->opnds[2].type,
- a.v.reg, dst_reg, in->loc);
+ b = operand_imm_or_reg(e, &in->opnds[2], NATIVE_IMM_CMP,
+ (u32)in->extra.imm, a.v.reg, dst_reg, in->loc);
if (dst.kind != NATIVE_LOC_REG)
dst = scratch_loc(e, in->opnds[0].type,
class_for_type(e, in->opnds[0].type), a.v.reg,
- b.v.reg, in->loc);
+ loc_avoid_reg(b), in->loc);
e->target->cmp(e->target, (CmpOp)in->extra.imm, dst, a, b);
if (in->opnds[0].kind != OPK_REG)
write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst,
@@ -925,9 +938,8 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in,
a = materialize(e, loc_from_operand(e, &in->opnds[0], in->loc),
class_for_type(e, in->opnds[0].type), in->opnds[0].type,
REG_NONE, REG_NONE, in->loc);
- b = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc),
- class_for_type(e, in->opnds[1].type), in->opnds[1].type,
- a.v.reg, REG_NONE, in->loc);
+ b = operand_imm_or_reg(e, &in->opnds[1], NATIVE_IMM_CMP,
+ (u32)in->extra.imm, a.v.reg, REG_NONE, in->loc);
e->target->cmp_branch(
e->target, (CmpOp)in->extra.imm, a, b,
ensure_label(e, e->f->blocks[block].succ[0], in->loc));