kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 104b3914e3d72c60eef8824d69b03f6ec726aa30
parent 283f35cc35b3d8f28773a7820d1513081ea28862
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed, 27 May 2026 11:37:32 -0700

opt: fold constant operands as immediates at O1; drop copy/return round-trips

Keep integer constants out of registers on the optimizer path. Three related
changes turn the common "materialize a constant, then use it" shape into a
single immediate-form instruction:

- aa64 backend (native.c): add an imm_legal hook reporting which constants the
  ISA can encode (add/sub/cmp 12-bit immediates, optionally <<12; any value for
  a plain move) and wire it. aa_binop and a shared aa_emit_cmp_to_flags
  (factored out of aa_cmp / aa_cmp_branch) now emit add/sub/subs immediate forms
  when the rhs is a NATIVE_LOC_IMM.

- emit (pass_native_emit.c): operand_imm_or_reg leaves a binop/cmp/cmp_branch
  constant operand as an immediate when imm_legal accepts it, instead of always
  materializing it into a register. write_loc sends an immediate straight to its
  destination register (one load_imm) rather than through a scratch + move.
  emit_ret hands plan_ret the return value's location directly, dropping the
  per-return spill-to-temp + reload of scalar results.

- combine (pass_combine.c): on the O1 (no-coalesce) path, fold a load_imm into
  a consuming IR_COPY so "load_imm rT,k; copy rD,rT" collapses to "copy rD,#k",
  which the emit path lowers to a single load_imm rD,k. O2 is unchanged (it
  leaves copies register-to-register for its coalescer + self-copy removal).

On a representative loop this removes the per-iteration movz for +7/+1/+100,
the constant-initializer copy chains, and the return-value round trip,
shrinking the function from ~20 to ~12 instructions and halving its frame.
Full toy suite (R/L/C/W x O0/O1/O2) green.

Diffstat:
Msrc/arch/aa64/native.c | 71+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
Msrc/opt/pass_combine.c | 23+++++++++++++++--------
Msrc/opt/pass_native_emit.c | 64++++++++++++++++++++++++++++++++++++++--------------------------
3 files changed, 114 insertions(+), 44 deletions(-)

diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c @@ -460,6 +460,8 @@ static u32 aa_subs_reg(u32 sf, u32 rd, u32 rn, u32 rm) { (AA64AddSubSR){.sf = sf, .op = 1, .S = 1, .Rm = rm, .Rn = rn, .Rd = rd}); } +static void aa_emit_cmp_to_flags(NativeTarget* t, NativeLoc lhs, NativeLoc rhs); + static u32 aa_add_lsl(u32 rd, u32 rn, u32 rm, u32 shift) { return aa64_addsubsr_pack((AA64AddSubSR){.sf = 1, .op = 0, @@ -671,6 +673,32 @@ static int aa_addr_legal(NativeTarget* t, const NativeAddr* addr, return addr->log2_scale == sz; } +/* Which constant operands the backend can fold directly into an instruction + * (so the optimizer can leave them as immediates instead of materializing a + * register). Currently: add/sub/cmp 12-bit immediates (optionally <<12), and + * any value for a plain register move (movz/movk synthesizes it). */ +static int aa_imm_legal(NativeTarget* t, NativeImmUse use, u32 op, + CfreeCgTypeId type, i64 imm) { + u32 imm12, sh; + (void)t; + (void)type; + switch (use) { + case NATIVE_IMM_BINOP: + if ((BinOp)op == BO_IADD || (BinOp)op == BO_ISUB) + return aa64_addsub_imm_fits(imm < 0 ? -imm : imm, &imm12, &sh); + return 0; + case NATIVE_IMM_CMP: + /* cmp lowers to subs #imm12; cmn (negative) is not wired, so require a + * non-negative immediate. */ + return imm >= 0 && aa64_addsub_imm_fits(imm, &imm12, &sh); + case NATIVE_IMM_ADDR_OFFSET: + return aa64_addsub_imm_fits(imm < 0 ? -imm : imm, &imm12, &sh); + case NATIVE_IMM_MOVE: + return 1; + } + return 0; +} + static void aa_apply_index(AANativeTarget* a, u32 rd, const NativeAddr* addr) { if (addr->index_kind == NATIVE_ADDR_INDEX_NONE) return; if (addr->index_kind != NATIVE_ADDR_INDEX_REG) @@ -1145,13 +1173,7 @@ static void aa_jump(NativeTarget* t, MCLabel label) { static void aa_cmp_branch(NativeTarget* t, CmpOp op, NativeLoc lhs, NativeLoc rhs, MCLabel label) { - if (loc_is_fp(lhs)) { - aa_emit32(t->mc, aa_fcmp(type_size32(t, lhs.type) == 8u, loc_reg(lhs), - loc_reg(rhs))); - } else { - u32 sf = loc_is_64(t, lhs) ? 1u : 0u; - aa_emit32(t->mc, aa_subs_reg(sf, AA64_ZR, loc_reg(lhs), loc_reg(rhs))); - } + aa_emit_cmp_to_flags(t, lhs, rhs); aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(op)})); t->mc->emit_label_ref(t->mc, label, R_AARCH64_CONDBR19, 4, 0); } @@ -1412,6 +1434,20 @@ static void aa_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc lhs, aa_panic(aa_of(t), "unsupported floating binary op"); } } + if (rhs.kind == NATIVE_LOC_IMM && (op == BO_IADD || op == BO_ISUB)) { + i64 imm = rhs.v.imm; + int is_add = (op == BO_IADD); + u32 imm12, sh; + if (imm < 0) { + is_add = !is_add; + imm = -imm; + } + if (!aa64_addsub_imm_fits(imm, &imm12, &sh)) + aa_panic(aa_of(t), "binop immediate not encodable"); + aa_emit32(t->mc, is_add ? aa64_add_imm(sf, rd, rn, imm12, sh) + : aa64_sub_imm(sf, rd, rn, imm12, sh)); + return; + } switch (op) { case BO_IADD: aa_emit32(t->mc, aa64_add(sf, rd, rn, rm)); @@ -1490,15 +1526,29 @@ static void aa_unop(NativeTarget* t, UnOp op, NativeLoc dst, NativeLoc src) { } } -static void aa_cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc lhs, - NativeLoc rhs) { +static void aa_emit_cmp_to_flags(NativeTarget* t, NativeLoc lhs, + NativeLoc rhs) { if (loc_is_fp(lhs)) { aa_emit32(t->mc, aa_fcmp(type_size32(t, lhs.type) == 8u, loc_reg(lhs), loc_reg(rhs))); - } else { + return; + } + { u32 sf = loc_is_64(t, lhs) ? 1u : 0u; + if (rhs.kind == NATIVE_LOC_IMM) { + u32 imm12 = 0, sh = 0; + if (rhs.v.imm < 0 || !aa64_addsub_imm_fits(rhs.v.imm, &imm12, &sh)) + aa_panic(aa_of(t), "cmp immediate not encodable"); + aa_emit32(t->mc, aa64_subs_imm12(sf, AA64_ZR, loc_reg(lhs), imm12, sh)); + return; + } aa_emit32(t->mc, aa_subs_reg(sf, AA64_ZR, loc_reg(lhs), loc_reg(rhs))); } +} + +static void aa_cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc lhs, + NativeLoc rhs) { + aa_emit_cmp_to_flags(t, lhs, rhs); aa_emit32(t->mc, aa_cset(loc_is_64(t, dst), loc_reg(dst), cmp_cond(op))); } @@ -2707,6 +2757,7 @@ NativeTarget* aa64_native_target_new(Compiler* c, ObjBuilder* obj, t->mc = mc; t->regs = &aa_reg_info; t->class_for_type = aa_class_for_type; + t->imm_legal = aa_imm_legal; t->addr_legal = aa_addr_legal; t->func_begin = aa_func_begin; t->func_begin_known_frame = aa_func_begin_known_frame; diff --git a/src/opt/pass_combine.c b/src/opt/pass_combine.c @@ -338,13 +338,16 @@ typedef enum SubstKind { /* Returns 1 if the given operand-index `idx` of `in` is foldable for `kind`. * SK_REG / SK_CV: register substitution slots. SK_IMM: immediate substitution * slots. */ -static int combine_subst_slot(const Inst* in, u32 idx, SubstKind kind) { +static int combine_subst_slot(const Inst* in, u32 idx, SubstKind kind, + int copy_imm_ok) { switch ((IROp)in->op) { case IR_COPY: - /* IR_COPY src is register-to-register by definition; folding an - * immediate would change its shape and defeat the self-copy detection - * that fires after coalescing assigns matching hard regs. */ - return kind != SK_IMM && idx == 1; + /* Normally IR_COPY stays register-to-register so that, after coalescing + * assigns src and dst the same hard reg, it becomes a self-copy combine + * removes. The O1 path never coalesces, so folding the immediate + * (copy_imm_ok) collapses `load_imm rT,k; copy rD,rT` into `copy rD,#k`, + * which the emit path turns into a single `load_imm rD,k`. */ + return (kind != SK_IMM || copy_imm_ok) && idx == 1; case IR_UNOP: return kind != SK_IMM && idx == 1; case IR_CONVERT: @@ -687,13 +690,14 @@ static void set_indirect_field(Operand* ind, Reg old_reg, Reg new_reg) { * only valid for OPK_REG `src`. Returns the number of operands actually * rewritten. */ static int subst_consumer_operands(Inst* in, const Operand* def, - const Operand* src, SubstKind kind) { + const Operand* src, SubstKind kind, + int copy_imm_ok) { int n = 0; for (u32 oi = 0; oi < in->nopnds; ++oi) { Operand* op = &in->opnds[oi]; /* Direct OPK_REG substitution: requires the slot to be on the whitelist. */ if (op->kind == OPK_REG && same_phys_reg(op, def) && - combine_subst_slot(in, oi, kind)) { + combine_subst_slot(in, oi, kind, copy_imm_ok)) { *op = *src; ++n; continue; @@ -771,7 +775,10 @@ static int try_substitute_for_reg(CombineCtx* ctx, Inst* in, i32 i, u8 cls, ctx->bl->id, &def)) return 0; - int n = subst_consumer_operands(in, &def, &src_op, kind); + /* O1 (no coalescing) folds immediates into IR_COPY; O2 leaves the copy + * register-to-register so coalescing + self-copy removal handles it. */ + int copy_imm_ok = ctx->f && !ctx->f->opt_coalesce_parent; + int n = subst_consumer_operands(in, &def, &src_op, kind, copy_imm_ok); if (n > 0) { ctx->block_change_p = 1; return 1; diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c @@ -450,6 +450,12 @@ static void write_loc(NativeEmitCtx* e, NativeLoc dst, NativeLoc src, e->target->move(e->target, dst, src); return; } + /* An immediate goes straight into the destination register; routing it + * through a scratch and then moving would cost an extra instruction. */ + if (src.kind == NATIVE_LOC_IMM) { + e->target->load_imm(e->target, dst, src.v.imm); + return; + } tmp = materialize(e, src, (NativeAllocClass)dst.cls, dst.type, dst.v.reg, REG_NONE, loc); if (tmp.v.reg != dst.v.reg || tmp.cls != dst.cls) @@ -463,6 +469,23 @@ static void write_loc(NativeEmitCtx* e, NativeLoc dst, NativeLoc src, e->target->store(e->target, addr, src, mem); } +/* For an arithmetic / compare source operand: keep it as an immediate when it + * is a constant the target can encode for `use` (so no register is wasted + * materializing it); otherwise materialize into a register. */ +static NativeLoc operand_imm_or_reg(NativeEmitCtx* e, const OptOperand* op, + NativeImmUse use, u32 sub, Reg avoid_a, + Reg avoid_b, SrcLoc loc) { + if (op->kind == OPK_IMM && e->target->imm_legal && + e->target->imm_legal(e->target, use, sub, op->type, op->v.imm)) + return loc_imm(op->type, op->v.imm); + return materialize(e, loc_from_operand(e, op, loc), class_for_type(e, op->type), + op->type, avoid_a, avoid_b, loc); +} + +static Reg loc_avoid_reg(NativeLoc l) { + return l.kind == NATIVE_LOC_REG ? l.v.reg : REG_NONE; +} + static int type_is_aggregate_or_large(NativeEmitCtx* e, CfreeCgTypeId type) { return type && (cg_type_is_aggregate(e->c, type) || type_size_or(e->c, type, 8u) > 8u); @@ -656,21 +679,13 @@ static void emit_ret(NativeEmitCtx* e, Inst* in, const CGFuncDesc* fd) { NativeCallPlanRet* rets = NULL; u32 nrets = 0; if (aux && aux->present) { - NativeLoc final = abi_storage_loc(e, &aux->val, in->loc); - CfreeCgTypeId vty = aux->val.type; - if (cg_type_is_aggregate(e->c, vty) || type_size_or(e->c, vty, 8u) > 8u) { - /* Aggregate / oversized return: hand plan_ret the value's memory - * location directly. It copies to the sret pointer (indirect) or reads - * the parts into the return registers (direct); a scalar copy through a - * temp would exceed the single-register width. */ - value = final; - } else { - NativeFrameSlot slot = - temp_slot(e, vty, in->loc, NATIVE_FRAME_SLOT_SPILL); - NativeLoc frame = loc_frame(vty, class_for_type(e, vty), slot); - write_loc(e, frame, final, mem_for_type(e->c, vty), in->loc); - value = frame; - } + /* Hand plan_ret the value's location directly. For an aggregate / oversized + * result it is a memory location (plan_ret copies to the sret pointer or + * reads parts into the return registers); for a scalar it is the value's + * register or slot, which plan_ret moves into the return register. The old + * code spilled scalars to a fresh slot and reloaded them, a pure round + * trip on every return. */ + value = abi_storage_loc(e, &aux->val, in->loc); values = &value; } e->target->plan_ret(e->target, fd, values, values ? 1u : 0u, &rets, &nrets); @@ -851,13 +866,12 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in, a = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc), class_for_type(e, in->opnds[1].type), in->opnds[1].type, dst_reg, REG_NONE, in->loc); - b = materialize(e, loc_from_operand(e, &in->opnds[2], in->loc), - class_for_type(e, in->opnds[2].type), in->opnds[2].type, - a.v.reg, dst_reg, in->loc); + b = operand_imm_or_reg(e, &in->opnds[2], NATIVE_IMM_BINOP, + (u32)in->extra.imm, a.v.reg, dst_reg, in->loc); if (dst.kind != NATIVE_LOC_REG) dst = scratch_loc(e, in->opnds[0].type, class_for_type(e, in->opnds[0].type), a.v.reg, - b.v.reg, in->loc); + loc_avoid_reg(b), in->loc); e->target->binop(e->target, (BinOp)in->extra.imm, dst, a, b); if (in->opnds[0].kind != OPK_REG) write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst, @@ -884,13 +898,12 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in, a = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc), class_for_type(e, in->opnds[1].type), in->opnds[1].type, dst_reg, REG_NONE, in->loc); - b = materialize(e, loc_from_operand(e, &in->opnds[2], in->loc), - class_for_type(e, in->opnds[2].type), in->opnds[2].type, - a.v.reg, dst_reg, in->loc); + b = operand_imm_or_reg(e, &in->opnds[2], NATIVE_IMM_CMP, + (u32)in->extra.imm, a.v.reg, dst_reg, in->loc); if (dst.kind != NATIVE_LOC_REG) dst = scratch_loc(e, in->opnds[0].type, class_for_type(e, in->opnds[0].type), a.v.reg, - b.v.reg, in->loc); + loc_avoid_reg(b), in->loc); e->target->cmp(e->target, (CmpOp)in->extra.imm, dst, a, b); if (in->opnds[0].kind != OPK_REG) write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst, @@ -925,9 +938,8 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in, a = materialize(e, loc_from_operand(e, &in->opnds[0], in->loc), class_for_type(e, in->opnds[0].type), in->opnds[0].type, REG_NONE, REG_NONE, in->loc); - b = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc), - class_for_type(e, in->opnds[1].type), in->opnds[1].type, - a.v.reg, REG_NONE, in->loc); + b = operand_imm_or_reg(e, &in->opnds[1], NATIVE_IMM_CMP, + (u32)in->extra.imm, a.v.reg, REG_NONE, in->loc); e->target->cmp_branch( e->target, (CmpOp)in->extra.imm, a, b, ensure_label(e, e->f->blocks[block].succ[0], in->loc));