opt: fold constant operands as immediates at O1; drop copy/return round-trips - kit

commit 104b3914e3d72c60eef8824d69b03f6ec726aa30
parent 283f35cc35b3d8f28773a7820d1513081ea28862
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed, 27 May 2026 11:37:32 -0700

opt: fold constant operands as immediates at O1; drop copy/return round-trips

Keep integer constants out of registers on the optimizer path. Three related
changes turn the common "materialize a constant, then use it" shape into a
single immediate-form instruction:

- aa64 backend (native.c): add an imm_legal hook reporting which constants the
  ISA can encode (add/sub/cmp 12-bit immediates, optionally <<12; any value for
  a plain move) and wire it. aa_binop and a shared aa_emit_cmp_to_flags
  (factored out of aa_cmp / aa_cmp_branch) now emit add/sub/subs immediate forms
  when the rhs is a NATIVE_LOC_IMM.

- emit (pass_native_emit.c): operand_imm_or_reg leaves a binop/cmp/cmp_branch
  constant operand as an immediate when imm_legal accepts it, instead of always
  materializing it into a register. write_loc sends an immediate straight to its
  destination register (one load_imm) rather than through a scratch + move.
  emit_ret hands plan_ret the return value's location directly, dropping the
  per-return spill-to-temp + reload of scalar results.

- combine (pass_combine.c): on the O1 (no-coalesce) path, fold a load_imm into
  a consuming IR_COPY so "load_imm rT,k; copy rD,rT" collapses to "copy rD,#k",
  which the emit path lowers to a single load_imm rD,k. O2 is unchanged (it
  leaves copies register-to-register for its coalescer + self-copy removal).

On a representative loop this removes the per-iteration movz for +7/+1/+100,
the constant-initializer copy chains, and the return-value round trip,
shrinking the function from ~20 to ~12 instructions and halving its frame.
Full toy suite (R/L/C/W x O0/O1/O2) green.

Diffstat:
M src/arch/aa64/native.c  | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
M src/opt/pass_combine.c  | 23 +++++++++++++++--------
M src/opt/pass_native_emit.c  | 64 ++++++++++++++++++++++++++++++++++++++--------------------------

3 files changed, 114 insertions(+), 44 deletions(-)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -460,6 +460,8 @@ static u32 aa_subs_reg(u32 sf, u32 rd, u32 rn, u32 rm) {
       (AA64AddSubSR){.sf = sf, .op = 1, .S = 1, .Rm = rm, .Rn = rn, .Rd = rd});
 }
 
+static void aa_emit_cmp_to_flags(NativeTarget* t, NativeLoc lhs, NativeLoc rhs);
+
 static u32 aa_add_lsl(u32 rd, u32 rn, u32 rm, u32 shift) {
   return aa64_addsubsr_pack((AA64AddSubSR){.sf = 1,
                                            .op = 0,
@@ -671,6 +673,32 @@ static int aa_addr_legal(NativeTarget* t, const NativeAddr* addr,
   return addr->log2_scale == sz;
 }
 
+/* Which constant operands the backend can fold directly into an instruction
+ * (so the optimizer can leave them as immediates instead of materializing a
+ * register). Currently: add/sub/cmp 12-bit immediates (optionally <<12), and
+ * any value for a plain register move (movz/movk synthesizes it). */
+static int aa_imm_legal(NativeTarget* t, NativeImmUse use, u32 op,
+                        CfreeCgTypeId type, i64 imm) {
+  u32 imm12, sh;
+  (void)t;
+  (void)type;
+  switch (use) {
+    case NATIVE_IMM_BINOP:
+      if ((BinOp)op == BO_IADD || (BinOp)op == BO_ISUB)
+        return aa64_addsub_imm_fits(imm < 0 ? -imm : imm, &imm12, &sh);
+      return 0;
+    case NATIVE_IMM_CMP:
+      /* cmp lowers to subs #imm12; cmn (negative) is not wired, so require a
+       * non-negative immediate. */
+      return imm >= 0 && aa64_addsub_imm_fits(imm, &imm12, &sh);
+    case NATIVE_IMM_ADDR_OFFSET:
+      return aa64_addsub_imm_fits(imm < 0 ? -imm : imm, &imm12, &sh);
+    case NATIVE_IMM_MOVE:
+      return 1;
+  }
+  return 0;
+}
+
 static void aa_apply_index(AANativeTarget* a, u32 rd, const NativeAddr* addr) {
   if (addr->index_kind == NATIVE_ADDR_INDEX_NONE) return;
   if (addr->index_kind != NATIVE_ADDR_INDEX_REG)
@@ -1145,13 +1173,7 @@ static void aa_jump(NativeTarget* t, MCLabel label) {
 
 static void aa_cmp_branch(NativeTarget* t, CmpOp op, NativeLoc lhs,
                           NativeLoc rhs, MCLabel label) {
-  if (loc_is_fp(lhs)) {
-    aa_emit32(t->mc, aa_fcmp(type_size32(t, lhs.type) == 8u, loc_reg(lhs),
-                             loc_reg(rhs)));
-  } else {
-    u32 sf = loc_is_64(t, lhs) ? 1u : 0u;
-    aa_emit32(t->mc, aa_subs_reg(sf, AA64_ZR, loc_reg(lhs), loc_reg(rhs)));
-  }
+  aa_emit_cmp_to_flags(t, lhs, rhs);
   aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(op)}));
   t->mc->emit_label_ref(t->mc, label, R_AARCH64_CONDBR19, 4, 0);
 }
@@ -1412,6 +1434,20 @@ static void aa_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc lhs,
         aa_panic(aa_of(t), "unsupported floating binary op");
     }
   }
+  if (rhs.kind == NATIVE_LOC_IMM && (op == BO_IADD || op == BO_ISUB)) {
+    i64 imm = rhs.v.imm;
+    int is_add = (op == BO_IADD);
+    u32 imm12, sh;
+    if (imm < 0) {
+      is_add = !is_add;
+      imm = -imm;
+    }
+    if (!aa64_addsub_imm_fits(imm, &imm12, &sh))
+      aa_panic(aa_of(t), "binop immediate not encodable");
+    aa_emit32(t->mc, is_add ? aa64_add_imm(sf, rd, rn, imm12, sh)
+                            : aa64_sub_imm(sf, rd, rn, imm12, sh));
+    return;
+  }
   switch (op) {
     case BO_IADD:
       aa_emit32(t->mc, aa64_add(sf, rd, rn, rm));
@@ -1490,15 +1526,29 @@ static void aa_unop(NativeTarget* t, UnOp op, NativeLoc dst, NativeLoc src) {
   }
 }
 
-static void aa_cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc lhs,
-                   NativeLoc rhs) {
+static void aa_emit_cmp_to_flags(NativeTarget* t, NativeLoc lhs,
+                                 NativeLoc rhs) {
   if (loc_is_fp(lhs)) {
     aa_emit32(t->mc, aa_fcmp(type_size32(t, lhs.type) == 8u, loc_reg(lhs),
                              loc_reg(rhs)));
-  } else {
+    return;
+  }
+  {
     u32 sf = loc_is_64(t, lhs) ? 1u : 0u;
+    if (rhs.kind == NATIVE_LOC_IMM) {
+      u32 imm12 = 0, sh = 0;
+      if (rhs.v.imm < 0 || !aa64_addsub_imm_fits(rhs.v.imm, &imm12, &sh))
+        aa_panic(aa_of(t), "cmp immediate not encodable");
+      aa_emit32(t->mc, aa64_subs_imm12(sf, AA64_ZR, loc_reg(lhs), imm12, sh));
+      return;
+    }
     aa_emit32(t->mc, aa_subs_reg(sf, AA64_ZR, loc_reg(lhs), loc_reg(rhs)));
   }
+}
+
+static void aa_cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc lhs,
+                   NativeLoc rhs) {
+  aa_emit_cmp_to_flags(t, lhs, rhs);
   aa_emit32(t->mc, aa_cset(loc_is_64(t, dst), loc_reg(dst), cmp_cond(op)));
 }
 
@@ -2707,6 +2757,7 @@ NativeTarget* aa64_native_target_new(Compiler* c, ObjBuilder* obj,
   t->mc = mc;
   t->regs = &aa_reg_info;
   t->class_for_type = aa_class_for_type;
+  t->imm_legal = aa_imm_legal;
   t->addr_legal = aa_addr_legal;
   t->func_begin = aa_func_begin;
   t->func_begin_known_frame = aa_func_begin_known_frame;
diff --git a/src/opt/pass_combine.c b/src/opt/pass_combine.c
@@ -338,13 +338,16 @@ typedef enum SubstKind {
 /* Returns 1 if the given operand-index `idx` of `in` is foldable for `kind`.
  * SK_REG / SK_CV: register substitution slots. SK_IMM: immediate substitution
  * slots. */
-static int combine_subst_slot(const Inst* in, u32 idx, SubstKind kind) {
+static int combine_subst_slot(const Inst* in, u32 idx, SubstKind kind,
+                              int copy_imm_ok) {
   switch ((IROp)in->op) {
     case IR_COPY:
-      /* IR_COPY src is register-to-register by definition; folding an
-       * immediate would change its shape and defeat the self-copy detection
-       * that fires after coalescing assigns matching hard regs. */
-      return kind != SK_IMM && idx == 1;
+      /* Normally IR_COPY stays register-to-register so that, after coalescing
+       * assigns src and dst the same hard reg, it becomes a self-copy combine
+       * removes. The O1 path never coalesces, so folding the immediate
+       * (copy_imm_ok) collapses `load_imm rT,k; copy rD,rT` into `copy rD,#k`,
+       * which the emit path turns into a single `load_imm rD,k`. */
+      return (kind != SK_IMM || copy_imm_ok) && idx == 1;
     case IR_UNOP:
       return kind != SK_IMM && idx == 1;
     case IR_CONVERT:
@@ -687,13 +690,14 @@ static void set_indirect_field(Operand* ind, Reg old_reg, Reg new_reg) {
  * only valid for OPK_REG `src`. Returns the number of operands actually
  * rewritten. */
 static int subst_consumer_operands(Inst* in, const Operand* def,
-                                   const Operand* src, SubstKind kind) {
+                                   const Operand* src, SubstKind kind,
+                                   int copy_imm_ok) {
   int n = 0;
   for (u32 oi = 0; oi < in->nopnds; ++oi) {
     Operand* op = &in->opnds[oi];
     /* Direct OPK_REG substitution: requires the slot to be on the whitelist. */
     if (op->kind == OPK_REG && same_phys_reg(op, def) &&
-        combine_subst_slot(in, oi, kind)) {
+        combine_subst_slot(in, oi, kind, copy_imm_ok)) {
       *op = *src;
       ++n;
       continue;
@@ -771,7 +775,10 @@ static int try_substitute_for_reg(CombineCtx* ctx, Inst* in, i32 i, u8 cls,
                                                  ctx->bl->id, &def))
     return 0;
 
-  int n = subst_consumer_operands(in, &def, &src_op, kind);
+  /* O1 (no coalescing) folds immediates into IR_COPY; O2 leaves the copy
+   * register-to-register so coalescing + self-copy removal handles it. */
+  int copy_imm_ok = ctx->f && !ctx->f->opt_coalesce_parent;
+  int n = subst_consumer_operands(in, &def, &src_op, kind, copy_imm_ok);
   if (n > 0) {
     ctx->block_change_p = 1;
     return 1;
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -450,6 +450,12 @@ static void write_loc(NativeEmitCtx* e, NativeLoc dst, NativeLoc src,
         e->target->move(e->target, dst, src);
       return;
     }
+    /* An immediate goes straight into the destination register; routing it
+     * through a scratch and then moving would cost an extra instruction. */
+    if (src.kind == NATIVE_LOC_IMM) {
+      e->target->load_imm(e->target, dst, src.v.imm);
+      return;
+    }
     tmp = materialize(e, src, (NativeAllocClass)dst.cls, dst.type, dst.v.reg,
                       REG_NONE, loc);
     if (tmp.v.reg != dst.v.reg || tmp.cls != dst.cls)
@@ -463,6 +469,23 @@ static void write_loc(NativeEmitCtx* e, NativeLoc dst, NativeLoc src,
   e->target->store(e->target, addr, src, mem);
 }
 
+/* For an arithmetic / compare source operand: keep it as an immediate when it
+ * is a constant the target can encode for `use` (so no register is wasted
+ * materializing it); otherwise materialize into a register. */
+static NativeLoc operand_imm_or_reg(NativeEmitCtx* e, const OptOperand* op,
+                                    NativeImmUse use, u32 sub, Reg avoid_a,
+                                    Reg avoid_b, SrcLoc loc) {
+  if (op->kind == OPK_IMM && e->target->imm_legal &&
+      e->target->imm_legal(e->target, use, sub, op->type, op->v.imm))
+    return loc_imm(op->type, op->v.imm);
+  return materialize(e, loc_from_operand(e, op, loc), class_for_type(e, op->type),
+                     op->type, avoid_a, avoid_b, loc);
+}
+
+static Reg loc_avoid_reg(NativeLoc l) {
+  return l.kind == NATIVE_LOC_REG ? l.v.reg : REG_NONE;
+}
+
 static int type_is_aggregate_or_large(NativeEmitCtx* e, CfreeCgTypeId type) {
   return type &&
          (cg_type_is_aggregate(e->c, type) || type_size_or(e->c, type, 8u) > 8u);
@@ -656,21 +679,13 @@ static void emit_ret(NativeEmitCtx* e, Inst* in, const CGFuncDesc* fd) {
   NativeCallPlanRet* rets = NULL;
   u32 nrets = 0;
   if (aux && aux->present) {
-    NativeLoc final = abi_storage_loc(e, &aux->val, in->loc);
-    CfreeCgTypeId vty = aux->val.type;
-    if (cg_type_is_aggregate(e->c, vty) || type_size_or(e->c, vty, 8u) > 8u) {
-      /* Aggregate / oversized return: hand plan_ret the value's memory
-       * location directly. It copies to the sret pointer (indirect) or reads
-       * the parts into the return registers (direct); a scalar copy through a
-       * temp would exceed the single-register width. */
-      value = final;
-    } else {
-      NativeFrameSlot slot =
-          temp_slot(e, vty, in->loc, NATIVE_FRAME_SLOT_SPILL);
-      NativeLoc frame = loc_frame(vty, class_for_type(e, vty), slot);
-      write_loc(e, frame, final, mem_for_type(e->c, vty), in->loc);
-      value = frame;
-    }
+    /* Hand plan_ret the value's location directly. For an aggregate / oversized
+     * result it is a memory location (plan_ret copies to the sret pointer or
+     * reads parts into the return registers); for a scalar it is the value's
+     * register or slot, which plan_ret moves into the return register. The old
+     * code spilled scalars to a fresh slot and reloaded them, a pure round
+     * trip on every return. */
+    value = abi_storage_loc(e, &aux->val, in->loc);
     values = &value;
   }
   e->target->plan_ret(e->target, fd, values, values ? 1u : 0u, &rets, &nrets);
@@ -851,13 +866,12 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in,
       a = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc),
                       class_for_type(e, in->opnds[1].type), in->opnds[1].type,
                       dst_reg, REG_NONE, in->loc);
-      b = materialize(e, loc_from_operand(e, &in->opnds[2], in->loc),
-                      class_for_type(e, in->opnds[2].type), in->opnds[2].type,
-                      a.v.reg, dst_reg, in->loc);
+      b = operand_imm_or_reg(e, &in->opnds[2], NATIVE_IMM_BINOP,
+                             (u32)in->extra.imm, a.v.reg, dst_reg, in->loc);
       if (dst.kind != NATIVE_LOC_REG)
         dst = scratch_loc(e, in->opnds[0].type,
                           class_for_type(e, in->opnds[0].type), a.v.reg,
-                          b.v.reg, in->loc);
+                          loc_avoid_reg(b), in->loc);
       e->target->binop(e->target, (BinOp)in->extra.imm, dst, a, b);
       if (in->opnds[0].kind != OPK_REG)
         write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst,
@@ -884,13 +898,12 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in,
       a = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc),
                       class_for_type(e, in->opnds[1].type), in->opnds[1].type,
                       dst_reg, REG_NONE, in->loc);
-      b = materialize(e, loc_from_operand(e, &in->opnds[2], in->loc),
-                      class_for_type(e, in->opnds[2].type), in->opnds[2].type,
-                      a.v.reg, dst_reg, in->loc);
+      b = operand_imm_or_reg(e, &in->opnds[2], NATIVE_IMM_CMP,
+                             (u32)in->extra.imm, a.v.reg, dst_reg, in->loc);
       if (dst.kind != NATIVE_LOC_REG)
         dst = scratch_loc(e, in->opnds[0].type,
                           class_for_type(e, in->opnds[0].type), a.v.reg,
-                          b.v.reg, in->loc);
+                          loc_avoid_reg(b), in->loc);
       e->target->cmp(e->target, (CmpOp)in->extra.imm, dst, a, b);
       if (in->opnds[0].kind != OPK_REG)
         write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst,
@@ -925,9 +938,8 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in,
       a = materialize(e, loc_from_operand(e, &in->opnds[0], in->loc),
                       class_for_type(e, in->opnds[0].type), in->opnds[0].type,
                       REG_NONE, REG_NONE, in->loc);
-      b = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc),
-                      class_for_type(e, in->opnds[1].type), in->opnds[1].type,
-                      a.v.reg, REG_NONE, in->loc);
+      b = operand_imm_or_reg(e, &in->opnds[1], NATIVE_IMM_CMP,
+                             (u32)in->extra.imm, a.v.reg, REG_NONE, in->loc);
       e->target->cmp_branch(
           e->target, (CmpOp)in->extra.imm, a, b,
           ensure_label(e, e->f->blocks[block].succ[0], in->loc));

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/arch/aa64/native.c	\|	71	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
M	src/opt/pass_combine.c	\|	23	+++++++++++++++--------
M	src/opt/pass_native_emit.c	\|	64	++++++++++++++++++++++++++++++++++++++--------------------------