kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

pass_combine.c (54369B)


      1 #include <kit/cg.h>
      2 #include <stdint.h>
      3 #include <string.h>
      4 
      5 #include "cg/ir_eval.h"
      6 #include "core/arena.h"
      7 #include "opt/opt_internal.h"
      8 
      9 enum {
     10   COMBINE_CG_TYPE_SEG_SHIFT = 6,
     11   COMBINE_CG_TYPE_SEG_MASK = (1u << COMBINE_CG_TYPE_SEG_SHIFT) - 1u,
     12   COMBINE_CG_TYPE_BUILTIN_SEG = 1u,
     13 };
     14 
     15 /* O1 combine, MIR-shaped (see mir-gen.c:8808-9146). One forward pass per BB
     16  * maintains last-def / last-mem-def tracking; per-BB fixpoint loop iterates
     17  * until no fold fires. Rewrites in this file:
     18  *
     19  *   1. Substitute producer source into consumer operand
     20  *      (IR_COPY / IR_LOAD_IMM / IR_CONVERT producer; into register slot,
     21  *       indirect base, or indirect index).
     22  *   2. Address-mode synthesis (IR_BINOP IADD/ISHL producer; into OPK_INDIRECT
     23  *       base/index/scale/ofs).
     24  *   3. Sink producer into single-use IR_COPY destination.
     25  *   4. combine_exts: fold ext-of-ext chains with size+signedness rules.
     26  *
     27  * Spill compaction (store-then-reload, etc.) and self-copy removal continue
     28  * to run in opt_combine_compact_block, unchanged. */
     29 
     30 /* ---- shared helpers (operand inspection / counting) ---- */
     31 
     32 static int same_reg_operand(const Operand* a, const Operand* b) {
     33   return a->kind == OPK_REG && b->kind == OPK_REG && a->cls == b->cls &&
     34          a->v.reg == b->v.reg;
     35 }
     36 
     37 static int frame_slot_is_spill(Func* f, FrameSlot fs) {
     38   if (fs == FRAME_SLOT_NONE || fs > f->nframe_slots) return 0;
     39   return f->frame_slots[fs - 1u].kind == FS_SPILL;
     40 }
     41 
     42 static int spill_local_slot(Func* f, const Operand* addr, const MemAccess* mem,
     43                             FrameSlot* out) {
     44   if (!addr || addr->kind != OPK_LOCAL) return 0;
     45   if (opt_mem_observable(mem)) return 0;
     46   if (mem->alias.kind != ALIAS_LOCAL) return 0;
     47   if (mem->alias.v.local_id != (i32)addr->v.frame_slot) return 0;
     48   if (!frame_slot_is_spill(f, addr->v.frame_slot)) return 0;
     49   *out = addr->v.frame_slot;
     50   return 1;
     51 }
     52 
     53 static int same_spill_access(Func* f, const Inst* a, const Inst* b,
     54                              FrameSlot* slot_out) {
     55   FrameSlot as = FRAME_SLOT_NONE;
     56   FrameSlot bs = FRAME_SLOT_NONE;
     57   if (!spill_local_slot(f, &a->opnds[0], &a->extra.mem, &as)) return 0;
     58   if (!spill_local_slot(f, &b->opnds[0], &b->extra.mem, &bs)) return 0;
     59   if (as != bs) return 0;
     60   if (a->extra.mem.size != b->extra.mem.size) return 0;
     61   if (a->extra.mem.addr_space != b->extra.mem.addr_space) return 0;
     62   if (slot_out) *slot_out = as;
     63   return 1;
     64 }
     65 
     66 static int load_spill_slot(Func* f, const Inst* in, FrameSlot* slot_out) {
     67   if ((IROp)in->op != IR_LOAD || in->nopnds < 2) return 0;
     68   return spill_local_slot(f, &in->opnds[1], &in->extra.mem, slot_out);
     69 }
     70 
     71 static int store_spill_slot(Func* f, const Inst* in, FrameSlot* slot_out) {
     72   if ((IROp)in->op != IR_STORE || in->nopnds < 2) return 0;
     73   return spill_local_slot(f, &in->opnds[0], &in->extra.mem, slot_out);
     74 }
     75 
     76 static int same_spill_slot_and_size(Func* f, const Inst* a, const Inst* b) {
     77   FrameSlot as = FRAME_SLOT_NONE;
     78   FrameSlot bs = FRAME_SLOT_NONE;
     79   if ((IROp)a->op == IR_LOAD) {
     80     if (!load_spill_slot(f, a, &as)) return 0;
     81   } else if (!store_spill_slot(f, a, &as)) {
     82     return 0;
     83   }
     84   if ((IROp)b->op == IR_LOAD) {
     85     if (!load_spill_slot(f, b, &bs)) return 0;
     86   } else if (!store_spill_slot(f, b, &bs)) {
     87     return 0;
     88   }
     89   return as == bs && a->extra.mem.size == b->extra.mem.size &&
     90          a->extra.mem.addr_space == b->extra.mem.addr_space;
     91 }
     92 
     93 static int same_phys_reg(const Operand* a, const Operand* b) {
     94   return a && b && a->kind == OPK_REG && b->kind == OPK_REG &&
     95          a->cls == b->cls && a->v.reg == b->v.reg;
     96 }
     97 
     98 /* Count register references inside a single operand. An OPK_INDIRECT can
     99  * reference the queried register twice (e.g. `[r4 + r4 * 4]`); we return the
    100  * true count so single-use accounting is exact. */
    101 static int count_operand_phys_uses(const Operand* op, const Operand* r) {
    102   if (!op || !r || r->kind != OPK_REG) return 0;
    103   if (op->kind == OPK_REG)
    104     return op->cls == r->cls && op->v.reg == r->v.reg ? 1 : 0;
    105   if (op->kind == OPK_INDIRECT) {
    106     if (r->cls != RC_INT) return 0;
    107     int n = 0;
    108     if (op->v.ind.base == r->v.reg) ++n;
    109     if (op->v.ind.index != (Reg)REG_NONE && op->v.ind.index == r->v.reg) ++n;
    110     return n;
    111   }
    112   return 0;
    113 }
    114 
    115 static int operand_uses_phys_reg(const Operand* op, const Operand* r) {
    116   return count_operand_phys_uses(op, r) > 0;
    117 }
    118 
    119 static int abi_uses_phys_reg(const CGABIValue* v, const Operand* r) {
    120   int n = 0;
    121   if (!v) return 0;
    122   n += count_operand_phys_uses(&v->storage, r);
    123   for (u32 i = 0; i < v->nparts; ++i)
    124     n += count_operand_phys_uses(&v->parts[i].op, r);
    125   return n;
    126 }
    127 
    128 static int inst_uses_phys_reg(const Inst* in, const Operand* r) {
    129   int n = 0;
    130   switch ((IROp)in->op) {
    131     case IR_COPY:
    132     case IR_CONVERT:
    133     case IR_UNOP:
    134     case IR_VA_ARG:
    135       if (in->nopnds >= 2) n += count_operand_phys_uses(&in->opnds[1], r);
    136       break;
    137     case IR_LOAD:
    138     case IR_ADDR_OF:
    139     case IR_BITFIELD_LOAD:
    140     case IR_ATOMIC_LOAD:
    141       if (in->nopnds >= 2) n += count_operand_phys_uses(&in->opnds[1], r);
    142       break;
    143     case IR_BINOP:
    144     case IR_CMP:
    145       if (in->nopnds >= 2) n += count_operand_phys_uses(&in->opnds[1], r);
    146       if (in->nopnds >= 3) n += count_operand_phys_uses(&in->opnds[2], r);
    147       break;
    148     case IR_STORE:
    149     case IR_AGG_COPY:
    150     case IR_AGG_SET:
    151     case IR_BITFIELD_STORE:
    152     case IR_VA_COPY:
    153       if (in->nopnds >= 1) n += count_operand_phys_uses(&in->opnds[0], r);
    154       if (in->nopnds >= 2) n += count_operand_phys_uses(&in->opnds[1], r);
    155       break;
    156     case IR_CALL: {
    157       IRCallAux* aux = (IRCallAux*)in->extra.aux;
    158       if (!aux) break;
    159       if (aux->use_plan_replay) {
    160         n += count_operand_phys_uses(&aux->plan.callee, r);
    161         for (u32 i = 0; i < aux->plan.nargs; ++i)
    162           n += count_operand_phys_uses(&aux->plan.args[i].src, r);
    163       } else {
    164         n += count_operand_phys_uses(&aux->desc.callee, r);
    165         for (u32 i = 0; i < aux->desc.nargs; ++i)
    166           n += abi_uses_phys_reg(&aux->desc.args[i], r);
    167       }
    168       break;
    169     }
    170     case IR_CMP_BRANCH:
    171     case IR_CONDBR:
    172     case IR_SWITCH:
    173     case IR_INDIRECT_BRANCH:
    174       for (u32 i = 0; i < in->nopnds; ++i)
    175         n += count_operand_phys_uses(&in->opnds[i], r);
    176       break;
    177     case IR_RET: {
    178       IRRetAux* aux = (IRRetAux*)in->extra.aux;
    179       if (aux && aux->present) n += abi_uses_phys_reg(&aux->val, r);
    180       break;
    181     }
    182     case IR_SCOPE_BEGIN:
    183       break;
    184     case IR_ALLOCA:
    185       if (in->nopnds >= 2) n += count_operand_phys_uses(&in->opnds[1], r);
    186       break;
    187     case IR_VA_START:
    188     case IR_VA_END:
    189       if (in->nopnds >= 1) n += count_operand_phys_uses(&in->opnds[0], r);
    190       break;
    191     case IR_ATOMIC_STORE:
    192       if (in->nopnds >= 1) n += count_operand_phys_uses(&in->opnds[0], r);
    193       if (in->nopnds >= 2) n += count_operand_phys_uses(&in->opnds[1], r);
    194       break;
    195     case IR_ATOMIC_RMW:
    196       if (in->nopnds >= 2) n += count_operand_phys_uses(&in->opnds[1], r);
    197       if (in->nopnds >= 3) n += count_operand_phys_uses(&in->opnds[2], r);
    198       break;
    199     case IR_ATOMIC_CAS:
    200       if (in->nopnds >= 3) n += count_operand_phys_uses(&in->opnds[2], r);
    201       if (in->nopnds >= 4) n += count_operand_phys_uses(&in->opnds[3], r);
    202       if (in->nopnds >= 5) n += count_operand_phys_uses(&in->opnds[4], r);
    203       break;
    204     case IR_ASM_BLOCK: {
    205       IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
    206       if (!aux) break;
    207       for (u32 i = 0; i < aux->nin; ++i)
    208         n += count_operand_phys_uses(&aux->in_ops[i], r);
    209       break;
    210     }
    211     case IR_INTRINSIC: {
    212       IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
    213       if (!aux) break;
    214       for (u32 i = 0; i < aux->narg; ++i)
    215         n += count_operand_phys_uses(&aux->args[i], r);
    216       break;
    217     }
    218     default:
    219       break;
    220   }
    221   return n;
    222 }
    223 
    224 static int abi_defines_phys_reg(const CGABIValue* v, const Operand* r) {
    225   int n = 0;
    226   if (!v) return 0;
    227   if (same_phys_reg(&v->storage, r)) ++n;
    228   for (u32 i = 0; i < v->nparts; ++i)
    229     if (same_phys_reg(&v->parts[i].op, r)) ++n;
    230   return n;
    231 }
    232 
    233 static int inst_defines_phys_reg(const Inst* in, const Operand* r) {
    234   if (!r || r->kind != OPK_REG) return 0;
    235   switch ((IROp)in->op) {
    236     case IR_LOAD_IMM:
    237     case IR_LOAD_CONST:
    238     case IR_LOAD_LABEL_ADDR:
    239     case IR_COPY:
    240     case IR_LOAD:
    241     case IR_ADDR_OF:
    242     case IR_TLS_ADDR_OF:
    243     case IR_BITFIELD_LOAD:
    244     case IR_BINOP:
    245     case IR_UNOP:
    246     case IR_CMP:
    247     case IR_CONVERT:
    248     case IR_ALLOCA:
    249     case IR_VA_ARG:
    250     case IR_ATOMIC_LOAD:
    251     case IR_ATOMIC_RMW:
    252       return in->nopnds >= 1 && same_phys_reg(&in->opnds[0], r);
    253     case IR_CALL: {
    254       IRCallAux* aux = (IRCallAux*)in->extra.aux;
    255       if (!aux) return 0;
    256       if (aux->use_plan_replay) {
    257         for (u32 i = 0; i < aux->plan.nargs; ++i)
    258           if (aux->plan.args[i].dst_kind == CG_CALL_PLAN_REG &&
    259               r->cls == aux->plan.args[i].cls &&
    260               r->v.reg == aux->plan.args[i].dst_reg)
    261             return 1;
    262         for (u32 i = 0; i < aux->plan.nrets; ++i)
    263           if ((r->cls == aux->plan.rets[i].cls &&
    264                r->v.reg == aux->plan.rets[i].src_reg) ||
    265               same_phys_reg(&aux->plan.rets[i].dst, r))
    266             return 1;
    267         return 0;
    268       }
    269       return abi_defines_phys_reg(&aux->desc.ret, r);
    270     }
    271     case IR_ATOMIC_CAS:
    272       return (in->nopnds >= 1 && same_phys_reg(&in->opnds[0], r)) ||
    273              (in->nopnds >= 2 && same_phys_reg(&in->opnds[1], r));
    274     case IR_ASM_BLOCK: {
    275       IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
    276       if (!aux) return 0;
    277       for (u32 i = 0; i < aux->nout; ++i)
    278         if (same_phys_reg(&aux->out_ops[i], r)) return 1;
    279       if (r->cls < OPT_REG_CLASSES && r->v.reg < 32 &&
    280           (aux->clobber_mask[r->cls] & (1u << r->v.reg)))
    281         return 1;
    282       return 0;
    283     }
    284     case IR_INTRINSIC: {
    285       IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
    286       if (!aux) return 0;
    287       for (u32 i = 0; i < aux->ndst; ++i)
    288         if (same_phys_reg(&aux->dsts[i], r)) return 1;
    289       return 0;
    290     }
    291     default:
    292       return 0;
    293   }
    294 }
    295 
    296 /* True if `in` may write to memory. Used to invalidate memory-reading
    297  * producers (IR_LOAD) when the consumer is past an intervening write. */
    298 static int inst_writes_memory(const Inst* in) {
    299   switch ((IROp)in->op) {
    300     case IR_STORE:
    301     case IR_AGG_COPY:
    302     case IR_AGG_SET:
    303     case IR_BITFIELD_STORE:
    304     case IR_ATOMIC_STORE:
    305     case IR_ATOMIC_RMW:
    306     case IR_ATOMIC_CAS:
    307     case IR_CALL:
    308     case IR_ASM_BLOCK:
    309     case IR_INTRINSIC:
    310       return 1;
    311     default:
    312       return 0;
    313   }
    314 }
    315 
    316 static int inst_reads_memory(const Inst* in) {
    317   switch ((IROp)in->op) {
    318     case IR_LOAD:
    319     case IR_BITFIELD_LOAD:
    320     case IR_ATOMIC_LOAD:
    321       return 1;
    322     default:
    323       return 0;
    324   }
    325 }
    326 
    327 /* ---- substitution-slot whitelist (operand positions that accept a register
    328  * substitute, an immediate substitute, or are valid for ext-of-ext folding)
    329  * ---- */
    330 
    331 typedef enum SubstKind {
    332   SK_REG, /* register-to-register (IR_COPY producer) */
    333   SK_IMM, /* register-to-immediate (IR_LOAD_IMM producer) */
    334   SK_CV,  /* register-to-register through identical convert (IR_CONVERT prod) */
    335 } SubstKind;
    336 
    337 /* Returns 1 if the given operand-index `idx` of `in` is foldable for `kind`.
    338  * SK_REG / SK_CV: register substitution slots. SK_IMM: immediate substitution
    339  * slots. */
    340 static int combine_subst_slot(const Inst* in, u32 idx, SubstKind kind,
    341                               int copy_imm_ok) {
    342   switch ((IROp)in->op) {
    343     case IR_COPY:
    344       /* Normally IR_COPY stays register-to-register so that, after coalescing
    345        * assigns src and dst the same hard reg, it becomes a self-copy combine
    346        * removes. The O1 path never coalesces, so folding the immediate
    347        * (copy_imm_ok) collapses `load_imm rT,k; copy rD,rT` into `copy rD,#k`,
    348        * which the emit path turns into a single `load_imm rD,k`. */
    349       return (kind != SK_IMM || copy_imm_ok) && idx == 1;
    350     case IR_UNOP:
    351       return kind != SK_IMM && idx == 1;
    352     case IR_CONVERT:
    353       return kind != SK_IMM && idx == 1;
    354     case IR_BINOP:
    355     case IR_CMP:
    356       return idx == 1 || idx == 2;
    357     case IR_CMP_BRANCH:
    358       return idx == 0 || idx == 1;
    359     case IR_CONDBR:
    360       return kind != SK_IMM && idx == 0;
    361     case IR_STORE:
    362       /* opnds[0] is the address; register subst into the indirect base/index
    363        * is handled separately. The data slot at idx==1 accepts reg or imm. */
    364       return idx == 1;
    365     case IR_LOAD:
    366     case IR_ATOMIC_LOAD:
    367     case IR_ATOMIC_RMW:
    368     case IR_BITFIELD_LOAD:
    369     case IR_BITFIELD_STORE:
    370     case IR_AGG_COPY:
    371     case IR_AGG_SET:
    372       /* OPK_INDIRECT base/index substitution is handled inside
    373        * subst_consumer_operands; these slots take no direct OPK_REG. */
    374       return 0;
    375     case IR_ALLOCA:
    376       return kind != SK_IMM && idx == 1;
    377     default:
    378       return 0;
    379   }
    380 }
    381 
    382 /* ---- per-BB tracking context ---- */
    383 
    384 typedef struct CombineCtx {
    385   Func* f;
    386   Block* bl;
    387   const OptHardBlockLive* hard_live;
    388   /* Index of the most recent inst in this BB that defined (cls, reg);
    389    * -1 means no definition seen this BB. */
    390   i32 last_def[OPT_REG_CLASSES][OPT_MAX_HARD_REGS];
    391   i32 last_mem_def;
    392   int block_change_p;
    393 } CombineCtx;
    394 
    395 static void ctx_reset(CombineCtx* ctx) {
    396   memset(ctx->last_def, 0xff, sizeof ctx->last_def); /* all -1 */
    397   ctx->last_mem_def = -1;
    398   ctx->block_change_p = 0;
    399 }
    400 
    401 /* True if `in` is a barrier that conservatively invalidates all prior
    402  * producers in this BB (modelled after the old find_single_direct_use, which
    403  * treats every IR_CALL as a clobber regardless of explicit ABI describes). */
    404 static int inst_is_clobber_barrier(const Inst* in) {
    405   switch ((IROp)in->op) {
    406     case IR_CALL:
    407     case IR_ASM_BLOCK:
    408     case IR_INTRINSIC:
    409       return 1;
    410     default:
    411       return 0;
    412   }
    413 }
    414 
    415 static void ctx_record_reg_def(CombineCtx* ctx, const Operand* op, i32 i) {
    416   if (op->kind != OPK_REG) return;
    417   if (op->cls >= OPT_REG_CLASSES || op->v.reg >= OPT_MAX_HARD_REGS) return;
    418   ctx->last_def[op->cls][op->v.reg] = i;
    419 }
    420 
    421 /* After processing inst at index `i`, mark each register it defines (incl.
    422  * implicit clobbers from CALL / ASM / INTRINSIC) as defined here. Mark
    423  * memory if this inst writes.
    424  *
    425  * Walks the inst's destination operand(s) directly rather than probing every
    426  * (cls, reg) pair. The barrier case (CALL/ASM/INTRINSIC) keeps a bulk mark
    427  * because explicit defs/clobbers aren't always populated in IRCallAux at
    428  * this point in the pipeline. */
    429 static void ctx_record(CombineCtx* ctx, const Inst* in, i32 i) {
    430   if (inst_writes_memory(in)) ctx->last_mem_def = i;
    431   if (inst_is_clobber_barrier(in)) {
    432     for (u32 c = 0; c < OPT_REG_CLASSES; ++c)
    433       for (u32 r = 0; r < OPT_MAX_HARD_REGS; ++r) ctx->last_def[c][r] = i;
    434     return;
    435   }
    436   switch ((IROp)in->op) {
    437     case IR_LOAD_IMM:
    438     case IR_LOAD_CONST:
    439     case IR_LOAD_LABEL_ADDR:
    440     case IR_COPY:
    441     case IR_LOAD:
    442     case IR_ADDR_OF:
    443     case IR_TLS_ADDR_OF:
    444     case IR_BITFIELD_LOAD:
    445     case IR_BINOP:
    446     case IR_UNOP:
    447     case IR_CMP:
    448     case IR_CONVERT:
    449     case IR_ALLOCA:
    450     case IR_VA_ARG:
    451     case IR_ATOMIC_LOAD:
    452     case IR_ATOMIC_RMW:
    453       if (in->nopnds >= 1) ctx_record_reg_def(ctx, &in->opnds[0], i);
    454       break;
    455     case IR_ATOMIC_CAS:
    456       if (in->nopnds >= 1) ctx_record_reg_def(ctx, &in->opnds[0], i);
    457       if (in->nopnds >= 2) ctx_record_reg_def(ctx, &in->opnds[1], i);
    458       break;
    459     default:
    460       break;
    461   }
    462 }
    463 
    464 /* Lookup the producer of (cls, reg) in this BB, if any. Returns -1 if no
    465  * definer has been seen yet in this BB. */
    466 static i32 ctx_producer_of(const CombineCtx* ctx, u8 cls, Reg reg) {
    467   if (reg >= OPT_MAX_HARD_REGS || cls >= OPT_REG_CLASSES) return -1;
    468   return ctx->last_def[cls][reg];
    469 }
    470 
    471 /* Does (cls, reg) get redefined strictly after `since_idx` (exclusive) in this
    472  * BB, given current ctx state? We use last_def: if last_def[cls][reg] >
    473  * since_idx, then yes. (since_idx is typically the producer's index.) */
    474 static int ctx_def_changed_since(const CombineCtx* ctx, u8 cls, Reg reg,
    475                                  i32 since_idx) {
    476   if (reg >= OPT_MAX_HARD_REGS || cls >= OPT_REG_CLASSES) return 0;
    477   return ctx->last_def[cls][reg] > since_idx;
    478 }
    479 
    480 static i32 ctx_prev_def_before(const CombineCtx* ctx, const Operand* reg,
    481                                i32 before_idx) {
    482   if (!reg || reg->kind != OPK_REG) return -1;
    483   for (i32 j = before_idx - 1; j >= 0; --j) {
    484     const Inst* prev = &ctx->bl->insts[j];
    485     if (inst_is_clobber_barrier(prev) || inst_defines_phys_reg(prev, reg))
    486       return j;
    487   }
    488   return -1;
    489 }
    490 
    491 static void ctx_restore_removed_def(CombineCtx* ctx, const Operand* reg,
    492                                     i32 removed_idx) {
    493   if (!reg || reg->kind != OPK_REG) return;
    494   if (reg->cls >= OPT_REG_CLASSES || reg->v.reg >= OPT_MAX_HARD_REGS) return;
    495   if (ctx->last_def[reg->cls][reg->v.reg] == removed_idx)
    496     ctx->last_def[reg->cls][reg->v.reg] =
    497         ctx_prev_def_before(ctx, reg, removed_idx);
    498 }
    499 
    500 /* ---- forward-scan use accounting (used for single-use checks) ---- */
    501 
    502 /* Count phys uses of `def` within the live range starting just after
    503  * `prod_idx`. The live range ends at the first inst that redefines `def`'s
    504  * physical register or that is a clobber barrier (CALL/ASM/INTRINSIC); uses
    505  * after that point belong to a different live range of the same physreg and
    506  * are irrelevant to this producer.
    507  *
    508  * `*killed_in_block_out` is set non-zero when the live range terminates
    509  * inside the block (caller may then skip the cross-block live-out check).
    510  *
    511  * Without this live-range scoping, reuse of a scratch physreg later in the
    512  * block (`sxtw x12, ...; mov x13, x12; mov x12, ...`) makes every fold look
    513  * multi-use and combine rejects almost everything. */
    514 /* True if `in` redefines or clobbers `def`'s physical register. Consults the
    515  * instruction's precise def/clobber set (the same one liveness uses) rather
    516  * than a blanket clobber-barrier test: a call clobbers only its caller-saved
    517  * set, so a value held in a callee-saved register survives it. Treating every
    518  * call as killing every register made `killed` spuriously true and let combine
    519  * skip the cross-block live-out check, deleting a still-live def. */
    520 static int inst_kills_phys_reg(Func* f, const Inst* in, const Operand* def) {
    521   OptHardRegSet use, d;
    522   if (!def || def->kind != OPK_REG || def->cls >= OPT_REG_CLASSES ||
    523       def->v.reg >= 32)
    524     return 0;
    525   opt_hard_inst_use_def(f, in, &use, &d);
    526   return (d.cls[def->cls] & (1u << def->v.reg)) != 0;
    527 }
    528 
    529 static int count_uses_in_live_range(Func* f, const Block* bl, i32 prod_idx,
    530                                     const Operand* def,
    531                                     int* killed_in_block_out) {
    532   int n = 0;
    533   int killed = 0;
    534   for (i32 i = prod_idx + 1; i < (i32)bl->ninsts; ++i) {
    535     const Inst* in = &bl->insts[i];
    536     n += inst_uses_phys_reg(in, def);
    537     if (inst_kills_phys_reg(f, in, def)) {
    538       killed = 1;
    539       break;
    540     }
    541   }
    542   if (killed_in_block_out) *killed_in_block_out = killed;
    543   return n;
    544 }
    545 
    546 static int use_after_clobber_before_redef(const Block* bl, i32 prod_idx,
    547                                           const Operand* def) {
    548   int saw_clobber = 0;
    549   for (i32 i = prod_idx + 1; i < (i32)bl->ninsts; ++i) {
    550     const Inst* in = &bl->insts[i];
    551     if (saw_clobber && inst_uses_phys_reg(in, def)) return 1;
    552     if (inst_defines_phys_reg(in, def)) return 0;
    553     if (inst_is_clobber_barrier(in)) saw_clobber = 1;
    554   }
    555   return 0;
    556 }
    557 
    558 /* ---- ConvKind helpers (for combine_exts) ---- */
    559 
    560 static u32 builtin_int_bytes(KitCgTypeId t) {
    561   if ((t >> COMBINE_CG_TYPE_SEG_SHIFT) != COMBINE_CG_TYPE_BUILTIN_SEG) return 0;
    562   KitCgBuiltinType b = (KitCgBuiltinType)(t & COMBINE_CG_TYPE_SEG_MASK);
    563   switch (b) {
    564     case KIT_CG_BUILTIN_BOOL:
    565     case KIT_CG_BUILTIN_I8:
    566       return 1;
    567     case KIT_CG_BUILTIN_I16:
    568       return 2;
    569     case KIT_CG_BUILTIN_I32:
    570       return 4;
    571     case KIT_CG_BUILTIN_I64:
    572       return 8;
    573     case KIT_CG_BUILTIN_I128:
    574       return 16;
    575     default:
    576       return 0;
    577   }
    578 }
    579 
    580 /* For an IR_CONVERT inst, decode (src_bytes, dst_bytes, sign_p). Returns 0
    581  * if this isn't an integer ext convert (CV_SEXT or CV_ZEXT). */
    582 static int ext_params(const Inst* in, u32* src_bytes_out, u32* dst_bytes_out,
    583                       int* sign_p_out) {
    584   if ((IROp)in->op != IR_CONVERT || in->nopnds < 2) return 0;
    585   ConvKind k = (ConvKind)in->extra.imm;
    586   if (k != CV_SEXT && k != CV_ZEXT) return 0;
    587   u32 sb = builtin_int_bytes(in->opnds[1].type);
    588   u32 db = builtin_int_bytes(in->opnds[0].type);
    589   if (!sb || !db || db < sb) return 0;
    590   *src_bytes_out = sb;
    591   *dst_bytes_out = db;
    592   *sign_p_out = (k == CV_SEXT);
    593   return 1;
    594 }
    595 
    596 /* Width in bytes of a scalar integer or pointer type (1..8), else 0. Mirrors
    597  * pass_simplify's simplify_width: builtin ints decode without the compiler,
    598  * pointers fall back to the type-size query. */
    599 static u32 combine_scalar_width_bytes(Func* f, KitCgTypeId t) {
    600   u32 b = builtin_int_bytes(t);
    601   if (b) return b > 8u ? 0u : b;
    602   if (f->c && kit_cg_type_kind((KitCompiler*)f->c, t) == KIT_CG_TYPE_PTR) {
    603     u64 sz = kit_cg_type_size((KitCompiler*)f->c, t);
    604     if (sz && sz <= 8u) return (u32)sz;
    605   }
    606   return 0;
    607 }
    608 
    609 /* Compute the constant produced by applying an integer/pointer convert `k`
    610  * (src width `sb`, dst width `db`, both BYTES) to immediate `imm`. Returns 0
    611  * for kinds that aren't a bit-preserving integer/pointer move (the float
    612  * conversions reinterpret the value and must not be folded this way). The
    613  * materialized-constant model: a load_imm puts (u64)imm into a register, so a
    614  * widening move/zext keeps the low `sb` bits and a trunc/narrowing keeps the
    615  * low `db` bits.
    616  *
    617  * The arithmetic is the shared cg/ir_eval convert core (in BITS); combine's
    618  * byte widths convert at the boundary. combine's BITCAST is the bit-preserving
    619  * register move (== ZEXT here, no equal-width requirement), so it maps to
    620  * CV_ZEXT rather than the shared eval's stricter equal-width BITCAST. */
    621 static int const_convert_value(ConvKind k, i64 imm, u32 sb, u32 db, i64* out) {
    622   if (!sb || !db) return 0;
    623   if (k == CV_BITCAST) k = CV_ZEXT;
    624   return kit_ir_eval_convert(k, sb * 8u, db * 8u, imm, out);
    625 }
    626 
    627 /* ---- producer-retarget legality (for sink rewrite) ---- */
    628 
    629 /* combine retargets the destination of a commutative producer, so it includes
    630  * the FP commutatives (FADD/FMUL) on top of the shared integer set. */
    631 static int binop_is_commutative(BinOp op) {
    632   return kit_ir_binop_is_commutative_int(op) || op == BO_FADD || op == BO_FMUL;
    633 }
    634 
    635 /* Is `producer` an op whose destination register can be safely retargeted
    636  * without backend consultation? Excludes ops with implicit destination
    637  * constraints, ABI pinning, multi-def outputs, or aux-driven dst layout. */
    638 static int producer_retargetable_op(IROp op) {
    639   switch (op) {
    640     case IR_BINOP:
    641     case IR_UNOP:
    642     case IR_LOAD_IMM:
    643     case IR_LOAD_CONST:
    644     case IR_LOAD:
    645     case IR_CONVERT:
    646     case IR_CMP:
    647     case IR_ADDR_OF:
    648     case IR_LOAD_LABEL_ADDR:
    649       return 1;
    650     default:
    651       return 0;
    652   }
    653 }
    654 
    655 /* Can `producer`'s destination be retargeted to `new_dst` without violating
    656  * IR shape? For binop, may signal that a commutative-swap is needed. */
    657 static int retarget_producer_legal(Inst* producer, const Operand* new_dst,
    658                                    int* swap_binop) {
    659   *swap_binop = 0;
    660   if (!new_dst || new_dst->kind != OPK_REG) return 0;
    661   if (producer->nopnds < 1 || producer->opnds[0].kind != OPK_REG) return 0;
    662   if (producer->opnds[0].cls != new_dst->cls) return 0;
    663   if (producer->opnds[0].type != new_dst->type) return 0;
    664 
    665   switch ((IROp)producer->op) {
    666     case IR_LOAD_IMM:
    667     case IR_LOAD_CONST:
    668     case IR_LOAD_LABEL_ADDR:
    669     case IR_ADDR_OF:
    670       /* Single-defining ops with no register source to alias against. */
    671       return 1;
    672     case IR_UNOP:
    673     case IR_CONVERT:
    674     case IR_LOAD:
    675       return 1;
    676     case IR_CMP:
    677       return 1;
    678     case IR_BINOP: {
    679       if (producer->nopnds < 3) return 0;
    680       int dst_is_lhs = operand_uses_phys_reg(&producer->opnds[1], new_dst);
    681       int dst_is_rhs = operand_uses_phys_reg(&producer->opnds[2], new_dst);
    682       if (!dst_is_lhs && !dst_is_rhs) return 1;
    683       if (dst_is_lhs) return 1;
    684       if (binop_is_commutative((BinOp)producer->extra.imm)) {
    685         *swap_binop = 1;
    686         return 1;
    687       }
    688       return 0;
    689     }
    690     default:
    691       return 0;
    692   }
    693 }
    694 
    695 static int first_return_reg(Func* f, u8 cls, Reg* out) {
    696   if (!f || cls >= OPT_REG_CLASSES) return 0;
    697   u32 mask = f->opt_ret_regs[cls];
    698   for (Reg r = 0; r < 32; ++r) {
    699     if (mask & (1u << r)) {
    700       *out = r;
    701       return 1;
    702     }
    703   }
    704   return 0;
    705 }
    706 
    707 static int ret_scalar_storage(CGABIValue* v, Operand** out) {
    708   if (!v || v->storage.kind != OPK_REG) return 0;
    709   if (v->nparts > 1) return 0;
    710   *out = &v->storage;
    711   return 1;
    712 }
    713 
    714 static int is_target_scratch_reg(Func* f, u8 cls, Reg reg) {
    715   if (!f || cls >= OPT_REG_CLASSES) return 0;
    716   for (u32 i = 0; i < f->opt_scratch_reg_count[cls]; ++i) {
    717     if (f->opt_scratch_regs[cls][i] == reg) return 1;
    718   }
    719   return 0;
    720 }
    721 
    722 /* ---- Rewrite 1: substitute producer source into uses ---- */
    723 
    724 /* Set the indirect's base or index to `new_reg`. */
    725 static void set_indirect_field(Operand* ind, Reg old_reg, Reg new_reg) {
    726   if (ind->v.ind.base == old_reg) ind->v.ind.base = new_reg;
    727   if (ind->v.ind.index != (Reg)REG_NONE && ind->v.ind.index == old_reg)
    728     ind->v.ind.index = new_reg;
    729 }
    730 
    731 /* Substitute `def` -> `src` in a single aux register slot (no slot-index
    732  * whitelist; caller has already restricted to SK_REG and ensured src is a
    733  * register). Returns 1 if rewritten. */
    734 static int subst_one_aux_operand(Operand* op, const Operand* def,
    735                                  const Operand* src) {
    736   if (op->kind == OPK_REG && same_phys_reg(op, def)) {
    737     *op = *src;
    738     return 1;
    739   }
    740   if (op->kind == OPK_INDIRECT && src->kind == OPK_REG && def->cls == RC_INT &&
    741       src->cls == RC_INT &&
    742       (op->v.ind.base == def->v.reg ||
    743        (op->v.ind.index != (Reg)REG_NONE && op->v.ind.index == def->v.reg))) {
    744     set_indirect_field(op, def->v.reg, src->v.reg);
    745     return 1;
    746   }
    747   return 0;
    748 }
    749 
    750 static int subst_abivalue_uses(CGABIValue* v, const Operand* def,
    751                                const Operand* src) {
    752   int n = 0;
    753   n += subst_one_aux_operand(&v->storage, def, src);
    754   for (u32 i = 0; i < v->nparts; ++i)
    755     n += subst_one_aux_operand((Operand*)&v->parts[i].op, def, src);
    756   return n;
    757 }
    758 
    759 /* Walk the operands of consumer `in` and try to substitute uses of `def`
    760  * (producer's destination, OPK_REG) with `src` (producer's source, OPK_REG
    761  * or OPK_IMM). For OPK_INDIRECT operands, substitution into base/index is
    762  * only valid for OPK_REG `src`. Returns the number of operands actually
    763  * rewritten. */
    764 static int subst_consumer_operands(Inst* in, const Operand* def,
    765                                    const Operand* src, SubstKind kind,
    766                                    int copy_imm_ok) {
    767   int n = 0;
    768   for (u32 oi = 0; oi < in->nopnds; ++oi) {
    769     Operand* op = &in->opnds[oi];
    770     /* Direct OPK_REG substitution: requires the slot to be on the whitelist. */
    771     if (op->kind == OPK_REG && same_phys_reg(op, def) &&
    772         combine_subst_slot(in, oi, kind, copy_imm_ok)) {
    773       *op = *src;
    774       ++n;
    775       continue;
    776     }
    777     /* Indirect base/index substitution: only OPK_REG src may land here. The
    778      * substitution only changes which register computes the address, not the
    779      * value being stored — safe for IR_STORE / IR_ATOMIC_STORE /
    780      * IR_BITFIELD_STORE / IR_AGG_COPY / IR_AGG_SET as well. */
    781     if (op->kind == OPK_INDIRECT && kind == SK_REG && src->kind == OPK_REG &&
    782         def->kind == OPK_REG && def->cls == RC_INT && src->cls == RC_INT &&
    783         (op->v.ind.base == def->v.reg ||
    784          (op->v.ind.index != (Reg)REG_NONE && op->v.ind.index == def->v.reg))) {
    785       set_indirect_field(op, def->v.reg, src->v.reg);
    786       ++n;
    787     }
    788   }
    789 
    790   /* Aux register uses: IR_CALL plan/desc args + callee. SK_REG only —
    791    * substituting an immediate or convert into an ABI-bound use would break
    792    * the call lowering. IR_RET is intentionally excluded: the return register
    793    * is live-out by ABI, so a producer copying into it stays alive even after
    794    * substituting val.storage, and emit_ret would emit a redundant second
    795    * move. */
    796   if (kind == SK_REG) {
    797     if ((IROp)in->op == IR_CALL) {
    798       IRCallAux* aux = (IRCallAux*)in->extra.aux;
    799       if (aux) {
    800         if (aux->use_plan_replay) {
    801           n += subst_one_aux_operand(&aux->plan.callee, def, src);
    802           for (u32 k = 0; k < aux->plan.nargs; ++k)
    803             n += subst_one_aux_operand(&aux->plan.args[k].src, def, src);
    804         } else {
    805           n += subst_one_aux_operand(&aux->desc.callee, def, src);
    806           for (u32 k = 0; k < aux->desc.nargs; ++k)
    807             n += subst_abivalue_uses((CGABIValue*)&aux->desc.args[k], def, src);
    808         }
    809       }
    810     }
    811   }
    812   return n;
    813 }
    814 
    815 /* Try to substitute the producer of (cls, reg) into uses of that register
    816  * in `in`. Returns 1 if at least one operand was rewritten. */
    817 static int try_substitute_for_reg(CombineCtx* ctx, Inst* in, i32 i, u8 cls,
    818                                   Reg reg) {
    819   i32 prod_idx = ctx_producer_of(ctx, cls, reg);
    820   if (prod_idx < 0 || prod_idx >= i) return 0;
    821   Inst* prod = &ctx->bl->insts[prod_idx];
    822   IROp pop = (IROp)prod->op;
    823   if (pop != IR_COPY && pop != IR_LOAD_IMM && pop != IR_CONVERT) return 0;
    824   if (prod->nopnds < 1 || prod->opnds[0].kind != OPK_REG) return 0;
    825   if (prod->opnds[0].cls != cls || prod->opnds[0].v.reg != reg) return 0;
    826 
    827   Operand def = prod->opnds[0];
    828   SubstKind kind;
    829   Operand src_op;
    830   memset(&src_op, 0, sizeof src_op);
    831   if (pop == IR_COPY) {
    832     if (prod->nopnds < 2 || prod->opnds[1].kind != OPK_REG) return 0;
    833     if (ctx_def_changed_since(ctx, prod->opnds[1].cls, prod->opnds[1].v.reg,
    834                               prod_idx))
    835       return 0;
    836     if (is_target_scratch_reg(ctx->f, prod->opnds[1].cls, prod->opnds[1].v.reg))
    837       return 0;
    838     kind = SK_REG;
    839     src_op = prod->opnds[1];
    840   } else if (pop == IR_LOAD_IMM) {
    841     kind = SK_IMM;
    842     src_op.kind = OPK_IMM;
    843     src_op.cls = def.cls;
    844     src_op.type = def.type;
    845     src_op.v.imm = prod->extra.imm;
    846   } else { /* IR_CONVERT */
    847     if (prod->nopnds < 2 || prod->opnds[1].kind != OPK_REG) return 0;
    848     /* Fold only when consumer is also a convert with matching shape (the
    849      * broader ext-of-ext rules run in try_combine_exts). */
    850     if ((IROp)in->op != IR_CONVERT) return 0;
    851     if (in->nopnds < 2 || in->opnds[1].kind != OPK_REG) return 0;
    852     if (in->extra.imm != prod->extra.imm) return 0;
    853     if (in->opnds[1].type != prod->opnds[0].type) return 0;
    854     if (in->opnds[0].type != prod->opnds[0].type) return 0;
    855     if (ctx_def_changed_since(ctx, prod->opnds[1].cls, prod->opnds[1].v.reg,
    856                               prod_idx))
    857       return 0;
    858     kind = SK_CV;
    859     src_op = prod->opnds[1];
    860   }
    861 
    862   /* For a register-source IR_COPY this is local copy propagation: rewriting
    863    * one use of the copy's dest to its source is value-safe whenever the source
    864    * is unchanged between the copy and this consumer — already verified by the
    865    * ctx_def_changed_since check above (which treats CALL/ASM/INTRINSIC as a
    866    * clobber of every register). The copy itself is not removed here; if all its
    867    * uses fold away and it is not live-out, post-combine DCE deletes it. So no
    868    * single-use or cross-block live-out restriction is required, which lets the
    869    * O1 path collapse the multi-use reg->reg copy chains it would otherwise
    870    * leave behind (it never runs the O2 coalescer).
    871    *
    872    * The immediate / convert producers don't propagate a register value, so
    873    * they keep the stricter single-use + live-out gate: substituting them into
    874    * every use would duplicate a (possibly multi-instruction) materialization
    875    * rather than shorten a copy chain. */
    876   if (kind != SK_REG) {
    877     int killed = 0;
    878     int uses_total =
    879         count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &def, &killed);
    880     if (uses_total != 1) return 0;
    881     if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live,
    882                                                    ctx->bl->id, &def))
    883       return 0;
    884   }
    885 
    886   /* O1 (no coalescing) folds immediates into IR_COPY; O2 leaves the copy
    887    * register-to-register so coalescing + self-copy removal handles it. */
    888   int copy_imm_ok = ctx->f && !ctx->f->opt_coalesce_parent;
    889   int n = subst_consumer_operands(in, &def, &src_op, kind, copy_imm_ok);
    890   if (n > 0) {
    891     ctx->block_change_p = 1;
    892     return 1;
    893   }
    894   return 0;
    895 }
    896 
    897 /* Mark (cls, reg) as already-attempted for this consumer so the same producer
    898  * is not looked up twice when a register appears in multiple operand slots
    899  * (e.g. `[r4 + r4*4]`, or `add r1, r4, r4`). */
    900 static int seen_mark(u32 seen[OPT_REG_CLASSES], u8 cls, Reg reg) {
    901   if (cls >= OPT_REG_CLASSES || reg >= 32) return 1;
    902   u32 bit = 1u << reg;
    903   if (seen[cls] & bit) return 1;
    904   seen[cls] |= bit;
    905   return 0;
    906 }
    907 
    908 static void try_substitute_aux_operand(CombineCtx* ctx, Inst* in, i32 i,
    909                                        const Operand* op,
    910                                        u32 seen[OPT_REG_CLASSES], int* any) {
    911   if (op->kind == OPK_REG) {
    912     if (!seen_mark(seen, op->cls, op->v.reg))
    913       *any |= try_substitute_for_reg(ctx, in, i, op->cls, op->v.reg);
    914   } else if (op->kind == OPK_INDIRECT) {
    915     if (op->v.ind.base != (Reg)REG_NONE &&
    916         !seen_mark(seen, RC_INT, op->v.ind.base))
    917       *any |= try_substitute_for_reg(ctx, in, i, RC_INT, op->v.ind.base);
    918     if (op->v.ind.index != (Reg)REG_NONE &&
    919         !seen_mark(seen, RC_INT, op->v.ind.index))
    920       *any |= try_substitute_for_reg(ctx, in, i, RC_INT, op->v.ind.index);
    921   }
    922 }
    923 
    924 static void try_substitute_aux_abivalue(CombineCtx* ctx, Inst* in, i32 i,
    925                                         const CGABIValue* v,
    926                                         u32 seen[OPT_REG_CLASSES], int* any) {
    927   try_substitute_aux_operand(ctx, in, i, &v->storage, seen, any);
    928   for (u32 k = 0; k < v->nparts; ++k)
    929     try_substitute_aux_operand(ctx, in, i, &v->parts[k].op, seen, any);
    930 }
    931 
    932 /* Try to substitute producers into operand slots of `in`. Walks the direct
    933  * operands of `in` and looks up only the producers of registers actually
    934  * referenced (typically 2-3 per inst). Also walks IR_CALL aux register uses
    935  * (args + callee) so reg->reg copies feeding call args or the callee get
    936  * propagated. See subst_consumer_operands for why IR_RET is excluded. */
    937 static int try_substitute(CombineCtx* ctx, Inst* in, i32 i) {
    938   int any = 0;
    939   u32 seen[OPT_REG_CLASSES] = {0};
    940   for (u32 oi = 0; oi < in->nopnds; ++oi) {
    941     const Operand* op = &in->opnds[oi];
    942     try_substitute_aux_operand(ctx, in, i, op, seen, &any);
    943   }
    944   if ((IROp)in->op == IR_CALL) {
    945     IRCallAux* aux = (IRCallAux*)in->extra.aux;
    946     if (aux) {
    947       if (aux->use_plan_replay) {
    948         try_substitute_aux_operand(ctx, in, i, &aux->plan.callee, seen, &any);
    949         for (u32 k = 0; k < aux->plan.nargs; ++k)
    950           try_substitute_aux_operand(ctx, in, i, &aux->plan.args[k].src, seen,
    951                                      &any);
    952       } else {
    953         try_substitute_aux_operand(ctx, in, i, &aux->desc.callee, seen, &any);
    954         for (u32 k = 0; k < aux->desc.nargs; ++k)
    955           try_substitute_aux_abivalue(
    956               ctx, in, i, (const CGABIValue*)&aux->desc.args[k], seen, &any);
    957       }
    958     }
    959   }
    960   return any;
    961 }
    962 
    963 /* ---- Rewrite 2: address-mode synthesis ---- */
    964 
    965 /* Try to fold add/shl producers into one OPK_INDIRECT operand of `in`. */
    966 static int try_addr_synth_one_op(CombineCtx* ctx, Inst* in, i32 i,
    967                                  Operand* op) {
    968   (void)in;
    969   if (op->kind != OPK_INDIRECT) return 0;
    970   int any = 0;
    971 
    972   /* (a/c) base producer is IR_BINOP IADD. reg+reg synthesizes a base+index
    973    * pair (requires no existing index, sub-rule a); reg+imm folds the immediate
    974    * into ofs (sub-rule c, ok with or without an existing index). */
    975   if (op->v.ind.base != (Reg)REG_NONE) {
    976     Reg b = op->v.ind.base;
    977     i32 prod_idx = ctx_producer_of(ctx, RC_INT, b);
    978     if (prod_idx >= 0 && prod_idx < i) {
    979       Inst* prod = &ctx->bl->insts[prod_idx];
    980       if ((IROp)prod->op == IR_BINOP && prod->nopnds == 3 &&
    981           (BinOp)prod->extra.imm == BO_IADD && prod->opnds[0].kind == OPK_REG &&
    982           prod->opnds[0].cls == RC_INT && prod->opnds[0].v.reg == b) {
    983         Operand prod_def = prod->opnds[0];
    984         /* Single-use within producer's live range (the only further use
    985          * before next redef/barrier). */
    986         int killed = 0;
    987         int uses_after = count_uses_in_live_range(ctx->f, ctx->bl, prod_idx,
    988                                                   &prod_def, &killed);
    989         if (uses_after == 1 &&
    990             (killed || !opt_block_live_out_has_phys_reg(
    991                            ctx->f, ctx->hard_live, ctx->bl->id, &prod_def))) {
    992           Operand lhs = prod->opnds[1];
    993           Operand rhs = prod->opnds[2];
    994           int has_no_index = (op->v.ind.index == (Reg)REG_NONE);
    995           /* reg + reg: base = lhs, index = rhs, scale=0. Needs an empty
    996            * index slot — cannot stack two indices. */
    997           if (has_no_index && lhs.kind == OPK_REG && rhs.kind == OPK_REG &&
    998               lhs.cls == RC_INT && rhs.cls == RC_INT &&
    999               !ctx_def_changed_since(ctx, RC_INT, lhs.v.reg, prod_idx) &&
   1000               !ctx_def_changed_since(ctx, RC_INT, rhs.v.reg, prod_idx)) {
   1001             op->v.ind.base = lhs.v.reg;
   1002             op->v.ind.index = rhs.v.reg;
   1003             op->v.ind.log2_scale = 0;
   1004             any = 1;
   1005           }
   1006           /* reg + imm: base = lhs, fold imm into ofs. Safe with or without
   1007            * an existing index — only the offset is mutated. */
   1008           else if (lhs.kind == OPK_REG && rhs.kind == OPK_IMM &&
   1009                    lhs.cls == RC_INT &&
   1010                    !ctx_def_changed_since(ctx, RC_INT, lhs.v.reg, prod_idx)) {
   1011             i64 sum = (i64)op->v.ind.ofs + rhs.v.imm;
   1012             if (sum >= INT32_MIN && sum <= INT32_MAX) {
   1013               op->v.ind.base = lhs.v.reg;
   1014               op->v.ind.ofs = (i32)sum;
   1015               any = 1;
   1016             }
   1017           }
   1018           /* imm + reg: base = rhs, fold imm into ofs (commutative IADD). */
   1019           else if (lhs.kind == OPK_IMM && rhs.kind == OPK_REG &&
   1020                    rhs.cls == RC_INT &&
   1021                    !ctx_def_changed_since(ctx, RC_INT, rhs.v.reg, prod_idx)) {
   1022             i64 sum = (i64)op->v.ind.ofs + lhs.v.imm;
   1023             if (sum >= INT32_MIN && sum <= INT32_MAX) {
   1024               op->v.ind.base = rhs.v.reg;
   1025               op->v.ind.ofs = (i32)sum;
   1026               any = 1;
   1027             }
   1028           }
   1029         }
   1030       }
   1031     }
   1032   }
   1033 
   1034   /* (b) index producer is IR_BINOP ISHL reg, imm with scale=0. */
   1035   if (op->v.ind.index != (Reg)REG_NONE && op->v.ind.log2_scale == 0) {
   1036     Reg idx = op->v.ind.index;
   1037     i32 prod_idx = ctx_producer_of(ctx, RC_INT, idx);
   1038     if (prod_idx >= 0 && prod_idx < i) {
   1039       Inst* prod = &ctx->bl->insts[prod_idx];
   1040       if ((IROp)prod->op == IR_BINOP && prod->nopnds == 3 &&
   1041           (BinOp)prod->extra.imm == BO_SHL && prod->opnds[0].kind == OPK_REG &&
   1042           prod->opnds[0].cls == RC_INT && prod->opnds[0].v.reg == idx &&
   1043           prod->opnds[1].kind == OPK_REG && prod->opnds[1].cls == RC_INT &&
   1044           prod->opnds[2].kind == OPK_IMM) {
   1045         i64 sh = prod->opnds[2].v.imm;
   1046         if (sh >= 1 && sh <= 3) {
   1047           Operand prod_def = prod->opnds[0];
   1048           int killed = 0;
   1049           int uses_after = count_uses_in_live_range(ctx->f, ctx->bl, prod_idx,
   1050                                                     &prod_def, &killed);
   1051           /* `<= 2` allows the degenerate [base == index] case where the SHL
   1052            * dst appears twice in the same indirect; rewriting only the index
   1053            * still yields equivalent arithmetic when base also held the SHL
   1054            * dst (base unchanged: r*4; index rewritten: r_src*4; r*4 == r*4). */
   1055           if (uses_after >= 1 && uses_after <= 2 &&
   1056               (killed || !opt_block_live_out_has_phys_reg(
   1057                              ctx->f, ctx->hard_live, ctx->bl->id, &prod_def)) &&
   1058               !ctx_def_changed_since(ctx, RC_INT, prod->opnds[1].v.reg,
   1059                                      prod_idx)) {
   1060             op->v.ind.index = prod->opnds[1].v.reg;
   1061             op->v.ind.log2_scale = (u8)sh;
   1062             any = 1;
   1063           }
   1064         }
   1065       }
   1066     }
   1067   }
   1068 
   1069   return any;
   1070 }
   1071 
   1072 static int try_addr_synth(CombineCtx* ctx, Inst* in, i32 i) {
   1073   int any = 0;
   1074   for (u32 oi = 0; oi < in->nopnds; ++oi) {
   1075     if (in->opnds[oi].kind == OPK_INDIRECT) {
   1076       if (try_addr_synth_one_op(ctx, in, i, &in->opnds[oi])) {
   1077         any = 1;
   1078         ctx->block_change_p = 1;
   1079       }
   1080     }
   1081   }
   1082   return any;
   1083 }
   1084 
   1085 /* ---- Rewrite 3: sink producer into single-use IR_COPY destination ---- */
   1086 
   1087 static int try_sink(CombineCtx* ctx, Inst* in, i32 i) {
   1088   if ((IROp)in->op != IR_COPY || in->nopnds < 2) return 0;
   1089   if (in->opnds[0].kind != OPK_REG || in->opnds[1].kind != OPK_REG) return 0;
   1090   if (same_phys_reg(&in->opnds[0], &in->opnds[1])) return 0;
   1091 
   1092   Operand src = in->opnds[1];
   1093   Operand dst = in->opnds[0];
   1094   i32 prod_idx = ctx_producer_of(ctx, src.cls, src.v.reg);
   1095   if (prod_idx < 0 || prod_idx >= i) return 0;
   1096   Inst* prod = &ctx->bl->insts[prod_idx];
   1097   if (!producer_retargetable_op((IROp)prod->op)) return 0;
   1098   if (prod->nopnds < 1 || prod->opnds[0].kind != OPK_REG) return 0;
   1099   if (!same_phys_reg(&prod->opnds[0], &src)) return 0;
   1100 
   1101   /* Producer's source operands must not have been redefined since. */
   1102   for (u32 oi = 1; oi < prod->nopnds; ++oi) {
   1103     const Operand* p = &prod->opnds[oi];
   1104     if (p->kind == OPK_REG) {
   1105       if (ctx_def_changed_since(ctx, p->cls, p->v.reg, prod_idx)) return 0;
   1106     } else if (p->kind == OPK_INDIRECT) {
   1107       if (ctx_def_changed_since(ctx, RC_INT, p->v.ind.base, prod_idx)) return 0;
   1108       if (p->v.ind.index != (Reg)REG_NONE &&
   1109           ctx_def_changed_since(ctx, RC_INT, p->v.ind.index, prod_idx))
   1110         return 0;
   1111     }
   1112   }
   1113   /* If producer reads memory, no intervening memory write. */
   1114   if (inst_reads_memory(prod) && ctx->last_mem_def > prod_idx) return 0;
   1115   /* Retargeting moves the producer's write to `dst` earlier. That is only
   1116    * legal if the hard register is dead throughout the interval before the
   1117    * copy that originally wrote it. */
   1118   for (i32 j = prod_idx + 1; j < i; ++j) {
   1119     Inst* mid = &ctx->bl->insts[j];
   1120     if (inst_uses_phys_reg(mid, &dst) || inst_defines_phys_reg(mid, &dst))
   1121       return 0;
   1122   }
   1123 
   1124   /* Producer dst must have exactly one use within its live range (the copy
   1125    * at i). If the live range terminates inside the block, the cross-block
   1126    * live-out check is moot. */
   1127   int killed = 0;
   1128   int uses_total =
   1129       count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &src, &killed);
   1130   if (uses_total != 1) return 0;
   1131   if (killed && use_after_clobber_before_redef(ctx->bl, prod_idx, &src))
   1132     return 0;
   1133   if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live,
   1134                                                  ctx->bl->id, &src))
   1135     return 0;
   1136 
   1137   int swap_binop = 0;
   1138   if (!retarget_producer_legal(prod, &dst, &swap_binop)) return 0;
   1139 
   1140   if (swap_binop) {
   1141     Operand tmp = prod->opnds[1];
   1142     prod->opnds[1] = prod->opnds[2];
   1143     prod->opnds[2] = tmp;
   1144   }
   1145   prod->opnds[0] = dst;
   1146   if (prod->def != VAL_NONE) prod->def = (Val)dst.v.reg;
   1147 
   1148   /* Update last-def: producer no longer defines src, now defines dst. If src
   1149    * had an earlier reaching definition in the block, keep it visible for
   1150    * later source-availability checks. */
   1151   ctx_restore_removed_def(ctx, &src, prod_idx);
   1152   ctx->last_def[dst.cls][dst.v.reg] = prod_idx;
   1153 
   1154   /* Mark the copy NOP'd so compact removes it. */
   1155   in->op = IR_NOP;
   1156   in->def = VAL_NONE;
   1157   in->ndefs = 0;
   1158   in->defs = NULL;
   1159   in->nopnds = 0;
   1160   in->opnds = NULL;
   1161   ctx->block_change_p = 1;
   1162   return 1;
   1163 }
   1164 
   1165 /* ---- Rewrite 4: combine_exts (ext-of-ext chains) ---- */
   1166 
   1167 static int try_combine_exts(CombineCtx* ctx, Inst* in, i32 i) {
   1168   if ((IROp)in->op != IR_CONVERT || in->nopnds < 2) return 0;
   1169   if (in->opnds[1].kind != OPK_REG) return 0;
   1170   u32 sb, db;
   1171   int outer_sign;
   1172   if (!ext_params(in, &sb, &db, &outer_sign)) return 0;
   1173 
   1174   Reg src_reg = in->opnds[1].v.reg;
   1175   u8 src_cls = in->opnds[1].cls;
   1176   i32 prod_idx = ctx_producer_of(ctx, src_cls, src_reg);
   1177   if (prod_idx < 0 || prod_idx >= i) return 0;
   1178   Inst* prod = &ctx->bl->insts[prod_idx];
   1179   u32 isb, idb;
   1180   int inner_sign;
   1181   if (!ext_params(prod, &isb, &idb, &inner_sign)) return 0;
   1182   if (prod->opnds[1].kind != OPK_REG) return 0;
   1183   if (ctx_def_changed_since(ctx, prod->opnds[1].cls, prod->opnds[1].v.reg,
   1184                             prod_idx))
   1185     return 0;
   1186 
   1187   /* Single-use of inner ext dst within its live range (only this insn). */
   1188   Operand prod_def = prod->opnds[0];
   1189   int killed = 0;
   1190   int uses_total =
   1191       count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &prod_def, &killed);
   1192   if (uses_total != 1) return 0;
   1193   if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live,
   1194                                                  ctx->bl->id, &prod_def))
   1195     return 0;
   1196 
   1197   /* Outer width `db`, inner width `idb`. Inner reads `isb` bytes. */
   1198   u32 outer_in = sb;                  /* outer's source width */
   1199   u32 inner_out = idb;                /* inner's output width */
   1200   if (outer_in > inner_out) return 0; /* sanity: inner feeds outer */
   1201 
   1202   /* If inner is an identity-shaped convert that reads back its own def
   1203    * register (e.g. `convert rB, rB`), the rewrite would set in.opnds[1] to
   1204    * the same register it already holds. Reporting a change spins the
   1205    * per-BB fixpoint forever. Skip. */
   1206   if (prod->opnds[1].kind == OPK_REG && in->opnds[1].kind == OPK_REG &&
   1207       prod->opnds[1].cls == in->opnds[1].cls &&
   1208       prod->opnds[1].v.reg == in->opnds[1].v.reg)
   1209     return 0;
   1210 
   1211   /* Pattern A: outer width <= inner-source width.  Outer can reach past the
   1212    * inner ext to the inner source directly, keeping outer's signedness.
   1213    * MIR allows any signedness combination here. */
   1214   if (db <= isb) {
   1215     in->opnds[1] = prod->opnds[1];
   1216     in->opnds[1].type = prod->opnds[1].type;
   1217     ctx->block_change_p = 1;
   1218     return 1;
   1219   }
   1220 
   1221   /* Pattern B: inner-source width < outer width. Allowed iff outer is signed
   1222    * OR inner is unsigned; excludes the unsafe uext(sext) combination. */
   1223   if (isb < db && (outer_sign || !inner_sign)) {
   1224     in->opnds[1] = prod->opnds[1];
   1225     in->opnds[1].type = prod->opnds[1].type;
   1226     /* outer's ConvKind already encodes the outer signedness; keep it. */
   1227     ctx->block_change_p = 1;
   1228     return 1;
   1229   }
   1230 
   1231   return 0;
   1232 }
   1233 
   1234 /* ---- Existing IR_RET retarget (kept; runs in the forward pass) ---- */
   1235 
   1236 static int try_ret_retarget(Func* f, Block* bl, i32 i) {
   1237   if (!f->opt_rewritten || i <= 0) return 0;
   1238   Inst* in = &bl->insts[i];
   1239   if ((IROp)in->op != IR_RET) return 0;
   1240   IRRetAux* aux = (IRRetAux*)in->extra.aux;
   1241   Operand* ret_op = NULL;
   1242   Reg ret_reg = REG_NONE;
   1243   if (!aux || !aux->present || !ret_scalar_storage(&aux->val, &ret_op) ||
   1244       !first_return_reg(f, ret_op->cls, &ret_reg) || ret_reg == (Reg)REG_NONE ||
   1245       ret_reg == ret_op->v.reg)
   1246     return 0;
   1247   Inst* producer = &bl->insts[i - 1u];
   1248   Operand ret_dst = *ret_op;
   1249   ret_dst.v.reg = ret_reg;
   1250   int swap_binop = 0;
   1251   if (producer->nopnds < 1 || !same_phys_reg(&producer->opnds[0], ret_op) ||
   1252       !retarget_producer_legal(producer, &ret_dst, &swap_binop))
   1253     return 0;
   1254   if (swap_binop) {
   1255     Operand tmp = producer->opnds[1];
   1256     producer->opnds[1] = producer->opnds[2];
   1257     producer->opnds[2] = tmp;
   1258   }
   1259   producer->opnds[0] = ret_dst;
   1260   *ret_op = ret_dst;
   1261   return 1;
   1262 }
   1263 
   1264 /* ---- Rewrite 5: constant-fold a convert of a load_imm into a load_imm ----
   1265  *
   1266  * `load_imm rT,k ; convert rD,rT` where the convert is a bit-preserving
   1267  * integer/pointer move collapses to `load_imm rD,k'` (k' = convert applied to
   1268  * k). The original load_imm, now dead, is removed by post-combine DCE.
   1269  *
   1270  * This is the convert-shaped sibling of the load_imm-into-copy fold (see
   1271  * try_substitute / combine_subst_slot). It matters for pointer-typed constant
   1272  * call args (e.g. NewTreeNode((void*)0, ...)): the arg-register hint reaches
   1273  * the convert's def but not its load_imm source, so without this the source
   1274  * lands in a scratch and the convert emits a `mov rD, scratch`. Folding lets
   1275  * the load_imm inherit the convert's (hinted) dst directly — `movz x0, 0`. */
   1276 static int try_fold_const_convert(CombineCtx* ctx, Inst* in, i32 i) {
   1277   if ((IROp)in->op != IR_CONVERT || in->nopnds < 2) return 0;
   1278   if (in->opnds[0].kind != OPK_REG || in->opnds[1].kind != OPK_REG) return 0;
   1279   /* Integer/pointer domain only: a load_imm produces an int-class value, and
   1280    * the bit-preserving convert kinds we fold (BITCAST/ZEXT/SEXT/TRUNC) keep it
   1281    * int-class. A class change (RC_INT->RC_FP, e.g. an int->float bitcast) is
   1282    * not bit-preserving and must not collapse to a load_imm. */
   1283   if (in->opnds[0].cls != RC_INT || in->opnds[1].cls != RC_INT) return 0;
   1284 
   1285   i32 prod_idx = ctx_producer_of(ctx, in->opnds[1].cls, in->opnds[1].v.reg);
   1286   if (prod_idx < 0 || prod_idx >= i) return 0;
   1287   Inst* prod = &ctx->bl->insts[prod_idx];
   1288   if ((IROp)prod->op != IR_LOAD_IMM || prod->nopnds < 1 ||
   1289       prod->opnds[0].kind != OPK_REG ||
   1290       !same_phys_reg(&prod->opnds[0], &in->opnds[1]))
   1291     return 0;
   1292 
   1293   u32 sb = combine_scalar_width_bytes(ctx->f, in->opnds[1].type);
   1294   u32 db = combine_scalar_width_bytes(ctx->f, in->opnds[0].type);
   1295   i64 value = 0;
   1296   if (!const_convert_value((ConvKind)in->extra.imm, prod->extra.imm, sb, db,
   1297                            &value))
   1298     return 0;
   1299 
   1300   /* Single-use + not-live-out gate on the load_imm dst, mirroring the SK_IMM
   1301    * gate in try_substitute_for_reg: only fold when this convert is the sole
   1302    * use, so we replace rather than duplicate the materialization. */
   1303   Operand src_def = prod->opnds[0];
   1304   int killed = 0;
   1305   if (count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &src_def, &killed) !=
   1306       1)
   1307     return 0;
   1308   if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live,
   1309                                                  ctx->bl->id, &src_def))
   1310     return 0;
   1311 
   1312   /* Rewrite the convert in place into a load_imm of the converted constant.
   1313    * The dst operand (opnds[0]) is the hinted arg/return reg and is kept; the
   1314    * source operand is dropped. The dead load_imm at prod_idx is left for DCE.
   1315    */
   1316   in->op = (u16)IR_LOAD_IMM;
   1317   in->type = in->opnds[0].type;
   1318   in->nopnds = 1;
   1319   in->extra.imm = value;
   1320   ctx->block_change_p = 1;
   1321   return 1;
   1322 }
   1323 
   1324 /* ---- per-BB driver ---- */
   1325 
   1326 static int opt_combine_fold_block(Func* f, Block* bl,
   1327                                   const OptHardBlockLive* hard_live) {
   1328   enum { enable_o1_combine_rewrites = 1 };
   1329   enum { enable_o1_sink_rewrites = 1 };
   1330   CombineCtx ctx;
   1331   ctx.f = f;
   1332   ctx.bl = bl;
   1333   ctx.hard_live = hard_live;
   1334   ctx_reset(&ctx);
   1335 
   1336   for (i32 i = 0; i < (i32)bl->ninsts; ++i) {
   1337     Inst* in = &bl->insts[i];
   1338 
   1339     if (enable_o1_combine_rewrites && try_ret_retarget(f, bl, i)) {
   1340       ctx.block_change_p = 1;
   1341       /* The producer's def changed; update ctx for the producer at i-1. */
   1342       Inst* prev = &bl->insts[i - 1];
   1343       Operand probe;
   1344       memset(&probe, 0, sizeof probe);
   1345       probe.kind = OPK_REG;
   1346       for (u8 c = 0; c < OPT_REG_CLASSES; ++c) {
   1347         probe.cls = c;
   1348         for (Reg r = 0; r < OPT_MAX_HARD_REGS; ++r) {
   1349           probe.v.reg = r;
   1350           if (ctx.last_def[c][r] == i - 1 &&
   1351               !inst_defines_phys_reg(prev, &probe))
   1352             ctx_restore_removed_def(&ctx, &probe, i - 1);
   1353         }
   1354       }
   1355       ctx_record(&ctx, prev, i - 1);
   1356     }
   1357 
   1358     /* Skip NOPs left by prior sink rewrites. */
   1359     if ((IROp)in->op == IR_NOP) continue;
   1360 
   1361     if (enable_o1_sink_rewrites && try_sink(&ctx, in, i)) {
   1362       /* sink NOP'd the copy and updated ctx. */
   1363       continue;
   1364     }
   1365 
   1366     if (enable_o1_combine_rewrites) {
   1367       try_fold_const_convert(&ctx, in, i);
   1368       try_combine_exts(&ctx, in, i);
   1369       try_substitute(&ctx, in, i);
   1370       try_addr_synth(&ctx, in, i);
   1371     }
   1372 
   1373     ctx_record(&ctx, in, i);
   1374   }
   1375   return ctx.block_change_p;
   1376 }
   1377 
   1378 static int opt_combine_compact_block(Func* f, Block* bl) {
   1379   u32 w = 0;
   1380   int changed = 0;
   1381   for (u32 i = 0; i < bl->ninsts; ++i) {
   1382     Inst* in = &bl->insts[i];
   1383 
   1384     /* Drop NOPs (e.g. copies sunk by try_sink). */
   1385     if ((IROp)in->op == IR_NOP) {
   1386       changed = 1;
   1387       continue;
   1388     }
   1389 
   1390     if ((IROp)in->op == IR_COPY && in->nopnds == 2 &&
   1391         same_reg_operand(&in->opnds[0], &in->opnds[1])) {
   1392       changed = 1;
   1393       continue;
   1394     }
   1395 
   1396     if (w) {
   1397       Inst* prev = &bl->insts[w - 1u];
   1398       if ((IROp)prev->op == IR_STORE && (IROp)in->op == IR_LOAD &&
   1399           same_spill_slot_and_size(f, prev, in) &&
   1400           same_reg_operand(&prev->opnds[1], &in->opnds[0])) {
   1401         changed = 1;
   1402         continue;
   1403       }
   1404       if ((IROp)prev->op == IR_LOAD && (IROp)in->op == IR_STORE &&
   1405           same_spill_slot_and_size(f, prev, in) &&
   1406           same_reg_operand(&prev->opnds[0], &in->opnds[1])) {
   1407         changed = 1;
   1408         continue;
   1409       }
   1410       if ((IROp)prev->op == IR_LOAD && (IROp)in->op == IR_LOAD &&
   1411           same_spill_slot_and_size(f, prev, in) &&
   1412           same_reg_operand(&prev->opnds[0], &in->opnds[0])) {
   1413         changed = 1;
   1414         continue;
   1415       }
   1416       if ((IROp)prev->op == IR_STORE && (IROp)in->op == IR_STORE &&
   1417           same_spill_access(f, prev, in, NULL)) {
   1418         bl->insts[w - 1u] = *in;
   1419         changed = 1;
   1420         continue;
   1421       }
   1422     }
   1423 
   1424     bl->insts[w++] = *in;
   1425   }
   1426   bl->ninsts = w;
   1427   return changed;
   1428 }
   1429 
   1430 void opt_combine(Func* f) {
   1431   if (!f) return;
   1432   opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE);
   1433   OptHardBlockLive* hard_live = opt_maybe_build_hard_live(f);
   1434   /* Per-BB fixpoint, matching MIR's combine loop (mir-gen.c:9037-9038). */
   1435   /* Per-BB fixpoint with a defensive iteration cap. Each rewrite is supposed
   1436    * to be monotonic (it only fires when it strictly improves the IR), so we
   1437    * shouldn't hit the cap in practice — but a noisy abort beats a hang if a
   1438    * future rewrite accidentally oscillates. */
   1439   enum { MAX_COMBINE_ITERS = 64 };
   1440   for (u32 b = 0; b < f->nblocks; ++b) {
   1441     Block* bl = &f->blocks[b];
   1442     for (int iter = 0; iter < MAX_COMBINE_ITERS; ++iter) {
   1443       int folded = opt_combine_fold_block(f, bl, hard_live);
   1444       int compacted = opt_combine_compact_block(f, bl);
   1445       if (!folded && !compacted) break;
   1446     }
   1447   }
   1448 }