kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

pass_addr_fold.c (36623B)


      1 /* O1 HIR address-folding passes.
      2  *
      3  * Split out of pass_o2.c so the always-on O1 lowering pipeline does not depend
      4  * on the (currently disabled) O2 pass translation unit. These three passes run
      5  * at every opt_level >= 1:
      6  *
      7  *   - opt_addr_xform_pregs:    PReg-namespace addr-of-local folding
      8  *   - opt_promote_scalar_locals: promote non-escaped scalar locals to PRegs
      9  *   - opt_addr_of_global_cse:  hoist/CSE duplicate ADDR_OF(global)
     10  */
     11 #include <kit/cg.h>
     12 #include <string.h>
     13 
     14 #include "opt/opt_internal.h"
     15 
     16 /* Private copy of the tiny inst-removal helper shared with pass_o2.c's
     17  * opt_addr_xform. Both are file-local statics, so there is no link conflict. */
     18 static void addr_inst_remove(Inst* in) {
     19   in->op = IR_NOP;
     20   in->def = VAL_NONE;
     21   in->ndefs = 0;
     22   in->defs = NULL;
     23   in->nopnds = 0;
     24   in->opnds = NULL;
     25 }
     26 
     27 /* PReg-namespace variant of opt_addr_xform for the O1 pipeline (no SSA, no
     28  * Val-keyed def-use chains). Scans the whole function once per candidate
     29  * IR_ADDR_OF def to classify uses of its PReg result.
     30  *
     31  * Use classifications (see addr_xform_pregs_classify_use):
     32  *
     33  *   OPF_ESCAPE     The use is something other than a non-observable
     34  *                  IR_LOAD/IR_STORE base operand. The IR_ADDR_OF cannot
     35  *                  be folded; the local's address truly escapes.
     36  *   OPF_FOLD_LOCAL Zero-EA use: `OPK_INDIRECT(base=p, ofs=0, index=NONE)`
     37  *                  in load/store base position. Foldable to OPK_LOCAL.
     38  *   OPF_FOLD_EA    EA-shaped use: same load/store base position, but with
     39  *                  nonzero `ofs` or `index != REG_NONE`. The EA must stay
     40  *                  on the load/store (the operand layout for OPK_LOCAL
     41  *                  cannot carry the EA today), so the operand is left
     42  *                  alone and the IR_ADDR_OF def must stay alive to feed
     43  *                  the OPK_INDIRECT base. The use is still recognized as
     44  *                  "non-escape" for downstream analysis (e.g. scalar
     45  *                  promotion's non-escape check).
     46  *
     47  * After classification: if any use is OPF_ESCAPE, no rewrite happens. If
     48  * every use is OPF_FOLD_LOCAL, fold all uses to OPK_LOCAL and NOP the
     49  * IR_ADDR_OF. If a mix of OPF_FOLD_LOCAL and OPF_FOLD_EA, fold the
     50  * zero-EA uses but keep the IR_ADDR_OF alive for the EA-shaped uses. */
     51 
     52 typedef enum AddrXformUseClass {
     53   OPF_ESCAPE = 0,
     54   OPF_FOLD_LOCAL = 1,
     55   OPF_FOLD_EA = 2,
     56 } AddrXformUseClass;
     57 
     58 static int addr_xform_pregs_main_op_position_ok(Inst* in, u32 op_idx) {
     59   if ((IROp)in->op != IR_LOAD && (IROp)in->op != IR_STORE) return 0;
     60   if (opt_mem_observable(&in->extra.mem)) return 0;
     61   if ((IROp)in->op == IR_LOAD && op_idx != 1u) return 0;
     62   if ((IROp)in->op == IR_STORE && op_idx != 0u) return 0;
     63   return 1;
     64 }
     65 
     66 static AddrXformUseClass addr_xform_pregs_classify_use(Inst* in, Operand* op,
     67                                                        u32 op_idx) {
     68   if (op->kind != OPK_INDIRECT) return OPF_ESCAPE;
     69   if (!addr_xform_pregs_main_op_position_ok(in, op_idx)) return OPF_ESCAPE;
     70   if (op->v.ind.ofs == 0 && op->v.ind.index == (Reg)REG_NONE)
     71     return OPF_FOLD_LOCAL;
     72   return OPF_FOLD_EA;
     73 }
     74 
     75 static int addr_xform_pregs_op_uses(const Operand* op, PReg p) {
     76   if (!op) return 0;
     77   if (op->kind == OPK_REG && (PReg)op->v.reg == p) return 1;
     78   if (op->kind == OPK_INDIRECT) {
     79     if ((PReg)op->v.ind.base == p) return 1;
     80     if (op->v.ind.index != (Reg)REG_NONE && (PReg)op->v.ind.index == p)
     81       return 1;
     82   }
     83   return 0;
     84 }
     85 
     86 static int addr_xform_pregs_abivalue_uses(const CGABIValue* v, PReg p) {
     87   if (!v) return 0;
     88   if (addr_xform_pregs_op_uses(&v->storage, p)) return 1;
     89   for (u32 i = 0; i < v->nparts; ++i)
     90     if (addr_xform_pregs_op_uses((const Operand*)&v->parts[i].op, p)) return 1;
     91   return 0;
     92 }
     93 
     94 static int addr_xform_pregs_aux_uses(Inst* in, PReg p) {
     95   switch ((IROp)in->op) {
     96     case IR_CALL: {
     97       IRCallAux* aux = (IRCallAux*)in->extra.aux;
     98       if (!aux) return 0;
     99       if (aux->use_plan_replay) {
    100         if (addr_xform_pregs_op_uses(&aux->plan.callee, p)) return 1;
    101         for (u32 i = 0; i < aux->plan.nargs; ++i)
    102           if (addr_xform_pregs_op_uses(&aux->plan.args[i].src, p)) return 1;
    103         for (u32 i = 0; i < aux->plan.nrets; ++i)
    104           if (addr_xform_pregs_op_uses(&aux->plan.rets[i].dst, p)) return 1;
    105       } else {
    106         if (addr_xform_pregs_op_uses(&aux->desc.callee, p)) return 1;
    107         for (u32 i = 0; i < aux->desc.nargs; ++i)
    108           if (addr_xform_pregs_abivalue_uses(
    109                   (const CGABIValue*)&aux->desc.args[i], p))
    110             return 1;
    111         if (addr_xform_pregs_abivalue_uses(&aux->desc.ret, p)) return 1;
    112       }
    113       return 0;
    114     }
    115     case IR_RET: {
    116       IRRetAux* aux = (IRRetAux*)in->extra.aux;
    117       if (!aux || !aux->present) return 0;
    118       return addr_xform_pregs_abivalue_uses(&aux->val, p);
    119     }
    120     case IR_SCOPE_BEGIN:
    121       return 0;
    122     case IR_ASM_BLOCK: {
    123       IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
    124       if (!aux) return 0;
    125       for (u32 i = 0; i < aux->nin; ++i)
    126         if (addr_xform_pregs_op_uses(&aux->in_ops[i], p)) return 1;
    127       for (u32 i = 0; i < aux->nout; ++i)
    128         if (addr_xform_pregs_op_uses(&aux->out_ops[i], p)) return 1;
    129       return 0;
    130     }
    131     case IR_INTRINSIC: {
    132       IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
    133       if (!aux) return 0;
    134       for (u32 i = 0; i < aux->narg; ++i)
    135         if (addr_xform_pregs_op_uses(&aux->args[i], p)) return 1;
    136       for (u32 i = 0; i < aux->ndst; ++i)
    137         if (addr_xform_pregs_op_uses(&aux->dsts[i], p)) return 1;
    138       return 0;
    139     }
    140     default:
    141       return 0;
    142   }
    143 }
    144 
    145 /* Returns nonzero if every use of `p` is foldable (OPF_FOLD_LOCAL or
    146  * OPF_FOLD_EA) and at least one use exists. *out_has_ea is set to 1 if any
    147  * use was OPF_FOLD_EA; in that case the rewrite must keep the IR_ADDR_OF
    148  * alive (the EA-shaped use still names p as the OPK_INDIRECT base). */
    149 static int addr_xform_pregs_classify(Func* f, PReg p, Inst* def_inst,
    150                                      int* out_has_ea) {
    151   int has_foldable_use = 0;
    152   int has_ea = 0;
    153   for (u32 b = 0; b < f->nblocks; ++b) {
    154     Block* bl = &f->blocks[b];
    155     for (u32 i = 0; i < bl->ninsts; ++i) {
    156       Inst* in = &bl->insts[i];
    157       if (in == def_inst) continue;
    158       for (u32 o = 0; o < in->nopnds; ++o) {
    159         Operand* op = &in->opnds[o];
    160         if (!addr_xform_pregs_op_uses(op, p)) continue;
    161         AddrXformUseClass uc = addr_xform_pregs_classify_use(in, op, o);
    162         if (uc == OPF_ESCAPE) return 0;
    163         has_foldable_use = 1;
    164         if (uc == OPF_FOLD_EA) has_ea = 1;
    165       }
    166       if (addr_xform_pregs_aux_uses(in, p)) return 0;
    167     }
    168   }
    169   if (out_has_ea) *out_has_ea = has_ea;
    170   return has_foldable_use;
    171 }
    172 
    173 void opt_addr_xform_pregs(Func* f) {
    174   if (!f || f->opt_reg_ssa || f->opt_rewritten) return;
    175   int changed = 0;
    176   for (u32 b = 0; b < f->nblocks; ++b) {
    177     Block* bl = &f->blocks[b];
    178     for (u32 i = 0; i < bl->ninsts; ++i) {
    179       Inst* in = &bl->insts[i];
    180       if ((IROp)in->op != IR_ADDR_OF) continue;
    181       if (in->nopnds < 2) continue;
    182       if (in->opnds[0].kind != OPK_REG) continue;
    183       if (in->opnds[1].kind != OPK_LOCAL) continue;
    184       PReg p = (PReg)in->opnds[0].v.reg;
    185       if (!opt_reg_valid(f, p)) continue;
    186       int has_ea = 0;
    187       if (!addr_xform_pregs_classify(f, p, in, &has_ea)) continue;
    188       Operand local = in->opnds[1];
    189       /* Fold every zero-EA use of p to OPK_LOCAL. EA-shaped uses are left
    190        * as OPK_INDIRECT(base=p, ofs, index, log2_scale) so the EA stays on
    191        * the load/store; the IR_ADDR_OF def must survive to feed them. */
    192       for (u32 bb = 0; bb < f->nblocks; ++bb) {
    193         Block* rb = &f->blocks[bb];
    194         for (u32 ii = 0; ii < rb->ninsts; ++ii) {
    195           Inst* use = &rb->insts[ii];
    196           if (use == in) continue;
    197           for (u32 o = 0; o < use->nopnds; ++o) {
    198             Operand* op = &use->opnds[o];
    199             if (op->kind != OPK_INDIRECT) continue;
    200             if ((PReg)op->v.ind.base != p) continue;
    201             if (op->v.ind.ofs != 0 || op->v.ind.index != (Reg)REG_NONE)
    202               continue; /* EA-shaped; leave alone */
    203             Operand folded = local;
    204             folded.type =
    205                 use->extra.mem.type ? use->extra.mem.type : local.type;
    206             *op = folded;
    207           }
    208         }
    209       }
    210       if (!has_ea) addr_inst_remove(in);
    211       changed = 1;
    212     }
    213   }
    214   /* After folding, walk all frame slots and clear FSF_ADDR_TAKEN on any
    215    * slot whose surviving IR_ADDR_OF defs (if any) have all been retired.
    216    * The frontend-set ADDR_TAKEN flag is conservative; if we proved the
    217    * address no longer escapes, downstream passes (opt_promote_scalar_locals)
    218    * can take advantage of the actual non-escape state. */
    219   if (changed) {
    220     u8* still_taken =
    221         arena_zarray(f->arena, u8, f->nframe_slots ? f->nframe_slots : 1u);
    222     for (u32 b = 0; b < f->nblocks; ++b) {
    223       Block* bl = &f->blocks[b];
    224       for (u32 i = 0; i < bl->ninsts; ++i) {
    225         Inst* in = &bl->insts[i];
    226         if ((IROp)in->op != IR_ADDR_OF) continue;
    227         if (in->nopnds < 2 || in->opnds[1].kind != OPK_LOCAL) continue;
    228         FrameSlot slot = in->opnds[1].v.frame_slot;
    229         if (slot && slot <= f->nframe_slots) still_taken[slot - 1u] = 1;
    230       }
    231     }
    232     for (u32 s = 0; s < f->nframe_slots; ++s) {
    233       if (!still_taken[s]) f->frame_slots[s].flags &= (u16)~FSF_ADDR_TAKEN;
    234     }
    235   }
    236   if (changed)
    237     opt_analysis_invalidate(
    238         f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP);
    239 }
    240 
    241 /* Scalar local promotion for the O1 pipeline. Runs after
    242  * `opt_addr_xform_pregs` has folded zero-EA `OPK_INDIRECT(p)` uses to
    243  * `OPK_LOCAL(slot)` and retired non-escaping `IR_ADDR_OF` defs. For each
    244  * frame slot that is now only referenced as the base of matching-type,
    245  * non-observable `IR_LOAD`/`IR_STORE`, the slot is replaced by a fresh
    246  * mutable PReg: each store becomes `IR_COPY P_slot, src` (or `IR_LOAD_IMM`
    247  * for an immediate source), each load becomes `IR_COPY dst, P_slot`. The
    248  * slot becomes unreferenced and the backend drops it from the frame.
    249  *
    250  * A mutable PReg in `-O1` IR has the same data-flow semantics as a named
    251  * memory cell that does not escape (multiple defs, multiple uses, value at
    252  * a use comes from whichever def reaches it via CFG edges). No phis are
    253  * required because the IR model has no phis; PReg flow becomes hard-reg
    254  * flow after regalloc, and regalloc already handles it.
    255  *
    256  * Conditions for promotion (per slot):
    257  *
    258  *   1. Slot kind is FS_LOCAL (real locals, not spills, sret, alloca).
    259  *   2. Slot has no FSF_ADDR_TAKEN, FSF_VOLATILE flag (after
    260  *      `opt_addr_xform_pregs` has cleared the conservative ADDR_TAKEN
    261  *      flag for slots whose IR_ADDR_OF defs were all retired).
    262  *   3. Slot's declared type is scalar (int, float, bool, ptr, enum).
    263  *   4. Every appearance of `OPK_LOCAL(slot)` in any instruction operand is
    264  *      either:
    265  *        - `IR_LOAD.opnds[1]` with matching `access.type == slot.type`,
    266  *          no observable mem flags, dst is OPK_REG;
    267  *        - `IR_STORE.opnds[0]` with matching `access.type == slot.type`,
    268  *          no observable mem flags, src is OPK_REG or OPK_IMM.
    269  *   5. Slot does not appear in any aux operand position (calls, asm, etc.)
    270  *      or as an OPK_LOCAL anywhere else (e.g., a surviving IR_ADDR_OF).
    271  *
    272  * Param-slot case: FS_PARAM slots are excluded. The backend prologue is
    273  * responsible for moving the ABI-incoming hard reg into the slot, and that
    274  * move is not visible in the IR (there is no `IR_STORE OPK_LOCAL(slot)` to
    275  * rewrite). At O1 the wrapper already places scalar params in REG storage
    276  * when the frontend does not force a memory home, so the param's value
    277  * arrives in a PReg without needing this pass. If a future scheme records
    278  * the entry-move as a synthetic IR_STORE OPK_LOCAL(slot), this pass would
    279  * promote it the same way it promotes any other store-to-slot. */
    280 
    281 static int promote_local_type_is_scalar(Func* f, KitCgTypeId ty) {
    282   if (!ty) return 0;
    283   KitCgTypeKind kind = kit_cg_type_kind((KitCompiler*)f->c, ty);
    284   switch (kind) {
    285     case KIT_CG_TYPE_BOOL:
    286     case KIT_CG_TYPE_INT:
    287     case KIT_CG_TYPE_FLOAT:
    288     case KIT_CG_TYPE_PTR:
    289     case KIT_CG_TYPE_ENUM:
    290       return 1;
    291     default:
    292       return 0;
    293   }
    294 }
    295 
    296 static int promote_op_uses_slot(const Operand* op, FrameSlot slot) {
    297   return op && op->kind == OPK_LOCAL && op->v.frame_slot == slot;
    298 }
    299 
    300 static int promote_abivalue_uses_slot(const CGABIValue* v, FrameSlot slot) {
    301   if (!v) return 0;
    302   if (promote_op_uses_slot(&v->storage, slot)) return 1;
    303   for (u32 i = 0; i < v->nparts; ++i)
    304     if (promote_op_uses_slot((const Operand*)&v->parts[i].op, slot)) return 1;
    305   return 0;
    306 }
    307 
    308 static int promote_aux_uses_slot(const Inst* in, FrameSlot slot) {
    309   switch ((IROp)in->op) {
    310     case IR_CALL: {
    311       IRCallAux* aux = (IRCallAux*)in->extra.aux;
    312       if (!aux) return 0;
    313       if (aux->use_plan_replay) {
    314         if (promote_op_uses_slot(&aux->plan.callee, slot)) return 1;
    315         for (u32 i = 0; i < aux->plan.nargs; ++i)
    316           if (promote_op_uses_slot(&aux->plan.args[i].src, slot)) return 1;
    317         for (u32 i = 0; i < aux->plan.nrets; ++i)
    318           if (promote_op_uses_slot(&aux->plan.rets[i].dst, slot)) return 1;
    319       } else {
    320         if (promote_op_uses_slot(&aux->desc.callee, slot)) return 1;
    321         for (u32 i = 0; i < aux->desc.nargs; ++i)
    322           if (promote_abivalue_uses_slot((const CGABIValue*)&aux->desc.args[i],
    323                                          slot))
    324             return 1;
    325         if (promote_abivalue_uses_slot(&aux->desc.ret, slot)) return 1;
    326       }
    327       return 0;
    328     }
    329     case IR_RET: {
    330       IRRetAux* aux = (IRRetAux*)in->extra.aux;
    331       if (!aux || !aux->present) return 0;
    332       return promote_abivalue_uses_slot(&aux->val, slot);
    333     }
    334     case IR_SCOPE_BEGIN:
    335       return 0;
    336     case IR_ASM_BLOCK: {
    337       IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
    338       if (!aux) return 0;
    339       for (u32 i = 0; i < aux->nin; ++i)
    340         if (promote_op_uses_slot(&aux->in_ops[i], slot)) return 1;
    341       for (u32 i = 0; i < aux->nout; ++i)
    342         if (promote_op_uses_slot(&aux->out_ops[i], slot)) return 1;
    343       return 0;
    344     }
    345     case IR_INTRINSIC: {
    346       IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
    347       if (!aux) return 0;
    348       for (u32 i = 0; i < aux->narg; ++i)
    349         if (promote_op_uses_slot(&aux->args[i], slot)) return 1;
    350       for (u32 i = 0; i < aux->ndst; ++i)
    351         if (promote_op_uses_slot(&aux->dsts[i], slot)) return 1;
    352       return 0;
    353     }
    354     default:
    355       return 0;
    356   }
    357 }
    358 
    359 /* Per-inst check. Returns:
    360  *  1  = "instruction touches slot in a promotable position" (load/store base).
    361  *  0  = "instruction does not touch slot at all".
    362  * -1  = "instruction touches slot in a non-promotable way" (e.g., wrong
    363  *       operand position, type mismatch, observable flags, aux use). */
    364 static int promote_inst_classify(const Inst* in, FrameSlot slot,
    365                                  KitCgTypeId slot_ty) {
    366   int touched = 0;
    367   /* IR_LOAD: opnds[0]=dst REG, opnds[1]=addr (allowed: OPK_LOCAL slot). */
    368   if ((IROp)in->op == IR_LOAD) {
    369     if (in->nopnds >= 2 && promote_op_uses_slot(&in->opnds[1], slot)) {
    370       if (opt_mem_observable(&in->extra.mem)) return -1;
    371       if (in->opnds[0].kind != OPK_REG) return -1;
    372       KitCgTypeId at = in->extra.mem.type;
    373       if (at && at != slot_ty) return -1;
    374       touched = 1;
    375     }
    376     /* opnds[0] is the dst REG — never OPK_LOCAL by construction. */
    377     if (in->nopnds >= 1 && promote_op_uses_slot(&in->opnds[0], slot)) return -1;
    378   } else if ((IROp)in->op == IR_STORE) {
    379     if (in->nopnds >= 1 && promote_op_uses_slot(&in->opnds[0], slot)) {
    380       if (opt_mem_observable(&in->extra.mem)) return -1;
    381       if (in->nopnds < 2) return -1;
    382       Operand* src = &in->opnds[1];
    383       if (src->kind != OPK_REG && src->kind != OPK_IMM) return -1;
    384       KitCgTypeId at = in->extra.mem.type;
    385       if (at && at != slot_ty) return -1;
    386       touched = 1;
    387     }
    388     /* opnds[1] is the src value — should never be OPK_LOCAL for a scalar. */
    389     if (in->nopnds >= 2 && promote_op_uses_slot(&in->opnds[1], slot)) return -1;
    390   } else {
    391     /* Any other instruction with an OPK_LOCAL(slot) operand blocks promotion.
    392      */
    393     for (u32 o = 0; o < in->nopnds; ++o)
    394       if (promote_op_uses_slot(&in->opnds[o], slot)) return -1;
    395   }
    396   if (promote_aux_uses_slot(in, slot)) return -1;
    397   return touched;
    398 }
    399 
    400 /* Rewrite an `IR_STORE OPK_LOCAL(slot), src` into a PReg def. If src is
    401  * OPK_IMM, emit IR_LOAD_IMM into preg; otherwise emit IR_COPY. */
    402 static void promote_rewrite_store(Func* f, Inst* in, PReg preg, KitCgTypeId ty,
    403                                   u8 cls) {
    404   Operand src = in->opnds[1];
    405   Operand* opnds = arena_array(f->arena, Operand, 2);
    406   memset(&opnds[0], 0, sizeof opnds[0]);
    407   opnds[0].kind = OPK_REG;
    408   opnds[0].type = ty;
    409   opnds[0].cls = cls;
    410   opnds[0].v.reg = (Reg)preg;
    411   in->type = ty;
    412   in->def = (Val)preg;
    413   if (src.kind == OPK_IMM) {
    414     in->op = IR_LOAD_IMM;
    415     in->nopnds = 1;
    416     in->opnds = opnds;
    417     in->extra.imm = src.v.imm;
    418   } else {
    419     opnds[1] = src;
    420     opnds[1].type = ty;
    421     opnds[1].cls = cls;
    422     in->op = IR_COPY;
    423     in->nopnds = 2;
    424     in->opnds = opnds;
    425     memset(&in->extra, 0, sizeof in->extra);
    426   }
    427 }
    428 
    429 /* Rewrite an `IR_LOAD dst, OPK_LOCAL(slot)` into `IR_COPY dst, preg`. */
    430 static void promote_rewrite_load(Func* f, Inst* in, PReg preg, KitCgTypeId ty,
    431                                  u8 cls) {
    432   Operand dst = in->opnds[0];
    433   Operand* opnds = arena_array(f->arena, Operand, 2);
    434   opnds[0] = dst;
    435   opnds[0].type = ty;
    436   opnds[0].cls = cls;
    437   memset(&opnds[1], 0, sizeof opnds[1]);
    438   opnds[1].kind = OPK_REG;
    439   opnds[1].type = ty;
    440   opnds[1].cls = cls;
    441   opnds[1].v.reg = (Reg)preg;
    442   in->op = IR_COPY;
    443   in->type = ty;
    444   in->nopnds = 2;
    445   in->opnds = opnds;
    446   memset(&in->extra, 0, sizeof in->extra);
    447 }
    448 
    449 void opt_promote_scalar_locals(Func* f) {
    450   if (!f || f->opt_reg_ssa || f->opt_rewritten) return;
    451   if (!f->nframe_slots) return;
    452   int changed = 0;
    453   for (u32 sidx = 0; sidx < f->nframe_slots; ++sidx) {
    454     IRFrameSlot* slot = &f->frame_slots[sidx];
    455     FrameSlot id = slot->id;
    456     /* FS_PARAM slots are owned by the backend prologue (which copies the
    457      * ABI-incoming hard reg into the slot before any user IR runs); there
    458      * is no IR-level store to rewrite. At O1, the wrapper already places
    459      * scalar params in REG storage when the frontend does not force a
    460      * memory home, so the FS_PARAM promotion path is normally a no-op.
    461      * Only promote FS_LOCAL slots. */
    462     if (slot->kind != FS_LOCAL) continue;
    463     if (slot->flags & (FSF_ADDR_TAKEN | FSF_VOLATILE)) continue;
    464     if (!promote_local_type_is_scalar(f, slot->type)) continue;
    465     int touched_count = 0;
    466     int rejected = 0;
    467     for (u32 b = 0; b < f->nblocks && !rejected; ++b) {
    468       Block* bl = &f->blocks[b];
    469       for (u32 i = 0; i < bl->ninsts; ++i) {
    470         Inst* in = &bl->insts[i];
    471         int r = promote_inst_classify(in, id, slot->type);
    472         if (r < 0) {
    473           rejected = 1;
    474           break;
    475         }
    476         touched_count += r;
    477       }
    478     }
    479     if (rejected || !touched_count) continue;
    480     u8 cls = opt_value_reg_class(f->c, slot->type);
    481     PReg preg = ir_alloc_preg(f, slot->type, cls);
    482     for (u32 b = 0; b < f->nblocks; ++b) {
    483       Block* bl = &f->blocks[b];
    484       for (u32 i = 0; i < bl->ninsts; ++i) {
    485         Inst* in = &bl->insts[i];
    486         if ((IROp)in->op == IR_LOAD && in->nopnds >= 2 &&
    487             promote_op_uses_slot(&in->opnds[1], id)) {
    488           promote_rewrite_load(f, in, preg, slot->type, cls);
    489         } else if ((IROp)in->op == IR_STORE && in->nopnds >= 2 &&
    490                    promote_op_uses_slot(&in->opnds[0], id)) {
    491           promote_rewrite_store(f, in, preg, slot->type, cls);
    492         }
    493       }
    494     }
    495     /* The frame slot is now unreferenced. Leave the slot table entry in
    496      * place (compaction would require remapping every other slot id);
    497      * the backend's frame layout pass simply omits unreferenced slots. */
    498     changed = 1;
    499   }
    500   if (changed)
    501     opt_analysis_invalidate(
    502         f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP);
    503 }
    504 
    505 /* CSE-style hoist of `IR_ADDR_OF(OPK_GLOBAL{sym, addend})` defs that appear
    506  * more than once in the same function. The address is a link-time constant
    507  * (TLS and IFUNC live on separate IROps), so all occurrences compute the
    508  * same value; consolidating to a single entry-block def shrinks each loop
    509  * body by the per-iter `adrp`/`add` pair the backend would otherwise re-emit.
    510  *
    511  * Implementation:
    512  *   - Walk all insts, group ADDR_OF defs by (sym, addend).
    513  *   - For each key with >= 2 defs: allocate a fresh PReg, materialize one
    514  *     IR_ADDR_OF in block 0 (after any IR_PARAM_DECL prologue), build a
    515  *     preg-remap from each original def-PReg to the new PReg, and NOP each
    516  *     original def.
    517  *   - One IR walk applies the remap to every operand `v.reg` /
    518  *     `v.ind.base`.
    519  *
    520  * Runs after opt_addr_xform_pregs so local addr-of has already been folded
    521  * out; the remaining IR_ADDR_OF defs are global. */
    522 
    523 typedef struct AddrCseEntry {
    524   ObjSymId sym;
    525   i64 addend;
    526   PReg canonical;        /* freshly allocated PReg, def in block 0 */
    527   KitCgTypeId addr_type; /* operand[0].type from the first def */
    528   u8 cls;                /* operand[0].cls from the first def */
    529   u32 count;             /* number of original ADDR_OF defs seen */
    530 } AddrCseEntry;
    531 
    532 static u32 addr_cse_find_or_add(AddrCseEntry** entries, u32* n, u32* cap,
    533                                 Arena* arena, ObjSymId sym, i64 addend) {
    534   for (u32 i = 0; i < *n; ++i) {
    535     if ((*entries)[i].sym == sym && (*entries)[i].addend == addend) return i;
    536   }
    537   if (*n == *cap) {
    538     u32 ncap = *cap ? *cap * 2u : 16u;
    539     AddrCseEntry* nv = arena_array(arena, AddrCseEntry, ncap);
    540     if (*entries) memcpy(nv, *entries, sizeof(AddrCseEntry) * (*n));
    541     *entries = nv;
    542     *cap = ncap;
    543   }
    544   u32 idx = (*n)++;
    545   AddrCseEntry* e = &(*entries)[idx];
    546   memset(e, 0, sizeof *e);
    547   e->sym = sym;
    548   e->addend = addend;
    549   e->canonical = PREG_NONE;
    550   e->count = 0;
    551   return idx;
    552 }
    553 
    554 static void addr_cse_apply_to_operand(Operand* op, const PReg* remap) {
    555   /* remap is zero-initialized; 0 means "no remap" (preg 0 is reserved as
    556    * unused). PREG_NONE = 0xffffffff and would be a valid remap target but
    557    * we never produce that. */
    558   if (!op) return;
    559   if (op->kind == OPK_REG) {
    560     PReg p = (PReg)op->v.reg;
    561     if (p != PREG_NONE && p != 0 && remap[p] != 0) op->v.reg = remap[p];
    562   } else if (op->kind == OPK_INDIRECT) {
    563     PReg p = (PReg)op->v.ind.base;
    564     if (p != PREG_NONE && p != 0 && remap[p] != 0) op->v.ind.base = remap[p];
    565     if (op->v.ind.index != (Reg)REG_NONE) {
    566       PReg pi = (PReg)op->v.ind.index;
    567       if (pi != PREG_NONE && pi != 0 && remap[pi] != 0)
    568         op->v.ind.index = remap[pi];
    569     }
    570   }
    571 }
    572 
    573 static void addr_cse_apply_to_inst(Inst* in, const PReg* remap) {
    574   for (u32 o = 0; o < in->nopnds; ++o)
    575     addr_cse_apply_to_operand(&in->opnds[o], remap);
    576   /* IR_CALL aux carries operands too; rewrite both replay variants. */
    577   if ((IROp)in->op == IR_CALL) {
    578     IRCallAux* aux = (IRCallAux*)in->extra.aux;
    579     if (!aux) return;
    580     if (aux->use_plan_replay) {
    581       addr_cse_apply_to_operand(&aux->plan.callee, remap);
    582       for (u32 i = 0; i < aux->plan.nargs; ++i)
    583         addr_cse_apply_to_operand(&aux->plan.args[i].src, remap);
    584       for (u32 i = 0; i < aux->plan.nrets; ++i)
    585         addr_cse_apply_to_operand(&aux->plan.rets[i].dst, remap);
    586     } else {
    587       addr_cse_apply_to_operand(&aux->desc.callee, remap);
    588       for (u32 i = 0; i < aux->desc.nargs; ++i) {
    589         CGABIValue* v = (CGABIValue*)&aux->desc.args[i];
    590         addr_cse_apply_to_operand(&v->storage, remap);
    591         for (u32 k = 0; k < v->nparts; ++k)
    592           addr_cse_apply_to_operand((Operand*)&v->parts[k].op, remap);
    593       }
    594       addr_cse_apply_to_operand(&aux->desc.ret.storage, remap);
    595       for (u32 k = 0; k < aux->desc.ret.nparts; ++k)
    596         addr_cse_apply_to_operand((Operand*)&aux->desc.ret.parts[k].op, remap);
    597     }
    598   } else if ((IROp)in->op == IR_RET) {
    599     IRRetAux* aux = (IRRetAux*)in->extra.aux;
    600     if (aux && aux->present) {
    601       addr_cse_apply_to_operand(&aux->val.storage, remap);
    602       for (u32 k = 0; k < aux->val.nparts; ++k)
    603         addr_cse_apply_to_operand((Operand*)&aux->val.parts[k].op, remap);
    604     }
    605   } else if ((IROp)in->op == IR_ASM_BLOCK) {
    606     IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
    607     if (!aux) return;
    608     for (u32 i = 0; i < aux->nin; ++i)
    609       addr_cse_apply_to_operand(&aux->in_ops[i], remap);
    610     for (u32 i = 0; i < aux->nout; ++i)
    611       addr_cse_apply_to_operand(&aux->out_ops[i], remap);
    612   } else if ((IROp)in->op == IR_INTRINSIC) {
    613     IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
    614     if (!aux) return;
    615     for (u32 i = 0; i < aux->narg; ++i)
    616       addr_cse_apply_to_operand(&aux->args[i], remap);
    617     for (u32 i = 0; i < aux->ndst; ++i)
    618       addr_cse_apply_to_operand(&aux->dsts[i], remap);
    619   }
    620 }
    621 
    622 Inst* opt_block_insert_at(Func* f, Block* bl, u32 at, u32 k) {
    623   if (at > bl->ninsts) at = bl->ninsts;
    624   if (bl->ninsts + k > bl->cap) {
    625     u32 ncap = bl->cap ? bl->cap : 8u;
    626     while (ncap < bl->ninsts + k) ncap *= 2u;
    627     Inst* nb = arena_zarray(f->arena, Inst, ncap);
    628     if (bl->insts && at) memcpy(nb, bl->insts, sizeof(Inst) * at);
    629     if (bl->insts && bl->ninsts > at)
    630       memcpy(nb + at + k, bl->insts + at, sizeof(Inst) * (bl->ninsts - at));
    631     bl->insts = nb;
    632     bl->cap = ncap;
    633   } else {
    634     if (bl->ninsts > at)
    635       memmove(bl->insts + at + k, bl->insts + at,
    636               sizeof(Inst) * (bl->ninsts - at));
    637   }
    638   for (u32 i = 0; i < k; ++i) memset(&bl->insts[at + i], 0, sizeof(Inst));
    639   bl->ninsts += k;
    640   return &bl->insts[at];
    641 }
    642 
    643 void opt_addr_of_global_cse(Func* f) {
    644   if (!f || f->opt_reg_ssa || f->opt_rewritten) return;
    645   if (f->nblocks == 0) return;
    646 
    647   /* Pass 1: index ADDR_OF(global) defs by (sym, addend). */
    648   AddrCseEntry* entries = NULL;
    649   u32 n_entries = 0;
    650   u32 cap_entries = 0;
    651   for (u32 b = 0; b < f->nblocks; ++b) {
    652     Block* bl = &f->blocks[b];
    653     for (u32 i = 0; i < bl->ninsts; ++i) {
    654       Inst* in = &bl->insts[i];
    655       if ((IROp)in->op != IR_ADDR_OF) continue;
    656       if (in->nopnds < 2) continue;
    657       if (in->opnds[0].kind != OPK_REG) continue;
    658       if (in->opnds[1].kind != OPK_GLOBAL) continue;
    659       u32 idx = addr_cse_find_or_add(&entries, &n_entries, &cap_entries,
    660                                      f->arena, in->opnds[1].v.global.sym,
    661                                      in->opnds[1].v.global.addend);
    662       AddrCseEntry* e = &entries[idx];
    663       if (e->count == 0) {
    664         e->addr_type = in->opnds[0].type;
    665         e->cls = in->opnds[0].cls;
    666       }
    667       ++e->count;
    668     }
    669   }
    670   if (!n_entries) return;
    671 
    672   /* Pass 2: for each duplicate key, allocate a canonical PReg. */
    673   u32 dup_count = 0;
    674   for (u32 i = 0; i < n_entries; ++i) {
    675     if (entries[i].count >= 2) {
    676       entries[i].canonical =
    677           ir_alloc_preg(f, entries[i].addr_type, entries[i].cls);
    678       ++dup_count;
    679     }
    680   }
    681   if (!dup_count) return;
    682 
    683   /* Pass 3: walk again, build per-old-PReg remap and NOP duplicate defs. */
    684   PReg* remap = arena_zarray(f->arena, PReg, opt_reg_count(f));
    685   for (u32 b = 0; b < f->nblocks; ++b) {
    686     Block* bl = &f->blocks[b];
    687     for (u32 i = 0; i < bl->ninsts; ++i) {
    688       Inst* in = &bl->insts[i];
    689       if ((IROp)in->op != IR_ADDR_OF) continue;
    690       if (in->nopnds < 2) continue;
    691       if (in->opnds[0].kind != OPK_REG) continue;
    692       if (in->opnds[1].kind != OPK_GLOBAL) continue;
    693       u32 idx = addr_cse_find_or_add(&entries, &n_entries, &cap_entries,
    694                                      f->arena, in->opnds[1].v.global.sym,
    695                                      in->opnds[1].v.global.addend);
    696       if (entries[idx].canonical == PREG_NONE) continue; /* singleton */
    697       PReg old = (PReg)in->opnds[0].v.reg;
    698       if (opt_reg_valid(f, old)) remap[old] = entries[idx].canonical;
    699       /* NOP the original def. */
    700       in->op = IR_NOP;
    701       in->def = VAL_NONE;
    702       in->ndefs = 0;
    703       in->defs = NULL;
    704       in->nopnds = 0;
    705       in->opnds = NULL;
    706     }
    707   }
    708 
    709   /* Pass 4: hoist a single ADDR_OF for each duplicated key to the entry
    710    * block, inserted after any leading IR_PARAM_DECL instructions. */
    711   if (f->entry >= f->nblocks) return;
    712   Block* entry = &f->blocks[f->entry];
    713   u32 insert_at = 0;
    714   while (insert_at < entry->ninsts &&
    715          (IROp)entry->insts[insert_at].op == IR_PARAM_DECL)
    716     ++insert_at;
    717   Inst* slot = opt_block_insert_at(f, entry, insert_at, dup_count);
    718   u32 w = 0;
    719   for (u32 i = 0; i < n_entries; ++i) {
    720     if (entries[i].canonical == PREG_NONE) continue;
    721     Inst* in = &slot[w++];
    722     in->op = (u16)IR_ADDR_OF;
    723     in->def = (Val)entries[i].canonical;
    724     in->type = entries[i].addr_type;
    725     in->nopnds = 2;
    726     in->opnds = arena_array(f->arena, Operand, 2);
    727     memset(&in->opnds[0], 0, sizeof(Operand));
    728     in->opnds[0].kind = OPK_REG;
    729     in->opnds[0].cls = entries[i].cls;
    730     in->opnds[0].type = entries[i].addr_type;
    731     in->opnds[0].v.reg = entries[i].canonical;
    732     memset(&in->opnds[1], 0, sizeof(Operand));
    733     in->opnds[1].kind = OPK_GLOBAL;
    734     in->opnds[1].cls = entries[i].cls;
    735     in->opnds[1].type = entries[i].addr_type;
    736     in->opnds[1].v.global.sym = entries[i].sym;
    737     in->opnds[1].v.global.addend = entries[i].addend;
    738     ir_assign_inst_id(f, in);
    739   }
    740 
    741   /* Pass 5: apply remap to all operand uses in the function. */
    742   for (u32 b = 0; b < f->nblocks; ++b) {
    743     Block* bl = &f->blocks[b];
    744     for (u32 i = 0; i < bl->ninsts; ++i) {
    745       addr_cse_apply_to_inst(&bl->insts[i], remap);
    746     }
    747   }
    748 
    749   opt_analysis_invalidate(
    750       f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP);
    751 }
    752 
    753 /* LICM-lite for constant materialization. A LOAD_IMM has no inputs, so its
    754  * result is loop-invariant and valid anywhere the entry block dominates (i.e.
    755  * everywhere). When such a def lives inside a loop, the backend would otherwise
    756  * re-materialize the constant (movz/movk) on every iteration; hoisting one copy
    757  * to the entry block and reusing it removes that per-iteration work, mirroring
    758  * opt_addr_of_global_cse for the `adrp`/`add` case.
    759  *
    760  * Scope/cost guards:
    761  *   - Only hoist constants that actually appear inside a loop (loop_depth > 0);
    762  *     non-loop constants gain nothing and would only lengthen live ranges.
    763  *   - Skip imm 0: the emit path stores a constant 0 from the hardware zero
    764  *     register (no materialization at all), which beats holding 0 in an
    765  *     allocated register across the loop.
    766  *
    767  * Must run after opt_build_loop_tree (reads block loop_depth) and before
    768  * register allocation. */
    769 typedef struct ConstHoistEntry {
    770   i64 imm;
    771   KitCgTypeId type;
    772   PReg canonical;
    773   u8 cls;
    774   u8 in_loop;
    775 } ConstHoistEntry;
    776 
    777 static u32 const_hoist_find_or_add(ConstHoistEntry** entries, u32* n, u32* cap,
    778                                    Arena* arena, i64 imm, KitCgTypeId type,
    779                                    u8 cls) {
    780   for (u32 i = 0; i < *n; ++i) {
    781     if ((*entries)[i].imm == imm && (*entries)[i].type == type &&
    782         (*entries)[i].cls == cls)
    783       return i;
    784   }
    785   if (*n == *cap) {
    786     u32 ncap = *cap ? *cap * 2u : 16u;
    787     ConstHoistEntry* nv = arena_array(arena, ConstHoistEntry, ncap);
    788     if (*entries) memcpy(nv, *entries, sizeof(ConstHoistEntry) * (*n));
    789     *entries = nv;
    790     *cap = ncap;
    791   }
    792   u32 idx = (*n)++;
    793   ConstHoistEntry* e = &(*entries)[idx];
    794   memset(e, 0, sizeof *e);
    795   e->imm = imm;
    796   e->type = type;
    797   e->cls = cls;
    798   e->canonical = PREG_NONE;
    799   return idx;
    800 }
    801 
    802 static void const_hoist_count_def(Func* f, Inst* in, Operand* op, int is_def,
    803                                   void* ud) {
    804   u32* counts = (u32*)ud;
    805   (void)in;
    806   if (!is_def || op->kind != OPK_REG) return;
    807   PReg p = (PReg)op->v.reg;
    808   if (opt_reg_valid(f, p)) ++counts[p];
    809 }
    810 
    811 /* True for a LOAD_IMM whose constant is a hoist candidate: a non-zero immediate
    812  * defining a register that has exactly one def in the function. The single-def
    813  * requirement is the safety condition for remapping every use of that register
    814  * to one canonical entry def — the PReg form here is non-SSA, so a mutable
    815  * local promoted by opt_promote_scalar_locals (e.g. a loop counter initialized
    816  * to a constant) is multiply defined; remapping it would discard the other
    817  * defs (the increment) and break the loop. */
    818 static int const_hoist_candidate(const Func* f, const Inst* in,
    819                                  const u32* def_counts) {
    820   if ((IROp)in->op != IR_LOAD_IMM || in->nopnds < 1) return 0;
    821   if (in->opnds[0].kind != OPK_REG || in->extra.imm == 0) return 0;
    822   PReg p = (PReg)in->opnds[0].v.reg;
    823   return opt_reg_valid(f, p) && def_counts[p] == 1;
    824 }
    825 
    826 void opt_hoist_loop_consts(Func* f) {
    827   if (!f || f->opt_reg_ssa || f->opt_rewritten) return;
    828   if (f->nblocks == 0 || f->entry >= f->nblocks) return;
    829 
    830   /* Count defs per PReg so const_hoist_candidate can reject multiply-defined
    831    * registers (mutable promoted locals) — see its comment. */
    832   u32* def_counts = arena_zarray(f->arena, u32, opt_reg_count(f));
    833   for (u32 b = 0; b < f->nblocks; ++b) {
    834     Block* bl = &f->blocks[b];
    835     for (u32 i = 0; i < bl->ninsts; ++i)
    836       opt_walk_inst_operands(f, &bl->insts[i], const_hoist_count_def,
    837                              def_counts);
    838   }
    839 
    840   /* Pass 1: index LOAD_IMM defs by (imm, type, cls); flag keys seen in a loop.
    841    */
    842   ConstHoistEntry* entries = NULL;
    843   u32 n_entries = 0, cap_entries = 0;
    844   for (u32 b = 0; b < f->nblocks; ++b) {
    845     Block* bl = &f->blocks[b];
    846     int in_loop = bl->loop_depth > 0;
    847     for (u32 i = 0; i < bl->ninsts; ++i) {
    848       Inst* in = &bl->insts[i];
    849       if (!const_hoist_candidate(f, in, def_counts)) continue;
    850       u32 idx = const_hoist_find_or_add(&entries, &n_entries, &cap_entries,
    851                                         f->arena, in->extra.imm,
    852                                         in->opnds[0].type, in->opnds[0].cls);
    853       if (in_loop) entries[idx].in_loop = 1;
    854     }
    855   }
    856 
    857   /* Pass 2: allocate a canonical PReg for each constant that appears in a loop.
    858    */
    859   u32 dup_count = 0;
    860   for (u32 i = 0; i < n_entries; ++i) {
    861     if (!entries[i].in_loop) continue;
    862     entries[i].canonical = ir_alloc_preg(f, entries[i].type, entries[i].cls);
    863     ++dup_count;
    864   }
    865   if (!dup_count) return;
    866 
    867   /* Pass 3: NOP every def of a hoisted constant, mapping its PReg to the
    868    * canonical one (consolidates loop and non-loop occurrences alike). */
    869   PReg* remap = arena_zarray(f->arena, PReg, opt_reg_count(f));
    870   for (u32 b = 0; b < f->nblocks; ++b) {
    871     Block* bl = &f->blocks[b];
    872     for (u32 i = 0; i < bl->ninsts; ++i) {
    873       Inst* in = &bl->insts[i];
    874       if (!const_hoist_candidate(f, in, def_counts)) continue;
    875       u32 idx = const_hoist_find_or_add(&entries, &n_entries, &cap_entries,
    876                                         f->arena, in->extra.imm,
    877                                         in->opnds[0].type, in->opnds[0].cls);
    878       if (entries[idx].canonical == PREG_NONE) continue;
    879       PReg old = (PReg)in->opnds[0].v.reg;
    880       if (opt_reg_valid(f, old)) remap[old] = entries[idx].canonical;
    881       in->op = IR_NOP;
    882       in->def = VAL_NONE;
    883       in->ndefs = 0;
    884       in->defs = NULL;
    885       in->nopnds = 0;
    886       in->opnds = NULL;
    887     }
    888   }
    889 
    890   /* Pass 4: emit one LOAD_IMM per hoisted constant in the entry block, after
    891    * any leading IR_PARAM_DECL prologue. */
    892   Block* entry = &f->blocks[f->entry];
    893   u32 insert_at = 0;
    894   while (insert_at < entry->ninsts &&
    895          (IROp)entry->insts[insert_at].op == IR_PARAM_DECL)
    896     ++insert_at;
    897   Inst* slot = opt_block_insert_at(f, entry, insert_at, dup_count);
    898   u32 w = 0;
    899   for (u32 i = 0; i < n_entries; ++i) {
    900     if (entries[i].canonical == PREG_NONE) continue;
    901     Inst* in = &slot[w++];
    902     in->op = (u16)IR_LOAD_IMM;
    903     in->def = (Val)entries[i].canonical;
    904     in->type = entries[i].type;
    905     in->extra.imm = entries[i].imm;
    906     in->nopnds = 1;
    907     in->opnds = arena_array(f->arena, Operand, 1);
    908     memset(&in->opnds[0], 0, sizeof(Operand));
    909     in->opnds[0].kind = OPK_REG;
    910     in->opnds[0].cls = entries[i].cls;
    911     in->opnds[0].type = entries[i].type;
    912     in->opnds[0].v.reg = entries[i].canonical;
    913     ir_assign_inst_id(f, in);
    914   }
    915 
    916   /* Pass 5: apply remap to all operand uses (reuses the ADDR_OF-CSE walker,
    917    * which also covers IR_CALL aux operands). */
    918   for (u32 b = 0; b < f->nblocks; ++b) {
    919     Block* bl = &f->blocks[b];
    920     for (u32 i = 0; i < bl->ninsts; ++i)
    921       addr_cse_apply_to_inst(&bl->insts[i], remap);
    922   }
    923 
    924   opt_analysis_invalidate(
    925       f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP);
    926 }