kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

pass_inline.c (25369B)


      1 #include <string.h>
      2 
      3 #include "core/arena.h"
      4 #include "core/core.h"
      5 #include "core/metrics.h"
      6 #include "opt/opt_internal.h"
      7 
      8 typedef struct InlineMap {
      9   Func* caller;
     10   Func* callee;
     11   PReg* preg;
     12   u32 npreg;
     13   FrameSlot* slot;
     14   u32 nslot;
     15   u32* block;
     16   u32 nblock;
     17 } InlineMap;
     18 
     19 typedef struct InstVec {
     20   Func* f;
     21   Inst* v;
     22   u32 n;
     23   u32 cap;
     24 } InstVec;
     25 
     26 typedef struct InlineOrderCtx {
     27   FuncSet* fs;
     28   u8* temp;
     29   u8* done;
     30   Func** order;
     31   u32 norder;
     32 } InlineOrderCtx;
     33 
     34 #define INLINE_NORMAL_COST_LIMIT 20u
     35 #define INLINE_HINT_COST_LIMIT 40u
     36 #define INLINE_ABS_GROWTH_LIMIT 64u
     37 #define INLINE_HINT_ABS_GROWTH_LIMIT 128u
     38 
     39 /* Streaming O1 tiny-inline: a much smaller budget than the whole-program
     40  * inliner. DEFAULT/HINT callees must fit under this cost; ALWAYS bypasses it.
     41  * Bounded by max passes since an inlined straightline body never introduces a
     42  * new call site. */
     43 #define INLINE_TINY_COST_LIMIT 8u
     44 #define TINY_INLINE_MAX_PASSES 4u
     45 
     46 static u32 func_inline_cost(Func* f) {
     47   u32 cost = 0;
     48   if (!f) return 0;
     49   for (u32 b = 0; b < f->nblocks; ++b) {
     50     Block* bl = &f->blocks[b];
     51     for (u32 i = 0; i < bl->ninsts; ++i) {
     52       switch ((IROp)bl->insts[i].op) {
     53         case IR_NOP:
     54         case IR_PARAM_DECL:
     55         case IR_BR:
     56         case IR_RET:
     57           break;
     58         default:
     59           ++cost;
     60           break;
     61       }
     62     }
     63   }
     64   return cost;
     65 }
     66 
     67 static void instvec_push(InstVec* iv, const Inst* in) {
     68   if (iv->n == iv->cap) {
     69     u32 ncap = iv->cap ? iv->cap * 2u : 16u;
     70     Inst* nv = arena_array(iv->f->arena, Inst, ncap);
     71     if (iv->v) memcpy(nv, iv->v, sizeof(Inst) * iv->n);
     72     iv->v = nv;
     73     iv->cap = ncap;
     74   }
     75   iv->v[iv->n++] = *in;
     76 }
     77 
     78 static Func* funcset_find(FuncSet* fs, ObjSymId sym) {
     79   if (!fs || sym == OBJ_SYM_NONE) return NULL;
     80   for (u32 i = 0; i < fs->nfuncs; ++i)
     81     if (fs->funcs[i] && fs->funcs[i]->name == sym) return fs->funcs[i];
     82   return NULL;
     83 }
     84 
     85 static int funcset_index(FuncSet* fs, Func* f) {
     86   if (!fs || !f) return -1;
     87   for (u32 i = 0; i < fs->nfuncs; ++i)
     88     if (fs->funcs[i] == f) return (int)i;
     89   return -1;
     90 }
     91 
     92 static Func* direct_callee(FuncSet* fs, const Inst* in) {
     93   if (!in || (IROp)in->op != IR_CALL) return NULL;
     94   IRCallAux* aux = (IRCallAux*)in->extra.aux;
     95   if (!aux || aux->use_plan_replay) return NULL;
     96   if (aux->desc.flags & CG_CALL_TAIL) return NULL;
     97   if (aux->desc.callee.kind != OPK_GLOBAL) return NULL;
     98   if (aux->desc.callee.v.global.addend != 0) return NULL;
     99   return funcset_find(fs, aux->desc.callee.v.global.sym);
    100 }
    101 
    102 static KitCgInlinePolicy call_inline_policy(const Inst* in) {
    103   if (!in || (IROp)in->op != IR_CALL) return KIT_CG_INLINE_DEFAULT;
    104   IRCallAux* aux = (IRCallAux*)in->extra.aux;
    105   return aux ? aux->desc.inline_policy : KIT_CG_INLINE_DEFAULT;
    106 }
    107 
    108 static KitCgInlinePolicy effective_inline_policy(const Inst* in,
    109                                                  const Func* callee) {
    110   KitCgInlinePolicy call_policy = call_inline_policy(in);
    111   KitCgInlinePolicy callee_policy =
    112       callee ? callee->desc.inline_policy : KIT_CG_INLINE_DEFAULT;
    113   if (call_policy == KIT_CG_INLINE_NEVER ||
    114       callee_policy == KIT_CG_INLINE_NEVER)
    115     return KIT_CG_INLINE_NEVER;
    116   if (call_policy == KIT_CG_INLINE_ALWAYS ||
    117       callee_policy == KIT_CG_INLINE_ALWAYS)
    118     return KIT_CG_INLINE_ALWAYS;
    119   if (call_policy == KIT_CG_INLINE_HINT || callee_policy == KIT_CG_INLINE_HINT)
    120     return KIT_CG_INLINE_HINT;
    121   return KIT_CG_INLINE_DEFAULT;
    122 }
    123 
    124 static int func_reaches(FuncSet* fs, Func* from, Func* target, u8* seen) {
    125   int idx = funcset_index(fs, from);
    126   if (idx < 0) return 0;
    127   if (seen[idx]) return 0;
    128   seen[idx] = 1;
    129   for (u32 b = 0; b < from->nblocks; ++b) {
    130     Block* bl = &from->blocks[b];
    131     for (u32 i = 0; i < bl->ninsts; ++i) {
    132       Func* callee = direct_callee(fs, &bl->insts[i]);
    133       if (!callee) continue;
    134       if (callee == target) return 1;
    135       if (func_reaches(fs, callee, target, seen)) return 1;
    136     }
    137   }
    138   return 0;
    139 }
    140 
    141 static int recursive_or_scc(FuncSet* fs, Func* caller, Func* callee) {
    142   if (caller == callee) return 1;
    143   u8* seen = arena_zarray(fs->arena, u8, fs->nfuncs ? fs->nfuncs : 1u);
    144   return func_reaches(fs, callee, caller, seen);
    145 }
    146 
    147 static int op_supported_in_straightline_inline(IROp op) {
    148   switch (op) {
    149     case IR_NOP:
    150     case IR_PARAM_DECL:
    151     case IR_LOAD_IMM:
    152     case IR_LOAD_CONST:
    153     case IR_COPY:
    154     case IR_LOAD:
    155     case IR_STORE:
    156     case IR_BINOP:
    157     case IR_UNOP:
    158     case IR_CMP:
    159     case IR_CONVERT:
    160     case IR_BR:
    161     case IR_CONDBR:
    162     case IR_CMP_BRANCH:
    163     case IR_RET:
    164       return 1;
    165     default:
    166       return 0;
    167   }
    168 }
    169 
    170 static int callee_inline_shape(Func* callee, KitCgInlinePolicy policy,
    171                                u32* cost_out) {
    172   if (!callee || callee->opt_reg_ssa || callee->opt_rewritten) return 0;
    173   if (kit_cg_type_func_is_variadic((KitCompiler*)callee->c, callee->type)) {
    174     metrics_count(callee->c, "opt.inline.refuse_shape_variadic", 1);
    175     return 0;
    176   }
    177   if (callee->entry >= callee->nblocks) {
    178     metrics_count(callee->c, "opt.inline.refuse_shape_entry", 1);
    179     return 0;
    180   }
    181 
    182   u32 nret = 0;
    183   u32 cost = 0;
    184   for (u32 b = 0; b < callee->nblocks; ++b) {
    185     Block* bl = &callee->blocks[b];
    186     for (u32 i = 0; i < bl->ninsts; ++i) {
    187       Inst* in = &bl->insts[i];
    188       IROp op = (IROp)in->op;
    189       if (!op_supported_in_straightline_inline(op)) {
    190         metrics_count(callee->c, "opt.inline.refuse_shape_op", 1);
    191         return 0;
    192       }
    193       if (op == IR_RET) {
    194         ++nret;
    195         if (i + 1u != bl->ninsts) {
    196           metrics_count(callee->c, "opt.inline.refuse_shape_ret_pos", 1);
    197           return 0;
    198         }
    199         continue;
    200       }
    201       if (op != IR_NOP && op != IR_PARAM_DECL && op != IR_BR) ++cost;
    202     }
    203   }
    204   if (nret == 0) {
    205     metrics_count(callee->c, "opt.inline.refuse_shape_ret_count", 1);
    206     return 0;
    207   }
    208   if (nret > 1) cost += (nret - 1u) * 4u;
    209   u32 cost_limit = policy == KIT_CG_INLINE_ALWAYS ? 0xffffffffu
    210                    : policy == KIT_CG_INLINE_HINT ? INLINE_HINT_COST_LIMIT
    211                                                   : INLINE_NORMAL_COST_LIMIT;
    212   if (cost > cost_limit) {
    213     metrics_count(callee->c, "opt.inline.refuse_shape_budget", 1);
    214     return 0;
    215   }
    216   if (cost_out) *cost_out = cost;
    217   return 1;
    218 }
    219 
    220 static PReg map_preg(InlineMap* m, PReg r) {
    221   if (r == PREG_NONE || r == 0 || r >= m->npreg) return r;
    222   return m->preg[r] ? m->preg[r] : r;
    223 }
    224 
    225 static FrameSlot map_slot(InlineMap* m, FrameSlot s) {
    226   if (s == FRAME_SLOT_NONE || s >= m->nslot) return s;
    227   return m->slot[s] ? m->slot[s] : s;
    228 }
    229 
    230 static u32 map_block(InlineMap* m, u32 b) {
    231   if (b >= m->nblock) return b;
    232   return m->block[b] != 0xffffffffu ? m->block[b] : b;
    233 }
    234 
    235 static void map_mem(InlineMap* m, MemAccess* mem) {
    236   if (!mem) return;
    237   if (mem->alias.kind == ALIAS_LOCAL && mem->alias.v.local_id > 0) {
    238     FrameSlot old = (FrameSlot)mem->alias.v.local_id;
    239     mem->alias.v.local_id = (i32)map_slot(m, old);
    240   }
    241 }
    242 
    243 static Operand map_operand(InlineMap* m, Operand op) {
    244   switch ((OptOperandKind)op.kind) {
    245     case OPK_REG:
    246       op.v.reg = map_preg(m, (PReg)op.v.reg);
    247       break;
    248     case OPK_LOCAL:
    249       op.v.frame_slot = map_slot(m, op.v.frame_slot);
    250       break;
    251     case OPK_INDIRECT:
    252       op.v.ind.base = map_preg(m, (PReg)op.v.ind.base);
    253       if (op.v.ind.index != (Reg)REG_NONE)
    254         op.v.ind.index = map_preg(m, (PReg)op.v.ind.index);
    255       break;
    256     default:
    257       break;
    258   }
    259   return op;
    260 }
    261 
    262 static int build_inline_map(InlineMap* m, Func* caller, Func* callee) {
    263   memset(m, 0, sizeof *m);
    264   m->caller = caller;
    265   m->callee = callee;
    266   m->npreg = callee->npregs;
    267   m->nslot = callee->nframe_slots + 1u;
    268   m->nblock = callee->nblocks;
    269   m->preg = arena_zarray(caller->arena, PReg, m->npreg ? m->npreg : 1u);
    270   m->slot = arena_zarray(caller->arena, FrameSlot, m->nslot ? m->nslot : 1u);
    271   m->block = arena_array(caller->arena, u32, m->nblock ? m->nblock : 1u);
    272   for (u32 b = 0; b < m->nblock; ++b) m->block[b] = 0xffffffffu;
    273 
    274   for (FrameSlot s = 1; s <= callee->nframe_slots; ++s) {
    275     IRFrameSlot* old = &callee->frame_slots[s - 1u];
    276     FrameSlotDesc d;
    277     memset(&d, 0, sizeof d);
    278     d.type = old->type;
    279     d.name = old->name;
    280     d.loc = old->loc;
    281     d.size = old->size;
    282     d.align = old->align;
    283     d.kind = old->kind == FS_PARAM ? FS_LOCAL : old->kind;
    284     d.flags = old->flags;
    285     m->slot[s] = ir_frame_slot_new(caller, &d);
    286   }
    287 
    288   for (PReg r = 1; r < callee->npregs; ++r) {
    289     m->preg[r] =
    290         ir_alloc_preg(caller, callee->preg_type[r], callee->preg_cls[r]);
    291   }
    292   for (u32 b = 0; b < callee->nblocks; ++b) m->block[b] = ir_block_new(caller);
    293   return 1;
    294 }
    295 
    296 static Inst make_inst(Func* f, IROp op, SrcLoc loc) {
    297   Inst in;
    298   memset(&in, 0, sizeof in);
    299   in.op = (u16)op;
    300   in.id = ir_inst_id_new(f);
    301   in.loc = loc;
    302   return in;
    303 }
    304 
    305 static Operand local_operand(FrameSlot s, KitCgTypeId ty) {
    306   Operand op;
    307   memset(&op, 0, sizeof op);
    308   op.kind = OPK_LOCAL;
    309   op.cls = RC_INT;
    310   op.type = ty;
    311   op.v.frame_slot = s;
    312   return op;
    313 }
    314 
    315 static MemAccess param_mem(const IRParam* p, FrameSlot s) {
    316   MemAccess m;
    317   memset(&m, 0, sizeof m);
    318   m.type = p->type;
    319   m.size = p->size;
    320   m.align = p->align;
    321   m.alias.kind = ALIAS_LOCAL;
    322   m.alias.v.local_id = (i32)s;
    323   return m;
    324 }
    325 
    326 static int append_param_materialization(InstVec* out, InlineMap* m,
    327                                         const Inst* call) {
    328   IRCallAux* aux = (IRCallAux*)call->extra.aux;
    329   Func* caller = m->caller;
    330   Func* callee = m->callee;
    331   if (!aux || aux->desc.nargs != callee->nparams) return 0;
    332 
    333   for (u32 i = 0; i < callee->nparams; ++i) {
    334     IRParam* p = &callee->params[i];
    335     const CGABIValue* av = &aux->desc.args[i];
    336     if (av->nparts != 0) return 0;
    337     Operand src = av->storage;
    338     if (src.kind != OPK_REG && src.kind != OPK_IMM) return 0;
    339     if (p->storage.kind == CG_LOCAL_STORAGE_REG) {
    340       PReg dst_r = map_preg(m, (PReg)p->storage.v.reg);
    341       Operand dst;
    342       memset(&dst, 0, sizeof dst);
    343       dst.kind = OPK_REG;
    344       dst.cls = opt_reg_cls(caller, dst_r);
    345       dst.type = p->type;
    346       dst.v.reg = dst_r;
    347       Inst in = make_inst(caller, src.kind == OPK_IMM ? IR_LOAD_IMM : IR_COPY,
    348                           call->loc);
    349       in.type = p->type;
    350       in.def = (Val)dst_r;
    351       in.opnds =
    352           arena_array(caller->arena, Operand, src.kind == OPK_IMM ? 1u : 2u);
    353       in.opnds[0] = dst;
    354       in.nopnds = src.kind == OPK_IMM ? 1u : 2u;
    355       if (src.kind == OPK_IMM) {
    356         in.extra.imm = src.v.imm;
    357       } else {
    358         in.opnds[1] = src;
    359       }
    360       instvec_push(out, &in);
    361     } else {
    362       FrameSlot dst_s = map_slot(m, p->storage.v.frame_slot);
    363       Inst in = make_inst(caller, IR_STORE, call->loc);
    364       in.opnds = arena_array(caller->arena, Operand, 2);
    365       in.opnds[0] = local_operand(dst_s, p->type);
    366       in.opnds[1] = src;
    367       in.nopnds = 2;
    368       in.extra.mem = param_mem(p, dst_s);
    369       instvec_push(out, &in);
    370     }
    371   }
    372   return 1;
    373 }
    374 
    375 static Inst clone_inst(InlineMap* m, const Inst* src) {
    376   Func* caller = m->caller;
    377   Inst dst = *src;
    378   dst.id = ir_inst_id_new(caller);
    379   dst.def = src->def != VAL_NONE ? (Val)map_preg(m, (PReg)src->def) : VAL_NONE;
    380   if (src->ndefs) {
    381     dst.defs = arena_array(caller->arena, Val, src->ndefs);
    382     for (u32 i = 0; i < src->ndefs; ++i) {
    383       dst.defs[i] = src->defs[i] != VAL_NONE
    384                         ? (Val)map_preg(m, (PReg)src->defs[i])
    385                         : VAL_NONE;
    386     }
    387   }
    388   if (src->nopnds) {
    389     dst.opnds = arena_array(caller->arena, Operand, src->nopnds);
    390     for (u32 i = 0; i < src->nopnds; ++i)
    391       dst.opnds[i] = map_operand(m, src->opnds[i]);
    392   }
    393   if ((IROp)dst.op == IR_LOAD || (IROp)dst.op == IR_STORE) {
    394     map_mem(m, &dst.extra.mem);
    395   }
    396   return dst;
    397 }
    398 
    399 static void clone_block_succs(InlineMap* m, Block* dst, const Block* src) {
    400   ir_block_set_nsucc(m->caller, dst->id, src->nsucc);
    401   for (u32 s = 0; s < src->nsucc; ++s)
    402     dst->succ[s] = map_block(m, src->succ[s]);
    403 }
    404 
    405 static int append_return_materialization(InstVec* out, InlineMap* m,
    406                                          const Inst* call, const Inst* ret) {
    407   IRCallAux* call_aux = (IRCallAux*)call->extra.aux;
    408   IRRetAux* ret_aux = (IRRetAux*)ret->extra.aux;
    409   Func* caller = m->caller;
    410   if (!call_aux) return 0;
    411   if (!ret_aux || !ret_aux->present) return 1;
    412   if (ret_aux->val.nparts != 0 || call_aux->desc.ret.nparts != 0) return 0;
    413   Operand dst = call_aux->desc.ret.storage;
    414   Operand src = ret_aux->val.storage;
    415   if (dst.kind == OPK_IMM) return 1;
    416   if (dst.kind != OPK_REG) return 0;
    417   src = map_operand(m, src);
    418   if (src.kind != OPK_REG && src.kind != OPK_IMM) return 0;
    419 
    420   Inst in =
    421       make_inst(caller, src.kind == OPK_IMM ? IR_LOAD_IMM : IR_COPY, call->loc);
    422   in.type = dst.type;
    423   in.def = (Val)dst.v.reg;
    424   in.opnds = arena_array(caller->arena, Operand, src.kind == OPK_IMM ? 1u : 2u);
    425   in.opnds[0] = dst;
    426   in.nopnds = src.kind == OPK_IMM ? 1u : 2u;
    427   if (src.kind == OPK_IMM) {
    428     in.extra.imm = src.v.imm;
    429   } else {
    430     in.opnds[1] = src;
    431   }
    432   instvec_push(out, &in);
    433   return 1;
    434 }
    435 
    436 static Inst make_branch(Func* f, SrcLoc loc) {
    437   return make_inst(f, IR_BR, loc);
    438 }
    439 
    440 static void emit_order_push_unique(u32* out, u32* nout, u32 b) {
    441   for (u32 i = 0; i < *nout; ++i)
    442     if (out[i] == b) return;
    443   out[(*nout)++] = b;
    444 }
    445 
    446 static void inline_rebuild_emit_order(Func* caller, u32 anchor, InlineMap* m,
    447                                       u32 cont) {
    448   u32 cap =
    449       caller->emit_order_n + m->callee->emit_order_n + m->callee->nblocks + 2u;
    450   u32* order = arena_array(caller->arena, u32, cap ? cap : 1u);
    451   u32 norder = 0;
    452   int inserted = 0;
    453   for (u32 i = 0; i < caller->emit_order_n; ++i) {
    454     u32 b = caller->emit_order[i];
    455     emit_order_push_unique(order, &norder, b);
    456     if (b != anchor) continue;
    457     for (u32 j = 0; j < m->callee->emit_order_n; ++j)
    458       emit_order_push_unique(order, &norder,
    459                              map_block(m, m->callee->emit_order[j]));
    460     for (u32 cb = 0; cb < m->callee->nblocks; ++cb)
    461       emit_order_push_unique(order, &norder, map_block(m, cb));
    462     emit_order_push_unique(order, &norder, cont);
    463     inserted = 1;
    464   }
    465   if (!inserted) {
    466     emit_order_push_unique(order, &norder, anchor);
    467     for (u32 j = 0; j < m->callee->emit_order_n; ++j)
    468       emit_order_push_unique(order, &norder,
    469                              map_block(m, m->callee->emit_order[j]));
    470     for (u32 cb = 0; cb < m->callee->nblocks; ++cb)
    471       emit_order_push_unique(order, &norder, map_block(m, cb));
    472     emit_order_push_unique(order, &norder, cont);
    473   }
    474   caller->emit_order = order;
    475   caller->emit_order_n = norder;
    476   caller->emit_order_cap = cap;
    477 }
    478 
    479 static int inline_rewrite_supported(Func* callee, const Inst* call) {
    480   IRCallAux* call_aux = (IRCallAux*)call->extra.aux;
    481   if (!call_aux || call_aux->desc.nargs != callee->nparams) return 0;
    482   for (u32 i = 0; i < callee->nparams; ++i) {
    483     const CGABIValue* av = &call_aux->desc.args[i];
    484     if (av->nparts != 0) return 0;
    485     if (av->storage.kind != OPK_REG && av->storage.kind != OPK_IMM) return 0;
    486   }
    487 
    488   for (u32 b = 0; b < callee->nblocks; ++b) {
    489     Block* bl = &callee->blocks[b];
    490     for (u32 i = 0; i < bl->ninsts; ++i) {
    491       Inst* in = &bl->insts[i];
    492       if ((IROp)in->op != IR_RET) continue;
    493       IRRetAux* ret_aux = (IRRetAux*)in->extra.aux;
    494       if (!ret_aux || !ret_aux->present) continue;
    495       if (ret_aux->val.nparts != 0 || call_aux->desc.ret.nparts != 0) return 0;
    496       if (call_aux->desc.ret.storage.kind == OPK_IMM) continue;
    497       if (call_aux->desc.ret.storage.kind != OPK_REG) return 0;
    498       if (ret_aux->val.storage.kind != OPK_REG &&
    499           ret_aux->val.storage.kind != OPK_IMM)
    500         return 0;
    501     }
    502   }
    503   return 1;
    504 }
    505 
    506 static int inline_call_site(Func* caller, u32 block_idx, u32 inst_idx,
    507                             Func* callee) {
    508   Block* old_bl = &caller->blocks[block_idx];
    509   Inst* old_insts = old_bl->insts;
    510   u32 old_ninsts = old_bl->ninsts;
    511   u32 old_nsucc = old_bl->nsucc;
    512   u32* old_succ = arena_array(caller->arena, u32, old_nsucc ? old_nsucc : 1u);
    513   for (u32 s = 0; s < old_nsucc; ++s) old_succ[s] = old_bl->succ[s];
    514   Inst call = old_insts[inst_idx];
    515   InlineMap map;
    516   if (!build_inline_map(&map, caller, callee)) return 0;
    517   u32 cont = ir_block_new(caller);
    518 
    519   InstVec pre;
    520   memset(&pre, 0, sizeof pre);
    521   pre.f = caller;
    522   for (u32 i = 0; i < inst_idx; ++i) instvec_push(&pre, &old_insts[i]);
    523   if (!append_param_materialization(&pre, &map, &call)) return 0;
    524   Inst br = make_branch(caller, call.loc);
    525   instvec_push(&pre, &br);
    526 
    527   Block* pre_bl = &caller->blocks[block_idx];
    528   pre_bl->insts = pre.v;
    529   pre_bl->ninsts = pre.n;
    530   pre_bl->cap = pre.cap;
    531   ir_block_set_nsucc(caller, block_idx, 1);
    532   pre_bl = &caller->blocks[block_idx];
    533   pre_bl->succ[0] = map_block(&map, callee->entry);
    534 
    535   Block* cont_bl = &caller->blocks[cont];
    536   InstVec cont_out;
    537   memset(&cont_out, 0, sizeof cont_out);
    538   cont_out.f = caller;
    539   for (u32 i = inst_idx + 1u; i < old_ninsts; ++i)
    540     instvec_push(&cont_out, &old_insts[i]);
    541   cont_bl->insts = cont_out.v;
    542   cont_bl->ninsts = cont_out.n;
    543   cont_bl->cap = cont_out.cap;
    544   ir_block_set_nsucc(caller, cont, old_nsucc);
    545   cont_bl = &caller->blocks[cont];
    546   for (u32 s = 0; s < old_nsucc; ++s) cont_bl->succ[s] = old_succ[s];
    547 
    548   for (u32 b = 0; b < callee->nblocks; ++b) {
    549     Block* src_bl = &callee->blocks[b];
    550     u32 dst_b = map_block(&map, b);
    551     Block* dst_bl = &caller->blocks[dst_b];
    552     InstVec body;
    553     memset(&body, 0, sizeof body);
    554     body.f = caller;
    555     for (u32 i = 0; i < src_bl->ninsts; ++i) {
    556       Inst* src = &src_bl->insts[i];
    557       switch ((IROp)src->op) {
    558         case IR_NOP:
    559         case IR_PARAM_DECL:
    560           break;
    561         case IR_RET: {
    562           if (!append_return_materialization(&body, &map, &call, src)) return 0;
    563           Inst ret_br = make_branch(caller, src->loc);
    564           instvec_push(&body, &ret_br);
    565           break;
    566         }
    567         default: {
    568           Inst cloned = clone_inst(&map, src);
    569           instvec_push(&body, &cloned);
    570           break;
    571         }
    572       }
    573     }
    574     dst_bl = &caller->blocks[dst_b];
    575     dst_bl->insts = body.v;
    576     dst_bl->ninsts = body.n;
    577     dst_bl->cap = body.cap;
    578     if (body.n && (IROp)body.v[body.n - 1u].op == IR_BR && src_bl->ninsts &&
    579         (IROp)src_bl->insts[src_bl->ninsts - 1u].op == IR_RET) {
    580       ir_block_set_nsucc(caller, dst_b, 1);
    581       caller->blocks[dst_b].succ[0] = cont;
    582     } else {
    583       clone_block_succs(&map, &caller->blocks[dst_b], src_bl);
    584     }
    585   }
    586 
    587   inline_rebuild_emit_order(caller, block_idx, &map, cont);
    588   opt_analysis_invalidate(caller, OPT_ANALYSIS_CFG | OPT_ANALYSIS_DEF_USE |
    589                                       OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP);
    590   return 1;
    591 }
    592 
    593 static int caller_growth_ok(FuncSet* fs, Func* caller, const u32* base_cost,
    594                             u32 callee_cost, KitCgInlinePolicy policy) {
    595   if (policy == KIT_CG_INLINE_ALWAYS) return 1;
    596   int idx = funcset_index(fs, caller);
    597   u32 base = idx >= 0 ? base_cost[idx] : func_inline_cost(caller);
    598   u32 growth_limit = base / 4u;
    599   if (growth_limit < INLINE_NORMAL_COST_LIMIT)
    600     growth_limit = INLINE_NORMAL_COST_LIMIT;
    601   if (policy == KIT_CG_INLINE_HINT) {
    602     growth_limit *= 2u;
    603     if (growth_limit < INLINE_HINT_COST_LIMIT)
    604       growth_limit = INLINE_HINT_COST_LIMIT;
    605     if (growth_limit > INLINE_HINT_ABS_GROWTH_LIMIT)
    606       growth_limit = INLINE_HINT_ABS_GROWTH_LIMIT;
    607   } else if (growth_limit > INLINE_ABS_GROWTH_LIMIT) {
    608     growth_limit = INLINE_ABS_GROWTH_LIMIT;
    609   }
    610   return func_inline_cost(caller) + callee_cost <= base + growth_limit;
    611 }
    612 
    613 static int try_inline_call(FuncSet* fs, Func* caller, u32 b, u32 i,
    614                            const u32* base_cost) {
    615   Inst* in = &caller->blocks[b].insts[i];
    616   Func* callee = direct_callee(fs, in);
    617   u32 cost = 0;
    618   KitCgInlinePolicy policy;
    619   if (!callee) return 0;
    620   policy = effective_inline_policy(in, callee);
    621   metrics_count(caller->c, "opt.inline.candidates", 1);
    622   if (policy == KIT_CG_INLINE_NEVER) {
    623     metrics_count(caller->c, "opt.inline.refuse_policy", 1);
    624     return 0;
    625   }
    626   if (recursive_or_scc(fs, caller, callee)) {
    627     metrics_count(caller->c, "opt.inline.refuse_scc", 1);
    628     return 0;
    629   }
    630   if (!callee_inline_shape(callee, policy, &cost)) {
    631     metrics_count(caller->c, "opt.inline.refuse_shape", 1);
    632     return 0;
    633   }
    634   if (!caller_growth_ok(fs, caller, base_cost, cost, policy)) {
    635     metrics_count(caller->c, "opt.inline.refuse_growth", 1);
    636     return 0;
    637   }
    638   if (!inline_rewrite_supported(callee, in)) {
    639     metrics_count(caller->c, "opt.inline.refuse_rewrite_shape", 1);
    640     return 0;
    641   }
    642   if (!inline_call_site(caller, b, i, callee)) {
    643     metrics_count(caller->c, "opt.inline.refuse_rewrite", 1);
    644     return 0;
    645   }
    646   metrics_count(caller->c, "opt.inline.inlined", 1);
    647   return 1;
    648 }
    649 
    650 static void inline_order_visit(InlineOrderCtx* ctx, Func* f) {
    651   int idx = funcset_index(ctx->fs, f);
    652   if (idx < 0 || ctx->done[idx]) return;
    653   if (ctx->temp[idx]) return;
    654   ctx->temp[idx] = 1;
    655   for (u32 b = 0; b < f->nblocks; ++b) {
    656     Block* bl = &f->blocks[b];
    657     for (u32 i = 0; i < bl->ninsts; ++i) {
    658       Func* callee = direct_callee(ctx->fs, &bl->insts[i]);
    659       if (callee) inline_order_visit(ctx, callee);
    660     }
    661   }
    662   ctx->temp[idx] = 0;
    663   ctx->done[idx] = 1;
    664   ctx->order[ctx->norder++] = f;
    665 }
    666 
    667 void opt_inline(FuncSet* fs, int max_iters) {
    668   if (!fs || fs->nfuncs == 0 || max_iters <= 0) return;
    669   if (max_iters > 4) max_iters = 4;
    670   u32* base_cost = arena_array(fs->arena, u32, fs->nfuncs);
    671   for (u32 i = 0; i < fs->nfuncs; ++i)
    672     base_cost[i] = func_inline_cost(fs->funcs[i]);
    673   for (int iter = 0; iter < max_iters; ++iter) {
    674     int changed = 0;
    675     InlineOrderCtx ctx;
    676     memset(&ctx, 0, sizeof ctx);
    677     ctx.fs = fs;
    678     ctx.temp = arena_zarray(fs->arena, u8, fs->nfuncs);
    679     ctx.done = arena_zarray(fs->arena, u8, fs->nfuncs);
    680     ctx.order = arena_array(fs->arena, Func*, fs->nfuncs);
    681     for (u32 fidx = 0; fidx < fs->nfuncs; ++fidx)
    682       inline_order_visit(&ctx, fs->funcs[fidx]);
    683 
    684     for (u32 fidx = 0; fidx < ctx.norder; ++fidx) {
    685       Func* caller = ctx.order[fidx];
    686       if (!caller || caller->opt_reg_ssa || caller->opt_rewritten) continue;
    687       for (u32 b = 0; b < caller->nblocks; ++b) {
    688         Block* bl = &caller->blocks[b];
    689         for (u32 i = 0; i < bl->ninsts; ++i) {
    690           if ((IROp)bl->insts[i].op != IR_CALL) continue;
    691           if (try_inline_call(fs, caller, b, i, base_cost)) {
    692             changed = 1;
    693             bl = &caller->blocks[b];
    694           }
    695         }
    696       }
    697     }
    698     if (!changed) break;
    699   }
    700 }
    701 
    702 /* Streaming single-caller variant for the O1 pipeline. Same gates as
    703  * try_inline_call, minus the whole-program growth check (which needs a
    704  * base_cost array the streaming path lacks), plus the tiny cost cap. */
    705 static int try_tiny_inline_call(FuncSet* fs, Func* caller, u32 b, u32 i) {
    706   Inst* in = &caller->blocks[b].insts[i];
    707   Func* callee = direct_callee(fs, in);
    708   u32 cost = 0;
    709   KitCgInlinePolicy policy;
    710   if (!callee) return 0;
    711   policy = effective_inline_policy(in, callee);
    712   metrics_count(caller->c, "opt.tiny_inline.candidates", 1);
    713   if (policy == KIT_CG_INLINE_NEVER) {
    714     metrics_count(caller->c, "opt.tiny_inline.refuse_policy", 1);
    715     return 0;
    716   }
    717   if (recursive_or_scc(fs, caller, callee)) {
    718     metrics_count(caller->c, "opt.tiny_inline.refuse_scc", 1);
    719     return 0;
    720   }
    721   if (!callee_inline_shape(callee, policy, &cost)) {
    722     metrics_count(caller->c, "opt.tiny_inline.refuse_shape", 1);
    723     return 0;
    724   }
    725   if (policy != KIT_CG_INLINE_ALWAYS && cost > INLINE_TINY_COST_LIMIT) {
    726     metrics_count(caller->c, "opt.tiny_inline.refuse_budget", 1);
    727     return 0;
    728   }
    729   if (!inline_rewrite_supported(callee, in)) {
    730     metrics_count(caller->c, "opt.tiny_inline.refuse_rewrite_shape", 1);
    731     return 0;
    732   }
    733   if (!inline_call_site(caller, b, i, callee)) {
    734     metrics_count(caller->c, "opt.tiny_inline.refuse_rewrite", 1);
    735     return 0;
    736   }
    737   metrics_count(caller->c, "opt.tiny_inline.inlined", 1);
    738   return 1;
    739 }
    740 
    741 int opt_try_tiny_inline(Func* caller, OptInlineCalleeLookup lookup, void* ctx) {
    742   int total = 0;
    743   if (!caller || !lookup || caller->opt_reg_ssa || caller->opt_rewritten)
    744     return 0;
    745   for (u32 pass = 0; pass < TINY_INLINE_MAX_PASSES; ++pass) {
    746     int changed = 0;
    747     for (u32 b = 0; b < caller->nblocks && !changed; ++b) {
    748       Block* bl = &caller->blocks[b];
    749       for (u32 i = 0; i < bl->ninsts; ++i) {
    750         Inst* in = &bl->insts[i];
    751         if ((IROp)in->op != IR_CALL) continue;
    752         IRCallAux* aux = (IRCallAux*)in->extra.aux;
    753         if (!aux || aux->desc.callee.kind != OPK_GLOBAL) continue;
    754         Func* callee = lookup(ctx, aux->desc.callee.v.global.sym);
    755         if (!callee) continue;
    756         /* Ad-hoc 2-element FuncSet so the existing gates (direct_callee,
    757          * recursive_or_scc) resolve the callee by symbol. */
    758         Func* funcs[2] = {caller, callee};
    759         FuncSet fs;
    760         memset(&fs, 0, sizeof fs);
    761         fs.c = caller->c;
    762         fs.arena = caller->arena;
    763         fs.funcs = funcs;
    764         fs.nfuncs = (caller == callee) ? 1u : 2u;
    765         fs.cap = fs.nfuncs;
    766         if (try_tiny_inline_call(&fs, caller, b, i)) {
    767           ++total;
    768           changed = 1; /* indices invalidated by the split; restart scan */
    769           break;
    770         }
    771       }
    772     }
    773     if (!changed) break;
    774   }
    775   return total;
    776 }