kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

pass_lower.c (81403B)


      1 #include <stdlib.h>
      2 #include <string.h>
      3 
      4 #include "core/arena.h"
      5 #include "core/core.h"
      6 #include "core/metrics.h"
      7 #include "core/pool.h"
      8 #include "core/slice.h"
      9 #include "core/strbuf.h"
     10 #include "opt/opt_internal.h"
     11 
     12 enum {
     13   OPT_CG_TYPE_SEG_SHIFT = 6,
     14   OPT_CG_TYPE_SEG_MASK = (1u << OPT_CG_TYPE_SEG_SHIFT) - 1u,
     15   OPT_CG_TYPE_BUILTIN_SEG = 1u,
     16 };
     17 
     18 static u32 type_size_fallback(const Func* f, KitCgTypeId t) {
     19   KitCgBuiltinType b;
     20   if (!t) return f->opt_target.ptr_size ? f->opt_target.ptr_size : 8u;
     21   if ((t >> OPT_CG_TYPE_SEG_SHIFT) != OPT_CG_TYPE_BUILTIN_SEG) {
     22     return f->opt_target.ptr_size ? f->opt_target.ptr_size : 8u;
     23   }
     24   b = (KitCgBuiltinType)(t & OPT_CG_TYPE_SEG_MASK);
     25   switch (b) {
     26     case KIT_CG_BUILTIN_BOOL:
     27     case KIT_CG_BUILTIN_I8:
     28       return 1;
     29     case KIT_CG_BUILTIN_I16:
     30       return 2;
     31     case KIT_CG_BUILTIN_I32:
     32     case KIT_CG_BUILTIN_F32:
     33       return 4;
     34     case KIT_CG_BUILTIN_I64:
     35     case KIT_CG_BUILTIN_F64:
     36       return 8;
     37     case KIT_CG_BUILTIN_I128:
     38       return 16;
     39     case KIT_CG_BUILTIN_VOID:
     40       return 0;
     41     case KIT_CG_BUILTIN_VARARG_STATE:
     42     case KIT_CG_BUILTIN_COUNT:
     43     default:
     44       return f->opt_target.ptr_size ? f->opt_target.ptr_size : 8u;
     45   }
     46 }
     47 
     48 static u32 bit_words(u32 npregs) { return (npregs + 63u) / 64u; }
     49 
     50 static void bit_set(u64* bits, PReg v) {
     51   u32 w = v / 64u;
     52   u64 mask = 1ull << (v % 64u);
     53   u64 old = bits[w];
     54   bits[w] = old | mask;
     55 }
     56 static void bit_clear(u64* bits, PReg v) {
     57   u32 w = v / 64u;
     58   u64 mask = 1ull << (v % 64u);
     59   u64 old = bits[w];
     60   bits[w] = old & ~mask;
     61 }
     62 static int bit_has(const u64* bits, PReg v) {
     63   u32 w = v / 64u;
     64   u64 mask = 1ull << (v % 64u);
     65   u64 old = bits[w];
     66   return (old & mask) != 0;
     67 }
     68 
     69 typedef struct BitsCtx {
     70   u64* use;
     71   u64* def;
     72 } BitsCtx;
     73 
     74 typedef struct InstRefs {
     75   PReg* uses;
     76   PReg* defs;
     77   u32 nuses;
     78   u32 ndefs;
     79   u32 use_cap;
     80   u32 def_cap;
     81 } InstRefs;
     82 
     83 static void collect_bits(Func* f, Inst* in, Operand* op, int is_def,
     84                          void* arg) {
     85   (void)in;
     86   BitsCtx* c = (BitsCtx*)arg;
     87   if (op->kind != OPK_REG) return;
     88   PReg v = (PReg)op->v.reg;
     89   if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return;
     90   if (is_def)
     91     bit_set(c->def, v);
     92   else
     93     bit_set(c->use, v);
     94 }
     95 
     96 static void refs_reset(InstRefs* refs) {
     97   refs->nuses = 0;
     98   refs->ndefs = 0;
     99 }
    100 
    101 static void refs_push(Func* f, PReg** pregs, u32* npregs, u32* cap, PReg v) {
    102   for (u32 i = 0; i < *npregs; ++i)
    103     if ((*pregs)[i] == v) return;
    104   if (*npregs == *cap) {
    105     u32 ncap = *cap ? *cap * 2u : 8u;
    106     PReg* nv = arena_array(f->arena, PReg, ncap);
    107     if (*pregs) memcpy(nv, *pregs, sizeof((*pregs)[0]) * *npregs);
    108     *pregs = nv;
    109     *cap = ncap;
    110   }
    111   (*pregs)[(*npregs)++] = v;
    112 }
    113 
    114 static void refs_collect(Func* f, Inst* in, Operand* op, int is_def,
    115                          void* arg) {
    116   (void)in;
    117   InstRefs* refs = (InstRefs*)arg;
    118   if (op->kind != OPK_REG) return;
    119   PReg v = (PReg)op->v.reg;
    120   if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return;
    121   if (is_def)
    122     refs_push(f, &refs->defs, &refs->ndefs, &refs->def_cap, v);
    123   else
    124     refs_push(f, &refs->uses, &refs->nuses, &refs->use_cap, v);
    125 }
    126 
    127 static int refs_has_def(const InstRefs* refs, PReg v) {
    128   for (u32 i = 0; i < refs->ndefs; ++i)
    129     if (refs->defs[i] == v) return 1;
    130   return 0;
    131 }
    132 
    133 static void live_update_refs_before(u64* live, const InstRefs* refs) {
    134   for (u32 i = 0; i < refs->ndefs; ++i) bit_clear(live, refs->defs[i]);
    135   for (u32 i = 0; i < refs->nuses; ++i) bit_set(live, refs->uses[i]);
    136 }
    137 
    138 static u32 live_update_refs_before_active(u64* live, u32 active_words,
    139                                           u32 nwords, const InstRefs* refs) {
    140   for (u32 i = 0; i < refs->ndefs; ++i) {
    141     PReg v = refs->defs[i];
    142     if (v == PREG_NONE || v == 0) continue;
    143     u32 w = v / 64u;
    144     if (w < active_words) live[w] &= ~(1ull << (v % 64u));
    145   }
    146   while (active_words && live[active_words - 1u] == 0) --active_words;
    147   for (u32 i = 0; i < refs->nuses; ++i) {
    148     PReg v = refs->uses[i];
    149     if (v == PREG_NONE || v == 0) continue;
    150     u32 w = v / 64u;
    151     if (w >= nwords) continue;
    152     live[w] |= 1ull << (v % 64u);
    153     if (active_words <= w) active_words = w + 1u;
    154   }
    155   return active_words;
    156 }
    157 
    158 static void forbid_preg_reg(Func* f, PReg v, u8 cls, Reg r) {
    159   if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f) ||
    160       cls >= OPT_REG_CLASSES || r >= 32)
    161     return;
    162   if (opt_reg_cls(f, v) != cls) return;
    163   f->preg_info[v].forbidden_hard_regs |= 1u << r;
    164 }
    165 
    166 static void apply_fixed_asm_operand(Func* f, Operand* op, i32 fixed,
    167                                     u8 fixed_cls) {
    168   if (!op || op->kind != OPK_REG || fixed < 0) return;
    169   PReg v = (PReg)op->v.reg;
    170   if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return;
    171   if (fixed_cls >= OPT_REG_CLASSES || opt_reg_cls(f, v) != fixed_cls) {
    172     SrcLoc loc = {0, 0, 0};
    173     compiler_panic(f->c, loc, "opt asm: fixed register class mismatch");
    174   }
    175   f->preg_info[v].tied_hard_reg = fixed;
    176 }
    177 
    178 static void apply_allowed_asm_operand(Func* f, Operand* op, u32 allowed,
    179                                       u8 allowed_cls) {
    180   u32 hard_mask = 0;
    181   if (!op || op->kind != OPK_REG || !allowed) return;
    182   PReg v = (PReg)op->v.reg;
    183   if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return;
    184   if (allowed_cls >= OPT_REG_CLASSES || opt_reg_cls(f, v) != allowed_cls) {
    185     SrcLoc loc = {0, 0, 0};
    186     compiler_panic(f->c, loc, "opt asm: allowed register class mismatch");
    187   }
    188   for (u32 i = 0; i < f->opt_hard_reg_count[allowed_cls]; ++i) {
    189     Reg r = f->opt_hard_regs[allowed_cls][i];
    190     if (r < 32) hard_mask |= 1u << r;
    191   }
    192   if (f->preg_info[v].allowed_hard_regs) {
    193     u32 both = f->preg_info[v].allowed_hard_regs & allowed;
    194     if (!both) {
    195       SrcLoc loc = {0, 0, 0};
    196       compiler_panic(f->c, loc, "opt asm: conflicting allowed register sets");
    197     }
    198     f->preg_info[v].allowed_hard_regs = both;
    199   } else {
    200     f->preg_info[v].allowed_hard_regs = allowed;
    201   }
    202   f->preg_info[v].forbidden_hard_regs |= hard_mask & ~allowed;
    203 }
    204 
    205 static void apply_asm_register_constraints(Func* f, Inst* in, u64* use,
    206                                            u64* def, u64* live_after) {
    207   IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
    208   if (!aux || !f->preg_info) return;
    209 
    210   for (u32 i = 0; i < aux->nout; ++i) {
    211     i32 fixed = aux->out_fixed_regs ? aux->out_fixed_regs[i] : -1;
    212     u8 cls = aux->out_fixed_cls ? aux->out_fixed_cls[i] : 0;
    213     u32 allowed = aux->out_allowed_masks ? aux->out_allowed_masks[i] : 0;
    214     u8 allowed_cls = aux->out_allowed_cls ? aux->out_allowed_cls[i] : 0;
    215     apply_allowed_asm_operand(f, &aux->out_ops[i], allowed, allowed_cls);
    216     apply_fixed_asm_operand(f, &aux->out_ops[i], fixed, cls);
    217   }
    218   for (u32 i = 0; i < aux->nin; ++i) {
    219     i32 fixed = aux->in_fixed_regs ? aux->in_fixed_regs[i] : -1;
    220     u8 cls = aux->in_fixed_cls ? aux->in_fixed_cls[i] : 0;
    221     u32 allowed = aux->in_allowed_masks ? aux->in_allowed_masks[i] : 0;
    222     u8 allowed_cls = aux->in_allowed_cls ? aux->in_allowed_cls[i] : 0;
    223     apply_allowed_asm_operand(f, &aux->in_ops[i], allowed, allowed_cls);
    224     apply_fixed_asm_operand(f, &aux->in_ops[i], fixed, cls);
    225   }
    226 
    227   for (u32 cls = 0; cls < OPT_REG_CLASSES; ++cls) {
    228     u32 mask = aux->clobber_mask[cls];
    229     if (!mask) continue;
    230     for (Reg r = 0; r < 32; ++r) {
    231       if ((mask & (1u << r)) == 0) continue;
    232       for (PReg v = 1; v < opt_reg_count(f); ++v) {
    233         if (!bit_has(use, v) && !bit_has(def, v) &&
    234             !(live_after && bit_has(live_after, v)))
    235           continue;
    236         forbid_preg_reg(f, v, (u8)cls, r);
    237       }
    238     }
    239   }
    240 }
    241 
    242 /* Apply the per-instruction fixed-register clobbers recorded in machinization
    243  * (Func.inst_clobbers). A register the instruction's encoding destroys must not
    244  * hold any value live AFTER the instruction unless that value is (re)defined
    245  * here — so forbid each clobbered register for every live-after, non-def value.
    246  * Values that merely die at the instruction (dying uses) need no constraint:
    247  * the backend stages them into/out of the fixed registers itself. */
    248 static void apply_machine_reg_clobbers(Func* f, Inst* in, u64* def,
    249                                        u64* live_after) {
    250   if (!f->preg_info || !f->inst_clobbers || in->id == INST_ID_NONE ||
    251       in->id >= f->inst_clobbers_cap)
    252     return;
    253   for (u32 cls = 0; cls < OPT_REG_CLASSES; ++cls) {
    254     u32 mask = f->inst_clobbers[in->id][cls];
    255     if (!mask) continue;
    256     for (Reg r = 0; r < 32; ++r) {
    257       if ((mask & (1u << r)) == 0) continue;
    258       for (PReg v = 1; v < opt_reg_count(f); ++v) {
    259         if (!(live_after && bit_has(live_after, v)) || bit_has(def, v))
    260           continue;
    261         if ((u8)opt_reg_cls(f, v) != (u8)cls) continue;
    262         f->preg_info[v].forbidden_hard_regs |= 1u << r;
    263         /* Record this as a hard clobber so the return-register hint won't later
    264          * clear the forbid and place the value in a register the instruction
    265          * destroys (see set_preg_pref_to_ret_reg). */
    266         f->preg_info[v].clobbered_hard_regs |= 1u << r;
    267       }
    268     }
    269   }
    270 }
    271 
    272 static int phys_arg_reg_for_index(Func* f, u8 cls, u32 abi_index, Reg* out) {
    273   if (!f || cls >= OPT_REG_CLASSES) return 0;
    274   for (u32 i = 0; i < f->opt_phys_reg_count[cls]; ++i) {
    275     const CGPhysRegInfo* pi = &f->opt_phys_regs[cls][i];
    276     if ((pi->flags & CG_REG_ARG) && pi->abi_index == abi_index) {
    277       if (out) *out = pi->reg;
    278       return 1;
    279     }
    280   }
    281   return 0;
    282 }
    283 
    284 static int hard_available(Func* f, u8 cls, Reg r);
    285 
    286 static int is_caller_saved(Func* f, u8 cls, Reg r) {
    287   if (cls >= OPT_REG_CLASSES || r >= 32) return 0;
    288   return (f->opt_caller_saved[cls] & (1u << r)) != 0;
    289 }
    290 
    291 static Reg first_ret_reg(Func* f, u8 cls) {
    292   if (!f || cls >= OPT_REG_CLASSES) return REG_NONE;
    293   u32 mask = f->opt_ret_regs[cls];
    294   for (Reg r = 0; r < 32; ++r)
    295     if (mask & (1u << r)) return r;
    296   return REG_NONE;
    297 }
    298 
    299 static void set_preg_pref_to_ret_reg(Func* f, const Operand* op) {
    300   if (!op || op->kind != OPK_REG) return;
    301   PReg v = (PReg)op->v.reg;
    302   if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return;
    303   u8 cls = f->preg_info[v].cls;
    304   if (cls >= OPT_REG_CLASSES) return;
    305   Reg hint = first_ret_reg(f, cls);
    306   if (hint == REG_NONE || hint >= 32) return;
    307   /* Don't override a real pin. */
    308   if (f->preg_info[v].tied_hard_reg >= 0) return;
    309   /* A value live across an instruction that clobbers the ret reg cannot live
    310    * there; skip the hint so the allocator places it elsewhere and the return
    311    * copy moves it into the ret reg (e.g. an accumulator returned past a va_arg,
    312    * which uses rax). */
    313   if (f->preg_info[v].clobbered_hard_regs & (1u << hint)) return;
    314   /* The hint reg may not be in opt_hard_regs (e.g. x0 on aa64 is reserved
    315    * as the ABI ret reg, outside aa_int_allocable); the allocator's
    316    * preferred-reg branch will still consider it via the unit-overlap
    317    * precision check. Clear any leftover soft forbid bit so the hint isn't
    318    * silently blocked. */
    319   f->preg_info[v].forbidden_hard_regs &= ~(1u << hint);
    320   f->preg_info[v].preferred_hard_reg = (i8)hint;
    321 }
    322 
    323 static void set_preg_pref_for_abivalue(Func* f, const CGABIValue* v) {
    324   if (!v) return;
    325   set_preg_pref_to_ret_reg(f, &v->storage);
    326   for (u32 i = 0; i < v->nparts; ++i)
    327     set_preg_pref_to_ret_reg(f, &v->parts[i].op);
    328 }
    329 
    330 /* Soft hint: prefer a specific ABI register for `op`'s PReg. Symmetric to
    331  * set_preg_pref_to_ret_reg but takes an arbitrary hint reg (the matching
    332  * arg reg for the i-th call argument). */
    333 static void set_preg_pref_to_arg_reg(Func* f, const Operand* op, Reg hint) {
    334   if (!op || op->kind != OPK_REG) return;
    335   if (hint == REG_NONE || hint >= 32) return;
    336   PReg v = (PReg)op->v.reg;
    337   if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return;
    338   u8 cls = f->preg_info[v].cls;
    339   if (cls >= OPT_REG_CLASSES) return;
    340   if (f->preg_info[v].tied_hard_reg >= 0) return;
    341   if (f->preg_info[v].preferred_hard_reg >= 0) return;
    342   f->preg_info[v].forbidden_hard_regs &= ~(1u << hint);
    343   f->preg_info[v].preferred_hard_reg = (i8)hint;
    344 }
    345 
    346 /* Hint each single-PReg-stored param toward its own incoming ABI reg. When
    347  * the allocator picks the incoming reg, bind_param sees src==dst and emits
    348  * no entry move (aa_bind_native_param checks at native.c:3227). Live-range
    349  * conflicts at body use sites still go through the normal allocator check,
    350  * so cross-call params that need a callee-save get one. */
    351 /* True iff `f` contains any IR_CALL flagged as a tail call. Tail-call arg
    352  * routing goes through the backend shuffle which can permute the caller's
    353  * incoming arg regs into different positions for the callee — pinning each
    354  * param PReg to its own incoming reg turns those permutations into multi-reg
    355  * cycles the shuffle can't break. Symmetric to the per-call tail skip in
    356  * set_preg_pref_for_call_args. */
    357 static int func_has_tail_call(const Func* f) {
    358   if (!f) return 0;
    359   for (u32 b = 0; b < f->nblocks; ++b) {
    360     const Block* bl = &f->blocks[b];
    361     for (u32 i = 0; i < bl->ninsts; ++i) {
    362       const Inst* in = &bl->insts[i];
    363       if ((IROp)in->op != IR_CALL) continue;
    364       const IRCallAux* aux = (const IRCallAux*)in->extra.aux;
    365       if (aux && (aux->desc.flags & CG_CALL_TAIL)) return 1;
    366     }
    367   }
    368   return 0;
    369 }
    370 
    371 static void set_preg_pref_for_params(Func* f) {
    372   if (!f || !f->preg_info || !f->nparams) return;
    373   if (func_has_tail_call(f)) return;
    374   /* Per-class ABI arg cursors. Drives from per-param ABI info rather than
    375    * f->desc.abi so this fires on paths where only f->params[i].abi is set. */
    376   u32 next_int = 0;
    377   u32 next_fp = 0;
    378   /* An sret pointer passed in the first integer argument register consumes
    379    * that slot (SysV-x64 rdi, Win64 rcx, RISC-V a0); ABIs that return it in a
    380    * dedicated register (AArch64 x8) do not. Driven by the ABI descriptor so no
    381    * arch identity is needed here. */
    382   if (f->desc.abi && f->desc.abi->sret_consumes_int_arg) next_int = 1;
    383   for (u32 i = 0; i < f->nparams; ++i) {
    384     IRParam* p = &f->params[i];
    385     const ABIArgInfo* ai = p->abi;
    386     if (!ai || ai->kind == ABI_ARG_IGNORE) continue;
    387     if (ai->kind == ABI_ARG_INDIRECT) {
    388       ++next_int;
    389       continue;
    390     }
    391     if (ai->kind != ABI_ARG_DIRECT) continue;
    392     /* Only hint single-part DIRECT params whose home is a single PReg.
    393      * Aggregate / split params take the bind_param frame-store path. */
    394     int single_part_to_preg =
    395         (ai->nparts == 1) && (p->storage.kind == CG_LOCAL_STORAGE_REG);
    396     if (single_part_to_preg) {
    397       const ABIArgPart* part = &ai->parts[0];
    398       u8 cls = (part->cls == ABI_CLASS_FP) ? RC_FP : RC_INT;
    399       u32* counter = (cls == RC_FP) ? &next_fp : &next_int;
    400       Reg hint = REG_NONE;
    401       if (*counter < 8u && phys_arg_reg_for_index(f, cls, *counter, &hint)) {
    402         PReg v = (PReg)p->storage.v.reg;
    403         if (v != PREG_NONE && v != 0 && v < opt_reg_count(f) &&
    404             f->preg_info[v].cls == cls && f->preg_info[v].tied_hard_reg < 0 &&
    405             f->preg_info[v].preferred_hard_reg < 0 && hint != REG_NONE &&
    406             hint < 32) {
    407           f->preg_info[v].forbidden_hard_regs &= ~(1u << hint);
    408           f->preg_info[v].preferred_hard_reg = (i8)hint;
    409         }
    410       }
    411     }
    412     /* Advance the ABI cursors for every part of this param's home, regardless
    413      * of whether we hinted, so subsequent params see the right slot. */
    414     for (u16 j = 0; j < ai->nparts; ++j) {
    415       u32* c = (ai->parts[j].cls == ABI_CLASS_FP) ? &next_fp : &next_int;
    416       *c += 1u;
    417     }
    418   }
    419 }
    420 
    421 /* For each IR_CALL arg whose source storage is a single OPK_REG, hint that
    422  * PReg to the matching ABI arg register. Sequential int/fp counters mirror
    423  * the per-class arg slot assignment used by set_preg_pref_for_params. Skips
    424  * variadic, has_sret, and indirect/aggregate args: they need per-target
    425  * counter logic that hasn't been factored out of plan_call. */
    426 static void set_preg_pref_for_call_args(Func* f, const CGCallDesc* desc) {
    427   if (!f || !desc) return;
    428   /* Tail calls handle arg routing through the tail-call shuffle in the
    429    * backend, which can resolve permutations (e.g. swap of caller's incoming
    430    * x0/x1 into tail-call x1/x0). Hinting the arg source PRegs across that
    431    * shuffle creates cycles bind_param / the entry-bind moves can't unbreak,
    432    * miscompiling permute / cycle cases (toy 24, 27, 28, ...). Skip. */
    433   if (desc->flags & CG_CALL_TAIL) return;
    434   const ABIFuncInfo* abi = desc->abi;
    435   if (!abi && f->c && f->c->abi)
    436     abi = abi_cg_func_info(f->c->abi, desc->fn_type);
    437   if (!abi || abi->variadic || abi->has_sret) return;
    438   u32 next_int = 0, next_fp = 0;
    439   for (u32 i = 0; i < desc->nargs; ++i) {
    440     if (i >= abi->nparams) break;
    441     const CGABIValue* a = &desc->args[i];
    442     const ABIArgInfo* ai = &abi->params[i];
    443     if (ai->kind == ABI_ARG_IGNORE) continue;
    444     if (ai->kind == ABI_ARG_INDIRECT) {
    445       ++next_int;
    446       continue;
    447     }
    448     if (ai->kind != ABI_ARG_DIRECT || ai->nparts == 0) continue;
    449     const ABIArgPart* part0 = &ai->parts[0];
    450     u8 cls = part0->cls == ABI_CLASS_FP ? RC_FP : RC_INT;
    451     u32* counter = cls == RC_FP ? &next_fp : &next_int;
    452     if (*counter < 8u && ai->nparts == 1) {
    453       Reg hint = REG_NONE;
    454       if (phys_arg_reg_for_index(f, cls, *counter, &hint))
    455         set_preg_pref_to_arg_reg(f, &a->storage, hint);
    456     }
    457     for (u16 p = 0; p < ai->nparts; ++p) {
    458       u32* c = (ai->parts[p].cls == ABI_CLASS_FP) ? &next_fp : &next_int;
    459       *c += 1u;
    460     }
    461   }
    462 }
    463 
    464 /* Propagate preferred_hard_reg backward across IR_COPY chains. When the
    465  * frontend emits `copy def=v_arg opnds=[v_arg, v_value]` to carry a value
    466  * into a call's arg slot, set_preg_pref_for_call_args hints v_arg to the
    467  * ABI dest (e.g. x0). But the underlying producer (v_value, e.g. the load
    468  * result) is unhinted and lands in a generic caller-save (e.g. x8); the
    469  * copy then emits `mov x0, x8`. Propagating the hint from v_arg to v_value
    470  * lets both land at x0 and turns the copy into an identity move that
    471  * combine elides. Walks insts; for each IR_COPY whose def has a hint and
    472  * whose single OPK_REG source operand has none, propagate (and clear the
    473  * source's forbid for the hint reg). Safe because the copy itself dies
    474  * once both sides share the reg. */
    475 static void propagate_hint_through_copies(Func* f) {
    476   if (!f || !f->preg_info) return;
    477   for (u32 b = 0; b < f->nblocks; ++b) {
    478     Block* bl = &f->blocks[b];
    479     for (u32 i = 0; i < bl->ninsts; ++i) {
    480       Inst* in = &bl->insts[i];
    481       if ((IROp)in->op != IR_COPY) continue;
    482       if (in->nopnds < 2 || in->opnds[0].kind != OPK_REG ||
    483           in->opnds[1].kind != OPK_REG)
    484         continue;
    485       PReg dst = (PReg)in->opnds[0].v.reg;
    486       PReg src = (PReg)in->opnds[1].v.reg;
    487       if (dst == PREG_NONE || dst == 0 || dst >= opt_reg_count(f)) continue;
    488       if (src == PREG_NONE || src == 0 || src >= opt_reg_count(f)) continue;
    489       i8 dst_pref = f->preg_info[dst].preferred_hard_reg;
    490       if (dst_pref < 0) continue;
    491       if (f->preg_info[src].tied_hard_reg >= 0) continue;
    492       if (f->preg_info[src].preferred_hard_reg >= 0) continue;
    493       if (f->preg_info[dst].cls != f->preg_info[src].cls) continue;
    494       /* Don't propagate the hint into a value clobbered out of that register by
    495        * a machine instruction live across it (e.g. a loop accumulator returned
    496        * past a va_arg/idiv): it cannot live there. Leave it allocated
    497        * elsewhere; the copy moves it into the hinted register. */
    498       if (f->preg_info[src].clobbered_hard_regs & (1u << (Reg)dst_pref))
    499         continue;
    500       f->preg_info[src].forbidden_hard_regs &= ~(1u << (Reg)dst_pref);
    501       f->preg_info[src].preferred_hard_reg = dst_pref;
    502     }
    503   }
    504 }
    505 
    506 /* Set a soft "prefer the ABI return reg" hint on:
    507  *   - IR_CALL result PRegs (so emit_call's `mov result, x0` is elided)
    508  *   - IR_RET value PRegs   (so emit_ret's `mov x0, value` is elided)
    509  *   - IR_CALL arg source PRegs (so emit_call's `mov x0, src` is elided);
    510  *     hints are propagated backward through IR_COPY so the actual producer
    511  *     of the value also prefers the ABI dest reg.
    512  *
    513  * The hint is a tie-breaker only — see hard_reg_alloc_score. The allocator's
    514  * existing conflict checks already exclude regs with real interference (e.g.
    515  * a result PReg live across another call cannot pick x0). */
    516 static void apply_abi_aliasing_hints(Func* f) {
    517   if (!f || !f->preg_info) return;
    518   set_preg_pref_for_params(f);
    519   for (u32 b = 0; b < f->nblocks; ++b) {
    520     Block* bl = &f->blocks[b];
    521     for (u32 i = 0; i < bl->ninsts; ++i) {
    522       Inst* in = &bl->insts[i];
    523       if ((IROp)in->op == IR_CALL) {
    524         IRCallAux* aux = (IRCallAux*)in->extra.aux;
    525         if (aux) {
    526           set_preg_pref_for_abivalue(f, &aux->desc.ret);
    527           set_preg_pref_for_call_args(f, &aux->desc);
    528         }
    529       } else if ((IROp)in->op == IR_RET) {
    530         IRRetAux* aux = (IRRetAux*)in->extra.aux;
    531         if (aux && aux->present) set_preg_pref_for_abivalue(f, &aux->val);
    532       }
    533     }
    534   }
    535   propagate_hint_through_copies(f);
    536 }
    537 
    538 /* ---------------------------------------------------------------------------
    539  * Register allocator, MIR-shaped.
    540  *
    541  * Data structures and assignment algorithm mirror MIR's reg_alloc/assign
    542  * (mir-gen.c:7551-7728, simplified_p branch). Conflict detection uses a
    543  * point-indexed bitmap of locations (hard regs + stack slots) instead of
    544  * a sorted interval vector per hard register.
    545  *
    546  *   used_locs[p * loc_words .. p * loc_words + loc_words)  -- one row per
    547  *                                                             compressed
    548  *                                                             program point
    549  *
    550  *   Bit indices:
    551  *     0 .. hard_loc_bits-1   -> hard registers (hard_loc_bit(cls, r))
    552  *     hard_loc_bits + k      -> stack slot index k (k < stack_slot_count)
    553  *
    554  * Live-range splitting (`get_hard_reg_with_split`, `lr_gap_t`, `split()`)
    555  * is deferred per doc/plan/OPTIMIZER.md.
    556  * ------------------------------------------------------------------------- */
    557 
    558 typedef struct OptAllocator OptAllocator;
    559 
    560 static int hard_available(Func* f, u8 cls, Reg r) {
    561   if (cls >= OPT_REG_CLASSES) return 0;
    562   for (u32 i = 0; i < f->opt_hard_reg_count[cls]; ++i)
    563     if (f->opt_hard_regs[cls][i] == r) return 1;
    564   return 0;
    565 }
    566 
    567 static const CGPhysRegInfo* phys_info_for(Func* f, u8 cls, Reg r) {
    568   if (cls >= OPT_REG_CLASSES) return NULL;
    569   for (u32 i = 0; i < f->opt_phys_reg_count[cls]; ++i)
    570     if (f->opt_phys_regs[cls][i].reg == r) return &f->opt_phys_regs[cls][i];
    571   return NULL;
    572 }
    573 
    574 static FrameSlot spill_slot_for(Func* f, PReg v) {
    575   FrameSlot existing = opt_preg_spill_slot(f, v);
    576   if (existing != FRAME_SLOT_NONE) return existing;
    577   FrameSlotDesc d;
    578   memset(&d, 0, sizeof d);
    579   d.type = opt_reg_type(f, v);
    580   d.size = type_size_fallback(f, opt_reg_type(f, v));
    581   d.align = d.size >= 8 ? 8 : d.size;
    582   d.kind = FS_SPILL;
    583   f->preg_info[v].spill_slot = ir_frame_slot_new(f, &d);
    584   return f->preg_info[v].spill_slot;
    585 }
    586 
    587 static u32 hard_loc_bit(u8 cls, Reg r) { return ((u32)cls * 32u) + (u32)r; }
    588 
    589 typedef struct OptAllocCandidate {
    590   PReg v; /* coalesce-root PReg with live ranges */
    591   u32 spill_cost;
    592   u32 live_length;
    593   u8 tied;
    594   u8 pad[3];
    595 } OptAllocCandidate;
    596 
    597 typedef struct OptAllocGroupInfo {
    598   PReg root;
    599   u32 spill_cost;
    600   u32 live_length;
    601   u32 first;
    602   u32 last;
    603   i32 tied_hard_reg;
    604   u32 forbidden_hard_regs;
    605   u32 allowed_hard_regs;
    606   u8 cls;
    607   u8 pad[3];
    608 } OptAllocGroupInfo;
    609 
    610 typedef struct OptAllocator {
    611   OptLoc* locs; /* per-PReg result (cls, hard_reg, spill_slot) */
    612 
    613   /* Per-point bitmap of locations. used_locs[p * loc_words + w] is word w
    614    * of the bitmap for compressed program point p. Bit indices:
    615    *   0 .. hard_loc_bits - 1            -> hard regs (hard_loc_bit)
    616    *   hard_loc_bits + stack_idx         -> stack slot indices */
    617   u64* used_locs;
    618   u32 point_count;
    619   u32 loc_words;     /* width of one row, in u64 words */
    620   u32 hard_loc_bits; /* OPT_REG_CLASSES * 32 */
    621 
    622   /* Stack slot table (parallel arrays). */
    623   FrameSlot* stack_slots;
    624   u32 stack_slot_count;
    625   u32 stack_slot_cap;
    626 
    627   /* hard_open[hard_loc_bit] is 1 if at least one PReg has been assigned to
    628    * this hard reg in the current function. Drives `hard_reg_alloc_score`'s
    629    * callee-save bias. */
    630   u8* hard_open;
    631 
    632   /* Scratch bitmap of loc_words. Reused per candidate. */
    633   u64* conflict_locs;
    634 
    635   /* Metrics. */
    636   u64 hard_point_visits;  /* points scanned during hard-reg conflict probe */
    637   u64 stack_point_visits; /* points scanned during stack-slot probe */
    638   u64 hard_word_ors;      /* word-OR operations into conflict_locs */
    639   u64 stack_word_ors;
    640   u64 hard_mark_points; /* points marked when assigning a hard reg */
    641   u64 stack_mark_points;
    642 } OptAllocator;
    643 
    644 /* Bitmap helpers over loc_words-wide rows of used_locs. */
    645 static u64* used_locs_row(OptAllocator* a, u32 p) {
    646   return &a->used_locs[(u64)p * a->loc_words];
    647 }
    648 
    649 static int loc_bit_in_conflict(const u64* conflict_locs, u32 bit) {
    650   return (conflict_locs[bit / 64u] & (1ull << (bit % 64u))) != 0;
    651 }
    652 
    653 static u32 alloc_loc_words_for_bits(u32 bits) { return (bits + 63u) / 64u; }
    654 
    655 static void alloc_grow_loc_words(Func* f, OptAllocator* a, u32 need_words) {
    656   if (need_words <= a->loc_words) return;
    657   u32 new_words = a->loc_words ? a->loc_words : 1u;
    658   while (new_words < need_words) new_words *= 2u;
    659   u64* nb = arena_zarray(f->arena, u64, (u64)a->point_count * new_words);
    660   if (a->used_locs && a->loc_words) {
    661     for (u32 p = 0; p < a->point_count; ++p)
    662       memcpy(&nb[(u64)p * new_words], &a->used_locs[(u64)p * a->loc_words],
    663              sizeof(u64) * a->loc_words);
    664   }
    665   a->used_locs = nb;
    666   a->loc_words = new_words;
    667   u64* nc = arena_zarray(f->arena, u64, new_words);
    668   a->conflict_locs = nc;
    669 }
    670 
    671 static u32 alloc_alloc_stack_slot(Func* f, OptAllocator* a, FrameSlot fs) {
    672   if (a->stack_slot_count == a->stack_slot_cap) {
    673     u32 ncap = a->stack_slot_cap ? a->stack_slot_cap * 2u : 16u;
    674     FrameSlot* ns = arena_array(f->arena, FrameSlot, ncap);
    675     if (a->stack_slots)
    676       memcpy(ns, a->stack_slots,
    677              sizeof(a->stack_slots[0]) * a->stack_slot_count);
    678     a->stack_slots = ns;
    679     a->stack_slot_cap = ncap;
    680   }
    681   u32 idx = a->stack_slot_count++;
    682   a->stack_slots[idx] = fs;
    683   u32 needed_bits = a->hard_loc_bits + a->stack_slot_count;
    684   u32 needed_words = alloc_loc_words_for_bits(needed_bits);
    685   alloc_grow_loc_words(f, a, needed_words);
    686   return idx;
    687 }
    688 
    689 static u32 hard_reg_alloc_score(Func* f, const OptAllocator* a,
    690                                 const OptPRegInfo* vi, Reg hr) {
    691   const CGPhysRegInfo* pi = phys_info_for(f, vi->cls, hr);
    692   u32 score = pi ? pi->spill_cost : 0;
    693   if (vi->live_across_call_freq) {
    694     if (is_caller_saved(f, vi->cls, hr))
    695       score += 1000u + vi->live_across_call_freq;
    696     else
    697       score += 20u;
    698   } else if (!is_caller_saved(f, vi->cls, hr)) {
    699     u32 bit = hard_loc_bit(vi->cls, hr);
    700     int already_open =
    701         a->hard_open && bit < a->hard_loc_bits && a->hard_open[bit];
    702     if (!already_open) score += pi ? pi->copy_cost : 50u;
    703   }
    704   /* Soft hint: tie-break toward a preferred hard reg by adding +1 to every
    705    * non-hinted choice. Used by apply_abi_aliasing_hints to put IR_CALL
    706    * results / IR_RET values into the ABI return register so emit_call /
    707    * emit_ret can elide the materialization move. Stays well below the
    708    * live-across-call (+1000) penalty and the callee-save (+20) gap so it
    709    * never overrides real costs. */
    710   if (vi->preferred_hard_reg >= 0 && (Reg)vi->preferred_hard_reg != hr)
    711     score += 1u;
    712   return score;
    713 }
    714 
    715 static int alloc_candidate_higher(const OptAllocCandidate* a,
    716                                   const OptAllocCandidate* b) {
    717   if (a->tied != b->tied) return a->tied > b->tied;
    718   if (a->spill_cost != b->spill_cost) return a->spill_cost > b->spill_cost;
    719   if (a->live_length != b->live_length) return a->live_length < b->live_length;
    720   return a->v < b->v;
    721 }
    722 
    723 static int alloc_candidate_cmp(const void* va, const void* vb) {
    724   const OptAllocCandidate* a = (const OptAllocCandidate*)va;
    725   const OptAllocCandidate* b = (const OptAllocCandidate*)vb;
    726   if (alloc_candidate_higher(a, b)) return -1;
    727   if (alloc_candidate_higher(b, a)) return 1;
    728   return 0;
    729 }
    730 
    731 static void alloc_sort_candidates(OptAllocCandidate* cands, u32 n) {
    732   if (n > 1) qsort(cands, n, sizeof(cands[0]), alloc_candidate_cmp);
    733 }
    734 
    735 static PReg alloc_coalesce_root(Func* f, PReg v) {
    736   if (!f->opt_coalesce_parent || v == PREG_NONE || v >= opt_reg_count(f))
    737     return v;
    738   PReg p = (PReg)f->opt_coalesce_parent[v];
    739   while (p != f->opt_coalesce_parent[p]) p = (PReg)f->opt_coalesce_parent[p];
    740   while (v != p) {
    741     PReg n = (PReg)f->opt_coalesce_parent[v];
    742     f->opt_coalesce_parent[v] = p;
    743     v = n;
    744   }
    745   return p;
    746 }
    747 
    748 static int alloc_group_member(Func* f, PReg root, PReg v) {
    749   return alloc_coalesce_root(f, v) == root;
    750 }
    751 
    752 static void alloc_group_info(Func* f, const OptLiveRangeSet* ranges, PReg root,
    753                              OptAllocGroupInfo* out) {
    754   memset(out, 0, sizeof *out);
    755   out->root = root;
    756   out->first = (u32)~0u;
    757   out->tied_hard_reg = -1;
    758   out->cls = f->preg_info[root].cls;
    759   for (PReg v = 1; v < opt_reg_count(f); ++v) {
    760     if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue;
    761     if (!alloc_group_member(f, root, v)) continue;
    762     OptPRegInfo* vi = &f->preg_info[v];
    763     out->spill_cost += vi->frequency ? vi->frequency : vi->spill_cost;
    764     out->live_length += vi->live_length;
    765     u32 first = vi->first_pos ? vi->first_pos - 1u : 0;
    766     if (first < out->first) out->first = first;
    767     if (vi->last_pos > out->last) out->last = vi->last_pos;
    768     out->forbidden_hard_regs |= vi->forbidden_hard_regs;
    769     if (vi->allowed_hard_regs) {
    770       if (out->allowed_hard_regs) {
    771         u32 both = out->allowed_hard_regs & vi->allowed_hard_regs;
    772         if (!both) {
    773           SrcLoc loc = {0, 0, 0};
    774           compiler_panic(f->c, loc,
    775                          "opt asm: conflicting allowed register sets");
    776         }
    777         out->allowed_hard_regs = both;
    778       } else {
    779         out->allowed_hard_regs = vi->allowed_hard_regs;
    780       }
    781     }
    782     if (vi->tied_hard_reg >= 0) out->tied_hard_reg = vi->tied_hard_reg;
    783   }
    784   if (out->first == (u32)~0u) out->first = 0;
    785 }
    786 
    787 static void opt_init_preg_info_from_ranges(Func* f,
    788                                            const OptLiveRangeSet* ranges) {
    789   OptPRegInfo* old = f->preg_info;
    790   OptPRegInfo* info = arena_zarray(f->arena, OptPRegInfo,
    791                                    opt_reg_count(f) ? opt_reg_count(f) : 1u);
    792   for (PReg v = 0; v < opt_reg_count(f); ++v) {
    793     i32 tied = old ? old[v].tied_hard_reg : -1;
    794     u32 forbidden = old ? old[v].forbidden_hard_regs : 0;
    795     u32 allowed = old ? old[v].allowed_hard_regs : 0;
    796     u32 clobbered = old ? old[v].clobbered_hard_regs : 0;
    797     u32 old_frequency = old ? old[v].frequency : 0;
    798     i8 pref = old ? old[v].preferred_hard_reg : (i8)-1;
    799     OptPRegInfo* vi = &info[v];
    800     vi->tied_hard_reg = tied;
    801     vi->preferred_hard_reg = pref;
    802     vi->hard_reg = REG_NONE;
    803     vi->spill_slot = FRAME_SLOT_NONE;
    804     vi->alloc_kind = OPT_ALLOC_NONE;
    805     vi->cls = opt_reg_cls(f, v);
    806     vi->forbidden_hard_regs = forbidden;
    807     vi->allowed_hard_regs = allowed;
    808     vi->clobbered_hard_regs = clobbered;
    809     if (!ranges || v == PREG_NONE || v == 0 ||
    810         ranges->first_range_by_preg[v] == OPT_RANGE_NONE) {
    811       continue;
    812     }
    813     u32 first = (u32)~0u;
    814     u32 last = 0;
    815     for (u32 r = ranges->first_range_by_preg[v]; r != OPT_RANGE_NONE;
    816          r = ranges->ranges[r].next) {
    817       const OptLiveRange* lr = &ranges->ranges[r];
    818       if (lr->start < first) first = lr->start;
    819       if (lr->end > last) last = lr->end;
    820     }
    821     vi->first_pos = first == (u32)~0u ? 0 : first + 1u;
    822     vi->last_pos = last;
    823     vi->live_length = ranges->live_length_by_preg[v];
    824     vi->use_freq = ranges->use_freq_by_preg[v];
    825     vi->def_freq = ranges->def_freq_by_preg[v];
    826     vi->live_block_freq = ranges->live_block_freq_by_preg[v];
    827     vi->live_across_call_freq = ranges->live_across_call_freq_by_preg[v];
    828     vi->spill_cost = ranges->spill_cost_by_preg[v];
    829     vi->frequency = vi->spill_cost;
    830     if (old_frequency > vi->frequency) vi->frequency = old_frequency;
    831   }
    832   f->preg_info = info;
    833 }
    834 
    835 static void bits_clear(u64* bits, u32 words) {
    836   for (u32 i = 0; i < words; ++i) bits[i] = 0;
    837 }
    838 
    839 static void live_update_before(u64* live, const u64* use, const u64* def,
    840                                u32 words) {
    841   for (u32 w = 0; w < words; ++w) live[w] = (live[w] & ~def[w]) | use[w];
    842 }
    843 
    844 static void live_copy_block_out(Func* f, const OptLiveInfo* live_info, u32 b,
    845                                 u64* live, u32 words) {
    846   (void)f;
    847   bits_clear(live, words);
    848   if (live_info) {
    849     const OptBitset* out = &live_info->blocks[b].live_out;
    850     for (u32 w = 0; w < words && w < out->nwords; ++w) live[w] = out->words[w];
    851   }
    852 }
    853 
    854 static u32 live_copy_block_out_active(const OptLiveInfo* live_info, u32 b,
    855                                       u64* live, u32 words,
    856                                       u32 old_active_words) {
    857   for (u32 w = 0; w < old_active_words; ++w) live[w] = 0;
    858   if (!live_info) return 0;
    859   const OptBitset* out = &live_info->blocks[b].live_out;
    860   u32 active = words < out->active_words ? words : out->active_words;
    861   for (u32 w = 0; w < active; ++w) live[w] = out->words[w];
    862   return active;
    863 }
    864 
    865 static void opt_apply_asm_constraints_from_live(Func* f,
    866                                                 const OptLiveInfo* live_info) {
    867   int has_asm = 0;
    868   for (u32 b = 0; b < f->nblocks && !has_asm; ++b) {
    869     Block* bl = &f->blocks[b];
    870     for (u32 i = 0; i < bl->ninsts; ++i) {
    871       if ((IROp)bl->insts[i].op == IR_ASM_BLOCK) {
    872         has_asm = 1;
    873         break;
    874       }
    875     }
    876   }
    877   /* The live walk drives both inline-asm operand constraints and
    878    * per-instruction fixed-register clobbers (Func.inst_clobbers); run it if
    879    * either is present. */
    880   if (!has_asm && !f->inst_clobbers) return;
    881 
    882   u32 words = live_info ? live_info->words : f->opt_live_words;
    883   if (!words) words = bit_words(opt_reg_count(f));
    884   f->opt_live_words = (u16)words;
    885 
    886   u64* live = arena_zarray(f->arena, u64, words ? words : 1u);
    887   u64* use = arena_zarray(f->arena, u64, words ? words : 1u);
    888   u64* def = arena_zarray(f->arena, u64, words ? words : 1u);
    889   for (u32 b = 0; b < f->nblocks; ++b) {
    890     Block* bl = &f->blocks[b];
    891     live_copy_block_out(f, live_info, b, live, words);
    892     for (u32 ri = bl->ninsts; ri > 0; --ri) {
    893       u32 i = ri - 1u;
    894       Inst* in = &bl->insts[i];
    895       bits_clear(use, words);
    896       bits_clear(def, words);
    897       BitsCtx bc = {use, def};
    898       opt_walk_inst_operands(f, in, collect_bits, &bc);
    899       if ((IROp)in->op == IR_ASM_BLOCK)
    900         apply_asm_register_constraints(f, in, use, def, live);
    901       apply_machine_reg_clobbers(f, in, def, live);
    902       live_update_before(live, use, def, words);
    903     }
    904   }
    905 }
    906 
    907 static int spill_slot_compatible(Func* f, FrameSlot fs, PReg v) {
    908   if (fs == FRAME_SLOT_NONE || fs > f->nframe_slots) return 0;
    909   IRFrameSlot* s = &f->frame_slots[fs - 1u];
    910   u32 size = type_size_fallback(f, opt_reg_type(f, v));
    911   u32 align = size >= 8 ? 8 : size;
    912   if (s->kind != FS_SPILL) return 0;
    913   if (s->size < size) return 0;
    914   if (s->align < align) return 0;
    915   return 1;
    916 }
    917 
    918 /* Compute conflict_locs = union of used_locs[j] for j in every live range
    919  * point of every PReg in `root`'s coalesce group. */
    920 static void alloc_compute_group_conflicts(Func* f, OptAllocator* a,
    921                                           const OptLiveRangeSet* ranges,
    922                                           PReg root) {
    923   for (u32 w = 0; w < a->loc_words; ++w) a->conflict_locs[w] = 0;
    924   for (PReg v = 1; v < opt_reg_count(f); ++v) {
    925     if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue;
    926     if (!alloc_group_member(f, root, v)) continue;
    927     for (u32 ri = ranges->first_range_by_preg[v]; ri != OPT_RANGE_NONE;
    928          ri = ranges->ranges[ri].next) {
    929       const OptLiveRange* lr = &ranges->ranges[ri];
    930       u32 end = lr->end < a->point_count ? lr->end : a->point_count;
    931       for (u32 j = lr->start; j < end; ++j) {
    932         ++a->hard_point_visits;
    933         const u64* row = used_locs_row(a, j);
    934         for (u32 w = 0; w < a->loc_words; ++w) {
    935           a->conflict_locs[w] |= row[w];
    936           ++a->hard_word_ors;
    937         }
    938       }
    939     }
    940   }
    941 }
    942 
    943 /* Mark `loc_bit` as occupied at every point covered by `root`'s group's
    944  * live ranges. */
    945 static void alloc_mark_group_loc(Func* f, OptAllocator* a,
    946                                  const OptLiveRangeSet* ranges, PReg root,
    947                                  u32 loc_bit) {
    948   u32 w = loc_bit / 64u;
    949   u64 mask = 1ull << (loc_bit % 64u);
    950   for (PReg v = 1; v < opt_reg_count(f); ++v) {
    951     if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue;
    952     if (!alloc_group_member(f, root, v)) continue;
    953     for (u32 ri = ranges->first_range_by_preg[v]; ri != OPT_RANGE_NONE;
    954          ri = ranges->ranges[ri].next) {
    955       const OptLiveRange* lr = &ranges->ranges[ri];
    956       u32 end = lr->end < a->point_count ? lr->end : a->point_count;
    957       for (u32 j = lr->start; j < end; ++j) {
    958         ++a->hard_mark_points;
    959         used_locs_row(a, j)[w] |= mask;
    960       }
    961     }
    962   }
    963 }
    964 
    965 static void alloc_assign_group_hard(Func* f, OptAllocator* a,
    966                                     const OptLiveRangeSet* ranges, PReg root,
    967                                     Reg r) {
    968   u8 cls = f->preg_info[root].cls;
    969   u32 bit = hard_loc_bit(cls, r);
    970   for (PReg v = 1; v < opt_reg_count(f); ++v) {
    971     if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue;
    972     if (!alloc_group_member(f, root, v)) continue;
    973     OptPRegInfo* vi = &f->preg_info[v];
    974     vi->alloc_kind = OPT_ALLOC_HARD;
    975     vi->hard_reg = r;
    976     a->locs[v].kind = OPT_LOC_HARD;
    977     a->locs[v].cls = vi->cls;
    978     a->locs[v].hard_reg = r;
    979     a->locs[v].spill_slot = FRAME_SLOT_NONE;
    980   }
    981   alloc_mark_group_loc(f, a, ranges, root, bit);
    982   if (bit < a->hard_loc_bits) a->hard_open[bit] = 1;
    983 }
    984 
    985 static void alloc_assign_group_stack(Func* f, OptAllocator* a,
    986                                      const OptLiveRangeSet* ranges, PReg root) {
    987   /* Try to reuse an existing stack slot whose bit is clear in conflict_locs
    988    * and whose frame slot is compatible. The conflict_locs scratch must
    989    * already be populated for `root` by the caller. */
    990   u32 stack_idx = (u32)~0u;
    991   for (u32 k = 0; k < a->stack_slot_count; ++k) {
    992     u32 bit = a->hard_loc_bits + k;
    993     if (loc_bit_in_conflict(a->conflict_locs, bit)) continue;
    994     if (!spill_slot_compatible(f, a->stack_slots[k], root)) continue;
    995     stack_idx = k;
    996     break;
    997   }
    998   if (stack_idx == (u32)~0u) {
    999     FrameSlot fs = spill_slot_for(f, root);
   1000     stack_idx = alloc_alloc_stack_slot(f, a, fs);
   1001     /* alloc_alloc_stack_slot may have widened a->loc_words: refresh
   1002      * conflict_locs (callers don't reuse it after this). */
   1003   }
   1004   FrameSlot slot = a->stack_slots[stack_idx];
   1005   for (PReg v = 1; v < opt_reg_count(f); ++v) {
   1006     if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue;
   1007     if (!alloc_group_member(f, root, v)) continue;
   1008     OptPRegInfo* vi = &f->preg_info[v];
   1009     vi->spill_slot = slot;
   1010     vi->alloc_kind = OPT_ALLOC_SPILL;
   1011     a->locs[v].kind = OPT_LOC_STACK;
   1012     a->locs[v].cls = vi->cls;
   1013     a->locs[v].hard_reg = REG_NONE;
   1014     a->locs[v].spill_slot = slot;
   1015   }
   1016   u32 bit = a->hard_loc_bits + stack_idx;
   1017   u32 w = bit / 64u;
   1018   u64 mask = 1ull << (bit % 64u);
   1019   for (PReg v = 1; v < opt_reg_count(f); ++v) {
   1020     if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue;
   1021     if (!alloc_group_member(f, root, v)) continue;
   1022     for (u32 ri = ranges->first_range_by_preg[v]; ri != OPT_RANGE_NONE;
   1023          ri = ranges->ranges[ri].next) {
   1024       const OptLiveRange* lr = &ranges->ranges[ri];
   1025       u32 end = lr->end < a->point_count ? lr->end : a->point_count;
   1026       for (u32 j = lr->start; j < end; ++j) {
   1027         ++a->stack_mark_points;
   1028         used_locs_row(a, j)[w] |= mask;
   1029       }
   1030     }
   1031   }
   1032 }
   1033 
   1034 static int alloc_group_conflicts_bit(const OptAllocator* a, u32 bit) {
   1035   if (bit / 64u >= a->loc_words) return 1;
   1036   return loc_bit_in_conflict(a->conflict_locs, bit);
   1037 }
   1038 
   1039 static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
   1040                               OptAllocator* a, int allow_live_range_split) {
   1041   (void)allow_live_range_split; /* live-range splitting deferred per
   1042                                    doc/plan/OPTIMIZER.md; the parameter is
   1043                                    passed through for ABI compatibility. */
   1044   memset(a, 0, sizeof *a);
   1045   a->point_count = ranges->point_count ? ranges->point_count : 1u;
   1046   a->hard_loc_bits = OPT_REG_CLASSES * 32u;
   1047   a->loc_words = alloc_loc_words_for_bits(a->hard_loc_bits);
   1048   a->used_locs =
   1049       arena_zarray(f->arena, u64, (u64)a->point_count * a->loc_words);
   1050   a->conflict_locs = arena_zarray(f->arena, u64, a->loc_words);
   1051   a->locs =
   1052       arena_zarray(f->arena, OptLoc, opt_reg_count(f) ? opt_reg_count(f) : 1u);
   1053   a->hard_open = arena_zarray(f->arena, u8, a->hard_loc_bits);
   1054   a->stack_slots = NULL;
   1055   a->stack_slot_count = 0;
   1056   a->stack_slot_cap = 0;
   1057 
   1058   /* Build candidate list: every coalesce-root PReg that has live ranges. */
   1059   u32 ncands = 0;
   1060   for (PReg v = 1; v < opt_reg_count(f); ++v) {
   1061     if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue;
   1062     if (alloc_coalesce_root(f, v) != v) continue;
   1063     ++ncands;
   1064   }
   1065   OptAllocCandidate* cands =
   1066       arena_array(f->arena, OptAllocCandidate, ncands ? ncands : 1u);
   1067   u32 n = 0;
   1068   for (PReg v = 1; v < opt_reg_count(f); ++v) {
   1069     if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue;
   1070     PReg root = alloc_coalesce_root(f, v);
   1071     if (root != v) continue;
   1072     OptAllocGroupInfo gi;
   1073     alloc_group_info(f, ranges, root, &gi);
   1074     cands[n].v = v;
   1075     cands[n].spill_cost = gi.spill_cost;
   1076     cands[n].live_length = gi.live_length;
   1077     cands[n].tied = gi.tied_hard_reg >= 0;
   1078     memset(cands[n].pad, 0, sizeof cands[n].pad);
   1079     ++n;
   1080   }
   1081   alloc_sort_candidates(cands, n);
   1082 
   1083   for (u32 i = 0; i < n; ++i) {
   1084     PReg v = cands[i].v;
   1085     OptAllocGroupInfo gi;
   1086     alloc_group_info(f, ranges, v, &gi);
   1087     OptPRegInfo* vi = &f->preg_info[v];
   1088     u8 cls = gi.cls;
   1089     alloc_compute_group_conflicts(f, a, ranges, v);
   1090 
   1091     if (gi.tied_hard_reg >= 0) {
   1092       Reg fixed = (Reg)gi.tied_hard_reg;
   1093       /* Machinize has already validated inline-asm hard-register pins against
   1094        * the target's operand-register policy. Some legal pins are ABI registers
   1095        * outside the standard allocable set (aa64 x0, rv64 a7), so the allocator
   1096        * accepts validated physical registers here and relies on the
   1097        * conflict/clobber checks below for placement correctness. */
   1098       if (!hard_available(f, cls, fixed) && !phys_info_for(f, cls, fixed)) {
   1099         SrcLoc loc = {0, 0, 0};
   1100         compiler_panic(
   1101             f->c, loc,
   1102             "opt regalloc: fixed hard reg %u unavailable in class %u",
   1103             (unsigned)fixed, (unsigned)cls);
   1104       }
   1105       if (fixed < 32 && (gi.forbidden_hard_regs & (1u << fixed))) {
   1106         SrcLoc loc = {0, 0, 0};
   1107         compiler_panic(f->c, loc,
   1108                        "opt regalloc: fixed hard reg %u is clobbered",
   1109                        (unsigned)fixed);
   1110       }
   1111       if (fixed >= 32 ||
   1112           (gi.allowed_hard_regs &&
   1113            (gi.allowed_hard_regs & (1u << fixed)) == 0)) {
   1114         SrcLoc loc = {0, 0, 0};
   1115         compiler_panic(f->c, loc,
   1116                        "opt regalloc: fixed hard reg %u violates asm "
   1117                        "constraint",
   1118                        (unsigned)fixed);
   1119       }
   1120       u32 bit = hard_loc_bit(cls, fixed);
   1121       if (fixed >= 32 || alloc_group_conflicts_bit(a, bit)) {
   1122         SrcLoc loc = {0, 0, 0};
   1123         compiler_panic(f->c, loc, "opt regalloc: conflicting fixed hard reg %u",
   1124                        (unsigned)fixed);
   1125       }
   1126       alloc_assign_group_hard(f, a, ranges, v, fixed);
   1127       continue;
   1128     }
   1129 
   1130     int found = 0;
   1131     Reg best = REG_NONE;
   1132     u32 best_score = 0xffffffffu;
   1133     for (u32 r = 0; r < f->opt_hard_reg_count[cls]; ++r) {
   1134       Reg hr = f->opt_hard_regs[cls][r];
   1135       if (hr >= 32) continue;
   1136       if (gi.allowed_hard_regs && (gi.allowed_hard_regs & (1u << hr)) == 0)
   1137         continue;
   1138       if (gi.forbidden_hard_regs & (1u << hr)) continue;
   1139       u32 bit = hard_loc_bit(cls, hr);
   1140       if (alloc_group_conflicts_bit(a, bit)) continue;
   1141       u32 score = hard_reg_alloc_score(f, a, vi, hr);
   1142       if (!found || score < best_score) {
   1143         found = 1;
   1144         best = hr;
   1145         best_score = score;
   1146       }
   1147     }
   1148     if (gi.allowed_hard_regs) {
   1149       for (Reg hr = 0; hr < 32; ++hr) {
   1150         const CGPhysRegInfo* pi;
   1151         if ((gi.allowed_hard_regs & (1u << hr)) == 0) continue;
   1152         if (hard_available(f, cls, hr)) continue;
   1153         if (gi.forbidden_hard_regs & (1u << hr)) continue;
   1154         pi = phys_info_for(f, cls, hr);
   1155         if (!pi || (pi->flags & CG_REG_RESERVED)) continue;
   1156         u32 bit = hard_loc_bit(cls, hr);
   1157         if (alloc_group_conflicts_bit(a, bit)) continue;
   1158         u32 score = hard_reg_alloc_score(f, a, vi, hr);
   1159         if (!found || score < best_score) {
   1160           found = 1;
   1161           best = hr;
   1162           best_score = score;
   1163         }
   1164       }
   1165     }
   1166     /* Also consider the preferred hard reg if it's outside the standard
   1167      * allocable set (e.g. x0 on aa64: reserved as the ABI ret reg, not in
   1168      * aa_int_allocable). Used by apply_abi_aliasing_hints to let an IR_CALL
   1169      * result PReg or IR_RET value PReg land directly in x0, eliding the
   1170      * materialization move emit_call/emit_ret would otherwise emit. Only
   1171      * short-lived PRegs benefit: a value live across a call cannot survive in
   1172      * a caller-saved hint reg, and this is the *only* path that can reach an
   1173      * out-of-allocable-set reg, so guard it explicitly. The +1000 caller-save
   1174      * penalty in hard_reg_alloc_score only deflects the hint when a cheaper
   1175      * reg is found; under high register pressure (found == 0) the fallback
   1176      * below would otherwise take the hint reg regardless of score, parking a
   1177      * cross-call value in x0 where it collides with the next call's result. */
   1178     if (vi->preferred_hard_reg >= 0 &&
   1179         !(vi->live_across_call_freq &&
   1180           is_caller_saved(f, cls, (Reg)vi->preferred_hard_reg))) {
   1181       Reg hint = (Reg)vi->preferred_hard_reg;
   1182       int already_tried = 0;
   1183       if (hint < 32 &&
   1184           (!gi.allowed_hard_regs || (gi.allowed_hard_regs & (1u << hint)))) {
   1185         for (u32 r = 0; r < f->opt_hard_reg_count[cls]; ++r) {
   1186           if (f->opt_hard_regs[cls][r] == hint) {
   1187             already_tried = 1;
   1188             break;
   1189           }
   1190         }
   1191         if (!already_tried && !(gi.forbidden_hard_regs & (1u << hint))) {
   1192           u32 bit = hard_loc_bit(cls, hint);
   1193           int hint_safe = !alloc_group_conflicts_bit(a, bit);
   1194           /* The bitmap conflict can be falsely positive when an
   1195            * already-assigned PReg ends exactly where v begins — the
   1196            * swap-friendly pattern like `sub x0, x21, x0`, where the previous
   1197            * call's result occupies x0 and the sub both reads it and writes
   1198            * the new value. Fall back to a precise per-PReg interference check
   1199            * that allows the unit-length overlap (same rule used by
   1200            * opt_coalesce_ranges for moves). */
   1201           if (!hint_safe) {
   1202             int real_conflict = 0;
   1203             for (PReg u = 1; u < opt_reg_count(f); ++u) {
   1204               if (u == v) continue;
   1205               const OptPRegInfo* ui = &f->preg_info[u];
   1206               if (ui->alloc_kind != OPT_ALLOC_HARD) continue;
   1207               if (ui->hard_reg != hint) continue;
   1208               if (opt_ranges_overlap_kind(ranges, u, v) >= 2) {
   1209                 real_conflict = 1;
   1210                 break;
   1211               }
   1212             }
   1213             if (!real_conflict) hint_safe = 1;
   1214           }
   1215           if (hint_safe) {
   1216             u32 score = hard_reg_alloc_score(f, a, vi, hint);
   1217             if (!found || score < best_score) {
   1218               found = 1;
   1219               best = hint;
   1220               best_score = score;
   1221             }
   1222           }
   1223         }
   1224       }
   1225     }
   1226     if (found) {
   1227       alloc_assign_group_hard(f, a, ranges, v, best);
   1228     } else if (gi.allowed_hard_regs) {
   1229       SrcLoc loc = {0, 0, 0};
   1230       compiler_panic(f->c, loc,
   1231                      "opt regalloc: no hard register satisfies asm constraint");
   1232     } else {
   1233       alloc_assign_group_stack(f, a, ranges, v);
   1234     }
   1235   }
   1236 
   1237   /* Report storage metrics in u64 words (used_locs is the only bitmap). */
   1238   u32 total_words = a->point_count * a->loc_words;
   1239   u32 hard_words = alloc_loc_words_for_bits(a->hard_loc_bits) * a->point_count;
   1240   if (hard_words > total_words) hard_words = total_words;
   1241   f->opt_alloc_hard_loc_words = hard_words;
   1242   f->opt_alloc_stack_loc_words = total_words - hard_words;
   1243   f->opt_alloc_stack_slots = a->stack_slot_count;
   1244   f->opt_used_loc_words = total_words;
   1245   f->opt_alloc_hard_point_visits = a->hard_point_visits;
   1246   f->opt_alloc_stack_point_visits = a->stack_point_visits;
   1247   f->opt_alloc_hard_word_ors = a->hard_word_ors;
   1248   f->opt_alloc_stack_word_ors = a->stack_word_ors;
   1249   f->opt_alloc_hard_mark_points = a->hard_mark_points;
   1250   f->opt_alloc_stack_mark_points = a->stack_mark_points;
   1251   f->preg_locs = a->locs;
   1252 }
   1253 
   1254 typedef struct RewriteList {
   1255   Inst* data;
   1256   u32 n;
   1257   u32 cap;
   1258 } RewriteList;
   1259 
   1260 typedef enum EdgeMatKind {
   1261   EDGE_MAT_STORE,
   1262   EDGE_MAT_RELOAD,
   1263 } EdgeMatKind;
   1264 
   1265 typedef struct EdgeMat {
   1266   u32 pred;
   1267   u32 succ;
   1268   PReg v;
   1269   Reg hard_reg;
   1270   u8 kind;
   1271   u8 pad[3];
   1272 } EdgeMat;
   1273 
   1274 typedef struct EdgeMatPlan {
   1275   EdgeMat* mats;
   1276   u32 n;
   1277   u32 cap;
   1278 } EdgeMatPlan;
   1279 
   1280 typedef struct RewriteOut {
   1281   Inst* data;
   1282   u32 cap;
   1283   u32 start;
   1284 } RewriteOut;
   1285 
   1286 static Inst* list_push(Func* f, RewriteList* l, IROp op) {
   1287   if (l->n == l->cap) {
   1288     u32 ncap = l->cap ? l->cap * 2u : 16u;
   1289     Inst* nb = arena_zarray(f->arena, Inst, ncap);
   1290     if (l->data) memcpy(nb, l->data, sizeof(Inst) * l->n);
   1291     l->data = nb;
   1292     l->cap = ncap;
   1293   }
   1294   Inst* in = &l->data[l->n++];
   1295   memset(in, 0, sizeof *in);
   1296   in->op = (u16)op;
   1297   ir_assign_inst_id(f, in);
   1298   ++f->opt_rewrite_inserted_insts;
   1299   return in;
   1300 }
   1301 
   1302 static void list_reset(RewriteList* l) {
   1303   if (l) l->n = 0;
   1304 }
   1305 
   1306 static void out_init(Func* f, RewriteOut* out, u32 cap) {
   1307   out->cap = cap ? cap : 16u;
   1308   out->data = arena_zarray(f->arena, Inst, out->cap);
   1309   out->start = out->cap;
   1310 }
   1311 
   1312 static Inst* out_push_front(Func* f, RewriteOut* out, IROp op) {
   1313   if (out->start == 0) {
   1314     u32 n = out->cap;
   1315     u32 ncap = out->cap ? out->cap * 2u : 16u;
   1316     Inst* nb = arena_zarray(f->arena, Inst, ncap);
   1317     if (out->data && n)
   1318       memcpy(nb + (ncap - n), out->data + out->start, sizeof(Inst) * n);
   1319     out->data = nb;
   1320     out->cap = ncap;
   1321     out->start = ncap - n;
   1322   }
   1323   Inst* in = &out->data[--out->start];
   1324   memset(in, 0, sizeof *in);
   1325   in->op = (u16)op;
   1326   ir_assign_inst_id(f, in);
   1327   return in;
   1328 }
   1329 
   1330 static void out_prepend_list_reverse(Func* f, RewriteOut* out,
   1331                                      const RewriteList* list) {
   1332   for (u32 i = list->n; i > 0; --i) {
   1333     Inst* dst = out_push_front(f, out, (IROp)list->data[i - 1u].op);
   1334     *dst = list->data[i - 1u];
   1335   }
   1336 }
   1337 
   1338 static void out_prepend_inst(Func* f, RewriteOut* out, const Inst* in) {
   1339   Inst* dst = out_push_front(f, out, (IROp)in->op);
   1340   *dst = *in;
   1341 }
   1342 
   1343 static MemAccess spill_mem(Func* f, PReg v) {
   1344   MemAccess m;
   1345   memset(&m, 0, sizeof m);
   1346   m.type = opt_reg_type(f, v);
   1347   m.size = type_size_fallback(f, opt_reg_type(f, v));
   1348   m.align = m.size >= 8 ? 8 : m.size;
   1349   m.alias.kind = ALIAS_LOCAL;
   1350   m.alias.v.local_id = (i32)spill_slot_for(f, v);
   1351   return m;
   1352 }
   1353 
   1354 static Operand spill_addr(Func* f, PReg v) {
   1355   Operand o;
   1356   memset(&o, 0, sizeof o);
   1357   o.kind = OPK_LOCAL;
   1358   o.cls = RC_INT;
   1359   o.type = opt_reg_type(f, v);
   1360   o.v.frame_slot = spill_slot_for(f, v);
   1361   return o;
   1362 }
   1363 
   1364 static Operand hard_operand(Func* f, PReg v) {
   1365   Operand o;
   1366   memset(&o, 0, sizeof o);
   1367   o.kind = OPK_REG;
   1368   o.cls = opt_preg_loc_cls(f, v);
   1369   o.type = opt_reg_type(f, v);
   1370   o.v.reg = opt_preg_hard_reg(f, v);
   1371   return o;
   1372 }
   1373 
   1374 static Operand hard_operand_reg(Func* f, PReg v, Reg hard_reg) {
   1375   Operand o;
   1376   memset(&o, 0, sizeof o);
   1377   o.kind = OPK_REG;
   1378   o.cls = opt_preg_loc_cls(f, v);
   1379   o.type = opt_reg_type(f, v);
   1380   o.v.reg = hard_reg;
   1381   return o;
   1382 }
   1383 
   1384 static void append_store_preg(Func* f, RewriteList* out, PReg v) {
   1385   Inst* st = list_push(f, out, IR_STORE);
   1386   st->opnds = arena_array(f->arena, Operand, 2);
   1387   st->opnds[0] = spill_addr(f, v);
   1388   st->opnds[1] = hard_operand(f, v);
   1389   st->nopnds = 2;
   1390   st->extra.mem = spill_mem(f, v);
   1391 }
   1392 
   1393 static void append_load_preg(Func* f, RewriteList* out, PReg v) {
   1394   Inst* ld = list_push(f, out, IR_LOAD);
   1395   ld->opnds = arena_array(f->arena, Operand, 2);
   1396   ld->opnds[0] = hard_operand(f, v);
   1397   ld->opnds[1] = spill_addr(f, v);
   1398   ld->nopnds = 2;
   1399   ld->extra.mem = spill_mem(f, v);
   1400 }
   1401 
   1402 static void append_store_preg_hard(Func* f, RewriteList* out, PReg v,
   1403                                    Reg hard_reg) {
   1404   Inst* st = list_push(f, out, IR_STORE);
   1405   st->opnds = arena_array(f->arena, Operand, 2);
   1406   st->opnds[0] = spill_addr(f, v);
   1407   st->opnds[1] = hard_operand_reg(f, v, hard_reg);
   1408   st->nopnds = 2;
   1409   st->extra.mem = spill_mem(f, v);
   1410 }
   1411 
   1412 static void append_load_preg_hard(Func* f, RewriteList* out, PReg v,
   1413                                   Reg hard_reg) {
   1414   Inst* ld = list_push(f, out, IR_LOAD);
   1415   ld->opnds = arena_array(f->arena, Operand, 2);
   1416   ld->opnds[0] = hard_operand_reg(f, v, hard_reg);
   1417   ld->opnds[1] = spill_addr(f, v);
   1418   ld->nopnds = 2;
   1419   ld->extra.mem = spill_mem(f, v);
   1420 }
   1421 
   1422 static Reg scratch_for(Func* f, u8 cls, u32* next) {
   1423   u32 n = f->opt_scratch_reg_count[cls];
   1424   if (!n) return REG_NONE;
   1425   Reg r = f->opt_scratch_regs[cls][*next % n];
   1426   ++*next;
   1427   return r;
   1428 }
   1429 
   1430 typedef struct RewriteCtx {
   1431   RewriteList* before;
   1432   RewriteList* after;
   1433   u32 next_scratch[OPT_REG_CLASSES];
   1434   u32 raw_point;
   1435 } RewriteCtx;
   1436 
   1437 static const OptAllocSegment* split_segment_at(Func* f, PReg v, u32 raw_point) {
   1438   if (!f->opt_first_segment_by_preg || v >= opt_reg_count(f)) return NULL;
   1439   for (u32 si = f->opt_first_segment_by_preg[v]; si != OPT_RANGE_NONE;
   1440        si = f->opt_alloc_segments[si].next) {
   1441     const OptAllocSegment* s = &f->opt_alloc_segments[si];
   1442     if (s->start <= raw_point && raw_point < s->end) return s;
   1443   }
   1444   return NULL;
   1445 }
   1446 
   1447 static void rewrite_one_operand(Func* f, Inst* owner, Operand* op, int is_def,
   1448                                 void* arg) {
   1449   RewriteCtx* c = (RewriteCtx*)arg;
   1450   if (op->kind != OPK_REG) return;
   1451   PReg v = (PReg)op->v.reg;
   1452   if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return;
   1453   u8 alloc_kind = opt_preg_alloc_kind(f, v);
   1454   if (alloc_kind == OPT_ALLOC_HARD) {
   1455     op->v.reg = opt_preg_hard_reg(f, v);
   1456     return;
   1457   }
   1458   if (alloc_kind == OPT_ALLOC_SPLIT) {
   1459     const OptAllocSegment* seg = split_segment_at(f, v, c->raw_point);
   1460     if (seg && seg->loc_kind == OPT_LOC_HARD) {
   1461       op->v.reg = seg->hard_reg;
   1462       return;
   1463     }
   1464   } else if (alloc_kind != OPT_ALLOC_SPILL) {
   1465     return;
   1466   }
   1467   u8 cls = opt_preg_loc_cls(f, v);
   1468   Reg scratch = scratch_for(f, cls, &c->next_scratch[cls]);
   1469   if (scratch == (Reg)REG_NONE) {
   1470     SrcLoc loc = owner ? owner->loc : (SrcLoc){0, 0, 0};
   1471     compiler_panic(f->c, loc,
   1472                    "opt rewrite: no scratch register for spilled class %u",
   1473                    (unsigned)cls);
   1474   }
   1475   op->v.reg = scratch;
   1476   if (!is_def) {
   1477     Inst* ld = list_push(f, c->before, IR_LOAD);
   1478     ld->opnds = arena_array(f->arena, Operand, 2);
   1479     ld->opnds[0] = *op;
   1480     ld->opnds[1] = spill_addr(f, v);
   1481     ld->nopnds = 2;
   1482     ld->extra.mem = spill_mem(f, v);
   1483   } else {
   1484     Inst* st = list_push(f, c->after, IR_STORE);
   1485     st->opnds = arena_array(f->arena, Operand, 2);
   1486     st->opnds[0] = spill_addr(f, v);
   1487     st->opnds[1] = *op;
   1488     st->nopnds = 2;
   1489     st->extra.mem = spill_mem(f, v);
   1490   }
   1491 }
   1492 
   1493 static void rewrite_call_arg_operand(Func* f, Operand* op) {
   1494   if (!op || op->kind != OPK_REG) return;
   1495   PReg v = (PReg)op->v.reg;
   1496   if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return;
   1497   u8 alloc_kind = opt_preg_alloc_kind(f, v);
   1498   if (alloc_kind == OPT_ALLOC_HARD) {
   1499     op->v.reg = opt_preg_hard_reg(f, v);
   1500   } else if (alloc_kind == OPT_ALLOC_SPILL || alloc_kind == OPT_ALLOC_SPLIT) {
   1501     *op = spill_addr(f, v);
   1502   }
   1503 }
   1504 
   1505 static void rewrite_store_value_operand(Func* f, Inst* owner, Operand* op,
   1506                                         RewriteCtx* ctx) {
   1507   PReg v;
   1508   u8 alloc_kind;
   1509   const OptAllocSegment* seg;
   1510   if (!op || op->kind != OPK_REG) return;
   1511   v = (PReg)op->v.reg;
   1512   if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return;
   1513   alloc_kind = opt_preg_alloc_kind(f, v);
   1514   if (alloc_kind == OPT_ALLOC_HARD) {
   1515     op->v.reg = opt_preg_hard_reg(f, v);
   1516     return;
   1517   }
   1518   if (alloc_kind == OPT_ALLOC_SPLIT) {
   1519     seg = split_segment_at(f, v, ctx->raw_point);
   1520     if (seg && seg->loc_kind == OPT_LOC_HARD) {
   1521       op->v.reg = seg->hard_reg;
   1522       return;
   1523     }
   1524     *op = spill_addr(f, v);
   1525     return;
   1526   }
   1527   if (alloc_kind == OPT_ALLOC_SPILL) {
   1528     *op = spill_addr(f, v);
   1529     return;
   1530   }
   1531   rewrite_one_operand(f, owner, op, 0, ctx);
   1532 }
   1533 
   1534 static void rewrite_call_arg_indirect_base(Func* f, Inst* owner, Operand* op,
   1535                                            RewriteCtx* ctx) {
   1536   if (!op || op->kind != OPK_INDIRECT) return;
   1537   Operand base = *op;
   1538   base.kind = OPK_REG;
   1539   base.cls = RC_INT;
   1540   base.v.reg = op->v.ind.base;
   1541   rewrite_one_operand(f, owner, &base, 0, ctx);
   1542   op->v.ind.base = base.v.reg;
   1543 }
   1544 
   1545 static void rewrite_call_arg_value(Func* f, Inst* owner, CGABIValue* v,
   1546                                    RewriteCtx* ctx) {
   1547   if (!v) return;
   1548   rewrite_call_arg_indirect_base(f, owner, &v->storage, ctx);
   1549   rewrite_call_arg_operand(f, &v->storage);
   1550   for (u32 i = 0; i < v->nparts; ++i) {
   1551     Operand* op = (Operand*)&v->parts[i].op;
   1552     rewrite_call_arg_indirect_base(f, owner, op, ctx);
   1553     rewrite_call_arg_operand(f, op);
   1554   }
   1555 }
   1556 
   1557 typedef struct RewriteCallSaveCtx {
   1558   Func* f;
   1559   RewriteList* out;
   1560   const InstRefs* refs;
   1561   const Inst* call;
   1562   int emit_restore;
   1563 } RewriteCallSaveCtx;
   1564 
   1565 static void rewrite_call_save_one(PReg v, void* arg) {
   1566   RewriteCallSaveCtx* c = (RewriteCallSaveCtx*)arg;
   1567   Func* f = c->f;
   1568   if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return;
   1569   if (c->refs && refs_has_def(c->refs, v)) return;
   1570   if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_HARD) return;
   1571   u8 cls = opt_preg_loc_cls(f, v);
   1572   Reg hr = opt_preg_hard_reg(f, v);
   1573   if (cls >= OPT_REG_CLASSES || hr >= 32u) return;
   1574   if ((opt_call_clobber_mask_for(f, c->call, cls) & (1u << hr)) == 0) return;
   1575   if (c->emit_restore)
   1576     append_load_preg(f, c->out, v);
   1577   else
   1578     append_store_preg(f, c->out, v);
   1579 }
   1580 
   1581 static void append_live_call_saves(Func* f, RewriteList* out, const Inst* call,
   1582                                    const u64* live_after, u32 live_active_words,
   1583                                    const InstRefs* refs,
   1584                                    const PReg* call_save_pregs,
   1585                                    u32 ncall_save_pregs, int emit_restore) {
   1586   RewriteCallSaveCtx ctx = {f, out, refs, call, emit_restore};
   1587   f->opt_rewrite_live_words_touched += ncall_save_pregs;
   1588   for (u32 i = 0; i < ncall_save_pregs; ++i) {
   1589     PReg v = call_save_pregs[i];
   1590     u32 w = v / 64u;
   1591     if (w >= live_active_words) continue;
   1592     if (!bit_has(live_after, v)) continue;
   1593     rewrite_call_save_one(v, &ctx);
   1594   }
   1595 }
   1596 
   1597 static PReg* rewrite_collect_call_save_pregs(Func* f, u32* count_out) {
   1598   u32 n = 0;
   1599   for (PReg v = 1; v < opt_reg_count(f); ++v) {
   1600     OptPRegInfo* vi = &f->preg_info[v];
   1601     if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_HARD) continue;
   1602     if (!vi->live_across_call_freq) continue;
   1603     ++n;
   1604   }
   1605   PReg* pregs = arena_array(f->arena, PReg, n ? n : 1u);
   1606   u32 w = 0;
   1607   for (PReg v = 1; v < opt_reg_count(f); ++v) {
   1608     OptPRegInfo* vi = &f->preg_info[v];
   1609     if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_HARD) continue;
   1610     if (!vi->live_across_call_freq) continue;
   1611     pregs[w++] = v;
   1612   }
   1613   *count_out = n;
   1614   return pregs;
   1615 }
   1616 
   1617 static void append_split_reloads_at(Func* f, RewriteList* out, u32 block,
   1618                                     u32 raw_point) {
   1619   if (!f->opt_first_segment_by_preg) return;
   1620   for (PReg v = 1; v < opt_reg_count(f); ++v) {
   1621     if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_SPLIT) continue;
   1622     for (u32 si = f->opt_first_segment_by_preg[v]; si != OPT_RANGE_NONE;
   1623          si = f->opt_alloc_segments[si].next) {
   1624       const OptAllocSegment* s = &f->opt_alloc_segments[si];
   1625       if (s->block == block && s->start == raw_point && s->reload_at_start &&
   1626           s->loc_kind == OPT_LOC_HARD) {
   1627         append_load_preg_hard(f, out, v, s->hard_reg);
   1628       }
   1629     }
   1630   }
   1631 }
   1632 
   1633 static void append_split_stores_at(Func* f, RewriteList* out, u32 block,
   1634                                    u32 raw_point) {
   1635   if (!f->opt_first_segment_by_preg) return;
   1636   for (PReg v = 1; v < opt_reg_count(f); ++v) {
   1637     if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_SPLIT) continue;
   1638     for (u32 si = f->opt_first_segment_by_preg[v]; si != OPT_RANGE_NONE;
   1639          si = f->opt_alloc_segments[si].next) {
   1640       const OptAllocSegment* s = &f->opt_alloc_segments[si];
   1641       if (s->block == block && s->end == raw_point && s->store_at_end &&
   1642           s->loc_kind == OPT_LOC_HARD) {
   1643         append_store_preg_hard(f, out, v, s->hard_reg);
   1644       }
   1645     }
   1646   }
   1647 }
   1648 
   1649 static void edge_mat_push(Func* f, EdgeMatPlan* plan, u32 pred, u32 succ,
   1650                           PReg v, Reg hard_reg, EdgeMatKind kind) {
   1651   if (!plan || pred >= f->nblocks || succ >= f->nblocks) return;
   1652   if (hard_reg == (Reg)REG_NONE) return;
   1653   if (plan->n == plan->cap) {
   1654     u32 ncap = plan->cap ? plan->cap * 2u : 16u;
   1655     EdgeMat* nm = arena_array(f->arena, EdgeMat, ncap);
   1656     if (plan->mats) memcpy(nm, plan->mats, sizeof(plan->mats[0]) * plan->n);
   1657     plan->mats = nm;
   1658     plan->cap = ncap;
   1659   }
   1660   EdgeMat* m = &plan->mats[plan->n++];
   1661   memset(m, 0, sizeof *m);
   1662   m->pred = pred;
   1663   m->succ = succ;
   1664   m->v = v;
   1665   m->hard_reg = hard_reg;
   1666   m->kind = (u8)kind;
   1667 }
   1668 
   1669 static void prepare_split_edge_materialization(Func* f, EdgeMatPlan* plan) {
   1670   if (!f || !plan || !f->opt_first_segment_by_preg) return;
   1671   u32* block_base = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u);
   1672   u32 raw = 0;
   1673   for (u32 b = 0; b < f->nblocks; ++b) {
   1674     block_base[b] = raw;
   1675     raw += f->blocks[b].ninsts ? f->blocks[b].ninsts : 1u;
   1676   }
   1677 
   1678   for (PReg v = 1; v < opt_reg_count(f); ++v) {
   1679     if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_SPLIT) continue;
   1680     for (u32 si = f->opt_first_segment_by_preg[v]; si != OPT_RANGE_NONE;
   1681          si = f->opt_alloc_segments[si].next) {
   1682       OptAllocSegment* s = &f->opt_alloc_segments[si];
   1683       if (s->loc_kind != OPT_LOC_HARD || s->block >= f->nblocks) continue;
   1684       Block* bl = &f->blocks[s->block];
   1685       u32 block_start = block_base[s->block];
   1686       u32 block_end = block_start + (bl->ninsts ? bl->ninsts : 1u);
   1687       if (s->reload_at_start && s->start == block_start && bl->npreds) {
   1688         for (u32 p = 0; p < bl->npreds; ++p)
   1689           edge_mat_push(f, plan, bl->preds[p], s->block, v, s->hard_reg,
   1690                         EDGE_MAT_RELOAD);
   1691         s->reload_at_start = 0;
   1692       }
   1693       if (s->store_at_end && s->end == block_end && bl->nsucc) {
   1694         for (u32 e = 0; e < bl->nsucc; ++e)
   1695           edge_mat_push(f, plan, s->block, bl->succ[e], v, s->hard_reg,
   1696                         EDGE_MAT_STORE);
   1697         s->store_at_end = 0;
   1698       }
   1699     }
   1700   }
   1701 }
   1702 
   1703 static int lower_is_terminator(const Inst* in) {
   1704   if (!in) return 0;
   1705   switch ((IROp)in->op) {
   1706     case IR_BR:
   1707     case IR_CONDBR:
   1708     case IR_CMP_BRANCH:
   1709     case IR_SWITCH:
   1710     case IR_INDIRECT_BRANCH:
   1711     case IR_RET:
   1712     case IR_UNREACHABLE:
   1713     case IR_BREAK_TO:
   1714     case IR_CONTINUE_TO:
   1715       return 1;
   1716     case IR_INTRINSIC: {
   1717       IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
   1718       return aux && (aux->kind == INTRIN_LONGJMP || aux->kind == INTRIN_TRAP);
   1719     }
   1720     default:
   1721       return 0;
   1722   }
   1723 }
   1724 
   1725 static void block_insert_list(Func* f, u32 block, const RewriteList* list) {
   1726   if (!f || block >= f->nblocks || !list || !list->n) return;
   1727   Block* bl = &f->blocks[block];
   1728   u32 term = bl->ninsts && lower_is_terminator(&bl->insts[bl->ninsts - 1u]);
   1729   u32 insert_at = bl->ninsts - term;
   1730   Inst* insts = arena_zarray(f->arena, Inst, bl->ninsts + list->n);
   1731   if (insert_at) memcpy(insts, bl->insts, sizeof(Inst) * insert_at);
   1732   memcpy(insts + insert_at, list->data, sizeof(Inst) * list->n);
   1733   if (bl->ninsts > insert_at) {
   1734     memcpy(insts + insert_at + list->n, bl->insts + insert_at,
   1735            sizeof(Inst) * (bl->ninsts - insert_at));
   1736   }
   1737   bl->insts = insts;
   1738   bl->ninsts += list->n;
   1739   bl->cap = bl->ninsts;
   1740 }
   1741 
   1742 typedef struct EdgePlace {
   1743   u32 pred;
   1744   u32 succ;
   1745   u32 block;
   1746 } EdgePlace;
   1747 
   1748 static u32 edge_place_for(Func* f, EdgePlace** places, u32* nplaces, u32* cap,
   1749                           u32 pred, u32 succ) {
   1750   for (u32 i = 0; i < *nplaces; ++i)
   1751     if ((*places)[i].pred == pred && (*places)[i].succ == succ)
   1752       return (*places)[i].block;
   1753   if (*nplaces == *cap) {
   1754     u32 ncap = *cap ? *cap * 2u : 16u;
   1755     EdgePlace* np = arena_array(f->arena, EdgePlace, ncap);
   1756     if (*places) memcpy(np, *places, sizeof((*places)[0]) * *nplaces);
   1757     *places = np;
   1758     *cap = ncap;
   1759   }
   1760   u32 block = opt_split_edge(f, pred, succ);
   1761   EdgePlace* p = &(*places)[(*nplaces)++];
   1762   p->pred = pred;
   1763   p->succ = succ;
   1764   p->block = block;
   1765   return block;
   1766 }
   1767 
   1768 static void apply_split_edge_materialization(Func* f, const EdgeMatPlan* plan) {
   1769   if (!f || !plan || !plan->n) return;
   1770   EdgePlace* places = NULL;
   1771   u32 nplaces = 0;
   1772   u32 place_cap = 0;
   1773   for (u32 i = 0; i < plan->n; ++i) {
   1774     const EdgeMat* m = &plan->mats[i];
   1775     if (m->pred < f->nblocks && m->succ < f->nblocks)
   1776       (void)edge_place_for(f, &places, &nplaces, &place_cap, m->pred, m->succ);
   1777   }
   1778   for (u32 pass = 0; pass < 2; ++pass) {
   1779     EdgeMatKind kind = pass == 0 ? EDGE_MAT_STORE : EDGE_MAT_RELOAD;
   1780     for (u32 i = 0; i < plan->n; ++i) {
   1781       const EdgeMat* m = &plan->mats[i];
   1782       if ((EdgeMatKind)m->kind != kind) continue;
   1783       u32 block =
   1784           edge_place_for(f, &places, &nplaces, &place_cap, m->pred, m->succ);
   1785       RewriteList list;
   1786       memset(&list, 0, sizeof list);
   1787       if (kind == EDGE_MAT_STORE)
   1788         append_store_preg_hard(f, &list, m->v, m->hard_reg);
   1789       else
   1790         append_load_preg_hard(f, &list, m->v, m->hard_reg);
   1791       block_insert_list(f, block, &list);
   1792     }
   1793   }
   1794   opt_build_cfg(f);
   1795 }
   1796 
   1797 static void rewrite_func(Func* f, const OptLiveInfo* live_info) {
   1798   u32 words = live_info ? live_info->words : f->opt_live_words;
   1799   if (!words) words = bit_words(opt_reg_count(f));
   1800   f->opt_live_words = (u16)words;
   1801   f->opt_rewrite_inserted_insts = 0;
   1802   f->opt_rewrite_live_words_touched = 0;
   1803 
   1804   u64* live = arena_zarray(f->arena, u64, words ? words : 1u);
   1805   u32 ncall_save_pregs = 0;
   1806   PReg* call_save_pregs = rewrite_collect_call_save_pregs(f, &ncall_save_pregs);
   1807   InstRefs refs;
   1808   memset(&refs, 0, sizeof refs);
   1809   u32 live_active_words = 0;
   1810   u32* block_base = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u);
   1811   u32 raw = 0;
   1812   for (u32 b = 0; b < f->nblocks; ++b) {
   1813     block_base[b] = raw;
   1814     raw += f->blocks[b].ninsts ? f->blocks[b].ninsts : 1u;
   1815   }
   1816   for (u32 b = 0; b < f->nblocks; ++b) {
   1817     Block* bl = &f->blocks[b];
   1818     RewriteOut out;
   1819     out_init(f, &out, bl->ninsts + 16u);
   1820     RewriteList before, after, call_saves, call_restores;
   1821     memset(&before, 0, sizeof before);
   1822     memset(&after, 0, sizeof after);
   1823     memset(&call_saves, 0, sizeof call_saves);
   1824     memset(&call_restores, 0, sizeof call_restores);
   1825     live_active_words = live_copy_block_out_active(live_info, b, live, words,
   1826                                                    live_active_words);
   1827     f->opt_rewrite_live_words_touched += live_active_words;
   1828 
   1829     for (u32 ri = bl->ninsts; ri > 0; --ri) {
   1830       u32 i = ri - 1u;
   1831       Inst in = bl->insts[i];
   1832       if ((IROp)in.op == IR_PARAM_DECL) {
   1833         out_prepend_inst(f, &out, &in);
   1834         continue;
   1835       }
   1836       refs_reset(&refs);
   1837       opt_walk_inst_operands(f, &in, refs_collect, &refs);
   1838       list_reset(&before);
   1839       list_reset(&after);
   1840       list_reset(&call_saves);
   1841       list_reset(&call_restores);
   1842       RewriteCtx ctx;
   1843       memset(&ctx, 0, sizeof ctx);
   1844       ctx.before = &before;
   1845       ctx.after = &after;
   1846       ctx.raw_point = block_base[b] + i;
   1847       if ((IROp)in.op == IR_CALL) {
   1848         IRCallAux* aux = (IRCallAux*)in.extra.aux;
   1849         if (aux) {
   1850           if (aux->use_plan_replay) {
   1851             rewrite_one_operand(f, &in, &aux->plan.callee, 0, &ctx);
   1852             for (u32 k = 0; k < aux->plan.nargs; ++k) {
   1853               rewrite_call_arg_indirect_base(f, &in, &aux->plan.args[k].src,
   1854                                              &ctx);
   1855               rewrite_call_arg_operand(f, &aux->plan.args[k].src);
   1856             }
   1857             for (u32 k = 0; k < aux->plan.nrets; ++k)
   1858               rewrite_one_operand(f, &in, &aux->plan.rets[k].dst, 1, &ctx);
   1859           } else {
   1860             rewrite_one_operand(f, &in, &aux->desc.callee, 0, &ctx);
   1861             for (u32 k = 0; k < aux->desc.nargs; ++k)
   1862               rewrite_call_arg_value(f, &in, (CGABIValue*)&aux->desc.args[k],
   1863                                      &ctx);
   1864             opt_walk_abivalue(f, &in, &aux->desc.ret, 1, rewrite_one_operand,
   1865                               &ctx);
   1866           }
   1867         }
   1868       } else if ((IROp)in.op == IR_STORE && in.nopnds >= 2) {
   1869         opt_walk_operand(f, &in, &in.opnds[0], 0, rewrite_one_operand, &ctx);
   1870         rewrite_store_value_operand(f, &in, &in.opnds[1], &ctx);
   1871       } else {
   1872         opt_walk_inst_operands(f, &in, rewrite_one_operand, &ctx);
   1873       }
   1874       if ((IROp)in.op == IR_CALL) {
   1875         append_live_call_saves(f, &call_saves, &in, live, live_active_words,
   1876                                &refs, call_save_pregs, ncall_save_pregs, 0);
   1877         append_live_call_saves(f, &call_restores, &in, live, live_active_words,
   1878                                &refs, call_save_pregs, ncall_save_pregs, 1);
   1879       }
   1880       append_split_reloads_at(f, &before, b, block_base[b] + i);
   1881       append_split_stores_at(f, &after, b, block_base[b] + i + 1u);
   1882 
   1883       out_prepend_list_reverse(f, &out, &after);
   1884       out_prepend_list_reverse(f, &out, &call_restores);
   1885       out_prepend_inst(f, &out, &in);
   1886       out_prepend_list_reverse(f, &out, &call_saves);
   1887       out_prepend_list_reverse(f, &out, &before);
   1888       live_active_words =
   1889           live_update_refs_before_active(live, live_active_words, words, &refs);
   1890       f->opt_rewrite_live_words_touched += refs.nuses + refs.ndefs;
   1891     }
   1892     bl->insts = out.data + out.start;
   1893     bl->ninsts = out.cap - out.start;
   1894     bl->cap = bl->ninsts;
   1895   }
   1896   f->opt_rewritten = 1;
   1897 }
   1898 
   1899 void opt_lower_to_mir(Func* f, const OptLiveInfo* live_info) {
   1900   if (!f) return;
   1901   Func phys = *f;
   1902   phys.blocks = f->blocks;
   1903   phys.opt_rewritten = 0;
   1904   phys.mir = NULL;
   1905 
   1906   rewrite_func(&phys, live_info);
   1907 
   1908   MFunc* m = arena_zarray(f->arena, MFunc, 1);
   1909   m->blocks = phys.blocks;
   1910   m->nblocks = phys.nblocks;
   1911   m->entry = phys.entry;
   1912   m->emit_order_n = phys.emit_order_n;
   1913   m->emit_order_cap = phys.emit_order_n;
   1914   m->emit_order =
   1915       arena_array(f->arena, u32, phys.emit_order_n ? phys.emit_order_n : 1u);
   1916   if (phys.emit_order_n)
   1917     memcpy(m->emit_order, phys.emit_order, sizeof(u32) * phys.emit_order_n);
   1918 
   1919   f->mir = m;
   1920   f->blocks = phys.blocks;
   1921   f->nblocks = phys.nblocks;
   1922   f->blocks_cap = phys.blocks_cap;
   1923   f->frame_slots = phys.frame_slots;
   1924   f->nframe_slots = phys.nframe_slots;
   1925   f->frame_slots_cap = phys.frame_slots_cap;
   1926   f->next_inst_id = phys.next_inst_id;
   1927   f->opt_rewrite_inserted_insts = phys.opt_rewrite_inserted_insts;
   1928   f->opt_rewrite_live_words_touched = phys.opt_rewrite_live_words_touched;
   1929 }
   1930 
   1931 static void rewrite_dump_sb(Writer* w, const StrBuf* sb) {
   1932   kit_writer_write(w, strbuf_cstr(sb), strbuf_len(sb));
   1933 }
   1934 
   1935 void opt_rewrite_dump(Func* f, Writer* w) {
   1936   if (!f || !w) return;
   1937   char buf[96];
   1938   StrBuf sb;
   1939   strbuf_init(&sb, buf, sizeof buf);
   1940   strbuf_puts(&sb, "rewrite blocks=");
   1941   strbuf_put_u64(&sb, (u64)(unsigned)f->nblocks);
   1942   strbuf_puts(&sb, " pregs=");
   1943   strbuf_put_u64(&sb, (u64)(unsigned)opt_reg_count(f));
   1944   strbuf_puts(&sb, " rewritten=");
   1945   strbuf_put_u64(&sb, (u64)(unsigned)f->opt_rewritten);
   1946   strbuf_putc(&sb, '\n');
   1947   rewrite_dump_sb(w, &sb);
   1948   for (u32 b = 0; b < f->nblocks; ++b) {
   1949     Block* bl = &f->blocks[b];
   1950     strbuf_reset(&sb);
   1951     strbuf_puts(&sb, "block ");
   1952     strbuf_put_u64(&sb, (u64)(unsigned)b);
   1953     strbuf_puts(&sb, " insts=");
   1954     strbuf_put_u64(&sb, (u64)(unsigned)bl->ninsts);
   1955     strbuf_putc(&sb, '\n');
   1956     rewrite_dump_sb(w, &sb);
   1957     for (u32 i = 0; i < bl->ninsts; ++i) {
   1958       Inst* in = &bl->insts[i];
   1959       strbuf_reset(&sb);
   1960       strbuf_puts(&sb, "  ");
   1961       strbuf_put_u64(&sb, (u64)(unsigned)i);
   1962       strbuf_puts(&sb, " op=");
   1963       strbuf_put_u64(&sb, (u64)(unsigned)in->op);
   1964       strbuf_puts(&sb, " operands=");
   1965       strbuf_put_u64(&sb, (u64)(unsigned)in->nopnds);
   1966       strbuf_putc(&sb, '\n');
   1967       rewrite_dump_sb(w, &sb);
   1968     }
   1969   }
   1970 }
   1971 
   1972 static int all_defs_dead(Func* f, Inst* in, u64* live) {
   1973   if (in->def != 0 && in->def < opt_reg_count(f) && bit_has(live, in->def))
   1974     return 0;
   1975   for (u32 i = 0; i < in->ndefs; ++i) {
   1976     PReg r = (PReg)in->defs[i];
   1977     if (r != 0 && r < opt_reg_count(f) && bit_has(live, r)) return 0;
   1978   }
   1979   return 1;
   1980 }
   1981 
   1982 void opt_dead_def_elim_with_live(Func* f, const OptLiveInfo* live_info) {
   1983   u32 words = live_info ? live_info->words : f->opt_live_words;
   1984   if (!words) words = bit_words(opt_reg_count(f));
   1985   f->opt_dde_live_words_touched = 0;
   1986   InstRefs refs;
   1987   memset(&refs, 0, sizeof refs);
   1988   for (u32 b = 0; b < f->nblocks; ++b) {
   1989     Block* bl = &f->blocks[b];
   1990     u64* live = arena_zarray(f->arena, u64, words ? words : 1u);
   1991     f->opt_dde_live_words_touched += words;
   1992     live_copy_block_out(f, live_info, b, live, words);
   1993 
   1994     Inst* new_insts = arena_array(f->arena, Inst, bl->ninsts);
   1995     u32 w = 0;
   1996     for (u32 ri = bl->ninsts; ri > 0; --ri) {
   1997       u32 i = ri - 1u;
   1998       Inst* in = &bl->insts[i];
   1999       if (!opt_inst_has_side_effect(f, in) && all_defs_dead(f, in, live)) {
   2000         continue;
   2001       }
   2002       new_insts[w++] = *in;
   2003 
   2004       refs_reset(&refs);
   2005       opt_walk_inst_operands(f, in, refs_collect, &refs);
   2006       live_update_refs_before(live, &refs);
   2007       f->opt_dde_live_words_touched += refs.nuses + refs.ndefs;
   2008     }
   2009 
   2010     for (u32 i = 0; i < w / 2; ++i) {
   2011       Inst tmp = new_insts[i];
   2012       new_insts[i] = new_insts[w - 1 - i];
   2013       new_insts[w - 1 - i] = tmp;
   2014     }
   2015 
   2016     bl->insts = new_insts;
   2017     bl->ninsts = w;
   2018     bl->cap = w;
   2019   }
   2020 }
   2021 
   2022 void opt_dead_def_elim(Func* f) {
   2023   OptLiveInfo live;
   2024   opt_live_blocks(f, &live);
   2025   opt_dead_def_elim_with_live(f, &live);
   2026 }
   2027 
   2028 /* Collect the def and use PRegs of one instruction (operands + aux fan-out). */
   2029 typedef struct AllocVerifyRefs {
   2030   PReg defs[256];
   2031   u32 ndefs;
   2032   PReg uses[256];
   2033   u32 nuses;
   2034 } AllocVerifyRefs;
   2035 
   2036 static void alloc_verify_collect(Func* f, Inst* in, Operand* op, int is_def,
   2037                                  void* ctx) {
   2038   (void)in;
   2039   AllocVerifyRefs* r = (AllocVerifyRefs*)ctx;
   2040   PReg p;
   2041   if (!op || op->kind != OPK_REG) return;
   2042   p = (PReg)op->v.reg;
   2043   if (p == 0 || p >= opt_reg_count(f)) return;
   2044   if (is_def) {
   2045     if (r->ndefs < 256u) r->defs[r->ndefs++] = p;
   2046   } else {
   2047     if (r->nuses < 256u) r->uses[r->nuses++] = p;
   2048   }
   2049 }
   2050 
   2051 /* Post-allocation interference verifier for the O1 (no-coalesce) path. Since
   2052  * O1 never coalesces, two *distinct* PRegs that are live simultaneously must
   2053  * never share a hard register. Re-derive per-instruction liveness from the
   2054  * block live-out sets and panic if any definition collides with a different
   2055  * live value in the same hard reg — turning a silent miscompile into a hard
   2056  * error. */
   2057 static void opt_verify_alloc(Func* f, const OptLiveInfo* live) {
   2058   u32 nregs = opt_reg_count(f);
   2059   u8* cur;
   2060   if (nregs <= 1u || !live) return;
   2061   /* No "left in incoming reg" pre-check: the hint path's
   2062    * opt_ranges_overlap_kind precision check already permits the unit-overlap
   2063    * between a param PReg and its own incoming reg (= "no entry move"), and
   2064    * the standard allocator's bitmap rejects every other overlap. The
   2065    * per-instruction interference scan below is the residual safety net. */
   2066   cur = arena_array(f->arena, u8, nregs);
   2067   for (u32 b = 0; b < f->nblocks; ++b) {
   2068     Block* bl = &f->blocks[b];
   2069     const OptBlockLive* lb = &live->blocks[b];
   2070     memset(cur, 0, nregs);
   2071     for (PReg p = 1; p < nregs; ++p)
   2072       if (opt_bitset_has(&lb->live_out, p)) cur[p] = 1u;
   2073     for (u32 ri = bl->ninsts; ri > 0; --ri) {
   2074       Inst* in = &bl->insts[ri - 1u];
   2075       AllocVerifyRefs refs;
   2076       refs.ndefs = 0;
   2077       refs.nuses = 0;
   2078       opt_walk_inst_operands(f, in, alloc_verify_collect, &refs);
   2079       for (u32 k = 0; k < refs.ndefs; ++k) {
   2080         PReg d = refs.defs[k];
   2081         u8 d_kind = opt_preg_alloc_kind(f, d);
   2082         if (d_kind != OPT_ALLOC_HARD && d_kind != OPT_ALLOC_SPILL) continue;
   2083         for (PReg p = 1; p < nregs; ++p) {
   2084           u8 p_kind;
   2085           if (!cur[p] || p == d) continue;
   2086           p_kind = opt_preg_alloc_kind(f, p);
   2087           if (p_kind == OPT_ALLOC_HARD && d_kind == OPT_ALLOC_HARD &&
   2088               opt_preg_loc_cls(f, p) == opt_preg_loc_cls(f, d) &&
   2089               opt_preg_hard_reg(f, p) == opt_preg_hard_reg(f, d)) {
   2090             SrcLoc loc = {0, 0, 0};
   2091             compiler_panic(f->c, loc,
   2092                            "opt regalloc: O1 interference — pregs %u and %u "
   2093                            "share cls%u reg%u, both live at block %u inst %u "
   2094                            "(op %u)",
   2095                            (unsigned)d, (unsigned)p,
   2096                            (unsigned)opt_preg_loc_cls(f, d),
   2097                            (unsigned)opt_preg_hard_reg(f, d), b, ri - 1u,
   2098                            (unsigned)in->op);
   2099           }
   2100           if (p_kind == OPT_ALLOC_SPILL && d_kind == OPT_ALLOC_SPILL &&
   2101               opt_preg_spill_slot(f, p) == opt_preg_spill_slot(f, d)) {
   2102             SrcLoc loc = {0, 0, 0};
   2103             compiler_panic(f->c, loc,
   2104                            "opt regalloc: O1 interference — pregs %u and %u "
   2105                            "share spill slot %u, both live at block %u inst %u "
   2106                            "(op %u)",
   2107                            (unsigned)d, (unsigned)p,
   2108                            (unsigned)opt_preg_spill_slot(f, d), b, ri - 1u,
   2109                            (unsigned)in->op);
   2110           }
   2111         }
   2112       }
   2113       for (u32 k = 0; k < refs.ndefs; ++k) cur[refs.defs[k]] = 0u;
   2114       for (u32 k = 0; k < refs.nuses; ++k) cur[refs.uses[k]] = 1u;
   2115     }
   2116   }
   2117 }
   2118 
   2119 static void opt_regalloc_place(Func* f, int allow_live_range_split,
   2120                                OptLiveInfo* live_out) {
   2121   metrics_scope_begin(f->c, "opt.live_ranges.regalloc");
   2122   OptLiveInfo live;
   2123   opt_live_blocks(f, &live);
   2124   OptLiveRangeSet ranges;
   2125   opt_live_ranges_build(f, &live, &ranges);
   2126   opt_init_preg_info_from_ranges(f, &ranges);
   2127   opt_apply_asm_constraints_from_live(f, &live);
   2128   apply_abi_aliasing_hints(f);
   2129   /* The ABI hint passes clear forbid bits to steer values toward ABI registers
   2130    * (ret reg, arg regs). Restore the hard machine-clobber forbids afterward so
   2131    * no allocation path (normal, preferred-reg, or two-address group) can place
   2132    * a value in a register that an instruction live across it destroys. */
   2133   if (f->preg_info)
   2134     for (PReg v = 1; v < opt_reg_count(f); ++v)
   2135       f->preg_info[v].forbidden_hard_regs |=
   2136           f->preg_info[v].clobbered_hard_regs;
   2137   /* MIR coalesces only at -O2 (mir-gen.c:9431); match that here. At O1 the
   2138    * point-bitmap allocator emits copies through the natural conflict-free
   2139    * path. IRF_NO_COALESCE protects SSA edge copies inserted at O2. */
   2140   if (allow_live_range_split) opt_coalesce_ranges(f, &ranges);
   2141   metrics_count(f->c, "opt.live_words", f->opt_live_words);
   2142   metrics_count(f->c, "opt.ranges", ranges.nranges);
   2143   metrics_count(f->c, "opt.range_points", ranges.point_count);
   2144   metrics_count(f->c, "opt.range_raw_points", ranges.raw_point_count);
   2145   metrics_count(f->c, "opt.range_max_per_preg", ranges.max_ranges_per_preg);
   2146   metrics_count(f->c, "opt.range_max_length", ranges.max_live_length);
   2147   metrics_count(f->c, "opt.range_whole_block_spans", ranges.whole_block_spans);
   2148   metrics_count(f->c, "opt.live.bitset_words_touched",
   2149                 live.bitset_words_touched);
   2150   metrics_count(f->c, "opt.live.dataflow_iterations", live.dataflow_iterations);
   2151   metrics_count(f->c, "opt.live.dataflow_block_visits",
   2152                 live.dataflow_block_visits);
   2153   metrics_count(f->c, "opt.range.point_visits", ranges.range_point_visits);
   2154   metrics_count(f->c, "opt.range.preg_scans", ranges.preg_scans);
   2155   metrics_count(f->c, "opt.range.live_words_touched",
   2156                 ranges.live_words_touched);
   2157   metrics_count(f->c, "opt.conflict_bytes", 0);
   2158   metrics_scope_end(f->c, "opt.live_ranges.regalloc");
   2159 
   2160   OptAllocator alloc;
   2161   opt_assign_ranges(f, &ranges, &alloc, allow_live_range_split);
   2162   if (!allow_live_range_split) opt_verify_alloc(f, &live);
   2163   if (live_out) *live_out = live;
   2164 }
   2165 
   2166 void opt_regalloc_locations(Func* f, int allow_live_range_split,
   2167                             OptLiveInfo* live_out) {
   2168   opt_regalloc_place(f, allow_live_range_split, live_out);
   2169 }
   2170 
   2171 void opt_regalloc(Func* f, int allow_live_range_split) {
   2172   OptLiveInfo live;
   2173   opt_regalloc_place(f, allow_live_range_split, &live);
   2174   EdgeMatPlan edge_mats;
   2175   memset(&edge_mats, 0, sizeof edge_mats);
   2176   if (allow_live_range_split) prepare_split_edge_materialization(f, &edge_mats);
   2177   rewrite_func(f, &live);
   2178   apply_split_edge_materialization(f, &edge_mats);
   2179 }