kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

pass_native_emit.c (64744B)


      1 #include <string.h>
      2 
      3 #include "cg/type.h"
      4 #include "core/metrics.h"
      5 #include "core/pool.h"
      6 #include "opt/opt_internal.h"
      7 
      8 #undef Operand
      9 #undef CGParamDesc
     10 #undef CGCallDesc
     11 #undef CGFuncDesc
     12 #undef CGLocalStorage
     13 #undef CGABIValue
     14 #undef CGABIPart
     15 #undef CGCallPlan
     16 #undef CGCallPlanMove
     17 #undef CGCallPlanRet
     18 #undef CGScopeDesc
     19 
     20 typedef struct NativeEmitCtx {
     21   Compiler* c;
     22   Func* f;
     23   NativeTarget* target;
     24   NativeFrameSlot* slot_map;
     25   MCLabel* labels;
     26   u8* label_placed;
     27   ObjSecId local_static_sec;
     28   ObjSymId local_static_sym;
     29   u32 local_static_base;
     30   u32 local_static_size;
     31   u8 local_static_active;
     32   /* Set by emit_block for the IR_RET that is the last inst of the last block
     33    * in emit_order. emit_ret consults it to skip the trailing
     34    * branch-to-epilogue: func_end places the epilogue label at the very next
     35    * position, so the branch would just jump to the next 4 bytes. */
     36   u8 emitting_terminal_ret;
     37 } NativeEmitCtx;
     38 
     39 static _Noreturn void emit_panic(NativeEmitCtx* e, SrcLoc loc,
     40                                  const char* msg) {
     41   compiler_panic(e->c, loc, "opt native emit: %s", msg);
     42 }
     43 
     44 static void emit_local_static_begin(NativeEmitCtx* e,
     45                                     const CGLocalStaticDataDesc* desc,
     46                                     SrcLoc loc) {
     47   Sym name;
     48   SecKind kind;
     49   u16 flags;
     50   u32 align;
     51   if (!desc) emit_panic(e, loc, "missing local static data descriptor");
     52   if (e->local_static_active) emit_panic(e, loc, "nested local static data");
     53   if (desc->attrs.section) {
     54     name = (Sym)desc->attrs.section;
     55     kind =
     56         (desc->attrs.flags & KIT_CG_DATADEF_READONLY) ? SEC_RODATA : SEC_DATA;
     57     flags = (desc->attrs.flags & KIT_CG_DATADEF_READONLY)
     58                 ? SF_ALLOC
     59                 : (SF_ALLOC | SF_WRITE);
     60   } else if (desc->attrs.flags & KIT_CG_DATADEF_READONLY) {
     61     name = pool_intern_slice(e->c->global, SLICE_LIT(".rodata"));
     62     kind = SEC_RODATA;
     63     flags = SF_ALLOC;
     64   } else {
     65     name = pool_intern_slice(e->c->global, SLICE_LIT(".data"));
     66     kind = SEC_DATA;
     67     flags = SF_ALLOC | SF_WRITE;
     68   }
     69   align = desc->align ? desc->align : 1u;
     70   e->local_static_sec = obj_section(e->target->obj, name, kind, flags, align);
     71   e->local_static_base =
     72       obj_align_to(e->target->obj, e->local_static_sec, align);
     73   e->local_static_size = 0;
     74   e->local_static_sym = desc->sym;
     75   e->local_static_active = 1;
     76 }
     77 
     78 static void emit_local_static_write(NativeEmitCtx* e, const u8* data, u64 len,
     79                                     SrcLoc loc) {
     80   u8 zero[64];
     81   u64 orig_len = len;
     82   if (!e->local_static_active) emit_panic(e, loc, "local static data inactive");
     83   if (!len) return;
     84   if (data) {
     85     obj_write(e->target->obj, e->local_static_sec, data, (size_t)len);
     86   } else {
     87     memset(zero, 0, sizeof zero);
     88     while (len >= sizeof zero) {
     89       obj_write(e->target->obj, e->local_static_sec, zero, sizeof zero);
     90       len -= sizeof zero;
     91     }
     92     if (len) obj_write(e->target->obj, e->local_static_sec, zero, (size_t)len);
     93   }
     94   e->local_static_size += (u32)orig_len;
     95 }
     96 
     97 static void emit_local_static_label_addr(NativeEmitCtx* e, MCLabel target,
     98                                          i64 addend, u32 width, SrcLoc loc) {
     99   u8 zero[8];
    100   u32 off;
    101   RelocKind kind;
    102   if (!e->local_static_active) emit_panic(e, loc, "local static data inactive");
    103   /* A jump-table / label-address slot is one target pointer wide: 8 bytes
    104    * (R_ABS64) on a 64-bit target, 4 bytes (R_ABS32) on rv32/ELFCLASS32. */
    105   if (width == 8u)
    106     kind = R_ABS64;
    107   else if (width == 4u)
    108     kind = R_ABS32;
    109   else {
    110     emit_panic(e, loc, "unsupported local static label width");
    111     return;
    112   }
    113   memset(zero, 0, sizeof zero);
    114   off = e->local_static_base + e->local_static_size;
    115   obj_write(e->target->obj, e->local_static_sec, zero, width);
    116   e->target->mc->emit_label_data_reloc(e->target->mc, e->local_static_sec, off,
    117                                        target, kind, width, addend);
    118   e->local_static_size += width;
    119 }
    120 
    121 static void emit_local_static_end(NativeEmitCtx* e, SrcLoc loc) {
    122   if (!e->local_static_active) emit_panic(e, loc, "local static data inactive");
    123   obj_symbol_define_live(e->target->obj, e->local_static_sym,
    124                          e->local_static_sec, e->local_static_base,
    125                          e->local_static_size);
    126   e->local_static_active = 0;
    127   e->local_static_sec = OBJ_SEC_NONE;
    128   e->local_static_sym = OBJ_SYM_NONE;
    129   e->local_static_base = 0;
    130   e->local_static_size = 0;
    131 }
    132 
    133 static u32 type_size_or(Compiler* c, KitCgTypeId type, u32 fallback) {
    134   u64 n = type ? cg_type_size(c, type) : 0u;
    135   if (!n || n > 0xffffffffull) return fallback;
    136   return (u32)n;
    137 }
    138 
    139 static u32 type_align_or(Compiler* c, KitCgTypeId type, u32 fallback) {
    140   u64 n = type ? cg_type_align(c, type) : 0u;
    141   if (!n || n > 0xffffffffull) return fallback;
    142   return (u32)n;
    143 }
    144 
    145 static MemAccess mem_for_type(Compiler* c, KitCgTypeId type) {
    146   MemAccess mem;
    147   memset(&mem, 0, sizeof mem);
    148   mem.type = type;
    149   mem.size = type_size_or(c, type, 8u);
    150   mem.align = type_align_or(c, type, mem.size >= 8u ? 8u : mem.size);
    151   return mem;
    152 }
    153 
    154 static NativeAllocClass class_for_type(NativeEmitCtx* e, KitCgTypeId type) {
    155   if (e->target->class_for_type)
    156     return e->target->class_for_type(e->target, type);
    157   return cg_type_is_float(e->c, type) ? NATIVE_REG_FP : NATIVE_REG_INT;
    158 }
    159 
    160 static NativeLoc loc_none(void) {
    161   NativeLoc loc;
    162   memset(&loc, 0, sizeof loc);
    163   return loc;
    164 }
    165 
    166 static NativeLoc loc_reg(KitCgTypeId type, NativeAllocClass cls, Reg reg) {
    167   NativeLoc loc;
    168   memset(&loc, 0, sizeof loc);
    169   loc.kind = NATIVE_LOC_REG;
    170   loc.cls = (u8)cls;
    171   loc.type = type;
    172   loc.v.reg = reg;
    173   return loc;
    174 }
    175 
    176 static NativeLoc loc_frame(KitCgTypeId type, NativeAllocClass cls,
    177                            NativeFrameSlot slot) {
    178   NativeLoc loc;
    179   memset(&loc, 0, sizeof loc);
    180   loc.kind = NATIVE_LOC_FRAME;
    181   loc.cls = (u8)cls;
    182   loc.type = type;
    183   loc.v.frame = slot;
    184   return loc;
    185 }
    186 
    187 static NativeLoc loc_imm(KitCgTypeId type, i64 imm) {
    188   NativeLoc loc;
    189   memset(&loc, 0, sizeof loc);
    190   loc.kind = NATIVE_LOC_IMM;
    191   loc.cls = NATIVE_REG_INT;
    192   loc.type = type;
    193   loc.v.imm = imm;
    194   return loc;
    195 }
    196 
    197 static NativeLoc loc_global(KitCgTypeId type, ObjSymId sym, i64 addend) {
    198   NativeLoc loc;
    199   memset(&loc, 0, sizeof loc);
    200   loc.kind = NATIVE_LOC_GLOBAL;
    201   loc.cls = NATIVE_REG_INT;
    202   loc.type = type;
    203   loc.v.global.sym = sym;
    204   loc.v.global.addend = addend;
    205   return loc;
    206 }
    207 
    208 static int loc_same_frame(NativeLoc a, NativeLoc b) {
    209   return a.kind == NATIVE_LOC_FRAME && b.kind == NATIVE_LOC_FRAME &&
    210          a.v.frame == b.v.frame;
    211 }
    212 
    213 static Reg scratch_reg(NativeEmitCtx* e, NativeAllocClass cls, Reg a, Reg b,
    214                        SrcLoc loc) {
    215   u32 c = (u32)cls;
    216   if (c < OPT_REG_CLASSES) {
    217     for (u32 i = 0; i < e->f->opt_scratch_reg_count[c]; ++i) {
    218       Reg r = e->f->opt_scratch_regs[c][i];
    219       if (r != a && r != b) return r;
    220     }
    221   }
    222   emit_panic(e, loc, "no scratch register for native emission");
    223 }
    224 
    225 static int scratch_available(NativeEmitCtx* e, NativeAllocClass cls, Reg a,
    226                              Reg b) {
    227   u32 c = (u32)cls;
    228   if (c < OPT_REG_CLASSES) {
    229     for (u32 i = 0; i < e->f->opt_scratch_reg_count[c]; ++i) {
    230       Reg r = e->f->opt_scratch_regs[c][i];
    231       if (r != a && r != b) return 1;
    232     }
    233   }
    234   return 0;
    235 }
    236 
    237 static NativeLoc scratch_loc(NativeEmitCtx* e, KitCgTypeId type,
    238                              NativeAllocClass cls, Reg a, Reg b, SrcLoc loc) {
    239   return loc_reg(type, cls, scratch_reg(e, cls, a, b, loc));
    240 }
    241 
    242 static NativeFrameSlot map_slot(NativeEmitCtx* e, NativeFrameSlot slot,
    243                                 SrcLoc loc) {
    244   if (slot == NATIVE_FRAME_SLOT_NONE) return NATIVE_FRAME_SLOT_NONE;
    245   if (slot > e->f->nframe_slots) emit_panic(e, loc, "bad frame slot");
    246   if (!e->slot_map[slot]) emit_panic(e, loc, "unmapped frame slot");
    247   return e->slot_map[slot];
    248 }
    249 
    250 static MCLabel ensure_label(NativeEmitCtx* e, u32 block, SrcLoc loc) {
    251   if (block >= e->f->nblocks) emit_panic(e, loc, "bad block label");
    252   if (e->labels[block] == MC_LABEL_NONE)
    253     e->labels[block] = e->target->label_new(e->target);
    254   return e->labels[block];
    255 }
    256 
    257 static NativeAddr addr_from_loc(NativeEmitCtx* e, NativeLoc loc,
    258                                 SrcLoc src_loc) {
    259   NativeAddr addr;
    260   memset(&addr, 0, sizeof addr);
    261   addr.base_type = loc.type;
    262   switch ((NativeLocKind)loc.kind) {
    263     case NATIVE_LOC_FRAME:
    264       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
    265       addr.base.frame = loc.v.frame;
    266       return addr;
    267     case NATIVE_LOC_STACK:
    268       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
    269       addr.base.frame = loc.v.stack.slot;
    270       addr.offset = loc.v.stack.offset;
    271       return addr;
    272     case NATIVE_LOC_GLOBAL:
    273       addr.base_kind = NATIVE_ADDR_BASE_GLOBAL;
    274       addr.base.global.sym = loc.v.global.sym;
    275       addr.base.global.addend = loc.v.global.addend;
    276       return addr;
    277     case NATIVE_LOC_REG:
    278       addr.base_kind = NATIVE_ADDR_BASE_REG;
    279       addr.cls = loc.cls;
    280       addr.base.reg = loc.v.reg;
    281       return addr;
    282     case NATIVE_LOC_ADDR:
    283       return loc.v.addr;
    284     default:
    285       emit_panic(e, src_loc, "location is not addressable");
    286   }
    287 }
    288 
    289 static NativeAddr addr_from_operand(NativeEmitCtx* e, const OptOperand* op,
    290                                     SrcLoc loc) {
    291   NativeAddr addr;
    292   memset(&addr, 0, sizeof addr);
    293   if (!op) emit_panic(e, loc, "missing address operand");
    294   addr.base_type = op->type;
    295   switch ((OptOperandKind)op->kind) {
    296     case OPT_OPK_LOCAL:
    297       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
    298       addr.base.frame = map_slot(e, op->v.frame_slot, loc);
    299       return addr;
    300     case OPT_OPK_GLOBAL:
    301       addr.base_kind = NATIVE_ADDR_BASE_GLOBAL;
    302       addr.base.global.sym = op->v.global.sym;
    303       addr.base.global.addend = op->v.global.addend;
    304       return addr;
    305     case OPT_OPK_INDIRECT:
    306       addr.base_kind = NATIVE_ADDR_BASE_REG;
    307       addr.cls = NATIVE_REG_INT;
    308       addr.base.reg = op->v.ind.base;
    309       addr.index_kind = op->v.ind.index == (Reg)REG_NONE
    310                             ? NATIVE_ADDR_INDEX_NONE
    311                             : NATIVE_ADDR_INDEX_REG;
    312       addr.index_cls = NATIVE_REG_INT;
    313       addr.index.reg = op->v.ind.index;
    314       addr.log2_scale = op->v.ind.log2_scale;
    315       addr.offset = op->v.ind.ofs;
    316       return addr;
    317     case OPT_OPK_REG:
    318       addr.base_kind = NATIVE_ADDR_BASE_REG;
    319       addr.cls = op->cls;
    320       addr.base.reg = op->v.reg;
    321       return addr;
    322     default:
    323       emit_panic(e, loc, "operand is not addressable");
    324   }
    325 }
    326 
    327 static NativeAddr pointer_addr_from_operand(NativeEmitCtx* e,
    328                                             const OptOperand* op, SrcLoc loc,
    329                                             Reg avoid_a, Reg avoid_b) {
    330   NativeAddr addr;
    331   memset(&addr, 0, sizeof addr);
    332   if (!op) emit_panic(e, loc, "missing pointer operand");
    333   addr.base_type = op->type;
    334   switch ((OptOperandKind)op->kind) {
    335     case OPT_OPK_LOCAL: {
    336       NativeAddr frame;
    337       NativeLoc dst;
    338       NativeAllocClass cls;
    339       Reg r;
    340       /* An OPK_LOCAL in a pointer-address position is ambiguous. When the
    341        * operand's type is a pointer, the local *holds* the pointer value and
    342        * must be loaded to get the address. Otherwise the local *is* the
    343        * aggregate storage and its frame home is the address directly — loading
    344        * it would dereference the aggregate's first 8 bytes as a pointer (e.g.
    345        * an `__int128` call result copied by `agg_copy`). Mirrors the
    346        * single-pass path's nd_addr_pointer. */
    347       if (!cg_type_is_ptr(e->c, op->type)) {
    348         addr.base_kind = NATIVE_ADDR_BASE_FRAME;
    349         addr.base.frame = map_slot(e, op->v.frame_slot, loc);
    350         return addr;
    351       }
    352       cls = class_for_type(e, op->type);
    353       r = scratch_reg(e, cls, avoid_a, avoid_b, loc);
    354       memset(&frame, 0, sizeof frame);
    355       frame.base_kind = NATIVE_ADDR_BASE_FRAME;
    356       frame.base.frame = map_slot(e, op->v.frame_slot, loc);
    357       frame.base_type = op->type;
    358       dst = loc_reg(op->type, cls, r);
    359       e->target->load(e->target, dst, frame, mem_for_type(e->c, op->type));
    360       addr.base_kind = NATIVE_ADDR_BASE_REG;
    361       addr.cls = (u8)cls;
    362       addr.base.reg = r;
    363       return addr;
    364     }
    365     case OPT_OPK_GLOBAL:
    366       addr.base_kind = NATIVE_ADDR_BASE_GLOBAL;
    367       addr.base.global.sym = op->v.global.sym;
    368       addr.base.global.addend = op->v.global.addend;
    369       return addr;
    370     case OPT_OPK_INDIRECT:
    371       return addr_from_operand(e, op, loc);
    372     case OPT_OPK_REG:
    373       addr.base_kind = NATIVE_ADDR_BASE_REG;
    374       addr.cls = op->cls;
    375       addr.base.reg = op->v.reg;
    376       return addr;
    377     default:
    378       emit_panic(e, loc, "operand is not a pointer address");
    379   }
    380 }
    381 
    382 static Reg addr_base_reg(const NativeAddr* addr) {
    383   return addr && addr->base_kind == NATIVE_ADDR_BASE_REG ? addr->base.reg
    384                                                          : REG_NONE;
    385 }
    386 
    387 static Reg addr_index_reg(const NativeAddr* addr) {
    388   return addr && addr->index_kind == NATIVE_ADDR_INDEX_REG ? addr->index.reg
    389                                                            : REG_NONE;
    390 }
    391 
    392 static void collapse_addr_to_reg(NativeEmitCtx* e, NativeAddr* addr,
    393                                  SrcLoc loc) {
    394   /* Materialize the full address into a reserved scratch register. We must not
    395    * reuse the base register as the destination: the register allocator may keep
    396    * that value live past this memory op (e.g. a pointer stored into several of
    397    * its own fields and then returned), so an in-place `add base, base, #off`
    398    * would corrupt it. Avoid both base and index so load_addr can still read
    399    * them. */
    400   Reg r = scratch_reg(e, NATIVE_REG_INT, addr_base_reg(addr),
    401                       addr_index_reg(addr), loc);
    402   NativeLoc dst = loc_reg(addr->base_type, NATIVE_REG_INT, r);
    403   e->target->load_addr(e->target, dst, *addr);
    404   memset(addr, 0, sizeof *addr);
    405   addr->base_kind = NATIVE_ADDR_BASE_REG;
    406   addr->cls = NATIVE_REG_INT;
    407   addr->base.reg = r;
    408   addr->base_type = dst.type;
    409 }
    410 
    411 /* Collapse an address the target cannot encode for this access (e.g. an
    412  * index scale aarch64 cannot fold into a load/store) into a single base
    413  * register via load_addr. Mirrors NativeDirectTarget's nd_addr_materialize so
    414  * the O1 emit path legalizes the same address shapes as direct -O0 emission. */
    415 static void legalize_addr(NativeEmitCtx* e, NativeAddr* addr, MemAccess mem,
    416                           SrcLoc loc) {
    417   if (e->target->addr_legal && !e->target->addr_legal(e->target, addr, mem))
    418     collapse_addr_to_reg(e, addr, loc);
    419 }
    420 
    421 static NativeLoc loc_from_operand(NativeEmitCtx* e, const OptOperand* op,
    422                                   SrcLoc loc) {
    423   if (!op) return loc_none();
    424   switch ((OptOperandKind)op->kind) {
    425     case OPT_OPK_REG:
    426       return loc_reg(op->type, (NativeAllocClass)op->cls, op->v.reg);
    427     case OPT_OPK_IMM:
    428       return loc_imm(op->type, op->v.imm);
    429     case OPT_OPK_GLOBAL:
    430       return loc_global(op->type, op->v.global.sym, op->v.global.addend);
    431     case OPT_OPK_LOCAL:
    432       return loc_frame(op->type, class_for_type(e, op->type),
    433                        map_slot(e, op->v.frame_slot, loc));
    434     case OPT_OPK_INDIRECT: {
    435       NativeLoc out = loc_none();
    436       out.kind = NATIVE_LOC_ADDR;
    437       out.cls = op->cls;
    438       out.type = op->type;
    439       out.v.addr = addr_from_operand(e, op, loc);
    440       return out;
    441     }
    442   }
    443   emit_panic(e, loc, "bad operand kind");
    444 }
    445 
    446 static NativeLoc materialize(NativeEmitCtx* e, NativeLoc src,
    447                              NativeAllocClass cls, KitCgTypeId type,
    448                              Reg avoid_a, Reg avoid_b, SrcLoc loc) {
    449   NativeLoc dst;
    450   NativeAddr addr;
    451   MemAccess mem;
    452   if (src.kind == NATIVE_LOC_REG) return src;
    453   dst = scratch_loc(e, type ? type : src.type, cls, avoid_a, avoid_b, loc);
    454   switch ((NativeLocKind)src.kind) {
    455     case NATIVE_LOC_IMM:
    456       e->target->load_imm(e->target, dst, src.v.imm);
    457       return dst;
    458     case NATIVE_LOC_GLOBAL:
    459       addr = addr_from_loc(e, src, loc);
    460       e->target->load_addr(e->target, dst, addr);
    461       return dst;
    462     case NATIVE_LOC_FRAME:
    463     case NATIVE_LOC_STACK:
    464     case NATIVE_LOC_ADDR:
    465       addr = addr_from_loc(e, src, loc);
    466       mem = mem_for_type(e->c, dst.type);
    467       e->target->load(e->target, dst, addr, mem);
    468       return dst;
    469     default:
    470       emit_panic(e, loc, "cannot materialize location");
    471   }
    472 }
    473 
    474 static void write_loc(NativeEmitCtx* e, NativeLoc dst, NativeLoc src,
    475                       MemAccess mem, SrcLoc loc) {
    476   NativeAddr addr;
    477   NativeLoc tmp;
    478   if (dst.kind == NATIVE_LOC_NONE) return;
    479   if (loc_same_frame(dst, src)) return;
    480   if (dst.kind == NATIVE_LOC_REG) {
    481     if (src.kind == NATIVE_LOC_REG) {
    482       if (dst.v.reg != src.v.reg || dst.cls != src.cls)
    483         e->target->move(e->target, dst, src);
    484       return;
    485     }
    486     /* An immediate goes straight into the destination register; routing it
    487      * through a scratch and then moving would cost an extra instruction. */
    488     if (src.kind == NATIVE_LOC_IMM) {
    489       e->target->load_imm(e->target, dst, src.v.imm);
    490       return;
    491     }
    492     tmp = materialize(e, src, (NativeAllocClass)dst.cls, dst.type, dst.v.reg,
    493                       REG_NONE, loc);
    494     if (tmp.v.reg != dst.v.reg || tmp.cls != dst.cls)
    495       e->target->move(e->target, dst, tmp);
    496     return;
    497   }
    498   addr = addr_from_loc(e, dst, loc);
    499   if (src.kind != NATIVE_LOC_REG)
    500     src = materialize(e, src, (NativeAllocClass)dst.cls, dst.type, REG_NONE,
    501                       REG_NONE, loc);
    502   e->target->store(e->target, addr, src, mem);
    503 }
    504 
    505 /* For an arithmetic / compare source operand: keep it as an immediate when it
    506  * is a constant the target can encode for `use` (so no register is wasted
    507  * materializing it); otherwise materialize into a register. */
    508 static NativeLoc operand_imm_or_reg(NativeEmitCtx* e, const OptOperand* op,
    509                                     NativeImmUse use, u32 sub, Reg avoid_a,
    510                                     Reg avoid_b, SrcLoc loc) {
    511   if (op->kind == OPK_IMM && e->target->imm_legal &&
    512       e->target->imm_legal(e->target, use, sub, op->type, op->v.imm))
    513     return loc_imm(op->type, op->v.imm);
    514   return materialize(e, loc_from_operand(e, op, loc),
    515                      class_for_type(e, op->type), op->type, avoid_a, avoid_b,
    516                      loc);
    517 }
    518 
    519 static Reg loc_avoid_reg(NativeLoc l) {
    520   return l.kind == NATIVE_LOC_REG ? l.v.reg : REG_NONE;
    521 }
    522 
    523 static int type_is_aggregate_or_large(NativeEmitCtx* e, KitCgTypeId type) {
    524   /* "Large" = wider than one machine word (ptr_size): such a value cannot move
    525    * through a single register, so IR_COPY/IR_LOAD/IR_STORE of it must go through
    526    * copy_bytes. 8 on rv64/x64/aa64, 4 on rv32 (so an 8-byte i64/double is large
    527    * there and is copied as two words rather than truncated into one register). */
    528   return type && (cg_type_is_aggregate(e->c, type) ||
    529                   type_size_or(e->c, type, 8u) > e->c->target.ptr_size);
    530 }
    531 
    532 /* Copy an aggregate / oversized value between two memory locations. dst and
    533  * src must be addressable (frame/global/indirect/reg-as-pointer); used for
    534  * IR_COPY/IR_LOAD/IR_STORE whose value type cannot move through one register.
    535  */
    536 static void emit_agg_move(NativeEmitCtx* e, NativeAddr da, NativeAddr sa,
    537                           KitCgTypeId type) {
    538   AggregateAccess acc;
    539   memset(&acc, 0, sizeof acc);
    540   acc.type = type;
    541   acc.size = type_size_or(e->c, type, 8u);
    542   acc.align = type_align_or(e->c, type, 8u);
    543   acc.mem = mem_for_type(e->c, type);
    544   e->target->copy_bytes(e->target, da, sa, acc);
    545 }
    546 
    547 static CGFuncDesc semantic_func_desc(NativeEmitCtx* e) {
    548   OptCGFuncDesc* in = &e->f->desc;
    549   CGFuncDesc out;
    550   memset(&out, 0, sizeof out);
    551   out.sym = in->sym;
    552   out.text_section_id = in->text_section_id;
    553   out.group_id = in->group_id;
    554   out.fn_type = in->fn_type;
    555   out.result_type = in->result_type;
    556   out.nparams = in->nparams;
    557   out.loc = in->loc;
    558   out.flags = in->flags;
    559   out.inline_policy = in->inline_policy;
    560   out.atomize = in->atomize;
    561   if (in->nparams && in->params) {
    562     CGParamDesc* params = arena_zarray(e->f->arena, CGParamDesc, in->nparams);
    563     for (u32 i = 0; i < in->nparams; ++i) {
    564       params[i].index = in->params[i].index;
    565       params[i].name = in->params[i].name;
    566       params[i].type = in->params[i].type;
    567       params[i].size = in->params[i].size;
    568       params[i].align = in->params[i].align;
    569       params[i].flags = in->params[i].flags;
    570       params[i].loc = in->params[i].loc;
    571     }
    572     out.params = params;
    573   }
    574   return out;
    575 }
    576 
    577 static CGParamDesc semantic_param_desc(const IRParam* p) {
    578   CGParamDesc out;
    579   memset(&out, 0, sizeof out);
    580   out.index = p->index;
    581   out.name = p->name;
    582   out.type = p->type;
    583   out.size = p->size;
    584   out.align = p->align;
    585   out.flags = p->flags;
    586   out.loc = p->loc;
    587   return out;
    588 }
    589 
    590 static NativeLoc loc_for_preg(NativeEmitCtx* e, PReg preg, KitCgTypeId type,
    591                               SrcLoc loc) {
    592   u8 kind = opt_preg_alloc_kind(e->f, preg);
    593   if (kind == OPT_ALLOC_HARD)
    594     return loc_reg(type, (NativeAllocClass)opt_preg_loc_cls(e->f, preg),
    595                    opt_preg_hard_reg(e->f, preg));
    596   if (kind == OPT_ALLOC_SPILL)
    597     return loc_frame(type, class_for_type(e, type),
    598                      map_slot(e, opt_preg_spill_slot(e->f, preg), loc));
    599   return loc_none();
    600 }
    601 
    602 static void bind_params(NativeEmitCtx* e) {
    603   for (u32 i = 0; i < e->f->nparams; ++i) {
    604     IRParam* p = &e->f->params[i];
    605     CGParamDesc sd = semantic_param_desc(p);
    606     NativeLoc dst;
    607     if (p->storage.kind == CG_LOCAL_STORAGE_REG)
    608       dst = loc_for_preg(e, (PReg)p->storage.v.reg, p->type, p->loc);
    609     else
    610       dst = loc_frame(p->type, class_for_type(e, p->type),
    611                       map_slot(e, p->storage.v.frame_slot, p->loc));
    612     if (e->target->bind_param) e->target->bind_param(e->target, &sd, dst);
    613   }
    614   /* Let a backend that defers register-destination binds resolve them now (as a
    615    * parallel copy), once every param's incoming location has been read. */
    616   if (e->target->bind_params_end) e->target->bind_params_end(e->target);
    617 }
    618 
    619 /* The parameter value is placed into its allocated location by bind_param at
    620  * function entry; the IR_PARAM_DECL marker emits nothing. */
    621 static void emit_param_decl(NativeEmitCtx* e, Inst* in) {
    622   (void)e;
    623   (void)in;
    624 }
    625 
    626 static NativeFrameSlot temp_slot(NativeEmitCtx* e, KitCgTypeId type, SrcLoc loc,
    627                                  NativeFrameSlotKind kind) {
    628   NativeFrameSlotDesc d;
    629   memset(&d, 0, sizeof d);
    630   d.type = type;
    631   d.loc = loc;
    632   d.size = type_size_or(e->c, type, 8u);
    633   d.align = type_align_or(e->c, type, d.size >= 8u ? 8u : d.size);
    634   d.kind = kind;
    635   return e->target->frame_slot(e->target, &d);
    636 }
    637 
    638 static NativeLoc abi_storage_loc(NativeEmitCtx* e, const OptCGABIValue* v,
    639                                  SrcLoc loc) {
    640   if (!v) return loc_none();
    641   return loc_from_operand(e, &v->storage, loc);
    642 }
    643 
    644 static void emit_call(NativeEmitCtx* e, Inst* in) {
    645   IRCallAux* aux = (IRCallAux*)in->extra.aux;
    646   NativeCallDesc d;
    647   NativeCallPlan plan;
    648   NativeLoc* args = NULL;
    649   NativeLoc* results = NULL;
    650   NativeLoc final_result = loc_none();
    651   NativeFrameSlot result_slot = NATIVE_FRAME_SLOT_NONE;
    652   MemAccess result_mem;
    653   if (!aux) return;
    654   memset(&d, 0, sizeof d);
    655   memset(&plan, 0, sizeof plan);
    656   if (aux->desc.nargs)
    657     args = arena_zarray(e->f->arena, NativeLoc, aux->desc.nargs);
    658   for (u32 i = 0; i < aux->desc.nargs; ++i)
    659     args[i] = abi_storage_loc(e, &aux->desc.args[i], in->loc);
    660   if (aux->desc.ret.storage.kind) {
    661     KitCgTypeId rty = aux->desc.ret.type;
    662     results = arena_zarray(e->f->arena, NativeLoc, 1);
    663     final_result = abi_storage_loc(e, &aux->desc.ret, in->loc);
    664     /* Hand plan_call the value's real destination directly whenever it is a
    665      * register or a frame slot: a scalar result is a single move out of the ABI
    666      * result register, and an aggregate / oversized result — which plan_call or
    667      * the callee writes in parts and so must land in memory — lands straight in
    668      * its frame home. Routing either through a fresh temp slot (store then
    669      * reload / copy_bytes) was a pure round trip on every call. The temp slot
    670      * is a fallback for the rare result whose storage is neither a register nor
    671      * a frame slot (e.g. written into a global); lowering hoists aggregates to
    672      * a frame home (opt_lower_to_mir), so this branch is scalar-only in
    673      * practice. */
    674     if (final_result.kind == NATIVE_LOC_REG ||
    675         final_result.kind == NATIVE_LOC_FRAME) {
    676       results[0] = final_result;
    677     } else {
    678       result_slot = temp_slot(e, rty, in->loc, NATIVE_FRAME_SLOT_SPILL);
    679       results[0] = loc_frame(rty, class_for_type(e, rty), result_slot);
    680     }
    681   }
    682   d.fn_type = aux->desc.fn_type;
    683   d.callee = loc_from_operand(e, &aux->desc.callee, in->loc);
    684   d.args = args;
    685   d.results = results;
    686   d.nargs = aux->desc.nargs;
    687   d.nresults = results ? 1u : 0u;
    688   d.flags = aux->desc.flags;
    689   d.tail_policy = aux->desc.tail_policy;
    690   d.inline_policy = aux->desc.inline_policy;
    691   e->target->plan_call(e->target, &d, &plan);
    692   for (u32 i = 0; i < plan.nargs; ++i)
    693     write_loc(e, plan.args[i].dst, plan.args[i].src, plan.args[i].mem, in->loc);
    694   if (plan.callee.kind != NATIVE_LOC_REG &&
    695       plan.callee.kind != NATIVE_LOC_GLOBAL)
    696     plan.callee = materialize(e, plan.callee, NATIVE_REG_INT, plan.callee.type,
    697                               REG_NONE, REG_NONE, in->loc);
    698   e->target->emit_call(e->target, &plan);
    699   for (u32 i = 0; i < plan.nrets; ++i)
    700     write_loc(e, plan.rets[i].dst, plan.rets[i].src, plan.rets[i].mem, in->loc);
    701   if (result_slot && final_result.kind != NATIVE_LOC_NONE) {
    702     KitCgTypeId rty = aux->desc.ret.type;
    703     NativeLoc tmp = loc_frame(rty, class_for_type(e, rty), result_slot);
    704     result_mem = mem_for_type(e->c, rty);
    705     if (final_result.kind != NATIVE_LOC_REG &&
    706         (cg_type_is_aggregate(e->c, rty) ||
    707          type_size_or(e->c, rty, 8u) > e->c->target.ptr_size)) {
    708       /* Aggregate / oversized result: move bytes rather than a scalar copy
    709        * (which would exceed the single-register width). The result was either
    710        * written in parts by plan_call's rets, or by the callee via the sret
    711        * pointer; either way it now lives in the temp slot. */
    712       AggregateAccess acc;
    713       NativeAddr da = addr_from_loc(e, final_result, in->loc);
    714       NativeAddr sa = addr_from_loc(e, tmp, in->loc);
    715       memset(&acc, 0, sizeof acc);
    716       acc.type = rty;
    717       acc.size = type_size_or(e->c, rty, 8u);
    718       acc.align = type_align_or(e->c, rty, 8u);
    719       acc.mem = result_mem;
    720       e->target->copy_bytes(e->target, da, sa, acc);
    721     } else {
    722       write_loc(e, final_result, tmp, result_mem, in->loc);
    723     }
    724   }
    725 }
    726 
    727 static void emit_ret(NativeEmitCtx* e, Inst* in, const CGFuncDesc* fd) {
    728   IRRetAux* aux = (IRRetAux*)in->extra.aux;
    729   NativeLoc value = loc_none();
    730   const NativeLoc* values = NULL;
    731   NativeCallPlanRet* rets = NULL;
    732   u32 nrets = 0;
    733   if (aux && aux->present) {
    734     /* Hand plan_ret the value's location directly. For an aggregate / oversized
    735      * result it is a memory location (plan_ret copies to the sret pointer or
    736      * reads parts into the return registers); for a scalar it is the value's
    737      * register or slot, which plan_ret moves into the return register. The old
    738      * code spilled scalars to a fresh slot and reloaded them, a pure round
    739      * trip on every return. */
    740     value = abi_storage_loc(e, &aux->val, in->loc);
    741     values = &value;
    742   }
    743   e->target->plan_ret(e->target, fd, values, &rets, &nrets);
    744   for (u32 i = 0; i < nrets; ++i)
    745     write_loc(e, rets[i].dst, rets[i].src, rets[i].mem, in->loc);
    746   /* Skip the trailing branch-to-epilogue when this IR_RET is the very last
    747    * inst emitted: func_end will place the epilogue label at mc->pos right
    748    * after this, so the branch would jump to the next 4 bytes. The actual
    749    * `ret` instruction lives in func_end's restore-frame sequence and is
    750    * unaffected. */
    751   if (!e->emitting_terminal_ret) e->target->ret(e->target);
    752 }
    753 
    754 static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in,
    755                       const CGFuncDesc* fd) {
    756   NativeLoc dst, a, b, src, tmp;
    757   NativeAddr addr, addr2;
    758   Reg dst_reg;
    759   (void)block;
    760   if (e->target->set_loc) e->target->set_loc(e->target, in->loc);
    761   switch ((IROp)in->op) {
    762     case IR_NOP:
    763     case IR_CONST_I:
    764     case IR_CONST_BYTES:
    765     case IR_PHI:
    766     case IR_SCOPE_BEGIN:
    767     case IR_SCOPE_END:
    768       return;
    769     case IR_PARAM_DECL:
    770       emit_param_decl(e, in);
    771       return;
    772     case IR_LOAD_IMM:
    773       dst = loc_from_operand(e, &in->opnds[0], in->loc);
    774       write_loc(e, dst, loc_imm(in->opnds[0].type, in->extra.imm),
    775                 mem_for_type(e->c, in->opnds[0].type), in->loc);
    776       return;
    777     case IR_LOAD_CONST:
    778       dst = loc_from_operand(e, &in->opnds[0], in->loc);
    779       if (dst.kind != NATIVE_LOC_REG)
    780         dst = materialize(e, dst, class_for_type(e, in->opnds[0].type),
    781                           in->opnds[0].type, REG_NONE, REG_NONE, in->loc);
    782       e->target->load_const(e->target, dst, in->extra.cbytes);
    783       return;
    784     case IR_COPY:
    785       if (type_is_aggregate_or_large(e, in->opnds[0].type)) {
    786         emit_agg_move(e, addr_from_operand(e, &in->opnds[0], in->loc),
    787                       addr_from_operand(e, &in->opnds[1], in->loc),
    788                       in->opnds[0].type);
    789         return;
    790       }
    791       dst = loc_from_operand(e, &in->opnds[0], in->loc);
    792       src = loc_from_operand(e, &in->opnds[1], in->loc);
    793       write_loc(e, dst, src, mem_for_type(e->c, in->opnds[0].type), in->loc);
    794       return;
    795     case IR_LOAD:
    796       if (type_is_aggregate_or_large(e, in->opnds[0].type)) {
    797         addr = addr_from_operand(e, &in->opnds[1], in->loc);
    798         emit_agg_move(e, addr_from_operand(e, &in->opnds[0], in->loc), addr,
    799                       in->opnds[0].type);
    800         return;
    801       }
    802       dst = loc_from_operand(e, &in->opnds[0], in->loc);
    803       addr = addr_from_operand(e, &in->opnds[1], in->loc);
    804       legalize_addr(e, &addr, in->extra.mem, in->loc);
    805       if (dst.kind == NATIVE_LOC_REG) {
    806         e->target->load(e->target, dst, addr, in->extra.mem);
    807       } else {
    808         if (!scratch_available(e, class_for_type(e, in->opnds[0].type),
    809                                addr_base_reg(&addr), addr_index_reg(&addr)))
    810           collapse_addr_to_reg(e, &addr, in->loc);
    811         tmp = scratch_loc(e, in->opnds[0].type,
    812                           class_for_type(e, in->opnds[0].type),
    813                           addr_base_reg(&addr), addr_index_reg(&addr), in->loc);
    814         e->target->load(e->target, tmp, addr, in->extra.mem);
    815         write_loc(e, dst, tmp, in->extra.mem, in->loc);
    816       }
    817       return;
    818     case IR_STORE:
    819       if (type_is_aggregate_or_large(e, in->opnds[1].type)) {
    820         emit_agg_move(e, addr_from_operand(e, &in->opnds[0], in->loc),
    821                       addr_from_operand(e, &in->opnds[1], in->loc),
    822                       in->opnds[1].type);
    823         return;
    824       }
    825       addr = addr_from_operand(e, &in->opnds[0], in->loc);
    826       legalize_addr(e, &addr, in->extra.mem, in->loc);
    827       src = loc_from_operand(e, &in->opnds[1], in->loc);
    828       /* Storing a constant 0 from the hardware zero register avoids
    829        * materializing 0 into a scratch first (e.g. `strb wzr, [..]` rather than
    830        * `movz w9,0; strb w9, [..]`). */
    831       if (src.kind == NATIVE_LOC_IMM && src.v.imm == 0 &&
    832           e->target->has_store_zero_reg &&
    833           class_for_type(e, in->opnds[1].type) == NATIVE_REG_INT)
    834         src = loc_reg(in->opnds[1].type, NATIVE_REG_INT,
    835                       e->target->store_zero_reg);
    836       /* Source register aliases the address base/index (e.g. `*p = (T)p`).
    837        * Collapse the address into a scratch register: collapse_addr_to_reg
    838        * selects a scratch distinct from both base and index — hence distinct
    839        * from `src` — so the store reads `src` and writes through the fresh
    840        * scratch with no alias. This stays entirely in registers; the frame is
    841        * fully planned before emission, so emit never allocates a slot here. */
    842       if (src.kind == NATIVE_LOC_REG && (src.v.reg == addr_base_reg(&addr) ||
    843                                          src.v.reg == addr_index_reg(&addr)))
    844         collapse_addr_to_reg(e, &addr, in->loc);
    845       if (src.kind != NATIVE_LOC_REG) {
    846         if (!scratch_available(e, class_for_type(e, in->opnds[1].type),
    847                                addr_base_reg(&addr), addr_index_reg(&addr)))
    848           collapse_addr_to_reg(e, &addr, in->loc);
    849         src = materialize(e, src, class_for_type(e, in->opnds[1].type),
    850                           in->opnds[1].type, addr_base_reg(&addr),
    851                           addr_index_reg(&addr), in->loc);
    852       }
    853       e->target->store(e->target, addr, src, in->extra.mem);
    854       return;
    855     case IR_ADDR_OF: {
    856       NativeLoc real = loc_from_operand(e, &in->opnds[0], in->loc);
    857       addr = addr_from_operand(e, &in->opnds[1], in->loc);
    858       dst = real;
    859       if (dst.kind != NATIVE_LOC_REG)
    860         dst = scratch_loc(e, in->opnds[0].type,
    861                           class_for_type(e, in->opnds[0].type), REG_NONE,
    862                           REG_NONE, in->loc);
    863       e->target->load_addr(e->target, dst, addr);
    864       if (real.kind != NATIVE_LOC_REG)
    865         write_loc(e, real, dst, mem_for_type(e->c, in->opnds[0].type), in->loc);
    866       return;
    867     }
    868     case IR_TLS_ADDR_OF: {
    869       IRTlsAux* aux = (IRTlsAux*)in->extra.aux;
    870       dst = loc_from_operand(e, &in->opnds[0], in->loc);
    871       if (dst.kind != NATIVE_LOC_REG)
    872         dst = materialize(e, dst, NATIVE_REG_INT, in->opnds[0].type, REG_NONE,
    873                           REG_NONE, in->loc);
    874       e->target->tls_addr_of(e->target, dst, aux->sym, aux->addend);
    875       return;
    876     }
    877     case IR_AGG_COPY: {
    878       IRAggAux* aux = (IRAggAux*)in->extra.aux;
    879       addr = pointer_addr_from_operand(e, &in->opnds[0], in->loc, REG_NONE,
    880                                        REG_NONE);
    881       addr2 = pointer_addr_from_operand(
    882           e, &in->opnds[1], in->loc,
    883           addr.base_kind == NATIVE_ADDR_BASE_REG ? addr.base.reg : REG_NONE,
    884           REG_NONE);
    885       e->target->copy_bytes(e->target, addr, addr2, aux->access);
    886       return;
    887     }
    888     case IR_AGG_SET: {
    889       IRAggAux* aux = (IRAggAux*)in->extra.aux;
    890       addr = pointer_addr_from_operand(e, &in->opnds[0], in->loc, REG_NONE,
    891                                        REG_NONE);
    892       src = loc_from_operand(e, &in->opnds[1], in->loc);
    893       if (src.kind != NATIVE_LOC_REG) {
    894         if (!scratch_available(e, NATIVE_REG_INT, addr_base_reg(&addr),
    895                                addr_index_reg(&addr)))
    896           collapse_addr_to_reg(e, &addr, in->loc);
    897         src = materialize(e, src, NATIVE_REG_INT, in->opnds[1].type,
    898                           addr_base_reg(&addr), addr_index_reg(&addr), in->loc);
    899       }
    900       e->target->set_bytes(e->target, addr, src, aux->access);
    901       return;
    902     }
    903     case IR_BITFIELD_LOAD: {
    904       IRBitFieldAux* aux = (IRBitFieldAux*)in->extra.aux;
    905       dst = loc_from_operand(e, &in->opnds[0], in->loc);
    906       addr = addr_from_operand(e, &in->opnds[1], in->loc);
    907       if (dst.kind != NATIVE_LOC_REG)
    908         dst = materialize(e, dst, class_for_type(e, in->opnds[0].type),
    909                           in->opnds[0].type, REG_NONE, REG_NONE, in->loc);
    910       e->target->bitfield_load(e->target, dst, addr, aux->access);
    911       return;
    912     }
    913     case IR_BITFIELD_STORE: {
    914       IRBitFieldAux* aux = (IRBitFieldAux*)in->extra.aux;
    915       addr = addr_from_operand(e, &in->opnds[0], in->loc);
    916       src = loc_from_operand(e, &in->opnds[1], in->loc);
    917       if (src.kind != NATIVE_LOC_REG)
    918         src = materialize(e, src, class_for_type(e, in->opnds[1].type),
    919                           in->opnds[1].type, REG_NONE, REG_NONE, in->loc);
    920       e->target->bitfield_store(e->target, addr, src, aux->access);
    921       return;
    922     }
    923     case IR_BINOP:
    924       dst = loc_from_operand(e, &in->opnds[0], in->loc);
    925       dst_reg = dst.kind == NATIVE_LOC_REG ? dst.v.reg : REG_NONE;
    926       b = loc_from_operand(e, &in->opnds[2], in->loc);
    927       a = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc),
    928                       class_for_type(e, in->opnds[1].type), in->opnds[1].type,
    929                       dst_reg, loc_avoid_reg(b), in->loc);
    930       b = operand_imm_or_reg(e, &in->opnds[2], NATIVE_IMM_BINOP,
    931                              (u32)in->extra.imm, a.v.reg, dst_reg, in->loc);
    932       if (dst.kind != NATIVE_LOC_REG)
    933         dst = scratch_loc(e, in->opnds[0].type,
    934                           class_for_type(e, in->opnds[0].type), a.v.reg,
    935                           loc_avoid_reg(b), in->loc);
    936       e->target->binop(e->target, (BinOp)in->extra.imm, dst, a, b);
    937       if (in->opnds[0].kind != OPK_REG)
    938         write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst,
    939                   mem_for_type(e->c, in->opnds[0].type), in->loc);
    940       return;
    941     case IR_UNOP:
    942       dst = loc_from_operand(e, &in->opnds[0], in->loc);
    943       dst_reg = dst.kind == NATIVE_LOC_REG ? dst.v.reg : REG_NONE;
    944       a = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc),
    945                       class_for_type(e, in->opnds[1].type), in->opnds[1].type,
    946                       dst_reg, REG_NONE, in->loc);
    947       if (dst.kind != NATIVE_LOC_REG)
    948         dst = scratch_loc(e, in->opnds[0].type,
    949                           class_for_type(e, in->opnds[0].type), a.v.reg,
    950                           REG_NONE, in->loc);
    951       e->target->unop(e->target, (UnOp)in->extra.imm, dst, a);
    952       if (in->opnds[0].kind != OPK_REG)
    953         write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst,
    954                   mem_for_type(e->c, in->opnds[0].type), in->loc);
    955       return;
    956     case IR_CMP:
    957       dst = loc_from_operand(e, &in->opnds[0], in->loc);
    958       dst_reg = dst.kind == NATIVE_LOC_REG ? dst.v.reg : REG_NONE;
    959       b = loc_from_operand(e, &in->opnds[2], in->loc);
    960       a = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc),
    961                       class_for_type(e, in->opnds[1].type), in->opnds[1].type,
    962                       dst_reg, loc_avoid_reg(b), in->loc);
    963       b = operand_imm_or_reg(e, &in->opnds[2], NATIVE_IMM_CMP,
    964                              (u32)in->extra.imm, a.v.reg, dst_reg, in->loc);
    965       if (dst.kind != NATIVE_LOC_REG)
    966         dst = scratch_loc(e, in->opnds[0].type,
    967                           class_for_type(e, in->opnds[0].type), a.v.reg,
    968                           loc_avoid_reg(b), in->loc);
    969       e->target->cmp(e->target, (CmpOp)in->extra.imm, dst, a, b);
    970       if (in->opnds[0].kind != OPK_REG)
    971         write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst,
    972                   mem_for_type(e->c, in->opnds[0].type), in->loc);
    973       return;
    974     case IR_CONVERT:
    975       dst = loc_from_operand(e, &in->opnds[0], in->loc);
    976       dst_reg = dst.kind == NATIVE_LOC_REG ? dst.v.reg : REG_NONE;
    977       src = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc),
    978                         class_for_type(e, in->opnds[1].type), in->opnds[1].type,
    979                         dst_reg, REG_NONE, in->loc);
    980       if (dst.kind != NATIVE_LOC_REG)
    981         dst = scratch_loc(e, in->opnds[0].type,
    982                           class_for_type(e, in->opnds[0].type), src.v.reg,
    983                           REG_NONE, in->loc);
    984       e->target->convert(e->target, (ConvKind)in->extra.imm, dst, src);
    985       if (in->opnds[0].kind != OPK_REG)
    986         write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst,
    987                   mem_for_type(e->c, in->opnds[0].type), in->loc);
    988       return;
    989     case IR_CALL:
    990       emit_call(e, in);
    991       return;
    992     case IR_BR:
    993       e->target->jump(e->target,
    994                       ensure_label(e, e->f->blocks[block].succ[0], in->loc));
    995       return;
    996     case IR_CMP_BRANCH: {
    997       u32 next = order_index + 1u < e->f->emit_order_n
    998                      ? e->f->emit_order[order_index + 1u]
    999                      : UINT32_MAX;
   1000       b = loc_from_operand(e, &in->opnds[1], in->loc);
   1001       a = materialize(e, loc_from_operand(e, &in->opnds[0], in->loc),
   1002                       class_for_type(e, in->opnds[0].type), in->opnds[0].type,
   1003                       REG_NONE, loc_avoid_reg(b), in->loc);
   1004       b = operand_imm_or_reg(e, &in->opnds[1], NATIVE_IMM_CMP,
   1005                              (u32)in->extra.imm, a.v.reg, REG_NONE, in->loc);
   1006       e->target->cmp_branch(
   1007           e->target, (CmpOp)in->extra.imm, a, b,
   1008           ensure_label(e, e->f->blocks[block].succ[0], in->loc));
   1009       if (e->f->blocks[block].nsucc > 1u && e->f->blocks[block].succ[1] != next)
   1010         e->target->jump(e->target,
   1011                         ensure_label(e, e->f->blocks[block].succ[1], in->loc));
   1012       return;
   1013     }
   1014     case IR_SWITCH: {
   1015       IRSwitchAux* aux = (IRSwitchAux*)in->extra.aux;
   1016       NativeLoc sel =
   1017           materialize(e, loc_from_operand(e, &in->opnds[0], in->loc),
   1018                       class_for_type(e, in->opnds[0].type), in->opnds[0].type,
   1019                       REG_NONE, REG_NONE, in->loc);
   1020       NativeLoc imm =
   1021           scratch_loc(e, in->opnds[0].type, (NativeAllocClass)sel.cls,
   1022                       sel.v.reg, REG_NONE, in->loc);
   1023       for (u32 i = 0; aux && i < aux->ncases; ++i) {
   1024         e->target->load_imm(e->target, imm, (i64)aux->cases[i].value);
   1025         e->target->cmp_branch(e->target, CMP_EQ, sel, imm,
   1026                               ensure_label(e, aux->cases[i].block, in->loc));
   1027       }
   1028       if (aux)
   1029         e->target->jump(e->target,
   1030                         ensure_label(e, aux->default_block, in->loc));
   1031       return;
   1032     }
   1033     case IR_INDIRECT_BRANCH: {
   1034       IRIndirectAux* aux = (IRIndirectAux*)in->extra.aux;
   1035       MCLabel* labels = aux && aux->ntargets
   1036                             ? arena_array(e->f->arena, MCLabel, aux->ntargets)
   1037                             : NULL;
   1038       for (u32 i = 0; aux && i < aux->ntargets; ++i)
   1039         labels[i] = ensure_label(e, aux->targets[i], in->loc);
   1040       src = materialize(e, loc_from_operand(e, &in->opnds[0], in->loc),
   1041                         NATIVE_REG_INT, in->opnds[0].type, REG_NONE, REG_NONE,
   1042                         in->loc);
   1043       e->target->indirect_branch(e->target, src, labels,
   1044                                  aux ? aux->ntargets : 0u);
   1045       return;
   1046     }
   1047     case IR_LOAD_LABEL_ADDR:
   1048       dst = loc_from_operand(e, &in->opnds[0], in->loc);
   1049       if (dst.kind != NATIVE_LOC_REG)
   1050         dst = materialize(e, dst, NATIVE_REG_INT, in->opnds[0].type, REG_NONE,
   1051                           REG_NONE, in->loc);
   1052       e->target->load_label_addr(e->target, dst,
   1053                                  ensure_label(e, (u32)in->extra.imm, in->loc));
   1054       return;
   1055     case IR_LOCAL_STATIC_DATA_BEGIN: {
   1056       CgIrLocalStaticBeginAux* aux = (CgIrLocalStaticBeginAux*)in->extra.aux;
   1057       emit_local_static_begin(e, aux ? &aux->desc : NULL, in->loc);
   1058       return;
   1059     }
   1060     case IR_LOCAL_STATIC_DATA_WRITE: {
   1061       CgIrLocalStaticWriteAux* aux = (CgIrLocalStaticWriteAux*)in->extra.aux;
   1062       if (!aux) emit_panic(e, in->loc, "missing local static data write");
   1063       emit_local_static_write(e, aux->has_data ? aux->data : NULL, aux->len,
   1064                               in->loc);
   1065       return;
   1066     }
   1067     case IR_LOCAL_STATIC_DATA_LABEL_ADDR: {
   1068       CgIrLocalStaticLabelAux* aux = (CgIrLocalStaticLabelAux*)in->extra.aux;
   1069       if (!aux) emit_panic(e, in->loc, "missing local static label data");
   1070       (void)aux->address_space;
   1071       emit_local_static_label_addr(e,
   1072                                    ensure_label(e, (u32)aux->target, in->loc),
   1073                                    aux->addend, aux->width, in->loc);
   1074       return;
   1075     }
   1076     case IR_LOCAL_STATIC_DATA_END:
   1077       emit_local_static_end(e, in->loc);
   1078       return;
   1079     case IR_RET:
   1080       emit_ret(e, in, fd);
   1081       return;
   1082     case IR_ALLOCA:
   1083       dst = loc_from_operand(e, &in->opnds[0], in->loc);
   1084       src = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc),
   1085                         NATIVE_REG_INT, in->opnds[1].type, REG_NONE, REG_NONE,
   1086                         in->loc);
   1087       if (dst.kind != NATIVE_LOC_REG)
   1088         dst = scratch_loc(e, in->opnds[0].type, NATIVE_REG_INT, src.v.reg,
   1089                           REG_NONE, in->loc);
   1090       e->target->alloca_(e->target, dst, src, (u32)in->extra.imm);
   1091       return;
   1092     case IR_ATOMIC_LOAD: {
   1093       IRAtomicAux* aux = (IRAtomicAux*)in->extra.aux;
   1094       dst = loc_from_operand(e, &in->opnds[0], in->loc);
   1095       addr = pointer_addr_from_operand(e, &in->opnds[1], in->loc, REG_NONE,
   1096                                        REG_NONE);
   1097       if (dst.kind != NATIVE_LOC_REG)
   1098         dst = scratch_loc(e, in->opnds[0].type,
   1099                           class_for_type(e, in->opnds[0].type), REG_NONE,
   1100                           REG_NONE, in->loc);
   1101       e->target->atomic_load(e->target, dst, addr, aux->mem, aux->mo);
   1102       if (in->opnds[0].kind != OPK_REG)
   1103         write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst, aux->mem,
   1104                   in->loc);
   1105       return;
   1106     }
   1107     case IR_ATOMIC_STORE: {
   1108       IRAtomicAux* aux = (IRAtomicAux*)in->extra.aux;
   1109       addr = pointer_addr_from_operand(e, &in->opnds[0], in->loc, REG_NONE,
   1110                                        REG_NONE);
   1111       src = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc),
   1112                         class_for_type(e, in->opnds[1].type), in->opnds[1].type,
   1113                         REG_NONE, REG_NONE, in->loc);
   1114       e->target->atomic_store(e->target, addr, src, aux->mem, aux->mo);
   1115       return;
   1116     }
   1117     case IR_ATOMIC_RMW: {
   1118       IRAtomicAux* aux = (IRAtomicAux*)in->extra.aux;
   1119       dst = loc_from_operand(e, &in->opnds[0], in->loc);
   1120       addr = pointer_addr_from_operand(e, &in->opnds[1], in->loc, REG_NONE,
   1121                                        REG_NONE);
   1122       src = materialize(e, loc_from_operand(e, &in->opnds[2], in->loc),
   1123                         class_for_type(e, in->opnds[2].type), in->opnds[2].type,
   1124                         REG_NONE, REG_NONE, in->loc);
   1125       if (dst.kind != NATIVE_LOC_REG)
   1126         dst = scratch_loc(e, in->opnds[0].type,
   1127                           class_for_type(e, in->opnds[0].type), src.v.reg,
   1128                           REG_NONE, in->loc);
   1129       e->target->atomic_rmw(e->target, (KitCgAtomicOp)aux->op, dst, addr, src,
   1130                             aux->mem, aux->mo);
   1131       if (in->opnds[0].kind != OPK_REG)
   1132         write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst, aux->mem,
   1133                   in->loc);
   1134       return;
   1135     }
   1136     case IR_ATOMIC_CAS: {
   1137       IRCasAux* aux = (IRCasAux*)in->extra.aux;
   1138       NativeLoc ok;
   1139       NativeLoc expected;
   1140       NativeLoc desired;
   1141       dst = loc_from_operand(e, &in->opnds[0], in->loc);
   1142       ok = loc_from_operand(e, &in->opnds[1], in->loc);
   1143       addr = pointer_addr_from_operand(e, &in->opnds[2], in->loc, REG_NONE,
   1144                                        REG_NONE);
   1145       expected = materialize(e, loc_from_operand(e, &in->opnds[3], in->loc),
   1146                              class_for_type(e, in->opnds[3].type),
   1147                              in->opnds[3].type, REG_NONE, REG_NONE, in->loc);
   1148       desired =
   1149           materialize(e, loc_from_operand(e, &in->opnds[4], in->loc),
   1150                       class_for_type(e, in->opnds[4].type), in->opnds[4].type,
   1151                       expected.v.reg, REG_NONE, in->loc);
   1152       if (dst.kind != NATIVE_LOC_REG)
   1153         dst = scratch_loc(e, in->opnds[0].type,
   1154                           class_for_type(e, in->opnds[0].type), expected.v.reg,
   1155                           desired.v.reg, in->loc);
   1156       if (ok.kind != NATIVE_LOC_REG)
   1157         ok = scratch_loc(e, in->opnds[1].type,
   1158                          class_for_type(e, in->opnds[1].type), dst.v.reg,
   1159                          expected.v.reg, in->loc);
   1160       e->target->atomic_cas(e->target, dst, ok, addr, expected, desired,
   1161                             aux->mem, aux->success, aux->failure);
   1162       if (in->opnds[0].kind != OPK_REG)
   1163         write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst, aux->mem,
   1164                   in->loc);
   1165       if (in->opnds[1].kind != OPK_REG)
   1166         write_loc(e, loc_from_operand(e, &in->opnds[1], in->loc), ok,
   1167                   mem_for_type(e->c, in->opnds[1].type), in->loc);
   1168       return;
   1169     }
   1170     case IR_VA_START: {
   1171       NativeLoc ap = materialize(e, loc_from_operand(e, &in->opnds[0], in->loc),
   1172                                  NATIVE_REG_INT, in->opnds[0].type, REG_NONE,
   1173                                  REG_NONE, in->loc);
   1174       e->target->va_start_(e->target, ap);
   1175       return;
   1176     }
   1177     case IR_VA_END: {
   1178       NativeLoc ap = materialize(e, loc_from_operand(e, &in->opnds[0], in->loc),
   1179                                  NATIVE_REG_INT, in->opnds[0].type, REG_NONE,
   1180                                  REG_NONE, in->loc);
   1181       e->target->va_end_(e->target, ap);
   1182       return;
   1183     }
   1184     case IR_VA_COPY: {
   1185       NativeLoc d = materialize(e, loc_from_operand(e, &in->opnds[0], in->loc),
   1186                                 NATIVE_REG_INT, in->opnds[0].type, REG_NONE,
   1187                                 REG_NONE, in->loc);
   1188       NativeLoc s = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc),
   1189                                 NATIVE_REG_INT, in->opnds[1].type, d.v.reg,
   1190                                 REG_NONE, in->loc);
   1191       e->target->va_copy_(e->target, d, s);
   1192       return;
   1193     }
   1194     case IR_VA_ARG: {
   1195       KitCgTypeId ty = in->opnds[0].type;
   1196       NativeLoc ap = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc),
   1197                                  NATIVE_REG_INT, in->opnds[1].type, REG_NONE,
   1198                                  REG_NONE, in->loc);
   1199       NativeLoc res;
   1200       if (type_is_aggregate_or_large(e, ty)) {
   1201         /* A value too wide for one register (an 8-byte i64/double on a 32-bit
   1202          * target, or an aggregate) can't pass through a scratch register; hand
   1203          * the target its memory destination so it can copy the value directly.
   1204          */
   1205         e->target->va_arg_(e->target, loc_from_operand(e, &in->opnds[0],
   1206                                                        in->loc),
   1207                            ap, ty);
   1208         return;
   1209       }
   1210       /* The result must land in a register distinct from the va_list pointer;
   1211        * fetch into a scratch register, then write to the real destination. */
   1212       res = scratch_loc(e, ty, class_for_type(e, ty), ap.v.reg, REG_NONE,
   1213                         in->loc);
   1214       e->target->va_arg_(e->target, res, ap, ty);
   1215       write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), res,
   1216                 mem_for_type(e->c, ty), in->loc);
   1217       return;
   1218     }
   1219     case IR_ASM_BLOCK: {
   1220       IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
   1221       NativeLoc* out_locs = aux && aux->nout
   1222                                 ? arena_array(e->f->arena, NativeLoc, aux->nout)
   1223                                 : NULL;
   1224       NativeLoc* in_locs = aux && aux->nin
   1225                                ? arena_array(e->f->arena, NativeLoc, aux->nin)
   1226                                : NULL;
   1227       /* The optimizer has already allocated registers for the asm operands and
   1228        * placed the input values / consumes the output values through the normal
   1229        * use/def data flow. We only convert each operand to its NativeLoc; the
   1230        * NativeTarget hook binds the pre-allocated registers to the template and
   1231        * saves/restores any callee-saved registers the asm clobbers. */
   1232       for (u32 i = 0; aux && i < aux->nout; ++i)
   1233         out_locs[i] = loc_from_operand(e, &aux->out_ops[i], in->loc);
   1234       for (u32 i = 0; aux && i < aux->nin; ++i)
   1235         in_locs[i] = loc_from_operand(e, &aux->in_ops[i], in->loc);
   1236       e->target->asm_block(e->target, aux ? aux->tmpl : "",
   1237                            aux ? aux->outs : NULL, aux ? aux->nout : 0,
   1238                            out_locs, aux ? aux->ins : NULL, aux ? aux->nin : 0,
   1239                            in_locs, aux ? aux->clobbers : NULL,
   1240                            aux ? aux->nclob : 0);
   1241       return;
   1242     }
   1243     case IR_BREAK_TO:
   1244     case IR_CONTINUE_TO:
   1245       emit_panic(e, in->loc, "operation is not wired to NativeTarget yet");
   1246     case IR_FENCE:
   1247       e->target->fence(e->target, (KitCgMemOrder)in->extra.imm);
   1248       return;
   1249     case IR_UNREACHABLE:
   1250       e->target->trap(e->target);
   1251       return;
   1252     case IR_INTRINSIC: {
   1253       IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
   1254       NativeLoc* dsts = aux && aux->ndst
   1255                             ? arena_array(e->f->arena, NativeLoc, aux->ndst)
   1256                             : NULL;
   1257       NativeLoc* args = aux && aux->narg
   1258                             ? arena_array(e->f->arena, NativeLoc, aux->narg)
   1259                             : NULL;
   1260       for (u32 i = 0; aux && i < aux->ndst; ++i)
   1261         dsts[i] = loc_from_operand(e, &aux->dsts[i], in->loc);
   1262       for (u32 i = 0; aux && i < aux->narg; ++i) {
   1263         if (aux->args[i].kind == OPK_IMM) {
   1264           args[i] = loc_from_operand(e, &aux->args[i], in->loc);
   1265         } else {
   1266           args[i] = materialize(e, loc_from_operand(e, &aux->args[i], in->loc),
   1267                                 class_for_type(e, aux->args[i].type),
   1268                                 aux->args[i].type, REG_NONE, REG_NONE, in->loc);
   1269         }
   1270       }
   1271       e->target->intrinsic(e->target, aux->kind, dsts, aux->ndst, args,
   1272                            aux->narg);
   1273       return;
   1274     }
   1275     default:
   1276       emit_panic(e, in->loc, "unknown IR op");
   1277   }
   1278 }
   1279 
   1280 static int native_emit_terminates(const Inst* in) {
   1281   if (!in) return 0;
   1282   switch ((IROp)in->op) {
   1283     case IR_BR:
   1284     case IR_CONDBR:
   1285     case IR_CMP_BRANCH:
   1286     case IR_SWITCH:
   1287     case IR_INDIRECT_BRANCH:
   1288     case IR_RET:
   1289     case IR_UNREACHABLE:
   1290     case IR_BREAK_TO:
   1291     case IR_CONTINUE_TO:
   1292       return 1;
   1293     case IR_INTRINSIC: {
   1294       IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
   1295       return aux && (aux->kind == INTRIN_LONGJMP || aux->kind == INTRIN_TRAP);
   1296     }
   1297     default:
   1298       return 0;
   1299   }
   1300 }
   1301 
   1302 static void emit_block(NativeEmitCtx* e, u32 block, u32 order_index,
   1303                        const CGFuncDesc* fd) {
   1304   if (block >= e->f->nblocks) return;
   1305   if (!e->label_placed[block]) {
   1306     e->label_placed[block] = 1u;
   1307     e->target->label_place(e->target,
   1308                            ensure_label(e, block, (SrcLoc){0, 0, 0}));
   1309   }
   1310   Block* bl = &e->f->blocks[block];
   1311   int is_last_block = order_index + 1u == e->f->emit_order_n;
   1312   for (u32 i = 0; i < bl->ninsts; ++i) {
   1313     e->emitting_terminal_ret = is_last_block && i + 1u == bl->ninsts &&
   1314                                (IROp)bl->insts[i].op == IR_RET;
   1315     emit_inst(e, block, order_index, &bl->insts[i], fd);
   1316   }
   1317   e->emitting_terminal_ret = 0;
   1318   if (bl->nsucc == 1u &&
   1319       (bl->ninsts == 0 ||
   1320        !native_emit_terminates(&bl->insts[bl->ninsts - 1u]))) {
   1321     u32 next = order_index + 1u < e->f->emit_order_n
   1322                    ? e->f->emit_order[order_index + 1u]
   1323                    : UINT32_MAX;
   1324     if (bl->succ[0] != next)
   1325       e->target->jump(e->target,
   1326                       ensure_label(e, bl->succ[0], (SrcLoc){0, 0, 0}));
   1327   }
   1328 }
   1329 
   1330 #define EMIT_MAX_REG_CLASSES 4u
   1331 
   1332 static void collect_used_reg(Func* f, Inst* in, OptOperand* op, int is_def,
   1333                              void* ctx) {
   1334   u32* used = (u32*)ctx;
   1335   (void)f;
   1336   (void)in;
   1337   (void)is_def;
   1338   if (op && op->kind == OPT_OPK_REG && op->cls < EMIT_MAX_REG_CLASSES &&
   1339       op->v.reg < 32u)
   1340     used[op->cls] |= 1u << op->v.reg;
   1341 }
   1342 
   1343 /* After register allocation the MIR names hard registers directly, so we scan
   1344  * it for the callee-saved registers the allocator assigned. Fills `used[cls]`
   1345  * (one bitmask per alloc class, masked to each class's callee-saved set) and
   1346  * returns the class count. The masks feed NativeKnownFrameDesc so the backend
   1347  * reserves the save slots as part of the up-front frame. */
   1348 static u32 compute_callee_saved_used(NativeEmitCtx* e, u32* used, u32 cap) {
   1349   NativeTarget* t = e->target;
   1350   const NativeRegInfo* ri = t->regs;
   1351   u32 nclasses;
   1352   for (u32 i = 0; i < cap; ++i) used[i] = 0;
   1353   if (!ri) return 0;
   1354   for (u32 b = 0; b < e->f->nblocks; ++b) {
   1355     Block* bl = &e->f->blocks[b];
   1356     for (u32 i = 0; i < bl->ninsts; ++i)
   1357       opt_walk_inst_operands(e->f, &bl->insts[i], collect_used_reg, used);
   1358   }
   1359   nclasses = ri->nclasses < cap ? ri->nclasses : cap;
   1360   for (u32 i = 0; i < ri->nclasses; ++i) {
   1361     const NativeAllocClassInfo* ci = &ri->classes[i];
   1362     if (ci->cls < cap)
   1363       used[ci->cls] &=
   1364           native_target_callee_saved_mask(t, (NativeAllocClass)ci->cls);
   1365   }
   1366   return nclasses;
   1367 }
   1368 
   1369 /* Plan the complete call frame before any code is emitted, then hand it to the
   1370  * backend via func_begin_known_frame so the prologue is emitted final. The
   1371  * optimizer knows everything the frame needs after register allocation and MIR
   1372  * lowering: the callee-saved set (scanned from the MIR), every static frame
   1373  * slot (f->frame_slots), and the outgoing-arg area (the max over all calls of
   1374  * the pure call_stack_bytes query). The body therefore allocates no slots, so
   1375  * the frame is final up front and nothing is back-patched. Populates
   1376  * e->slot_map from the backend-assigned slot handles for the body to use. */
   1377 static void plan_frame(NativeEmitCtx* e, const CGFuncDesc* fd) {
   1378   NativeTarget* t = e->target;
   1379   NativeKnownFrameDesc frame;
   1380   NativeFrameSlotDesc* slots = NULL;
   1381   NativeFrameSlot* out_slots = NULL;
   1382   u32 used[EMIT_MAX_REG_CLASSES];
   1383   u32 nclasses;
   1384   u32 max_args = 0, max_outgoing = 0;
   1385   u8 has_alloca = 0;
   1386   u8 needs_scratch_spill = 0;
   1387   u8 has_call = 0;
   1388   u8 has_asm = 0;
   1389   u8 reads_frame = 0;
   1390   u32 nasm_clob = 0;
   1391   u32 asm_clobber_abi_sets = 0;
   1392   Sym* asm_clobbers = NULL;
   1393   memset(&frame, 0, sizeof frame);
   1394   nclasses = t->reserve_callee_saves
   1395                  ? compute_callee_saved_used(e, used, EMIT_MAX_REG_CLASSES)
   1396                  : 0u;
   1397   /* Outgoing-arg area = max stack-arg bytes over all calls; also note alloca.
   1398    */
   1399   for (u32 b = 0; b < e->f->nblocks; ++b) {
   1400     Block* bl = &e->f->blocks[b];
   1401     for (u32 i = 0; i < bl->ninsts; ++i) {
   1402       Inst* in = &bl->insts[i];
   1403       if ((IROp)in->op == IR_ALLOCA) {
   1404         has_alloca = 1;
   1405       } else if ((IROp)in->op == IR_ATOMIC_RMW) {
   1406         needs_scratch_spill = 1;
   1407       } else if ((IROp)in->op == IR_CALL) {
   1408         IRCallAux* aux = (IRCallAux*)in->extra.aux;
   1409         /* Any call (regular or sibling/tail) means the function is not a leaf:
   1410          * it clobbers the return-address register and the stack below sp. */
   1411         has_call = 1;
   1412         if (aux && aux->desc.nargs > max_args) max_args = aux->desc.nargs;
   1413       } else if ((IROp)in->op == IR_ASM_BLOCK) {
   1414         /* Inline asm may clobber the return-address register or the red zone
   1415          * opaquely; disqualifies the frame-eliding tiers (see has_asm). Its
   1416          * callee-saved register clobbers and hard-register operand pins are
   1417          * equally opaque to the operand scan below; count them now so the
   1418          * backend can fold them into the saved set (collected into a single Sym
   1419          * list in a second pass below). */
   1420         IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
   1421         has_asm = 1;
   1422         if (aux) {
   1423           nasm_clob += aux->nclob;
   1424           for (u32 k = 0; k < aux->nout; ++k)
   1425             if (aux->outs[k].reg) ++nasm_clob;
   1426           for (u32 k = 0; k < aux->nin; ++k)
   1427             if (aux->ins[k].reg) ++nasm_clob;
   1428           asm_clobber_abi_sets |= aux->clobber_abi_sets;
   1429         }
   1430       } else if ((IROp)in->op == IR_INTRINSIC) {
   1431         /* __builtin_frame_address / __builtin_return_address read the frame
   1432          * record, so the function must keep one (disables the rv64 frameless
   1433          * leaf tier; see NativeKnownFrameDesc.reads_frame). */
   1434         IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
   1435         if (aux && (aux->kind == INTRIN_FRAME_ADDRESS ||
   1436                     aux->kind == INTRIN_RETURN_ADDRESS))
   1437           reads_frame = 1;
   1438       }
   1439     }
   1440   }
   1441   /* Gather the union of every asm block's clobber names and hard-register
   1442    * operand pins. The backend resolves them with its own clobber parser
   1443    * (machinize's resolve_name is unset on every backend, so aux->clobber_mask is
   1444    * unreliable here). */
   1445   if (nasm_clob) {
   1446     u32 n = 0;
   1447     asm_clobbers = arena_array(e->f->arena, Sym, nasm_clob);
   1448     for (u32 b = 0; b < e->f->nblocks; ++b) {
   1449       Block* bl = &e->f->blocks[b];
   1450       for (u32 i = 0; i < bl->ninsts; ++i) {
   1451         Inst* in = &bl->insts[i];
   1452         IRAsmAux* aux;
   1453         if ((IROp)in->op != IR_ASM_BLOCK) continue;
   1454         aux = (IRAsmAux*)in->extra.aux;
   1455         for (u32 k = 0; aux && k < aux->nclob; ++k)
   1456           asm_clobbers[n++] = aux->clobbers[k];
   1457         for (u32 k = 0; aux && k < aux->nout; ++k)
   1458           if (aux->outs[k].reg) asm_clobbers[n++] = aux->outs[k].reg;
   1459         for (u32 k = 0; aux && k < aux->nin; ++k)
   1460           if (aux->ins[k].reg) asm_clobbers[n++] = aux->ins[k].reg;
   1461       }
   1462     }
   1463     nasm_clob = n;
   1464   }
   1465   if (t->call_stack_bytes) {
   1466     NativeLoc* args =
   1467         max_args ? arena_zarray(e->f->arena, NativeLoc, max_args) : NULL;
   1468     for (u32 b = 0; b < e->f->nblocks; ++b) {
   1469       Block* bl = &e->f->blocks[b];
   1470       for (u32 i = 0; i < bl->ninsts; ++i) {
   1471         Inst* in = &bl->insts[i];
   1472         IRCallAux* aux;
   1473         NativeCallDesc d;
   1474         u32 sb;
   1475         if ((IROp)in->op != IR_CALL) continue;
   1476         aux = (IRCallAux*)in->extra.aux;
   1477         if (!aux) continue;
   1478         memset(&d, 0, sizeof d);
   1479         d.fn_type = aux->desc.fn_type;
   1480         d.flags = aux->desc.flags;
   1481         d.nargs = aux->desc.nargs;
   1482         for (u32 k = 0; k < aux->desc.nargs; ++k) {
   1483           memset(&args[k], 0, sizeof args[k]);
   1484           args[k].type = aux->desc.args[k].type;
   1485         }
   1486         d.args = args;
   1487         sb = t->call_stack_bytes(t, &d);
   1488         if (sb > max_outgoing) max_outgoing = sb;
   1489       }
   1490     }
   1491   }
   1492   e->slot_map =
   1493       arena_zarray(e->f->arena, NativeFrameSlot, e->f->nframe_slots + 1u);
   1494   if (e->f->nframe_slots) {
   1495     slots = arena_zarray(e->f->arena, NativeFrameSlotDesc, e->f->nframe_slots);
   1496     out_slots = arena_zarray(e->f->arena, NativeFrameSlot, e->f->nframe_slots);
   1497     for (u32 i = 0; i < e->f->nframe_slots; ++i) {
   1498       IRFrameSlot* s = &e->f->frame_slots[i];
   1499       NativeFrameSlotDesc* d = &slots[i];
   1500       memset(d, 0, sizeof *d);
   1501       d->type = s->type;
   1502       d->name = s->name;
   1503       d->loc = s->loc;
   1504       d->size = s->size;
   1505       d->align = s->align;
   1506       d->kind = s->kind;
   1507       d->flags = s->flags;
   1508     }
   1509   }
   1510   frame.slots = slots;
   1511   frame.nslots = e->f->nframe_slots;
   1512   frame.max_outgoing = max_outgoing;
   1513   frame.callee_saved_used = nclasses ? used : NULL;
   1514   frame.ncallee_classes = nclasses;
   1515   frame.has_alloca = has_alloca;
   1516   frame.needs_scratch_spill = needs_scratch_spill;
   1517   frame.is_leaf = !has_call;
   1518   frame.has_asm = has_asm;
   1519   frame.reads_frame = reads_frame;
   1520   frame.asm_clobbers = asm_clobbers;
   1521   frame.nasm_clobbers = nasm_clob;
   1522   frame.asm_clobber_abi_sets = asm_clobber_abi_sets;
   1523   t->func_begin_known_frame(t, fd, &frame, out_slots);
   1524   for (u32 i = 0; i < e->f->nframe_slots; ++i)
   1525     e->slot_map[e->f->frame_slots[i].id] = out_slots[i];
   1526 }
   1527 
   1528 void opt_emit_native(Compiler* c, Func* f, NativeTarget* target) {
   1529   NativeEmitCtx e;
   1530   Func view;
   1531   CGFuncDesc fd;
   1532   if (!f || !target) return;
   1533   memset(&e, 0, sizeof e);
   1534   if (f->mir) {
   1535     view = *f;
   1536     view.blocks = f->mir->blocks;
   1537     view.nblocks = f->mir->nblocks;
   1538     view.entry = f->mir->entry;
   1539     view.emit_order = f->mir->emit_order;
   1540     view.emit_order_n = f->mir->emit_order_n;
   1541     view.emit_order_cap = f->mir->emit_order_cap;
   1542     view.opt_rewritten = 1;
   1543     view.mir = NULL;
   1544     e.f = &view;
   1545   } else {
   1546     e.f = f;
   1547   }
   1548   e.c = c;
   1549   e.target = target;
   1550   metrics_scope_begin(c, "opt.native_emit.setup");
   1551   e.labels = arena_array(e.f->arena, MCLabel, e.f->nblocks ? e.f->nblocks : 1u);
   1552   e.label_placed =
   1553       arena_zarray(e.f->arena, u8, e.f->nblocks ? e.f->nblocks : 1u);
   1554   for (u32 i = 0; i < e.f->nblocks; ++i) e.labels[i] = MC_LABEL_NONE;
   1555   fd = semantic_func_desc(&e);
   1556   metrics_scope_end(c, "opt.native_emit.setup");
   1557 
   1558   metrics_scope_begin(c, "opt.native_emit.func_begin");
   1559   /* The optimizer has the whole frame after regalloc + MIR lowering, so it
   1560    * plans it up front (plan_frame) and drives func_begin_known_frame: the
   1561    * backend emits a final prologue with no reserved NOP region and no
   1562    * back-patching. The body allocates no frame slots, so the frame stays final;
   1563    * allocas and tail epilogues are emitted final too. (Contrast the
   1564    * single-pass NativeDirectTarget path, which reserves and patches.) */
   1565   plan_frame(&e, &fd);
   1566   bind_params(&e);
   1567   metrics_scope_end(c, "opt.native_emit.func_begin");
   1568 
   1569   metrics_scope_begin(c, "opt.native_emit.body");
   1570   for (u32 i = 0; i < e.f->emit_order_n; ++i)
   1571     emit_block(&e, e.f->emit_order[i], i, &fd);
   1572   metrics_scope_end(c, "opt.native_emit.body");
   1573 
   1574   metrics_scope_begin(c, "opt.native_emit.func_end");
   1575   target->func_end(target);
   1576   metrics_scope_end(c, "opt.native_emit.func_end");
   1577 }