kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

cg_ir_lower.c (44612B)


      1 #include <string.h>
      2 
      3 #include "cg/ir.h"
      4 #include "cg/type.h"
      5 #include "opt/opt_internal.h"
      6 
      7 #undef Operand
      8 #undef CGParamDesc
      9 #undef CGCallDesc
     10 #undef CGFuncDesc
     11 #undef CGLocalStorage
     12 #undef FrameSlotDesc
     13 
     14 typedef struct OptLocalMap {
     15   OptCGLocalStorage storage;
     16   NativeFrameSlot home_slot;
     17   KitCgTypeId type;
     18   u32 size;
     19   u32 align;
     20   u8 cls;
     21   u8 address_taken;
     22   u8 pad[2];
     23 } OptLocalMap;
     24 
     25 /* Per-instruction record of pointer locals whose value was loaded from their
     26  * frame home into a fresh PReg so they can serve as an indirect-addressing base
     27  * (see frame_indirect_base_reg). Reset for each lowered instruction. */
     28 #define CG_IR_LOWER_MAX_MAT 8u
     29 typedef struct CgIrLower {
     30   Compiler* c;
     31   const CgIrFunc* src;
     32   Func* f;
     33   OptLocalMap* locals;
     34   u32 nlocals;
     35   u32* label_block;
     36   u32 nlabels;
     37   u32* inst_block;
     38   u8* leader;
     39   CGLocal mat_local[CG_IR_LOWER_MAX_MAT];
     40   u8 mat_role[CG_IR_LOWER_MAX_MAT];
     41   Reg mat_reg[CG_IR_LOWER_MAX_MAT];
     42   u32 nmat;
     43 } CgIrLower;
     44 
     45 typedef enum CgIrMatRole {
     46   CG_IR_MAT_BASE = 0,
     47   CG_IR_MAT_INDEX = 1,
     48 } CgIrMatRole;
     49 
     50 static _Noreturn void lower_panic(CgIrLower* l, SrcLoc loc, const char* msg) {
     51   compiler_panic(l->c, loc, "opt cg-ir lower: %s", msg);
     52 }
     53 
     54 static u8 local_reg_class(Compiler* c, KitCgTypeId ty) {
     55   return opt_value_reg_class(c, ty);
     56 }
     57 
     58 static OptCGFuncDesc lower_func_desc(Arena* a, const struct CGFuncDesc* in) {
     59   OptCGFuncDesc out;
     60   memset(&out, 0, sizeof out);
     61   if (!in) return out;
     62   out.sym = in->sym;
     63   out.text_section_id = in->text_section_id;
     64   out.group_id = in->group_id;
     65   out.fn_type = in->fn_type;
     66   out.result_type = in->result_type;
     67   out.nparams = in->nparams;
     68   out.loc = in->loc;
     69   out.flags = in->flags;
     70   out.inline_policy = in->inline_policy;
     71   out.atomize = in->atomize;
     72   if (in->nparams && in->params) {
     73     OptCGParamDesc* params = arena_zarray(a, OptCGParamDesc, in->nparams);
     74     for (u32 i = 0; i < in->nparams; ++i) {
     75       params[i].index = in->params[i].index;
     76       params[i].name = in->params[i].name;
     77       params[i].type = in->params[i].type;
     78       params[i].size = in->params[i].size;
     79       params[i].align = in->params[i].align;
     80       params[i].flags = in->params[i].flags;
     81       params[i].loc = in->params[i].loc;
     82     }
     83     out.params = params;
     84   }
     85   return out;
     86 }
     87 
     88 static NativeFrameSlotDesc local_slot_desc(const CgIrLocal* in, u8 kind) {
     89   NativeFrameSlotDesc out;
     90   memset(&out, 0, sizeof out);
     91   out.type = in->desc.type;
     92   out.name = in->desc.name;
     93   out.loc = in->desc.loc;
     94   out.size = in->desc.size;
     95   out.align = in->desc.align;
     96   out.kind = kind;
     97   if (in->address_taken || (in->desc.flags & CG_LOCAL_ADDR_TAKEN))
     98     out.flags |= FSF_ADDR_TAKEN;
     99   if (in->desc.flags & CG_LOCAL_MEMORY_REQUIRED)
    100     out.flags |= FSF_MEMORY_REQUIRED;
    101   return out;
    102 }
    103 
    104 static OptLocalMap* local_map(CgIrLower* l, CGLocal id, SrcLoc loc) {
    105   if (id == CG_LOCAL_NONE || id > l->nlocals)
    106     lower_panic(l, loc, "bad semantic local");
    107   return &l->locals[id - 1u];
    108 }
    109 
    110 static int local_needs_home(const CgIrLocal* in) {
    111   return in->address_taken ||
    112          (in->desc.flags & (CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED));
    113 }
    114 
    115 static int operand_uses_local_addr(const Operand* op, CGLocal local) {
    116   if (!op) return 0;
    117   if (op->kind == OPK_LOCAL) return op->v.local == local;
    118   return 0;
    119 }
    120 
    121 /* AGG_COPY/AGG_SET take their dest/src as *pointer values* to the aggregate —
    122  * the emitter derefs an OPK_LOCAL pointer operand via pointer_addr_from_operand
    123  * (it loads the pointer; it does not address the local's own slot). So a
    124  * pointer-typed local operand of an aggregate op uses the local's VALUE, not
    125  * its address, and must not force the local to a frame home. Only a non-pointer
    126  * operand (the aggregate-typed local itself) genuinely addresses its storage.
    127  * (STORE/LOAD/ADDR_OF use addr_from_operand, where an OPK_LOCAL always
    128  * addresses the slot, so they keep operand_uses_local_addr.) */
    129 static int operand_uses_local_agg_addr(Compiler* c, const Operand* op,
    130                                        CGLocal local) {
    131   if (!op || op->kind != OPK_LOCAL || op->v.local != local) return 0;
    132   return !cg_type_is_ptr(c, op->type);
    133 }
    134 
    135 static int local_address_used_in_cg_ir(Compiler* c, const CgIrFunc* f,
    136                                        CGLocal local) {
    137   for (u32 i = 0; i < f->ninsts; ++i) {
    138     const CgIrInst* in = &f->insts[i];
    139     switch ((CgIrOp)in->op) {
    140       case CG_IR_LOAD:
    141       case CG_IR_BITFIELD_LOAD:
    142         if (in->nopnds > 1u && operand_uses_local_addr(&in->opnds[1], local))
    143           return 1;
    144         break;
    145       case CG_IR_STORE:
    146       case CG_IR_BITFIELD_STORE:
    147         if (in->nopnds > 0u && operand_uses_local_addr(&in->opnds[0], local))
    148           return 1;
    149         break;
    150       case CG_IR_AGG_SET:
    151         if (in->nopnds > 0u &&
    152             operand_uses_local_agg_addr(c, &in->opnds[0], local))
    153           return 1;
    154         break;
    155       case CG_IR_ADDR_OF:
    156         if (in->nopnds > 1u && operand_uses_local_addr(&in->opnds[1], local))
    157           return 1;
    158         break;
    159       case CG_IR_AGG_COPY:
    160         if ((in->nopnds > 0u &&
    161              operand_uses_local_agg_addr(c, &in->opnds[0], local)) ||
    162             (in->nopnds > 1u &&
    163              operand_uses_local_agg_addr(c, &in->opnds[1], local)))
    164           return 1;
    165         break;
    166       /* VA_START/VA_ARG/VA_END/VA_COPY consume a pointer *value* (the address
    167        * of the va_list, produced by an earlier ADDR_OF); they do not take the
    168        * address of their pointer operand, so they must not force it to a frame
    169        * slot. */
    170       default:
    171         break;
    172     }
    173   }
    174   return 0;
    175 }
    176 
    177 static void lower_locals(CgIrLower* l) {
    178   l->nlocals = l->src->nlocals;
    179   l->locals =
    180       arena_zarray(l->f->arena, OptLocalMap, l->nlocals ? l->nlocals : 1u);
    181   for (u32 i = 0; i < l->src->nlocals; ++i) {
    182     const CgIrLocal* in = &l->src->locals[i];
    183     OptLocalMap* m;
    184     if (in->id == CG_LOCAL_NONE || in->id > l->src->nlocals)
    185       lower_panic(l, in->desc.loc, "non-dense semantic local table");
    186     m = &l->locals[in->id - 1u];
    187     m->type = in->desc.type;
    188     m->size = in->desc.size;
    189     m->align = in->desc.align;
    190     m->cls = local_reg_class(l->c, in->desc.type);
    191     /* Aggregates and oversized scalars cannot live in a single PReg; they need
    192      * a memory home regardless of whether their address is taken. "Oversized"
    193      * is wider than the machine word (ptr_size): 8 on rv64/x64/aa64, 4 on rv32
    194      * — so an 8-byte i64/double on rv32 is homed in memory like an i128 is on a
    195      * 64-bit target (the cg layer also flags these CG_LOCAL_MEMORY_REQUIRED). */
    196     m->address_taken = local_needs_home(in) ||
    197                        local_address_used_in_cg_ir(l->c, l->src, in->id) ||
    198                        cg_type_is_aggregate(l->c, in->desc.type) ||
    199                        cg_type_size(l->c, in->desc.type) >
    200                            (u64)l->c->target.ptr_size;
    201 
    202     PReg r = ir_alloc_preg(l->f, in->desc.type, m->cls);
    203     if (m->address_taken) {
    204       m->storage.kind = CG_LOCAL_STORAGE_FRAME;
    205     } else {
    206       m->storage.kind = CG_LOCAL_STORAGE_REG;
    207       m->storage.v.reg = (Reg)r;
    208     }
    209 
    210     if (m->address_taken) {
    211       NativeFrameSlotDesc fsd =
    212           local_slot_desc(in, in->is_param ? FS_PARAM : FS_LOCAL);
    213       m->home_slot = ir_frame_slot_new(l->f, &fsd);
    214       m->storage.v.frame_slot = m->home_slot;
    215     } else {
    216       m->home_slot = FRAME_SLOT_NONE;
    217     }
    218     (void)ir_local_add(l->f, &in->desc, m->storage);
    219     l->f->locals[l->f->nlocals - 1u].address_taken = m->address_taken;
    220     l->f->locals[l->f->nlocals - 1u].home_slot = m->home_slot;
    221   }
    222 }
    223 
    224 static const CgIrParam* find_param(const CgIrFunc* f, CGLocal local) {
    225   for (u32 i = 0; i < f->nparams; ++i)
    226     if (f->params[i].local == local) return &f->params[i];
    227   return NULL;
    228 }
    229 
    230 static void lower_params(CgIrLower* l) {
    231   /* Resolve the function-level ABI info once so we can attach per-param
    232    * ABIArgInfo to each IRParam. Consumers (set_preg_pref_for_params, the
    233    * native bind_param emit path) read p->abi without going through
    234    * f->desc.abi, so this stays scoped to the param plumbing and does not
    235    * activate the dormant f->desc.abi-gated passes (e.g.
    236    * apply_param_incoming_register_hazards, opt_verify_alloc's incoming
    237    * check), which have known issues with tail-call shuffles. */
    238   const ABIFuncInfo* fi = NULL;
    239   if (l->c && l->c->abi && l->f->desc.fn_type)
    240     fi = abi_cg_func_info(l->c->abi, l->f->desc.fn_type);
    241   for (u32 i = 0; i < l->src->nlocals; ++i) {
    242     const CgIrLocal* loc = &l->src->locals[i];
    243     if (!loc->is_param) continue;
    244     const CgIrParam* p = find_param(l->src, loc->id);
    245     OptLocalMap* m = local_map(l, loc->id, loc->desc.loc);
    246     OptCGParamDesc d;
    247     memset(&d, 0, sizeof d);
    248     if (p) {
    249       d.index = p->desc.index;
    250       d.name = p->desc.name;
    251       d.type = p->desc.type;
    252       d.size = p->desc.size;
    253       d.align = p->desc.align;
    254       d.flags = p->desc.flags;
    255       d.loc = p->desc.loc;
    256     } else {
    257       d.index = loc->param_index;
    258       d.name = loc->desc.name;
    259       d.type = loc->desc.type;
    260       d.size = loc->desc.size;
    261       d.align = loc->desc.align;
    262       d.flags = loc->desc.flags;
    263       d.loc = loc->desc.loc;
    264     }
    265     d.storage = m->storage;
    266     if (fi && d.index < fi->nparams) d.abi = &fi->params[d.index];
    267     ir_param_add(l->f, &d);
    268   }
    269 }
    270 
    271 static int cg_inst_terminates(const CgIrInst* in) {
    272   if (!in) return 0;
    273   switch ((CgIrOp)in->op) {
    274     case CG_IR_BR:
    275     case CG_IR_RET:
    276     case CG_IR_UNREACHABLE:
    277     case CG_IR_CMP_BRANCH:
    278     case CG_IR_SWITCH:
    279     case CG_IR_INDIRECT_BRANCH:
    280     case CG_IR_BREAK_TO:
    281     case CG_IR_CONTINUE_TO:
    282       return 1;
    283     case CG_IR_INTRINSIC: {
    284       const CgIrIntrinsicAux* aux = (const CgIrIntrinsicAux*)in->extra.aux;
    285       return aux && (aux->kind == INTRIN_LONGJMP || aux->kind == INTRIN_TRAP);
    286     }
    287     default:
    288       return 0;
    289   }
    290 }
    291 
    292 static u32 label_id_max(const CgIrFunc* f) {
    293   u32 max = 0;
    294   for (u32 i = 0; i < f->nlabels; ++i)
    295     if (f->labels[i].id > max) max = f->labels[i].id;
    296   return max;
    297 }
    298 
    299 static void mark_label_leader(CgIrLower* l, Label label, const u32* place) {
    300   if (label == LABEL_NONE || label > l->nlabels || place[label] == UINT32_MAX)
    301     return;
    302   l->leader[place[label]] = 1;
    303 }
    304 
    305 static void mark_leaders(CgIrLower* l, u32* label_place) {
    306   const CgIrFunc* f = l->src;
    307   for (u32 i = 0; i <= f->ninsts; ++i) l->leader[i] = 0;
    308   if (f->ninsts) l->leader[0] = 1;
    309   for (u32 i = 0; i < f->ninsts; ++i) {
    310     const CgIrInst* in = &f->insts[i];
    311     if ((CgIrOp)in->op == CG_IR_LABEL) {
    312       Label label = (Label)in->extra.imm;
    313       l->leader[i] = 1;
    314       if (label && label <= l->nlabels && label_place[label] == UINT32_MAX)
    315         label_place[label] = i;
    316     }
    317   }
    318   for (u32 i = 0; i < f->ninsts; ++i) {
    319     const CgIrInst* in = &f->insts[i];
    320     if (cg_inst_terminates(in) && i + 1u < f->ninsts) l->leader[i + 1u] = 1;
    321     switch ((CgIrOp)in->op) {
    322       case CG_IR_BR:
    323       case CG_IR_LOAD_LABEL_ADDR:
    324         mark_label_leader(l, (Label)in->extra.imm, label_place);
    325         break;
    326       case CG_IR_CMP_BRANCH: {
    327         CgIrCmpBranchAux* aux = (CgIrCmpBranchAux*)in->extra.aux;
    328         if (i + 1u < f->ninsts) l->leader[i + 1u] = 1;
    329         if (aux) mark_label_leader(l, aux->target, label_place);
    330         break;
    331       }
    332       case CG_IR_SWITCH: {
    333         CgIrSwitchAux* aux = (CgIrSwitchAux*)in->extra.aux;
    334         if (i + 1u < f->ninsts) l->leader[i + 1u] = 1;
    335         if (aux) {
    336           mark_label_leader(l, aux->default_label, label_place);
    337           for (u32 c = 0; c < aux->ncases; ++c)
    338             mark_label_leader(l, aux->cases[c].label, label_place);
    339         }
    340         break;
    341       }
    342       case CG_IR_INDIRECT_BRANCH: {
    343         CgIrIndirectAux* aux = (CgIrIndirectAux*)in->extra.aux;
    344         if (aux) {
    345           for (u32 t = 0; t < aux->ntargets; ++t)
    346             mark_label_leader(l, aux->targets[t], label_place);
    347         }
    348         break;
    349       }
    350       case CG_IR_SCOPE_BEGIN:
    351         if (i + 1u < f->ninsts) l->leader[i + 1u] = 1;
    352         break;
    353       case CG_IR_SCOPE_END:
    354         l->leader[i] = 1;
    355         if (i + 1u < f->ninsts) l->leader[i + 1u] = 1;
    356         break;
    357       default:
    358         break;
    359     }
    360   }
    361 }
    362 
    363 static void make_blocks(CgIrLower* l, const u32* label_place) {
    364   const CgIrFunc* f = l->src;
    365   u32 cur = UINT32_MAX;
    366   l->inst_block = arena_zarray(l->f->arena, u32, f->ninsts ? f->ninsts : 1u);
    367   for (u32 i = 0; i < f->ninsts; ++i) {
    368     if (l->leader[i] || cur == UINT32_MAX) {
    369       cur = ir_block_new(l->f);
    370       ir_note_emit(l->f, cur);
    371       if (l->f->nblocks == 1u) l->f->entry = cur;
    372     }
    373     l->inst_block[i] = cur;
    374   }
    375   l->label_block =
    376       arena_zarray(l->f->arena, u32, l->nlabels ? l->nlabels + 1u : 1u);
    377   for (u32 i = 0; i <= l->nlabels; ++i) l->label_block[i] = UINT32_MAX;
    378   for (u32 label = 1; label <= l->nlabels; ++label) {
    379     if (label_place[label] != UINT32_MAX) {
    380       u32 place = label_place[label];
    381       l->label_block[label] = (place + 1u < f->ninsts)
    382                                   ? l->inst_block[place + 1u]
    383                                   : l->inst_block[place];
    384     } else {
    385       l->label_block[label] = ir_block_new(l->f);
    386     }
    387   }
    388   if (!l->f->nblocks) {
    389     l->f->entry = ir_block_new(l->f);
    390     ir_note_emit(l->f, l->f->entry);
    391   }
    392   l->f->emit_order_n = 0;
    393   for (u32 i = 0; i < f->ninsts; ++i) ir_note_emit(l->f, l->inst_block[i]);
    394   if (!f->ninsts) ir_note_emit(l->f, l->f->entry);
    395 }
    396 
    397 static void emit_param_decls(CgIrLower* l) {
    398   if (!l->f->nparams || l->f->entry >= l->f->nblocks) return;
    399   /* Emit the IR_PARAM_DECL phantom defs into a dedicated prologue block that
    400    * falls through to the body, and make it the function entry. This keeps the
    401    * parameter defs out of the body's first block, which matters when the body
    402    * begins with a loop: that first block is then the loop header and the
    403    * back-edge targets it. With the param_decls in the header, liveness reads
    404    * each parameter as redefined every iteration (killing the liveness of an
    405    * induction variable carried in a parameter register), and because the entry
    406    * block's label is not placed by the emitter the back-edge resolves to a
    407    * branch-to-self. Both miscompile loop-first functions at -O1. The prologue
    408    * block emits no code (param_decls are markers, the fall-through is free, and
    409    * the entry label is elided), so this is free in the common case. */
    410   u32 prologue = ir_block_new(l->f);
    411   l->f->entry = prologue;
    412   ir_note_emit(l->f, prologue);
    413   for (u32 i = l->f->emit_order_n - 1u; i > 0; --i)
    414     l->f->emit_order[i] = l->f->emit_order[i - 1u];
    415   l->f->emit_order[0] = prologue;
    416   for (u32 i = 0; i < l->f->nparams; ++i) {
    417     IRParam* p = &l->f->params[i];
    418     Inst* in = ir_emit(l->f, prologue, IR_PARAM_DECL);
    419     IRParamDeclAux* aux = arena_znew(l->f->arena, IRParamDeclAux);
    420     in->loc = p->loc;
    421     in->type = p->type;
    422     if (p->storage.kind == CG_LOCAL_STORAGE_REG) in->def = p->storage.v.reg;
    423     memset(aux, 0, sizeof *aux);
    424     aux->desc.index = p->index;
    425     aux->desc.name = p->name;
    426     aux->desc.type = p->type;
    427     aux->desc.size = p->size;
    428     aux->desc.align = p->align;
    429     aux->desc.flags = p->flags;
    430     aux->desc.loc = p->loc;
    431     aux->desc.storage = p->storage;
    432     aux->desc.abi = p->abi;
    433     in->extra.aux = aux;
    434   }
    435 }
    436 
    437 static u32 block_for_label(CgIrLower* l, Label label, SrcLoc loc) {
    438   if (label == LABEL_NONE || label > l->nlabels ||
    439       l->label_block[label] == UINT32_MAX)
    440     lower_panic(l, loc, "bad label");
    441   return l->label_block[label];
    442 }
    443 
    444 static u32 fallthrough_block(CgIrLower* l, u32 inst_index) {
    445   if (inst_index + 1u >= l->src->ninsts) return UINT32_MAX;
    446   return l->inst_block[inst_index + 1u];
    447 }
    448 
    449 static void set_succ1(CgIrLower* l, u32 block, u32 succ) {
    450   if (succ == UINT32_MAX) {
    451     l->f->blocks[block].nsucc = 0;
    452     return;
    453   }
    454   l->f->blocks[block].succ[0] = succ;
    455   l->f->blocks[block].nsucc = 1;
    456 }
    457 
    458 static OptOperand* dup_opt_ops(CgIrLower* l, const OptOperand* ops, u32 n) {
    459   if (!n) return NULL;
    460   OptOperand* out = arena_array(l->f->arena, OptOperand, n);
    461   memcpy(out, ops, sizeof(*out) * n);
    462   return out;
    463 }
    464 
    465 static OptOperand opt_reg_operand(OptLocalMap* m) {
    466   OptOperand out;
    467   memset(&out, 0, sizeof out);
    468   out.kind = OPK_REG;
    469   out.cls = m->cls;
    470   out.type = m->type;
    471   out.v.reg = m->storage.v.reg;
    472   return out;
    473 }
    474 
    475 static OptOperand opt_frame_operand(OptLocalMap* m) {
    476   OptOperand out;
    477   memset(&out, 0, sizeof out);
    478   out.kind = OPK_LOCAL;
    479   out.cls = RC_INT;
    480   out.type = m->type;
    481   out.v.frame_slot = m->home_slot;
    482   return out;
    483 }
    484 
    485 /* Base/index register for an OPK_INDIRECT whose base is a local. A REG-storage
    486  * local supplies its value register directly. A FRAME-storage local (its
    487  * address was taken, e.g. `int **q = &p; p->f = ...`) holds the pointer value
    488  * in its frame home, so storage.v.reg is meaningless; load the home into a
    489  * fresh PReg. prematerialize_indirect_bases emits that load before the using
    490  * instruction; here we just look the result up (l->mat_*). */
    491 static Reg resolve_materialized_reg(CgIrLower* l, CGLocal local,
    492                                     CgIrMatRole role, SrcLoc loc) {
    493   OptLocalMap* m = local_map(l, local, loc);
    494   if (m->storage.kind == CG_LOCAL_STORAGE_REG) return m->storage.v.reg;
    495   for (u32 i = 0; i < l->nmat; ++i)
    496     if (l->mat_local[i] == local && l->mat_role[i] == (u8)role)
    497       return l->mat_reg[i];
    498   lower_panic(l, loc, role == CG_IR_MAT_INDEX
    499                           ? "indirect index local not materialized"
    500                           : "indirect base local not materialized");
    501 }
    502 
    503 static KitCgTypeId pointer_sized_int_type(CgIrLower* l) {
    504   return builtin_id(l->c->target.ptr_size <= 4u ? KIT_CG_BUILTIN_I32
    505                                                 : KIT_CG_BUILTIN_I64);
    506 }
    507 
    508 static void remember_materialized_reg(CgIrLower* l, CGLocal local,
    509                                       CgIrMatRole role, Reg r, SrcLoc loc) {
    510   if (l->nmat >= CG_IR_LOWER_MAX_MAT)
    511     lower_panic(l, loc, "too many frame indirect operands in one instruction");
    512   l->mat_local[l->nmat] = local;
    513   l->mat_role[l->nmat] = (u8)role;
    514   l->mat_reg[l->nmat] = r;
    515   l->nmat++;
    516 }
    517 
    518 static int materialized_reg_exists(CgIrLower* l, CGLocal local,
    519                                    CgIrMatRole role) {
    520   for (u32 i = 0; i < l->nmat; ++i)
    521     if (l->mat_local[i] == local && l->mat_role[i] == (u8)role) return 1;
    522   return 0;
    523 }
    524 
    525 static OptOperand opt_frame_operand_as(OptLocalMap* m, KitCgTypeId type) {
    526   OptOperand out = opt_frame_operand(m);
    527   out.type = type ? type : m->type;
    528   return out;
    529 }
    530 
    531 /* Emit the pre-materialization needed for a FRAME-storage local used as an
    532  * OPK_INDIRECT base. A pointer-typed local holds the base pointer value and is
    533  * loaded. A non-pointer local names storage, so its frame address is the base. */
    534 static void materialize_frame_base(CgIrLower* l, u32 block, CGLocal local,
    535                                    SrcLoc loc) {
    536   OptLocalMap* m = local_map(l, local, loc);
    537   if (m->storage.kind == CG_LOCAL_STORAGE_REG) return;
    538   if (materialized_reg_exists(l, local, CG_IR_MAT_BASE)) return;
    539   PReg r = ir_alloc_preg(l->f, m->type, RC_INT);
    540   OptOperand ops[2];
    541   ops[1] = opt_frame_operand(m);
    542   if (cg_type_is_ptr(l->c, m->type)) {
    543     /* The local *holds* a pointer; load that value to use as the base. */
    544     Inst* ld = ir_emit(l->f, block, IR_LOAD);
    545     ld->loc = loc;
    546     memset(&ops[0], 0, sizeof ops[0]);
    547     ops[0].kind = OPK_REG;
    548     ops[0].cls = RC_INT;
    549     ops[0].type = m->type;
    550     ops[0].v.reg = (Reg)r;
    551     ld->opnds = dup_opt_ops(l, ops, 2);
    552     ld->nopnds = 2;
    553     ld->def = (Val)r;
    554     ld->type = m->type;
    555     memset(&ld->extra.mem, 0, sizeof ld->extra.mem);
    556     ld->extra.mem.type = m->type;
    557     ld->extra.mem.size = m->size ? m->size : 8u;
    558     ld->extra.mem.align = m->align ? m->align : 8u;
    559   } else {
    560     /* The local *is* the storage; its frame address is the base. */
    561     Inst* ao = ir_emit(l->f, block, IR_ADDR_OF);
    562     ao->loc = loc;
    563     memset(&ops[0], 0, sizeof ops[0]);
    564     ops[0].kind = OPK_REG;
    565     ops[0].cls = RC_INT;
    566     ops[0].type = m->type;
    567     ops[0].v.reg = (Reg)r;
    568     ao->opnds = dup_opt_ops(l, ops, 2);
    569     ao->nopnds = 2;
    570     ao->def = (Val)r;
    571     ao->type = m->type;
    572   }
    573   remember_materialized_reg(l, local, CG_IR_MAT_BASE, (Reg)r, loc);
    574 }
    575 
    576 /* Emit `r = load <local home>` for a FRAME-storage local used as an
    577  * OPK_INDIRECT index. Unlike a non-pointer base, an index always needs the
    578  * local's value. On rv32, Toy indexes are i64 and therefore memory-backed; the
    579  * address calculation only consumes the pointer-width low word. */
    580 static void materialize_frame_index(CgIrLower* l, u32 block, CGLocal local,
    581                                     SrcLoc loc) {
    582   OptLocalMap* m = local_map(l, local, loc);
    583   if (m->storage.kind == CG_LOCAL_STORAGE_REG) return;
    584   if (materialized_reg_exists(l, local, CG_IR_MAT_INDEX)) return;
    585   KitCgTypeId idx_ty = pointer_sized_int_type(l);
    586   PReg r = ir_alloc_preg(l->f, idx_ty, RC_INT);
    587   OptOperand ops[2];
    588   Inst* ld = ir_emit(l->f, block, IR_LOAD);
    589   ld->loc = loc;
    590   memset(&ops[0], 0, sizeof ops[0]);
    591   ops[0].kind = OPK_REG;
    592   ops[0].cls = RC_INT;
    593   ops[0].type = idx_ty;
    594   ops[0].v.reg = (Reg)r;
    595   ops[1] = opt_frame_operand_as(m, idx_ty);
    596   ld->opnds = dup_opt_ops(l, ops, 2);
    597   ld->nopnds = 2;
    598   ld->def = (Val)r;
    599   ld->type = idx_ty;
    600   memset(&ld->extra.mem, 0, sizeof ld->extra.mem);
    601   ld->extra.mem.type = idx_ty;
    602   ld->extra.mem.size = l->c->target.ptr_size;
    603   ld->extra.mem.align = m->align && m->align < l->c->target.ptr_size
    604                             ? m->align
    605                             : l->c->target.ptr_size;
    606   remember_materialized_reg(l, local, CG_IR_MAT_INDEX, (Reg)r, loc);
    607 }
    608 
    609 /* Scan the CG instruction's operands for OPK_INDIRECT bases/indices that are
    610  * FRAME-storage locals and pre-load them (see materialize_frame_base). */
    611 static void prematerialize_indirect_bases(CgIrLower* l, const CgIrInst* in,
    612                                           u32 block) {
    613   l->nmat = 0;
    614   for (u32 i = 0; i < in->nopnds; ++i) {
    615     const Operand* op = &in->opnds[i];
    616     if (op->kind != OPK_INDIRECT) continue;
    617     materialize_frame_base(l, block, op->v.ind.base, in->loc);
    618     if (op->v.ind.index != CG_LOCAL_NONE)
    619       materialize_frame_index(l, block, op->v.ind.index, in->loc);
    620   }
    621 }
    622 
    623 static OptOperand lower_operand_value(CgIrLower* l, const Operand* in,
    624                                       SrcLoc loc);
    625 
    626 static OptOperand lower_operand_addr(CgIrLower* l, const Operand* in,
    627                                      SrcLoc loc) {
    628   OptOperand out;
    629   memset(&out, 0, sizeof out);
    630   if (!in) return out;
    631   out.type = in->type;
    632   switch ((OpKind)in->kind) {
    633     case OPK_LOCAL: {
    634       OptLocalMap* m = local_map(l, in->v.local, loc);
    635       if (m->home_slot == FRAME_SLOT_NONE) {
    636         const CgIrLocal* src = &l->src->locals[in->v.local - 1u];
    637         NativeFrameSlotDesc fsd =
    638             local_slot_desc(src, src->is_param ? FS_PARAM : FS_LOCAL);
    639         m->home_slot = ir_frame_slot_new(l->f, &fsd);
    640         m->address_taken = 1;
    641         if (in->v.local - 1u < l->f->nlocals) {
    642           l->f->locals[in->v.local - 1u].address_taken = 1;
    643           l->f->locals[in->v.local - 1u].home_slot = m->home_slot;
    644         }
    645       }
    646       return opt_frame_operand(m);
    647     }
    648     case OPK_GLOBAL:
    649       out.kind = OPK_GLOBAL;
    650       out.cls = RC_INT;
    651       out.v.global.sym = in->v.global.sym;
    652       out.v.global.addend = in->v.global.addend;
    653       return out;
    654     case OPK_INDIRECT: {
    655       out.kind = OPK_INDIRECT;
    656       out.cls = RC_INT;
    657       out.v.ind.base =
    658           resolve_materialized_reg(l, in->v.ind.base, CG_IR_MAT_BASE, loc);
    659       out.v.ind.index = REG_NONE;
    660       if (in->v.ind.index != CG_LOCAL_NONE)
    661         out.v.ind.index =
    662             resolve_materialized_reg(l, in->v.ind.index, CG_IR_MAT_INDEX, loc);
    663       out.v.ind.log2_scale = in->v.ind.log2_scale;
    664       out.v.ind.ofs = in->v.ind.ofs;
    665       return out;
    666     }
    667     case OPK_IMM:
    668     default:
    669       lower_panic(l, loc, "operand is not addressable");
    670   }
    671 }
    672 
    673 static OptOperand lower_operand_value(CgIrLower* l, const Operand* in,
    674                                       SrcLoc loc) {
    675   OptOperand out;
    676   memset(&out, 0, sizeof out);
    677   if (!in) return out;
    678   out.type = in->type;
    679   switch ((OpKind)in->kind) {
    680     case OPK_IMM:
    681       out.kind = OPK_IMM;
    682       out.cls = RC_INT;
    683       out.v.imm = in->v.imm;
    684       return out;
    685     case OPK_LOCAL: {
    686       OptLocalMap* m = local_map(l, in->v.local, loc);
    687       return m->address_taken ? opt_frame_operand(m) : opt_reg_operand(m);
    688     }
    689     case OPK_GLOBAL:
    690       out.kind = OPK_GLOBAL;
    691       out.cls = RC_INT;
    692       out.v.global.sym = in->v.global.sym;
    693       out.v.global.addend = in->v.global.addend;
    694       return out;
    695     case OPK_INDIRECT:
    696       return lower_operand_addr(l, in, loc);
    697     default:
    698       lower_panic(l, loc, "bad operand kind");
    699   }
    700 }
    701 
    702 static void set_inst_def(Inst* out, const OptOperand* op) {
    703   if (op && op->kind == OPK_REG) {
    704     out->def = (Val)op->v.reg;
    705     out->type = op->type;
    706   }
    707 }
    708 
    709 /* Lower `n` value operands. When `defs_first` is set, opnds[0] is the
    710  * instruction's destination (def); otherwise all operands are uses. Branch
    711  * terminators (CMP_BRANCH, SWITCH, INDIRECT_BRANCH) read their first operand
    712  * and define nothing, so they must pass defs_first=0 -- otherwise dead-def
    713  * elimination treats the branch as a redefinition of the tested value and
    714  * removes the real producer. */
    715 static void lower_value_ops_ex(CgIrLower* l, Inst* out, const CgIrInst* in,
    716                                u32 n, int defs_first) {
    717   OptOperand tmp[5];
    718   if (n > 5u) lower_panic(l, in->loc, "too many operands");
    719   for (u32 i = 0; i < n; ++i)
    720     tmp[i] = lower_operand_value(l, &in->opnds[i], in->loc);
    721   out->opnds = dup_opt_ops(l, tmp, n);
    722   out->nopnds = n;
    723   if (n && defs_first) set_inst_def(out, &out->opnds[0]);
    724 }
    725 
    726 static void lower_value_ops(CgIrLower* l, Inst* out, const CgIrInst* in,
    727                             u32 n) {
    728   lower_value_ops_ex(l, out, in, n, 1);
    729 }
    730 
    731 static void lower_use_ops(CgIrLower* l, Inst* out, const CgIrInst* in, u32 n) {
    732   lower_value_ops_ex(l, out, in, n, 0);
    733 }
    734 
    735 static void lower_addr_value_ops(CgIrLower* l, Inst* out, const CgIrInst* in,
    736                                  u32 naddr, u32 nvalue) {
    737   OptOperand tmp[5];
    738   u32 n = naddr + nvalue;
    739   if (n > 5u) lower_panic(l, in->loc, "too many operands");
    740   for (u32 i = 0; i < naddr; ++i)
    741     tmp[i] = lower_operand_addr(l, &in->opnds[i], in->loc);
    742   for (u32 i = 0; i < nvalue; ++i)
    743     tmp[naddr + i] = lower_operand_value(l, &in->opnds[naddr + i], in->loc);
    744   out->opnds = dup_opt_ops(l, tmp, n);
    745   out->nopnds = n;
    746 }
    747 
    748 static OptCGABIValue abi_value_for_local(CgIrLower* l, CGLocal local,
    749                                          SrcLoc loc) {
    750   OptCGABIValue out;
    751   memset(&out, 0, sizeof out);
    752   OptLocalMap* m = local_map(l, local, loc);
    753   out.type = m->type;
    754   out.storage = m->address_taken ? opt_frame_operand(m) : opt_reg_operand(m);
    755   return out;
    756 }
    757 
    758 static void lower_call(CgIrLower* l, Inst* out, const CgIrInst* in) {
    759   const CgIrCallAux* src = (const CgIrCallAux*)in->extra.aux;
    760   IRCallAux* aux = arena_znew(l->f->arena, IRCallAux);
    761   memset(aux, 0, sizeof *aux);
    762   if (!src) {
    763     out->extra.aux = aux;
    764     return;
    765   }
    766   aux->desc.fn_type = src->desc.fn_type;
    767   aux->desc.callee = lower_operand_value(l, &src->desc.callee, in->loc);
    768   aux->desc.nargs = src->desc.nargs;
    769   aux->desc.flags = src->desc.flags;
    770   aux->desc.tail_policy = src->desc.tail_policy;
    771   aux->desc.inline_policy = src->desc.inline_policy;
    772   /* Cache the function ABI on the desc so downstream passes (e.g. the
    773    * regalloc hint pass that steers call-arg sources toward their ABI dest
    774    * register) don't have to re-derive it per call. abi_cg_func_info is the
    775    * canonical lookup. */
    776   if (l->f->c && l->f->c->abi)
    777     aux->desc.abi = abi_cg_func_info(l->f->c->abi, src->desc.fn_type);
    778   if (src->desc.nargs) {
    779     aux->desc.args = arena_zarray(l->f->arena, OptCGABIValue, src->desc.nargs);
    780     for (u32 i = 0; i < src->desc.nargs; ++i)
    781       aux->desc.args[i] = abi_value_for_local(l, src->desc.args[i], in->loc);
    782   }
    783   if (src->desc.result != CG_LOCAL_NONE) {
    784     aux->desc.ret = abi_value_for_local(l, src->desc.result, in->loc);
    785     set_inst_def(out, &aux->desc.ret.storage);
    786   }
    787   out->type = src->desc.fn_type;
    788   out->extra.aux = aux;
    789 }
    790 
    791 static void lower_ret(CgIrLower* l, Inst* out, const CgIrInst* in) {
    792   const CgIrRetAux* src = (const CgIrRetAux*)in->extra.aux;
    793   IRRetAux* aux = arena_znew(l->f->arena, IRRetAux);
    794   if (src && src->present) {
    795     aux->present = 1;
    796     aux->val = abi_value_for_local(l, src->value, in->loc);
    797   }
    798   out->extra.aux = aux;
    799 }
    800 
    801 static void lower_intrinsic(CgIrLower* l, Inst* out, const CgIrInst* in) {
    802   const CgIrIntrinsicAux* src = (const CgIrIntrinsicAux*)in->extra.aux;
    803   IRIntrinAux* aux = arena_znew(l->f->arena, IRIntrinAux);
    804   if (src) {
    805     aux->kind = src->kind;
    806     aux->ndst = src->ndst;
    807     aux->narg = src->narg;
    808     aux->dsts =
    809         src->ndst ? arena_array(l->f->arena, OptOperand, src->ndst) : NULL;
    810     aux->args =
    811         src->narg ? arena_array(l->f->arena, OptOperand, src->narg) : NULL;
    812     for (u32 i = 0; i < src->ndst; ++i)
    813       aux->dsts[i] = lower_operand_value(l, &src->dsts[i], in->loc);
    814     for (u32 i = 0; i < src->narg; ++i)
    815       aux->args[i] = lower_operand_value(l, &src->args[i], in->loc);
    816     if (src->ndst) {
    817       u32 ndefs = 0;
    818       for (u32 i = 0; i < src->ndst; ++i)
    819         if (aux->dsts[i].kind == OPK_REG) ++ndefs;
    820       if (ndefs) {
    821         u32 d = 0;
    822         out->ndefs = ndefs;
    823         out->defs = arena_array(l->f->arena, Val, ndefs);
    824         for (u32 i = 0; i < src->ndst; ++i)
    825           if (aux->dsts[i].kind == OPK_REG)
    826             out->defs[d++] = aux->dsts[i].v.reg;
    827         out->def = out->defs[0];
    828       }
    829       out->type = aux->dsts[0].type;
    830     }
    831   }
    832   out->extra.aux = aux;
    833 }
    834 
    835 static void lower_asm(CgIrLower* l, Inst* out, const CgIrInst* in) {
    836   const CgIrAsmAux* src = (const CgIrAsmAux*)in->extra.aux;
    837   IRAsmAux* aux = arena_znew(l->f->arena, IRAsmAux);
    838   if (src) {
    839     aux->tmpl = src->tmpl;
    840     aux->outs = src->outs;
    841     aux->ins = src->ins;
    842     aux->clobbers = src->clobbers;
    843     aux->nout = src->nout;
    844     aux->nin = src->nin;
    845     aux->nclob = src->nclob;
    846     aux->clobber_abi_sets = src->clobber_abi_sets;
    847     aux->out_ops =
    848         src->nout ? arena_array(l->f->arena, OptOperand, src->nout) : NULL;
    849     aux->in_ops =
    850         src->nin ? arena_array(l->f->arena, OptOperand, src->nin) : NULL;
    851     for (u32 i = 0; i < src->nout; ++i)
    852       aux->out_ops[i] = lower_operand_value(l, &src->out_ops[i], in->loc);
    853     for (u32 i = 0; i < src->nin; ++i)
    854       aux->in_ops[i] = lower_operand_value(l, &src->in_ops[i], in->loc);
    855     if (src->nout) {
    856       u32 ndefs = 0;
    857       for (u32 i = 0; i < src->nout; ++i)
    858         if (aux->out_ops[i].kind == OPK_REG) ++ndefs;
    859       if (ndefs) {
    860         u32 d = 0;
    861         out->ndefs = ndefs;
    862         out->defs = arena_array(l->f->arena, Val, ndefs);
    863         for (u32 i = 0; i < src->nout; ++i)
    864           if (aux->out_ops[i].kind == OPK_REG)
    865             out->defs[d++] = aux->out_ops[i].v.reg;
    866         out->def = out->defs[0];
    867       }
    868       out->type = aux->out_ops[0].type;
    869     }
    870   }
    871   out->extra.aux = aux;
    872 }
    873 
    874 static void lower_one_inst(CgIrLower* l, u32 idx) {
    875   const CgIrInst* in = &l->src->insts[idx];
    876   u32 block = l->inst_block[idx];
    877   Inst* out = NULL;
    878   IROp op = IR_NOP;
    879   switch ((CgIrOp)in->op) {
    880     case CG_IR_LABEL:
    881       return;
    882     case CG_IR_LOAD_IMM:
    883       op = IR_LOAD_IMM;
    884       break;
    885     case CG_IR_LOAD_CONST:
    886       op = IR_LOAD_CONST;
    887       break;
    888     case CG_IR_COPY:
    889       op = IR_COPY;
    890       break;
    891     case CG_IR_LOAD:
    892       op = IR_LOAD;
    893       break;
    894     case CG_IR_STORE:
    895       op = IR_STORE;
    896       break;
    897     case CG_IR_ADDR_OF:
    898       op = IR_ADDR_OF;
    899       break;
    900     case CG_IR_TLS_ADDR_OF:
    901       op = IR_TLS_ADDR_OF;
    902       break;
    903     case CG_IR_AGG_COPY:
    904       op = IR_AGG_COPY;
    905       break;
    906     case CG_IR_AGG_SET:
    907       op = IR_AGG_SET;
    908       break;
    909     case CG_IR_BITFIELD_LOAD:
    910       op = IR_BITFIELD_LOAD;
    911       break;
    912     case CG_IR_BITFIELD_STORE:
    913       op = IR_BITFIELD_STORE;
    914       break;
    915     case CG_IR_BINOP:
    916       op = IR_BINOP;
    917       break;
    918     case CG_IR_UNOP:
    919       op = IR_UNOP;
    920       break;
    921     case CG_IR_CMP:
    922       op = IR_CMP;
    923       break;
    924     case CG_IR_CONVERT:
    925       op = IR_CONVERT;
    926       break;
    927     case CG_IR_CALL:
    928       op = IR_CALL;
    929       break;
    930     case CG_IR_RET:
    931       op = IR_RET;
    932       break;
    933     case CG_IR_UNREACHABLE:
    934       op = IR_UNREACHABLE;
    935       break;
    936     case CG_IR_BR:
    937       op = IR_BR;
    938       break;
    939     case CG_IR_CMP_BRANCH:
    940       op = IR_CMP_BRANCH;
    941       break;
    942     case CG_IR_SWITCH:
    943       op = IR_SWITCH;
    944       break;
    945     case CG_IR_INDIRECT_BRANCH:
    946       op = IR_INDIRECT_BRANCH;
    947       break;
    948     case CG_IR_LOAD_LABEL_ADDR:
    949       op = IR_LOAD_LABEL_ADDR;
    950       break;
    951     case CG_IR_LOCAL_STATIC_DATA_BEGIN:
    952       op = IR_LOCAL_STATIC_DATA_BEGIN;
    953       break;
    954     case CG_IR_LOCAL_STATIC_DATA_WRITE:
    955       op = IR_LOCAL_STATIC_DATA_WRITE;
    956       break;
    957     case CG_IR_LOCAL_STATIC_DATA_LABEL_ADDR:
    958       op = IR_LOCAL_STATIC_DATA_LABEL_ADDR;
    959       break;
    960     case CG_IR_LOCAL_STATIC_DATA_END:
    961       op = IR_LOCAL_STATIC_DATA_END;
    962       break;
    963     case CG_IR_SCOPE_BEGIN:
    964       op = IR_SCOPE_BEGIN;
    965       break;
    966     case CG_IR_SCOPE_END:
    967       op = IR_SCOPE_END;
    968       break;
    969     case CG_IR_BREAK_TO:
    970       op = IR_BREAK_TO;
    971       break;
    972     case CG_IR_CONTINUE_TO:
    973       op = IR_CONTINUE_TO;
    974       break;
    975     case CG_IR_ALLOCA:
    976       op = IR_ALLOCA;
    977       break;
    978     case CG_IR_VA_START:
    979       op = IR_VA_START;
    980       break;
    981     case CG_IR_VA_ARG:
    982       op = IR_VA_ARG;
    983       break;
    984     case CG_IR_VA_END:
    985       op = IR_VA_END;
    986       break;
    987     case CG_IR_VA_COPY:
    988       op = IR_VA_COPY;
    989       break;
    990     case CG_IR_ATOMIC_LOAD:
    991       op = IR_ATOMIC_LOAD;
    992       break;
    993     case CG_IR_ATOMIC_STORE:
    994       op = IR_ATOMIC_STORE;
    995       break;
    996     case CG_IR_ATOMIC_RMW:
    997       op = IR_ATOMIC_RMW;
    998       break;
    999     case CG_IR_ATOMIC_CAS:
   1000       op = IR_ATOMIC_CAS;
   1001       break;
   1002     case CG_IR_FENCE:
   1003       op = IR_FENCE;
   1004       break;
   1005     case CG_IR_INTRINSIC:
   1006       op = IR_INTRINSIC;
   1007       break;
   1008     case CG_IR_ASM_BLOCK:
   1009       op = IR_ASM_BLOCK;
   1010       break;
   1011     default:
   1012       op = IR_NOP;
   1013       break;
   1014   }
   1015   /* Pre-load any FRAME-resident pointer locals used as indirect bases so the
   1016    * load dominates this instruction (which is emitted next). */
   1017   prematerialize_indirect_bases(l, in, block);
   1018   out = ir_emit(l->f, block, op);
   1019   out->loc = in->loc;
   1020   switch ((CgIrOp)in->op) {
   1021     case CG_IR_LOAD_IMM:
   1022       lower_value_ops(l, out, in, 1);
   1023       out->extra.imm = in->extra.imm;
   1024       break;
   1025     case CG_IR_LOAD_CONST:
   1026       lower_value_ops(l, out, in, 1);
   1027       out->extra.cbytes = in->extra.cbytes;
   1028       break;
   1029     case CG_IR_COPY:
   1030     case CG_IR_BINOP:
   1031     case CG_IR_UNOP:
   1032     case CG_IR_CMP:
   1033     case CG_IR_CONVERT:
   1034     case CG_IR_ALLOCA:
   1035     case CG_IR_VA_ARG:
   1036       lower_value_ops(l, out, in, in->nopnds);
   1037       out->extra.imm = in->extra.imm;
   1038       break;
   1039     case CG_IR_LOAD:
   1040     case CG_IR_BITFIELD_LOAD: {
   1041       OptOperand ops[2];
   1042       ops[0] = lower_operand_value(l, &in->opnds[0], in->loc);
   1043       ops[1] = lower_operand_addr(l, &in->opnds[1], in->loc);
   1044       out->opnds = dup_opt_ops(l, ops, 2);
   1045       out->nopnds = 2;
   1046       set_inst_def(out, &out->opnds[0]);
   1047       if ((CgIrOp)in->op == CG_IR_LOAD)
   1048         out->extra.mem = in->extra.mem;
   1049       else
   1050         out->extra.aux = in->extra.aux;
   1051       break;
   1052     }
   1053     case CG_IR_ATOMIC_LOAD: {
   1054       OptOperand ops[2];
   1055       ops[0] = lower_operand_value(l, &in->opnds[0], in->loc);
   1056       ops[1] = lower_operand_value(l, &in->opnds[1], in->loc);
   1057       out->opnds = dup_opt_ops(l, ops, 2);
   1058       out->nopnds = 2;
   1059       set_inst_def(out, &out->opnds[0]);
   1060       out->extra.aux = in->extra.aux;
   1061       break;
   1062     }
   1063     case CG_IR_STORE:
   1064     case CG_IR_BITFIELD_STORE:
   1065       lower_addr_value_ops(l, out, in, 1, in->nopnds - 1u);
   1066       if ((CgIrOp)in->op == CG_IR_STORE)
   1067         out->extra.mem = in->extra.mem;
   1068       else
   1069         out->extra.aux = in->extra.aux;
   1070       break;
   1071     case CG_IR_AGG_COPY:
   1072     case CG_IR_AGG_SET:
   1073       /* Aggregate ops take their operands as pointer *values* to the aggregates
   1074        * (the emitter derefs them via pointer_addr_from_operand). Lowering them
   1075        * as values keeps a pointer local in its register instead of forcing a
   1076        * frame home — the home would otherwise break the local's other uses as
   1077        * an indirect base, whose lowering reads storage.v.reg. */
   1078       lower_use_ops(l, out, in, in->nopnds);
   1079       out->extra.aux = in->extra.aux;
   1080       break;
   1081     case CG_IR_ATOMIC_STORE: {
   1082       OptOperand ops[2];
   1083       ops[0] = lower_operand_value(l, &in->opnds[0], in->loc);
   1084       ops[1] = lower_operand_value(l, &in->opnds[1], in->loc);
   1085       out->opnds = dup_opt_ops(l, ops, 2);
   1086       out->nopnds = 2;
   1087       out->extra.aux = in->extra.aux;
   1088       break;
   1089     }
   1090     case CG_IR_ADDR_OF: {
   1091       OptOperand ops[2];
   1092       ops[0] = lower_operand_value(l, &in->opnds[0], in->loc);
   1093       ops[1] = lower_operand_addr(l, &in->opnds[1], in->loc);
   1094       out->opnds = dup_opt_ops(l, ops, 2);
   1095       out->nopnds = 2;
   1096       set_inst_def(out, &out->opnds[0]);
   1097       break;
   1098     }
   1099     case CG_IR_TLS_ADDR_OF:
   1100       lower_value_ops(l, out, in, 1);
   1101       out->extra.aux = in->extra.aux;
   1102       break;
   1103     case CG_IR_CALL:
   1104       lower_call(l, out, in);
   1105       break;
   1106     case CG_IR_RET:
   1107       lower_ret(l, out, in);
   1108       l->f->blocks[block].nsucc = 0;
   1109       break;
   1110     case CG_IR_UNREACHABLE:
   1111       /* Terminator with no successors: control does not leave this block. */
   1112       l->f->blocks[block].nsucc = 0;
   1113       break;
   1114     case CG_IR_BR:
   1115       out->extra.imm = block_for_label(l, (Label)in->extra.imm, in->loc);
   1116       set_succ1(l, block, (u32)out->extra.imm);
   1117       break;
   1118     case CG_IR_CMP_BRANCH: {
   1119       CgIrCmpBranchAux* aux = (CgIrCmpBranchAux*)in->extra.aux;
   1120       lower_use_ops(l, out, in, 2);
   1121       out->extra.imm = aux ? aux->op : CMP_NE;
   1122       ir_block_set_nsucc(l->f, block, 2);
   1123       l->f->blocks[block].succ[0] =
   1124           aux ? block_for_label(l, aux->target, in->loc) : UINT32_MAX;
   1125       l->f->blocks[block].succ[1] = fallthrough_block(l, idx);
   1126       break;
   1127     }
   1128     case CG_IR_SWITCH: {
   1129       CgIrSwitchAux* src = (CgIrSwitchAux*)in->extra.aux;
   1130       IRSwitchAux* aux = arena_znew(l->f->arena, IRSwitchAux);
   1131       lower_use_ops(l, out, in, 1);
   1132       if (src) {
   1133         aux->selector_type = src->selector_type;
   1134         aux->ncases = src->ncases;
   1135         aux->hint = src->hint;
   1136         aux->has_default = src->default_label != LABEL_NONE;
   1137         aux->default_block =
   1138             aux->has_default ? block_for_label(l, src->default_label, in->loc)
   1139                              : fallthrough_block(l, idx);
   1140         if (src->ncases) {
   1141           aux->cases = arena_array(l->f->arena, IRSwitchAuxCase, src->ncases);
   1142           for (u32 i = 0; i < src->ncases; ++i) {
   1143             aux->cases[i].value = src->cases[i].value;
   1144             aux->cases[i].block =
   1145                 block_for_label(l, src->cases[i].label, in->loc);
   1146           }
   1147         }
   1148         ir_block_set_nsucc(l->f, block, src->ncases + 1u);
   1149         for (u32 i = 0; i < src->ncases; ++i)
   1150           l->f->blocks[block].succ[i] = aux->cases[i].block;
   1151         l->f->blocks[block].succ[src->ncases] = aux->default_block;
   1152       }
   1153       out->extra.aux = aux;
   1154       break;
   1155     }
   1156     case CG_IR_INDIRECT_BRANCH: {
   1157       CgIrIndirectAux* src = (CgIrIndirectAux*)in->extra.aux;
   1158       IRIndirectAux* aux = arena_znew(l->f->arena, IRIndirectAux);
   1159       lower_use_ops(l, out, in, 1);
   1160       if (src && src->ntargets) {
   1161         aux->ntargets = src->ntargets;
   1162         aux->targets = arena_array(l->f->arena, u32, src->ntargets);
   1163         ir_block_set_nsucc(l->f, block, src->ntargets);
   1164         for (u32 i = 0; i < src->ntargets; ++i) {
   1165           aux->targets[i] = block_for_label(l, src->targets[i], in->loc);
   1166           l->f->blocks[block].succ[i] = aux->targets[i];
   1167         }
   1168       }
   1169       out->extra.aux = aux;
   1170       break;
   1171     }
   1172     case CG_IR_LOAD_LABEL_ADDR:
   1173       lower_value_ops(l, out, in, 1);
   1174       out->extra.imm = block_for_label(l, (Label)in->extra.imm, in->loc);
   1175       break;
   1176     case CG_IR_LOCAL_STATIC_DATA_BEGIN:
   1177       out->extra.aux = in->extra.aux;
   1178       break;
   1179     case CG_IR_LOCAL_STATIC_DATA_WRITE:
   1180       out->extra.aux = in->extra.aux;
   1181       break;
   1182     case CG_IR_LOCAL_STATIC_DATA_LABEL_ADDR: {
   1183       CgIrLocalStaticLabelAux* src = (CgIrLocalStaticLabelAux*)in->extra.aux;
   1184       CgIrLocalStaticLabelAux* aux =
   1185           arena_znew(l->f->arena, CgIrLocalStaticLabelAux);
   1186       if (src) {
   1187         *aux = *src;
   1188         aux->target = (Label)block_for_label(l, src->target, in->loc);
   1189       }
   1190       out->extra.aux = aux;
   1191       break;
   1192     }
   1193     case CG_IR_LOCAL_STATIC_DATA_END:
   1194       break;
   1195     case CG_IR_SCOPE_BEGIN: {
   1196       CgIrScopeAux* src = (CgIrScopeAux*)in->extra.aux;
   1197       IRScopeAux* aux = arena_znew(l->f->arena, IRScopeAux);
   1198       if (src) {
   1199         aux->scope_id = src->scope;
   1200         aux->desc.kind = src->desc.kind;
   1201         aux->desc.break_label = src->desc.break_label;
   1202         aux->desc.continue_label = src->desc.continue_label;
   1203         aux->desc.result_type = src->desc.result_type;
   1204       }
   1205       out->extra.aux = aux;
   1206       break;
   1207     }
   1208     case CG_IR_SCOPE_END:
   1209     case CG_IR_BREAK_TO:
   1210     case CG_IR_CONTINUE_TO:
   1211       out->extra.imm = in->extra.imm;
   1212       break;
   1213     case CG_IR_VA_START:
   1214     case CG_IR_VA_END:
   1215       /* The operand is a pointer value (the address of the va_list object),
   1216        * produced by an earlier ADDR_OF. Lower as a value so it can live in a
   1217        * register; the backend va hook consumes the pointer. */
   1218       lower_use_ops(l, out, in, 1);
   1219       break;
   1220     case CG_IR_VA_COPY:
   1221       lower_use_ops(l, out, in, 2);
   1222       break;
   1223     case CG_IR_ATOMIC_RMW:
   1224       lower_value_ops(l, out, in, 3);
   1225       out->extra.aux = in->extra.aux;
   1226       break;
   1227     case CG_IR_ATOMIC_CAS:
   1228       lower_value_ops(l, out, in, 5);
   1229       out->ndefs = 2;
   1230       out->defs = arena_array(l->f->arena, Val, 2);
   1231       out->defs[0] = out->opnds[0].v.reg;
   1232       out->defs[1] = out->opnds[1].v.reg;
   1233       out->def = out->defs[0];
   1234       out->type = out->opnds[0].type;
   1235       {
   1236         const CgIrAtomicAux* src = (const CgIrAtomicAux*)in->extra.aux;
   1237         IRCasAux* aux = arena_znew(l->f->arena, IRCasAux);
   1238         if (src) {
   1239           aux->mem = src->mem;
   1240           aux->success = src->order;
   1241           aux->failure = src->failure;
   1242         }
   1243         out->extra.aux = aux;
   1244       }
   1245       break;
   1246     case CG_IR_FENCE:
   1247       out->extra.imm = in->extra.imm;
   1248       break;
   1249     case CG_IR_INTRINSIC:
   1250       lower_intrinsic(l, out, in);
   1251       break;
   1252     case CG_IR_ASM_BLOCK:
   1253       lower_asm(l, out, in);
   1254       break;
   1255     default:
   1256       out->extra.aux = in->extra.aux;
   1257       break;
   1258   }
   1259 }
   1260 
   1261 static void add_fallthrough_succs(CgIrLower* l) {
   1262   for (u32 b = 0; b < l->f->nblocks; ++b) {
   1263     Block* bl = &l->f->blocks[b];
   1264     if (bl->nsucc) continue;
   1265     if (bl->ninsts) {
   1266       Inst* last = &bl->insts[bl->ninsts - 1u];
   1267       switch ((IROp)last->op) {
   1268         case IR_BR:
   1269         case IR_CONDBR:
   1270         case IR_CMP_BRANCH:
   1271         case IR_SWITCH:
   1272         case IR_INDIRECT_BRANCH:
   1273         case IR_RET:
   1274         case IR_UNREACHABLE:
   1275         case IR_BREAK_TO:
   1276         case IR_CONTINUE_TO:
   1277           continue;
   1278         case IR_INTRINSIC: {
   1279           IRIntrinAux* aux = (IRIntrinAux*)last->extra.aux;
   1280           if (aux && (aux->kind == INTRIN_LONGJMP || aux->kind == INTRIN_TRAP))
   1281             continue;
   1282           break;
   1283         }
   1284         default:
   1285           break;
   1286       }
   1287     }
   1288     for (u32 i = 0; i + 1u < l->f->emit_order_n; ++i) {
   1289       if (l->f->emit_order[i] == b) {
   1290         set_succ1(l, b, l->f->emit_order[i + 1u]);
   1291         break;
   1292       }
   1293     }
   1294   }
   1295 }
   1296 
   1297 Func* opt_func_from_cg_ir(Compiler* c, const CgIrFunc* src) {
   1298   if (!c || !src) return NULL;
   1299   OptCGFuncDesc desc = lower_func_desc(c->tu, &src->desc);
   1300   Func* f = ir_func_new(c, &desc);
   1301   CgIrLower l;
   1302   memset(&l, 0, sizeof l);
   1303   l.c = c;
   1304   l.src = src;
   1305   l.f = f;
   1306   l.nlabels = label_id_max(src);
   1307   u32* label_place =
   1308       arena_array(f->arena, u32, l.nlabels ? l.nlabels + 1u : 1u);
   1309   for (u32 i = 0; i <= l.nlabels; ++i) label_place[i] = UINT32_MAX;
   1310   l.leader = arena_zarray(f->arena, u8, src->ninsts + 1u);
   1311   lower_locals(&l);
   1312   lower_params(&l);
   1313   mark_leaders(&l, label_place);
   1314   make_blocks(&l, label_place);
   1315   emit_param_decls(&l);
   1316   for (u32 i = 0; i < src->ninsts; ++i) lower_one_inst(&l, i);
   1317   add_fallthrough_succs(&l);
   1318   opt_build_cfg(f);
   1319   return f;
   1320 }