kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

emit.c (184324B)


      1 /* Wasm CGTarget emission.
      2  *
      3  * Records CGTarget operations into a per-function WIR list, then linearizes
      4  * to a WasmFunc body at func_end. Each SSA Reg becomes a Wasm local; control
      5  * flow that fits the kit_cg_if_begin/else/end pattern lowers to
      6  * if/else/end, while CG scopes (SCOPE_LOOP) lower to (block (loop ...)).
      7  *
      8  * TODO: complete IR-to-WASM coverage for:
      9  *   - bitfield load/store
     10  *   - multiple call/return results
     11  *   - address-taken parameters
     12  *   - ABI multipart params
     13  *   - dynamic memcpy/memset via memory.* calls
     14  *   - file-scope asm
     15  *   - atomics
     16  *   - intrinsics
     17  */
     18 
     19 #include <stdarg.h>
     20 #include <string.h>
     21 
     22 #include "abi/abi.h"
     23 #include "arch/wasm/internal.h"
     24 #include "cg/type.h"
     25 #include "core/arena.h"
     26 #include "core/buf.h"
     27 #include "core/heap.h"
     28 #include "core/pool.h"
     29 #include "obj/obj.h"
     30 #include "obj/wasm_imports.h"
     31 
     32 /* Shared Wasm core: in-memory WasmModule, helpers (wasm_add_func,
     33  * wasm_intern_func_type, wasm_func_add_insn, ...), and wasm_encode for
     34  * the final flush from emit_wasm. */
     35 #include "wasm/wasm.h"
     36 
     37 /* -----------------------------------------------------------------
     38  * Helpers
     39  * ----------------------------------------------------------------- */
     40 
     41 static SrcLoc cur_loc(WTarget* t) {
     42   /* Prefer the most recent statement loc the frontend reported via
     43    * wasm_set_loc — gives diagnostics the actual failing line, not the
     44    * function definition's line. Fall back to the function loc when no
     45    * statement loc has been set (line == 0). */
     46   if (t->cur_stmt_loc.line) return t->cur_stmt_loc;
     47   if (t->cur_fn_desc) return t->cur_fn_desc->loc;
     48   SrcLoc l = {0, 0, 0};
     49   return l;
     50 }
     51 
     52 static _Noreturn void wfail(WTarget* t, const char* fmt, ...) {
     53   va_list ap;
     54   va_start(ap, fmt);
     55   compiler_panicv(t->c, cur_loc(t), fmt, ap);
     56 }
     57 
     58 static _Noreturn void wfail_at(WTarget* t, SrcLoc loc, const char* fmt, ...) {
     59   va_list ap;
     60   va_start(ap, fmt);
     61   compiler_panicv(t->c, loc, fmt, ap);
     62 }
     63 
     64 static struct WasmModule* ensure_module(WTarget* t);
     65 
     66 static const char* pool_sym_cstr(Pool* p, Sym sym, size_t* len_out) {
     67   Slice sl = pool_slice(p, sym);
     68   if (len_out) *len_out = sl.len;
     69   return sl.s;
     70 }
     71 
     72 static WasmValType valtype_for_size_kind(WTarget* t, u32 size, u8 scalar_kind) {
     73   if (scalar_kind == ABI_SC_FLOAT) {
     74     if (size == 4) return WASM_VAL_F32;
     75     if (size == 8) return WASM_VAL_F64;
     76     /* The only C float wider than f64 is binary128 long double, which has
     77      * no wasm value type. Report it specifically rather than as a generic
     78      * size error. */
     79     if (size == 16) wfail(t, "wasm: long double not supported");
     80     wfail(t, "wasm: unsupported float size %u", size);
     81   }
     82   if (size <= 4) return WASM_VAL_I32;
     83   if (size == 8) return WASM_VAL_I64;
     84   wfail(t, "wasm: unsupported integer size %u", size);
     85 }
     86 
     87 static WasmValType valtype_for_type(WTarget* t, KitCgTypeId ty) {
     88   ABITypeInfo ti = abi_cg_type_info(t->c->abi, ty);
     89   if (ti.scalar_kind == ABI_SC_VOID) {
     90     wfail(t, "wasm: void value type requested");
     91   }
     92   if (ti.scalar_kind == ABI_SC_PTR) return WASM_VAL_I32; /* wasm32 ILP32 */
     93   return valtype_for_size_kind(t, ti.size, ti.scalar_kind);
     94 }
     95 
     96 static u32 align_to_u32(u32 v, u32 a) {
     97   if (!a) return v;
     98   return (v + a - 1u) & ~(a - 1u);
     99 }
    100 
    101 /* Low-memory guard: leaves the first NULL_GUARD bytes of linear memory
    102  * unassigned so addr==0 never resolves to a real symbol. */
    103 #define WASM_DATA_NULL_GUARD 16u
    104 
    105 static void type_size_align(WTarget* t, KitCgTypeId ty, u32 fallback_size,
    106                             u32 fallback_align, u32* size_out, u32* align_out) {
    107   ABITypeInfo ti;
    108   if (ty) {
    109     ti = abi_cg_type_info(t->c->abi, ty);
    110     *size_out = ti.size ? ti.size : fallback_size;
    111     *align_out = ti.align ? ti.align : fallback_align;
    112   } else {
    113     *size_out = fallback_size;
    114     *align_out = fallback_align;
    115   }
    116   if (!*size_out) *size_out = 1;
    117   if (!*align_out) *align_out = 1;
    118 }
    119 
    120 static u32 add_wasm_local(WTarget* t, WasmValType vt) {
    121   if (!t->cur_func) wfail(t, "wasm: local allocation outside a function");
    122   return wasm_func_push_local(t->c, t->module, t->cur_func, vt);
    123 }
    124 
    125 static void ensure_linear_memory(WTarget* t) {
    126   ensure_module(t);
    127   if (!t->has_memory) {
    128     WasmMemory* mem = wasm_add_memory(t->c, t->module);
    129     mem->min_pages = 1;
    130     t->has_memory = 1;
    131   }
    132 }
    133 
    134 /* Atomic memory ops require the linear memory to be declared shared and to
    135  * carry a maximum size. We promote the (single) module memory to shared on
    136  * first atomic emission. max_pages is provisionally set to the wasm32 limit
    137  * (65536, i.e. 4 GiB); wasm_materialize_data tightens it to match min_pages
    138  * after the final layout is known so embedders can pre-reserve a snug arena. */
    139 static void ensure_shared_memory(WTarget* t) {
    140   ensure_linear_memory(t);
    141   WasmMemory* mem = &t->module->memories[0];
    142   if (!mem->shared) {
    143     mem->shared = 1;
    144   }
    145   if (!mem->has_max) {
    146     mem->has_max = 1;
    147     mem->max_pages = 65536u;
    148   }
    149 }
    150 
    151 static void ensure_stack_pointer(WTarget* t) {
    152   ensure_linear_memory(t);
    153   if (!t->has_stack_pointer) {
    154     WasmGlobal* g = wasm_add_global(t->c, t->module);
    155     g->name = wasm_strdup(t->module->heap, "__stack_pointer",
    156                           sizeof("__stack_pointer") - 1u);
    157     g->type = WASM_VAL_I32;
    158     g->mutable_ = 1;
    159     g->init.kind = WASM_INSN_I32_CONST;
    160     g->init.imm = 65536;
    161     t->stack_pointer_global = t->module->nglobals - 1u;
    162     t->stack_size = 65536;
    163     t->has_stack_pointer = 1;
    164   }
    165 }
    166 
    167 /* Map an SSA Reg to its WasmFunc local index, allocating on first use. */
    168 static u32 reg_local(WTarget* t, Reg r, KitCgTypeId ty, RegClass cls) {
    169   Heap* h = t->c->ctx->heap;
    170   if (r == REG_NONE) wfail(t, "wasm: REG_NONE used as operand");
    171   if (r >= t->reg_cap) {
    172     u32 nc = t->reg_cap ? t->reg_cap : 16u;
    173     while (nc <= r) nc *= 2u;
    174     u32* nl = (u32*)h->realloc(h, t->reg_to_local, sizeof(u32) * t->reg_cap,
    175                                sizeof(u32) * nc, _Alignof(u32));
    176     KitCgTypeId* nt = (KitCgTypeId*)h->realloc(
    177         h, t->reg_type, sizeof(KitCgTypeId) * t->reg_cap,
    178         sizeof(KitCgTypeId) * nc, _Alignof(KitCgTypeId));
    179     u8* nc_arr = (u8*)h->realloc(h, t->reg_cls, t->reg_cap, nc, 1);
    180     if (!nl || !nt || !nc_arr) wfail(t, "wasm: out of memory");
    181     for (u32 i = t->reg_cap; i < nc; ++i) {
    182       nl[i] = 0xffffffffu;
    183       nt[i] = KIT_CG_TYPE_NONE;
    184       nc_arr[i] = 0;
    185     }
    186     t->reg_to_local = nl;
    187     t->reg_type = nt;
    188     t->reg_cls = nc_arr;
    189     t->reg_cap = nc;
    190   }
    191   /* CG may reuse the same Reg id with different value types: api_ensure_reg
    192    * for an SV_CMP reuses one of the cmp operands' Regs (originally e.g. i64)
    193    * to hold the i32 cmp result. Detect a type change and rebind to a fresh
    194    * wasm local; the previous binding is dead from CG's point of view. */
    195   if (t->reg_to_local[r] != 0xffffffffu && t->reg_type[r]) {
    196     WasmValType cached_vt = valtype_for_type(t, t->reg_type[r]);
    197     WasmValType want_vt = valtype_for_type(t, ty);
    198     if (cached_vt == want_vt) return t->reg_to_local[r];
    199     /* fall through to allocate fresh */
    200   }
    201   {
    202     WasmValType vt = valtype_for_type(t, ty);
    203     t->reg_to_local[r] = add_wasm_local(t, vt);
    204     t->reg_type[r] = ty;
    205     t->reg_cls[r] = (u8)cls;
    206   }
    207   return t->reg_to_local[r];
    208 }
    209 
    210 /* -----------------------------------------------------------------
    211  * WIR appending
    212  * ----------------------------------------------------------------- */
    213 
    214 static WIR* wir_push(WTarget* t) {
    215   Heap* h = t->c->ctx->heap;
    216   if (t->nwir == t->wir_cap) {
    217     u32 nc = t->wir_cap ? t->wir_cap * 2u : 64u;
    218     void* p = h->realloc(h, t->wir, sizeof(WIR) * t->wir_cap, sizeof(WIR) * nc,
    219                          _Alignof(WIR));
    220     if (!p) wfail(t, "wasm: out of memory");
    221     t->wir = (WIR*)p;
    222     t->wir_cap = nc;
    223   }
    224   WIR* w = &t->wir[t->nwir++];
    225   memset(w, 0, sizeof *w);
    226   return w;
    227 }
    228 
    229 /* Operand-kind encoding stored in WIR's imm_kind / imm_kind_b. */
    230 enum {
    231   WOP_REG = 0,
    232   WOP_IMM = 1,
    233   WOP_LOCAL = 2,
    234   WOP_WASM_LOCAL = 3,
    235   WOP_ADDR = 4
    236 };
    237 
    238 static void wir_capture_operand(WIR* w, int which, Operand op) {
    239   u32 kind;
    240   i64 ival;
    241   Reg r = REG_NONE;
    242   switch (op.kind) {
    243     case OPK_REG:
    244       kind = WOP_REG;
    245       r = op.v.reg;
    246       ival = 0;
    247       break;
    248     case OPK_IMM:
    249       kind = WOP_IMM;
    250       ival = op.v.imm;
    251       break;
    252     case OPK_LOCAL:
    253       kind = WOP_LOCAL;
    254       ival = (i64)op.v.frame_slot;
    255       break;
    256     default:
    257       kind = 99u;
    258       ival = 0;
    259       break;
    260   }
    261   if (which == 0) {
    262     w->imm_kind = kind;
    263     w->imm_a = ival;
    264     w->a = r;
    265   } else {
    266     w->imm_kind_b = kind;
    267     w->imm_b = ival;
    268     w->b = r;
    269   }
    270 }
    271 
    272 /* -----------------------------------------------------------------
    273  * Labels
    274  * ----------------------------------------------------------------- */
    275 
    276 Label wasm_label_new(CGTarget* tg) {
    277   WTarget* t = (WTarget*)tg;
    278   Heap* h = t->c->ctx->heap;
    279   if (t->nlabels == t->labels_cap) {
    280     u32 nc = t->labels_cap ? t->labels_cap * 2u : 16u;
    281     void* p = h->realloc(h, t->labels, sizeof(WLabel) * t->labels_cap,
    282                          sizeof(WLabel) * nc, _Alignof(WLabel));
    283     if (!p) wfail(t, "wasm: out of memory");
    284     t->labels = (WLabel*)p;
    285     t->labels_cap = nc;
    286   }
    287   u32 id = t->nlabels++;
    288   memset(&t->labels[id], 0, sizeof t->labels[id]);
    289   return (Label)(id + 1u);
    290 }
    291 
    292 static WLabel* lookup_label(WTarget* t, Label l) {
    293   if (l == LABEL_NONE || l - 1u >= t->nlabels) return NULL;
    294   return &t->labels[l - 1u];
    295 }
    296 
    297 void wasm_label_place(CGTarget* tg, Label l) {
    298   WTarget* t = (WTarget*)tg;
    299   WLabel* lbl = lookup_label(t, l);
    300   if (!lbl) wfail(t, "wasm: label_place on unknown label");
    301   /* If this label is registered to a scope, the scope ops drive the wasm
    302    * structure — placement here is a no-op. */
    303   if (lbl->kind == WLBL_SCOPE_BREAK || lbl->kind == WLBL_SCOPE_CONT) {
    304     lbl->placed = 1;
    305     t->dead = 0;
    306     return;
    307   }
    308   /* Free-standing label placement: record but emit nothing. The CG layer
    309    * sometimes places scope continue/break labels just before/after a
    310    * scope_begin/end pair; for wasm the structured scope ops drive the
    311    * `br N` targets, so these placements are no-ops. A subsequent jump or
    312    * cmp_branch that lands on an unbound label will diagnose. */
    313   lbl->placed = 1;
    314   lbl->wir_index = t->nwir;
    315   if (lbl->kind == WLBL_UNBOUND) lbl->kind = WLBL_FORWARD;
    316   WIR* w = wir_push(t);
    317   w->op = WIR_LABEL;
    318   w->labels[0] = l;
    319   t->dead = 0;
    320 }
    321 
    322 void wasm_jump(CGTarget* tg, Label l) {
    323   WTarget* t = (WTarget*)tg;
    324   if (t->dead) return;
    325   WLabel* lbl = lookup_label(t, l);
    326   if (!lbl) wfail(t, "wasm: jump to unknown label");
    327   WIR* w = wir_push(t);
    328   w->op = WIR_JUMP;
    329   w->labels[0] = l;
    330   t->dead = 1;
    331 }
    332 
    333 void wasm_cmp_branch(CGTarget* tg, CmpOp op, Operand a, Operand b, Label l) {
    334   WTarget* t = (WTarget*)tg;
    335   if (t->dead) return;
    336   if (!lookup_label(t, l)) wfail(t, "wasm: cmp_branch to unknown label");
    337   WIR* w = wir_push(t);
    338   w->op = WIR_CMP_BRANCH;
    339   w->cgop = (u8)op;
    340   wir_capture_operand(w, 0, a);
    341   wir_capture_operand(w, 1, b);
    342   w->type = a.type ? a.type : b.type;
    343   w->labels[0] = l;
    344 }
    345 
    346 void wasm_switch(CGTarget* tg, const CGSwitchDesc* d) {
    347   WTarget* t = (WTarget*)tg;
    348   WIR* w;
    349   if (t->dead) return;
    350   if (!d) wfail(t, "wasm: switch without descriptor");
    351   if (d->default_label == LABEL_NONE)
    352     wfail(t, "wasm: switch without default label");
    353   if (d->ncases && !d->cases) wfail(t, "wasm: switch case count without cases");
    354   if (d->selector.kind != OPK_REG && d->selector.kind != OPK_IMM &&
    355       d->selector.kind != OPK_LOCAL)
    356     wfail(t, "wasm: switch selector has unsupported operand kind");
    357   if (!lookup_label(t, d->default_label))
    358     wfail(t, "wasm: switch default label is unknown");
    359   for (u32 i = 0; i < d->ncases; ++i) {
    360     if (!lookup_label(t, d->cases[i].label))
    361       wfail(t, "wasm: switch case label is unknown");
    362   }
    363 
    364   w = wir_push(t);
    365   w->op = WIR_SWITCH;
    366   wir_capture_operand(w, 0, d->selector);
    367   w->type = d->selector_type;
    368   w->labels[0] = d->default_label;
    369   w->switch_ncases = d->ncases;
    370   if (d->ncases) {
    371     Heap* h = t->c->ctx->heap;
    372     w->switch_cases = (CGSwitchCase*)h->alloc(
    373         h, sizeof(CGSwitchCase) * d->ncases, _Alignof(CGSwitchCase));
    374     if (!w->switch_cases) wfail(t, "wasm: out of memory");
    375     memcpy(w->switch_cases, d->cases, sizeof(CGSwitchCase) * d->ncases);
    376   }
    377   t->dead = 1;
    378 }
    379 
    380 /* -----------------------------------------------------------------
    381  * Scopes
    382  * ----------------------------------------------------------------- */
    383 
    384 CGScope wasm_scope_begin(CGTarget* tg, const CGScopeDesc* d) {
    385   WTarget* t = (WTarget*)tg;
    386   if (t->nscopes >= 32u) wfail(t, "wasm: too many nested scopes (max 32)");
    387   WScope* s = &t->scopes[t->nscopes];
    388   memset(s, 0, sizeof *s);
    389   s->id = ++t->next_scope_id;
    390   s->cg_kind = d->kind;
    391   s->break_lbl = d->break_label;
    392   s->cont_lbl = d->continue_label;
    393   s->result_type = d->result_type;
    394 
    395   /* Wire scope's break/continue labels to this scope so jump()/cmp_branch()
    396    * to them can lower to wasm `br`. */
    397   if (d->break_label != LABEL_NONE) {
    398     WLabel* lbl = lookup_label(t, d->break_label);
    399     if (lbl) {
    400       lbl->kind = WLBL_SCOPE_BREAK;
    401       lbl->scope_id = s->id;
    402     }
    403   }
    404   if (d->continue_label != LABEL_NONE) {
    405     WLabel* lbl = lookup_label(t, d->continue_label);
    406     if (lbl) {
    407       lbl->kind = WLBL_SCOPE_CONT;
    408       lbl->scope_id = s->id;
    409     }
    410   }
    411 
    412   WIR* open = wir_push(t);
    413   open->op = WIR_SCOPE_OPEN;
    414   open->scope_id = s->id;
    415   open->cgop = d->kind;
    416   open->dst = REG_NONE;
    417 
    418   s->placed_in_wir = 1;
    419   t->nscopes++;
    420   return (CGScope)s->id;
    421 }
    422 
    423 static WScope* scope_by_id(WTarget* t, u32 id) {
    424   for (u32 i = 0; i < t->nscopes; ++i) {
    425     if (t->scopes[i].id == id) return &t->scopes[i];
    426   }
    427   return NULL;
    428 }
    429 
    430 void wasm_scope_end(CGTarget* tg, CGScope sc) {
    431   WTarget* t = (WTarget*)tg;
    432   WScope* s = scope_by_id(t, (u32)sc);
    433   if (!s) wfail(t, "wasm: scope_end on unknown scope");
    434   WIR* w = wir_push(t);
    435   w->op = WIR_SCOPE_CLOSE;
    436   w->scope_id = s->id;
    437   /* Pop the scope from the stack. CG always closes in LIFO order. */
    438   if (t->nscopes == 0 || t->scopes[t->nscopes - 1u].id != s->id)
    439     wfail(t, "wasm: scope_end out of LIFO order");
    440   t->nscopes--;
    441   t->dead = 0;
    442 }
    443 
    444 void wasm_break_to(CGTarget* tg, CGScope sc) {
    445   WTarget* t = (WTarget*)tg;
    446   if (t->dead) return;
    447   WScope* s = scope_by_id(t, (u32)sc);
    448   if (!s) wfail(t, "wasm: break_to unknown scope");
    449   WIR* w = wir_push(t);
    450   w->op = WIR_JUMP;
    451   w->labels[0] = s->break_lbl;
    452   t->dead = 1;
    453 }
    454 
    455 void wasm_continue_to(CGTarget* tg, CGScope sc) {
    456   WTarget* t = (WTarget*)tg;
    457   if (t->dead) return;
    458   WScope* s = scope_by_id(t, (u32)sc);
    459   if (!s) wfail(t, "wasm: continue_to unknown scope");
    460   WIR* w = wir_push(t);
    461   w->op = WIR_JUMP;
    462   w->labels[0] = s->cont_lbl;
    463   t->dead = 1;
    464 }
    465 
    466 /* -----------------------------------------------------------------
    467  * Function lifecycle
    468  * ----------------------------------------------------------------- */
    469 
    470 /* Forward decl — promotes an undefined function symbol's WasmFunc to an
    471  * import using the supplied ABI to build the wasm signature. */
    472 static void promote_import_func(WTarget* t, ObjSymId sym, WasmFunc* f,
    473                                 const ABIFuncInfo* abi);
    474 
    475 /* Lookup or allocate a Wasm function index for an ObjSymId. Returns
    476  * (wasm_func_idx, *out_func) on success. */
    477 static u32 sym_to_wasm_func(WTarget* t, ObjSymId sym, WasmFunc** out_func) {
    478   Heap* h = t->c->ctx->heap;
    479   if (sym >= t->sym_to_func_cap) {
    480     u32 nc = t->sym_to_func_cap ? t->sym_to_func_cap : 16u;
    481     while (nc <= sym) nc *= 2u;
    482     u32* p =
    483         (u32*)h->realloc(h, t->sym_to_func, sizeof(u32) * t->sym_to_func_cap,
    484                          sizeof(u32) * nc, _Alignof(u32));
    485     if (!p) wfail(t, "wasm: out of memory");
    486     for (u32 i = t->sym_to_func_cap; i < nc; ++i) p[i] = 0;
    487     t->sym_to_func = p;
    488     t->sym_to_func_cap = nc;
    489   }
    490   if (t->sym_to_func[sym]) {
    491     u32 idx = t->sym_to_func[sym] - 1u;
    492     if (out_func) *out_func = &t->module->funcs[idx];
    493     return idx;
    494   }
    495   /* Create a fresh WasmFunc and link. */
    496   WasmFunc* f = wasm_add_func(t->c, t->module);
    497   u32 idx = t->module->nfuncs - 1u;
    498   t->sym_to_func[sym] = idx + 1u;
    499   if (out_func) *out_func = f;
    500   return idx;
    501 }
    502 
    503 static WSlot* slot_push(WTarget* t) {
    504   Heap* h = t->c->ctx->heap;
    505   if (t->nslots == t->slots_cap) {
    506     u32 nc = t->slots_cap ? t->slots_cap * 2u : 16u;
    507     WSlot* ns = (WSlot*)h->realloc(h, t->slots, sizeof(WSlot) * t->slots_cap,
    508                                    sizeof(WSlot) * nc, _Alignof(WSlot));
    509     if (!ns) wfail(t, "wasm: out of memory");
    510     t->slots = ns;
    511     t->slots_cap = nc;
    512   }
    513   WSlot* s = &t->slots[t->nslots++];
    514   memset(s, 0, sizeof *s);
    515   return s;
    516 }
    517 
    518 /* True iff `ty` maps to a single wasm value type (i32/i64/f32/f64) and so can
    519  * live directly in a wasm local. Aggregates (records/arrays) and anything
    520  * wider than 8 bytes must be homed in linear memory. */
    521 static int type_is_wasm_scalar(WTarget* t, KitCgTypeId ty) {
    522   ABITypeInfo ti;
    523   if (!ty) return 0;
    524   ti = abi_cg_type_info(t->c->abi, ty);
    525   if (ti.scalar_kind == ABI_SC_VOID) return 0;
    526   if (ti.scalar_kind == ABI_SC_PTR) return 1;
    527   return ti.size <= 8u;
    528 }
    529 
    530 static FrameSlot alloc_frame_slot_kind(WTarget* t, KitCgTypeId ty, u32 size,
    531                                        u32 align, int stack_backed) {
    532   WSlot* s;
    533   u32 slot_id;
    534   type_size_align(t, ty, size, align, &size, &align);
    535   /* A non-scalar type has no wasm value type; force it into linear memory. */
    536   if (!stack_backed && !type_is_wasm_scalar(t, ty)) stack_backed = 1;
    537   s = slot_push(t);
    538   s->type = ty;
    539   s->size = size;
    540   s->align = align;
    541   if (stack_backed) {
    542     ensure_stack_pointer(t);
    543     t->frame_size = align_to_u32(t->frame_size, align);
    544     s->kind = W_SLOT_STACK;
    545     s->frame_offset = t->frame_size;
    546     t->frame_size += size;
    547     if (align > t->frame_align) t->frame_align = align;
    548     t->has_stack_frame = 1;
    549   } else {
    550     s->kind = W_SLOT_LOCAL;
    551     s->wasm_local = add_wasm_local(t, valtype_for_type(t, ty));
    552   }
    553   slot_id = t->nslots;
    554   return (FrameSlot)slot_id;
    555 }
    556 
    557 static WSlot* slot_for(WTarget* t, FrameSlot fs) {
    558   if (fs == FRAME_SLOT_NONE) wfail(t, "wasm: FRAME_SLOT_NONE used");
    559   u32 idx = fs - 1u;
    560   if (idx >= t->nslots) wfail(t, "wasm: bad frame slot id");
    561   return &t->slots[idx];
    562 }
    563 
    564 static void promote_slot_to_stack(WTarget* t, WSlot* s) {
    565   if (s->kind == W_SLOT_STACK) return;
    566   ensure_stack_pointer(t);
    567   t->frame_size = align_to_u32(t->frame_size, s->align ? s->align : 1u);
    568   s->frame_offset = t->frame_size;
    569   t->frame_size += s->size ? s->size : 1u;
    570   if (s->align > t->frame_align) t->frame_align = s->align;
    571   s->kind = W_SLOT_STACK;
    572   t->has_stack_frame = 1;
    573 }
    574 
    575 void wasm_func_begin(CGTarget* tg, const CGFuncDesc* d) {
    576   WTarget* t = (WTarget*)tg;
    577   WasmFunc* f;
    578   u32 idx;
    579   const CgType* fnty;
    580   const ABIFuncInfo* abi;
    581   Heap* h = t->c->ctx->heap;
    582 
    583   ensure_module(t);
    584   t->cur_fn_desc = d;
    585   memset(&t->cur_stmt_loc, 0, sizeof t->cur_stmt_loc);
    586   t->nwir = 0;
    587   t->nlabels = 0;
    588   t->nslots = 0;
    589   t->nscopes = 0;
    590   t->next_scope_id = 0;
    591   t->frame_size = 0;
    592   t->frame_align = 1;
    593   t->frame_base_local = 0xffffffffu;
    594   t->frame_saved_sp_local = 0xffffffffu;
    595   t->has_stack_frame = 0;
    596   t->dead = 0;
    597   t->sret_param_local = 0xffffffffu;
    598   t->va_ptr_param_local = 0xffffffffu;
    599   t->cur_has_sret = 0;
    600   t->cur_is_variadic = 0;
    601   t->varcall_saved_sp_local = 0xffffffffu;
    602   t->varcall_buf_local = 0xffffffffu;
    603   t->va_arg_tmp_addr_local = 0xffffffffu;
    604   t->nparams_cg = 0;
    605   t->nbyval_copies = 0;
    606   /* Wipe reg map. */
    607   for (u32 i = 0; i < t->reg_cap; ++i) t->reg_to_local[i] = 0xffffffffu;
    608 
    609   idx = sym_to_wasm_func(t, d->sym, &f);
    610   t->cur_func_idx = idx;
    611   t->cur_func = f;
    612 
    613   fnty = cg_type_get(t->c, d->fn_type);
    614   if (!fnty || fnty->kind != KIT_CG_TYPE_FUNC)
    615     wfail(t, "wasm: func_begin without function type");
    616   abi = d->abi;
    617   if (!abi) wfail(t, "wasm: func_begin with no ABI info");
    618 
    619   /* Build the wasm function's param layout:
    620    *   [sret_ptr]? [param_0] [param_1] ...
    621    * with IGNORE'd CG params dropped. Record per-CG-param the wasm-local
    622    * index so wasm_param can place each frame slot on the right local.
    623    * params/locals/local_names grow on demand — no fixed cap. */
    624   f->nparams = 0;
    625   if (abi->has_sret) {
    626     t->sret_param_local =
    627         wasm_func_push_param(t->c, t->module, f, WASM_VAL_I32);
    628     t->cur_has_sret = 1;
    629     ensure_linear_memory(t);
    630   }
    631   if (fnty->func.nparams > t->param_local_idx_cap) {
    632     u32 nc = t->param_local_idx_cap ? t->param_local_idx_cap : 4u;
    633     while (nc < fnty->func.nparams) nc *= 2u;
    634     u32* p = (u32*)h->realloc(h, t->param_local_idx,
    635                               sizeof(u32) * t->param_local_idx_cap,
    636                               sizeof(u32) * nc, _Alignof(u32));
    637     if (!p) wfail(t, "wasm: out of memory");
    638     t->param_local_idx = p;
    639     t->param_local_idx_cap = nc;
    640   }
    641   t->nparams_cg = fnty->func.nparams;
    642   for (u32 i = 0; i < fnty->func.nparams; ++i) {
    643     const ABIArgInfo* ai = &abi->params[i];
    644     if (ai->kind == ABI_ARG_IGNORE) {
    645       t->param_local_idx[i] = 0xffffffffu;
    646       continue;
    647     }
    648     if (ai->kind == ABI_ARG_INDIRECT) {
    649       t->param_local_idx[i] =
    650           wasm_func_push_param(t->c, t->module, f, WASM_VAL_I32);
    651       ensure_linear_memory(t);
    652     } else {
    653       if (ai->nparts != 1)
    654         wfail(t, "wasm: multi-part DIRECT param %u not yet implemented", i);
    655       const ABIArgPart* p = &ai->parts[0];
    656       WasmValType vt = valtype_for_size_kind(
    657           t, p->size, p->cls == ABI_CLASS_FP ? ABI_SC_FLOAT : ABI_SC_INT);
    658       t->param_local_idx[i] = wasm_func_push_param(t->c, t->module, f, vt);
    659     }
    660   }
    661   /* Variadic: append hidden i32 va_ptr trailing param. Must match
    662    * abi_to_wasm_func_type so indirect calls' signature interning agrees. */
    663   if (abi->variadic) {
    664     t->va_ptr_param_local =
    665         wasm_func_push_param(t->c, t->module, f, WASM_VAL_I32);
    666     t->cur_is_variadic = 1;
    667     ensure_linear_memory(t);
    668   }
    669   f->nresults = 0;
    670   if (!abi->has_sret && abi->ret.kind == ABI_ARG_DIRECT &&
    671       abi->ret.nparts == 1) {
    672     const ABIArgPart* p = &abi->ret.parts[0];
    673     f->results[0] = valtype_for_size_kind(
    674         t, p->size, p->cls == ABI_CLASS_FP ? ABI_SC_FLOAT : ABI_SC_INT);
    675     f->nresults = 1;
    676   }
    677   f->typeidx = wasm_intern_func_type(t->c, t->module, f);
    678 
    679   /* Export under the symbol's name when the symbol is globally bound. */
    680   const ObjSym* sym = obj_symbol_get(t->obj, d->sym);
    681   if (sym && sym->bind != SB_LOCAL) {
    682     const char* name = pool_sym_cstr(t->c->global, sym->name, NULL);
    683     if (name && *name) {
    684       Heap* h = t->c->ctx->heap;
    685       size_t nlen = strlen(name);
    686       char* dup = (char*)h->alloc(h, nlen + 1u, 1);
    687       memcpy(dup, name, nlen + 1u);
    688       f->export_name = dup;
    689       WasmExport* e = wasm_add_export(t->c, t->module);
    690       char* exp_name = (char*)h->alloc(h, nlen + 1u, 1);
    691       memcpy(exp_name, name, nlen + 1u);
    692       e->name = exp_name;
    693       e->kind = 0; /* function export */
    694       e->index = idx;
    695     }
    696   }
    697 }
    698 
    699 CGLocalStorage wasm_param(CGTarget* tg, const CGParamDesc* d) {
    700   WTarget* t = (WTarget*)tg;
    701   CGLocalStorage ls;
    702   Heap* h = t->c->ctx->heap;
    703   u32 wli;
    704   WSlot* s;
    705   memset(&ls, 0, sizeof ls);
    706   if (d->index >= t->nparams_cg)
    707     wfail(t, "wasm: param index %u out of range (nparams=%u)", d->index,
    708           t->nparams_cg);
    709   wli = t->param_local_idx[d->index];
    710   if (wli == 0xffffffffu) {
    711     /* ABI_ARG_IGNORE — no wasm storage. Push a placeholder slot so the
    712      * returned FrameSlot is meaningful to CG; it never gets emitted. */
    713     s = slot_push(t);
    714     s->type = d->type;
    715     s->size = d->size;
    716     s->align = d->align ? d->align : 1u;
    717     s->kind = W_SLOT_LOCAL;
    718     s->wasm_local = 0;
    719     ls.kind = CG_LOCAL_STORAGE_FRAME;
    720     ls.v.frame_slot = (FrameSlot)t->nslots;
    721     return ls;
    722   }
    723   if (d->abi && d->abi->kind == ABI_ARG_INDIRECT) {
    724     /* byval: callee receives an i32 pointer; copy the aggregate into a
    725      * caller-isolated stack-backed slot at function entry. */
    726     u32 size = d->size;
    727     u32 align = d->align ? d->align : 1u;
    728     type_size_align(t, d->type, size, align, &size, &align);
    729     ensure_stack_pointer(t);
    730     s = slot_push(t);
    731     s->type = d->type;
    732     s->size = size;
    733     s->align = align;
    734     s->kind = W_SLOT_STACK;
    735     t->frame_size = align_to_u32(t->frame_size, align);
    736     s->frame_offset = t->frame_size;
    737     t->frame_size += size;
    738     if (align > t->frame_align) t->frame_align = align;
    739     t->has_stack_frame = 1;
    740     /* Queue prologue copy-in from the pointer's wasm-local into &slot. */
    741     if (t->nbyval_copies == t->byval_copies_cap) {
    742       u32 nc = t->byval_copies_cap ? t->byval_copies_cap * 2u : 4u;
    743       WByvalCopy* nb = (WByvalCopy*)h->realloc(
    744           h, t->byval_copies, sizeof(WByvalCopy) * t->byval_copies_cap,
    745           sizeof(WByvalCopy) * nc, _Alignof(WByvalCopy));
    746       if (!nb) wfail(t, "wasm: out of memory");
    747       t->byval_copies = nb;
    748       t->byval_copies_cap = nc;
    749     }
    750     WByvalCopy* bc = &t->byval_copies[t->nbyval_copies++];
    751     bc->ptr_wasm_local = wli;
    752     bc->dst_slot_id = t->nslots - 1u;
    753     ls.kind = CG_LOCAL_STORAGE_FRAME;
    754     ls.v.frame_slot = (FrameSlot)t->nslots;
    755     return ls;
    756   }
    757   if (d->flags & CG_LOCAL_ADDR_TAKEN) {
    758     wfail(t, "wasm: address-taken parameter not yet implemented");
    759   }
    760   s = slot_push(t);
    761   s->type = d->type;
    762   s->size = d->size;
    763   s->align = d->align ? d->align : 1u;
    764   s->kind = W_SLOT_LOCAL;
    765   s->wasm_local = wli;
    766   ls.kind = CG_LOCAL_STORAGE_FRAME;
    767   ls.v.frame_slot = (FrameSlot)t->nslots;
    768   return ls;
    769 }
    770 
    771 /* Allocate a frame slot backed by a fresh wasm local. */
    772 static FrameSlot alloc_frame_slot(WTarget* t, KitCgTypeId ty) {
    773   return alloc_frame_slot_kind(t, ty, 0, 0, 0);
    774 }
    775 
    776 FrameSlot wasm_frame_slot(CGTarget* tg, const FrameSlotDesc* d) {
    777   WTarget* t = (WTarget*)tg;
    778   if (!d->type && !d->size) wfail(t, "wasm: frame slot without type/size");
    779   return alloc_frame_slot_kind(
    780       t, d->type, d->size, d->align,
    781       (d->flags & FSF_ADDR_TAKEN) != 0 || d->kind == FS_ALLOCA);
    782 }
    783 
    784 CGLocalStorage wasm_local(CGTarget* tg, const CGLocalDesc* d) {
    785   WTarget* t = (WTarget*)tg;
    786   CGLocalStorage ls;
    787   memset(&ls, 0, sizeof ls);
    788   if (d->flags & (CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED)) {
    789     ls.kind = CG_LOCAL_STORAGE_FRAME;
    790     ls.v.frame_slot = alloc_frame_slot_kind(t, d->type, d->size, d->align, 1);
    791     return ls;
    792   }
    793   ls.kind = CG_LOCAL_STORAGE_FRAME;
    794   ls.v.frame_slot = alloc_frame_slot(t, d->type);
    795   return ls;
    796 }
    797 
    798 /* -----------------------------------------------------------------
    799  * Data-movement records
    800  * ----------------------------------------------------------------- */
    801 
    802 void wasm_load_imm(CGTarget* tg, Operand dst, i64 imm) {
    803   WTarget* t = (WTarget*)tg;
    804   if (t->dead) return;
    805   if (dst.kind != OPK_REG) wfail(t, "wasm: load_imm dst must be REG");
    806   WIR* w = wir_push(t);
    807   w->op = WIR_LOAD_IMM;
    808   w->dst = dst.v.reg;
    809   w->imm = imm;
    810   w->type = dst.type;
    811   w->cls = dst.cls;
    812 }
    813 
    814 void wasm_load_const(CGTarget* tg, Operand dst, ConstBytes cb) {
    815   WTarget* t = (WTarget*)tg;
    816   if (t->dead) return;
    817   if (dst.kind != OPK_REG) wfail(t, "wasm: load_const dst must be REG");
    818   WasmValType vt = valtype_for_type(t, cb.type);
    819   WIR* w = wir_push(t);
    820   w->dst = dst.v.reg;
    821   w->type = cb.type;
    822   w->cls = dst.cls;
    823   if (vt == WASM_VAL_F32) {
    824     if (cb.size != 4) wfail(t, "wasm: f32 const must be 4 bytes");
    825     float f;
    826     memcpy(&f, cb.bytes, 4);
    827     w->op = WIR_LOAD_CONST_F;
    828     w->fp_imm = (double)f;
    829   } else if (vt == WASM_VAL_F64) {
    830     if (cb.size != 8) wfail(t, "wasm: f64 const must be 8 bytes");
    831     double v;
    832     memcpy(&v, cb.bytes, 8);
    833     w->op = WIR_LOAD_CONST_F;
    834     w->fp_imm = v;
    835   } else {
    836     i64 v = 0;
    837     memcpy(&v, cb.bytes, cb.size < 8 ? cb.size : 8u);
    838     /* Sign-extend for small signed types so the immediate has the expected
    839      * bit pattern. */
    840     if (cb.size == 1)
    841       v = (i64)(i8)v;
    842     else if (cb.size == 2)
    843       v = (i64)(i16)v;
    844     else if (cb.size == 4)
    845       v = (i64)(i32)v;
    846     w->op = WIR_LOAD_IMM;
    847     w->imm = v;
    848   }
    849 }
    850 
    851 void wasm_copy(CGTarget* tg, Operand dst, Operand src) {
    852   WTarget* t = (WTarget*)tg;
    853   if (t->dead) return;
    854   if (dst.kind != OPK_REG || src.kind != OPK_REG)
    855     wfail(t, "wasm: copy operands must both be REG");
    856   WIR* w = wir_push(t);
    857   w->op = WIR_COPY;
    858   w->dst = dst.v.reg;
    859   w->a = src.v.reg;
    860   w->type = dst.type;
    861 }
    862 
    863 void wasm_binop(CGTarget* tg, BinOp op, Operand dst, Operand a, Operand b) {
    864   WTarget* t = (WTarget*)tg;
    865   if (t->dead) return;
    866   if (dst.kind != OPK_REG) wfail(t, "wasm: binop dst must be REG");
    867   WIR* w = wir_push(t);
    868   w->op = WIR_BINOP;
    869   w->cgop = (u8)op;
    870   w->dst = dst.v.reg;
    871   wir_capture_operand(w, 0, a);
    872   wir_capture_operand(w, 1, b);
    873   w->type = dst.type;
    874   w->cls = dst.cls;
    875 }
    876 
    877 void wasm_unop(CGTarget* tg, UnOp op, Operand dst, Operand a) {
    878   WTarget* t = (WTarget*)tg;
    879   if (t->dead) return;
    880   if (dst.kind != OPK_REG) wfail(t, "wasm: unop dst must be REG");
    881   WIR* w = wir_push(t);
    882   w->op = WIR_UNOP;
    883   w->cgop = (u8)op;
    884   w->dst = dst.v.reg;
    885   wir_capture_operand(w, 0, a);
    886   w->type = dst.type;
    887   w->cls = dst.cls;
    888 }
    889 
    890 void wasm_cmp(CGTarget* tg, CmpOp op, Operand dst, Operand a, Operand b) {
    891   WTarget* t = (WTarget*)tg;
    892   if (t->dead) return;
    893   if (dst.kind != OPK_REG) wfail(t, "wasm: cmp dst must be REG");
    894   WIR* w = wir_push(t);
    895   w->op = WIR_CMP;
    896   w->cgop = (u8)op;
    897   w->dst = dst.v.reg;
    898   wir_capture_operand(w, 0, a);
    899   wir_capture_operand(w, 1, b);
    900   w->type = dst.type;
    901   w->type2 = a.type ? a.type : b.type;
    902   w->cls = dst.cls;
    903 }
    904 
    905 void wasm_convert(CGTarget* tg, ConvKind ck, Operand dst, Operand src) {
    906   WTarget* t = (WTarget*)tg;
    907   if (t->dead) return;
    908   if (dst.kind != OPK_REG) wfail(t, "wasm: convert dst must be REG");
    909   WIR* w = wir_push(t);
    910   w->op = WIR_CONVERT;
    911   w->cgop = (u8)ck;
    912   w->dst = dst.v.reg;
    913   wir_capture_operand(w, 0, src);
    914   w->type = dst.type;
    915   w->type2 = src.type;
    916   w->cls = dst.cls;
    917 }
    918 
    919 /* Build (or reuse) the wasm typeidx for a function-typed indirect call. The
    920  * signature shape must exactly match a direct call to the same C type:
    921  * - hidden i32 sret pointer prepended when ABI has_sret
    922  * - per-param: i32 pointer for ABI_ARG_INDIRECT, else the DIRECT scalar
    923  *   produced by the wasm32 BasicCABI classifier (IGNORE params dropped)
    924  * - result: empty when has_sret, else the DIRECT scalar
    925  *
    926  * call_indirect's runtime type check compares this typeidx against the
    927  * funcref's recorded type, so any mismatch with the direct-call path would
    928  * trap. The temporary WasmFunc is stack-allocated; wasm_intern_func_type
    929  * copies the param array on insertion. */
    930 /* Translate an ABI function signature into the wasm-level param/result list.
    931  * Used both for indirect-call signature interning and for import-function
    932  * type synthesis. `what` names the call site in diagnostics. Returns the
    933  * interned type index. The caller-provided buffer `params` (length `cap`) is
    934  * filled from index 0; *nparams_out is the count written. */
    935 static u32 abi_to_wasm_func_type(WTarget* t, const ABIFuncInfo* abi,
    936                                  WasmValType* params, u32 cap, u32* nparams_out,
    937                                  WasmValType* result_out, u32* nresults_out,
    938                                  const char* what) {
    939   WasmFunc tmp;
    940   memset(&tmp, 0, sizeof tmp);
    941   tmp.params = params;
    942   tmp.cap_params = cap;
    943   if (abi->has_sret) {
    944     if (tmp.nparams >= cap) wfail(t, "wasm: %s has too many params", what);
    945     params[tmp.nparams++] = WASM_VAL_I32;
    946   }
    947   for (u32 i = 0; i < abi->nparams; ++i) {
    948     const ABIArgInfo* ai = &abi->params[i];
    949     if (ai->kind == ABI_ARG_IGNORE) continue;
    950     if (tmp.nparams >= cap) wfail(t, "wasm: %s has too many params", what);
    951     if (ai->kind == ABI_ARG_INDIRECT) {
    952       params[tmp.nparams++] = WASM_VAL_I32;
    953     } else {
    954       if (ai->nparts != 1)
    955         wfail(t, "wasm: %s has multi-part DIRECT param (unsupported)", what);
    956       const ABIArgPart* p = &ai->parts[0];
    957       params[tmp.nparams++] = valtype_for_size_kind(
    958           t, p->size, p->cls == ABI_CLASS_FP ? ABI_SC_FLOAT : ABI_SC_INT);
    959     }
    960   }
    961   /* Variadic functions take one hidden trailing i32 va_ptr — the address of
    962    * the caller-packed varargs buffer in linear memory. See wasm_call's
    963    * variadic packing and wasm_va_start in this file. */
    964   if (abi->variadic) {
    965     if (tmp.nparams >= cap) wfail(t, "wasm: %s has too many params", what);
    966     params[tmp.nparams++] = WASM_VAL_I32;
    967   }
    968   tmp.nresults = 0;
    969   if (!abi->has_sret && abi->ret.kind == ABI_ARG_DIRECT &&
    970       abi->ret.nparts == 1) {
    971     const ABIArgPart* p = &abi->ret.parts[0];
    972     tmp.results[0] = valtype_for_size_kind(
    973         t, p->size, p->cls == ABI_CLASS_FP ? ABI_SC_FLOAT : ABI_SC_INT);
    974     tmp.nresults = 1;
    975   }
    976   if (nparams_out) *nparams_out = tmp.nparams;
    977   if (result_out && tmp.nresults) *result_out = tmp.results[0];
    978   if (nresults_out) *nresults_out = tmp.nresults;
    979   return wasm_intern_func_type(t->c, t->module, &tmp);
    980 }
    981 
    982 static u32 intern_indirect_signature(WTarget* t, const ABIFuncInfo* abi) {
    983   WasmValType params[64];
    984   return abi_to_wasm_func_type(t, abi, params, 64u, NULL, NULL, NULL,
    985                                "indirect call");
    986 }
    987 
    988 /* Promote `f` (already allocated for `sym` via sym_to_wasm_func) into a wasm
    989  * `(import "<module>" "<field>" (func ...))` entry. The signature is
    990  * synthesized from the supplied ABIFuncInfo, mirroring the layout the
    991  * caller-side WIR_CALL pushes onto the stack: hidden i32 sret-pointer when
    992  * has_sret, followed by lowered params, with a single i32/i64/f32/f64
    993  * result for direct, non-sret returns. Module/field default to "env" / the
    994  * symbol's name; either may be overridden by
    995  * `__attribute__((import_module/import_name))`. */
    996 static void promote_import_func(WTarget* t, ObjSymId sym, WasmFunc* f,
    997                                 const ABIFuncInfo* abi) {
    998   Heap* h = t->c->ctx->heap;
    999   const ObjSym* os;
   1000   const char* sym_name;
   1001   size_t sym_name_len = 0;
   1002   Sym attr_module = 0;
   1003   Sym attr_name = 0;
   1004   const char* mod_str = "env";
   1005   size_t mod_len = sizeof("env") - 1u;
   1006   if (!t->module) return;
   1007   if (f->is_import) return;
   1008   os = obj_symbol_get(t->obj, sym);
   1009   if (!os) return;
   1010   if (os->section_id != OBJ_SEC_NONE) return; /* already defined locally */
   1011   if (os->kind != SK_UNDEF && os->kind != SK_FUNC) return;
   1012   if (!abi)
   1013     wfail(t,
   1014           "wasm: cannot synthesize import signature for '%s' "
   1015           "(missing ABI info)",
   1016           pool_sym_cstr(t->c->global, os->name, NULL));
   1017   if (f->ninsns != 0)
   1018     wfail(t, "wasm: cannot promote function with emitted body to import");
   1019   /* Synthesize the wasm type from the ABI. Diagnoses unsupported shapes
   1020    * (varargs => multi-part DIRECT or extra parts; by-value aggregates that
   1021    * the ABI didn't already lower to ABI_ARG_INDIRECT) by naming the import
   1022    * symbol so the error points at the C declaration. */
   1023   WasmValType params[64];
   1024   u32 nparams = 0;
   1025   WasmValType result_vt = 0;
   1026   u32 nresults = 0;
   1027   char what[160];
   1028   sym_name = pool_sym_cstr(t->c->global, os->name, &sym_name_len);
   1029   if (!sym_name) sym_name = "(anonymous)";
   1030   /* Snprintf-free: build a short context string by hand to avoid pulling in
   1031    * stdio. The buffer is large enough for any plausible C identifier. */
   1032   {
   1033     const char* prefix = "import '";
   1034     const char* suffix = "'";
   1035     size_t plen = 8u; /* strlen(prefix) */
   1036     size_t slen = 1u; /* strlen(suffix) */
   1037     size_t nlen = sym_name_len ? sym_name_len : strlen(sym_name);
   1038     if (nlen > sizeof(what) - plen - slen - 1u)
   1039       nlen = sizeof(what) - plen - slen - 1u;
   1040     memcpy(what, prefix, plen);
   1041     memcpy(what + plen, sym_name, nlen);
   1042     memcpy(what + plen + nlen, suffix, slen);
   1043     what[plen + nlen + slen] = 0;
   1044   }
   1045   f->typeidx = abi_to_wasm_func_type(t, abi, params, 64u, &nparams, &result_vt,
   1046                                      &nresults, what);
   1047   f->has_typeidx = 1;
   1048   /* Mirror the synthesized params/results onto the WasmFunc so the import
   1049    * encoder writes the matching signature. */
   1050   wasm_func_set_params(t->c, t->module, f, params, nparams);
   1051   f->nresults = nresults;
   1052   if (nresults) f->results[0] = result_vt;
   1053   /* Resolve module/name overrides set via __attribute__((import_module/
   1054    * import_name)) on the C declaration. */
   1055   (void)wasm_imports_get(t->obj, os->name, &attr_module, &attr_name);
   1056   if (attr_module) mod_str = pool_sym_cstr(t->c->global, attr_module, &mod_len);
   1057   const char* name_str =
   1058       attr_name ? pool_sym_cstr(t->c->global, attr_name, &sym_name_len)
   1059                 : sym_name;
   1060   size_t name_len = attr_name
   1061                         ? sym_name_len
   1062                         : (sym_name_len ? sym_name_len : strlen(name_str));
   1063   f->is_import = 1;
   1064   {
   1065     char* m = (char*)h->alloc(h, mod_len + 1u, 1);
   1066     if (!m) wfail(t, "wasm: out of memory");
   1067     memcpy(m, mod_str, mod_len);
   1068     m[mod_len] = 0;
   1069     f->import_module = m;
   1070   }
   1071   {
   1072     char* n = (char*)h->alloc(h, name_len + 1u, 1);
   1073     if (!n) wfail(t, "wasm: out of memory");
   1074     memcpy(n, name_str, name_len);
   1075     n[name_len] = 0;
   1076     f->import_name = n;
   1077   }
   1078 }
   1079 
   1080 const char* wasm_tail_call_unrealizable_reason(CGTarget* tg,
   1081                                                const CGCallDesc* d) {
   1082   (void)tg;
   1083   /* Variadic tail calls are not realizable on wasm: varargs are packed into a
   1084    * buffer carved from this function's linear-memory frame, which return_call
   1085    * tears down before the callee reads it. sret is realizable — the tail
   1086    * forwards the function's own incoming sret pointer (see wasm_call). wasm
   1087    * function parameters are wasm locals, so there is no caller stack-arg area
   1088    * to overflow. */
   1089   if (d->abi && d->abi->variadic)
   1090     return "wasm cannot tail-call a variadic function (its vararg buffer "
   1091            "lives in the frame a sibling call tears down)";
   1092   return NULL;
   1093 }
   1094 
   1095 void wasm_call(CGTarget* tg, const CGCallDesc* d) {
   1096   WTarget* t = (WTarget*)tg;
   1097   if (t->dead) return;
   1098   int is_indirect = (d->callee.kind != OPK_GLOBAL);
   1099   if (is_indirect && d->callee.kind != OPK_REG)
   1100     wfail(t, "wasm: indirect call callee must be a register (got opkind %u)",
   1101           (unsigned)d->callee.kind);
   1102   Heap* h = t->c->ctx->heap;
   1103   int callee_has_sret = (d->abi && d->abi->has_sret) ? 1 : 0;
   1104   int callee_variadic = (d->abi && d->abi->variadic) ? 1 : 0;
   1105   int is_tail = (d->flags & CG_CALL_TAIL) ? 1 : 0;
   1106   if (is_tail) {
   1107     /* Realizability is decided by CG via wasm_tail_call_unrealizable_reason
   1108      * before CG_CALL_TAIL is set: variadic tails are rejected there, and sret
   1109      * tails forward the incoming sret pointer (handled in the WIR emit). */
   1110     ensure_module(t);
   1111     t->module->features |= WASM_FEATURE_TAIL_CALLS;
   1112   }
   1113   u32 nfixed = (u32)d->nargs;
   1114   u32 nvar = 0u;
   1115   if (callee_variadic) {
   1116     if (d->nargs < d->abi->nparams)
   1117       wfail(t, "wasm: variadic call has fewer args (%u) than fixed params (%u)",
   1118             (unsigned)d->nargs, (unsigned)d->abi->nparams);
   1119     nfixed = d->abi->nparams;
   1120     nvar = (u32)d->nargs - nfixed;
   1121   }
   1122   WIR* w = wir_push(t);
   1123   if (is_indirect) {
   1124     if (!d->abi) wfail(t, "wasm: indirect call without ABIFuncInfo");
   1125     ensure_module(t);
   1126     w->op = WIR_CALL_INDIRECT;
   1127     w->a = d->callee.v.reg;
   1128     w->imm = (i64)intern_indirect_signature(t, d->abi);
   1129   } else {
   1130     w->op = WIR_CALL;
   1131     w->call_sym = d->callee.v.global.sym;
   1132     /* Direct calls into externally-defined functions become wasm imports.
   1133      * Synthesize the import signature now while the ABI is available — the
   1134      * WIR emit loop only has the symbol index. The C frontend mints SK_FUNC
   1135      * for `extern foo(...)` declarations; the "undefined" signal is
   1136      * `section_id == OBJ_SEC_NONE`. SK_UNDEF can appear when a symbol's
   1137      * kind hasn't been pinned down yet. */
   1138     {
   1139       const ObjSym* os = obj_symbol_get(t->obj, d->callee.v.global.sym);
   1140       if (os && os->section_id == OBJ_SEC_NONE &&
   1141           (os->kind == SK_UNDEF || os->kind == SK_FUNC)) {
   1142         WasmFunc* f;
   1143         ensure_module(t);
   1144         (void)sym_to_wasm_func(t, d->callee.v.global.sym, &f);
   1145         if (!f->is_import)
   1146           promote_import_func(t, d->callee.v.global.sym, f, d->abi);
   1147       }
   1148     }
   1149   }
   1150   w->call_narg = nfixed;
   1151   w->type = d->ret.type;
   1152   w->call_has_sret = (u8)callee_has_sret;
   1153   w->call_variadic = (u8)callee_variadic;
   1154   w->call_tail = (u8)is_tail;
   1155   w->call_nvar = nvar;
   1156   if (callee_variadic) ensure_linear_memory(t);
   1157   if (callee_has_sret) {
   1158     /* Caller allocated a frame slot for the aggregate result via
   1159      * api_alloc_call_ret_storage; pass its address as the prepended i32. */
   1160     w->call_sret_addr = d->ret.storage;
   1161     ensure_linear_memory(t);
   1162   }
   1163   if (nfixed) {
   1164     w->call_args = (Reg*)h->alloc(h, sizeof(Reg) * nfixed, _Alignof(Reg));
   1165     w->call_arg_imms = (i64*)h->alloc(h, sizeof(i64) * nfixed, _Alignof(i64));
   1166     w->call_arg_kinds = (u8*)h->alloc(h, nfixed, 1);
   1167     w->call_arg_types = (KitCgTypeId*)h->alloc(h, sizeof(KitCgTypeId) * nfixed,
   1168                                                _Alignof(KitCgTypeId));
   1169     w->call_arg_addrs =
   1170         (Operand*)h->alloc(h, sizeof(Operand) * nfixed, _Alignof(Operand));
   1171     memset(w->call_arg_addrs, 0, sizeof(Operand) * nfixed);
   1172     for (u32 i = 0; i < nfixed; ++i) {
   1173       const CGABIValue* av = &d->args[i];
   1174       w->call_arg_types[i] = av->type;
   1175       int is_indirect = (av->abi && av->abi->kind == ABI_ARG_INDIRECT);
   1176       if (is_indirect) {
   1177         if (av->storage.kind != OPK_LOCAL && av->storage.kind != OPK_INDIRECT &&
   1178             av->storage.kind != OPK_GLOBAL) {
   1179           wfail(t, "wasm: byval call arg %u storage kind %u must be an address",
   1180                 i, (unsigned)av->storage.kind);
   1181         }
   1182         w->call_arg_kinds[i] = WOP_ADDR;
   1183         w->call_args[i] = REG_NONE;
   1184         w->call_arg_imms[i] = 0;
   1185         w->call_arg_addrs[i] = av->storage;
   1186         ensure_linear_memory(t);
   1187       } else if (av->storage.kind == OPK_REG) {
   1188         w->call_arg_kinds[i] = 0;
   1189         w->call_args[i] = av->storage.v.reg;
   1190         w->call_arg_imms[i] = 0;
   1191       } else if (av->storage.kind == OPK_IMM) {
   1192         w->call_arg_kinds[i] = 1;
   1193         w->call_args[i] = REG_NONE;
   1194         w->call_arg_imms[i] = av->storage.v.imm;
   1195       } else {
   1196         wfail(t,
   1197               "wasm: call arg %u has unsupported operand kind %u (only "
   1198               "REG/IMM scalar args are supported in v1)",
   1199               i, (unsigned)av->storage.kind);
   1200       }
   1201     }
   1202   }
   1203   if (nvar) {
   1204     w->call_var_regs = (Reg*)h->alloc(h, sizeof(Reg) * nvar, _Alignof(Reg));
   1205     w->call_var_imms = (i64*)h->alloc(h, sizeof(i64) * nvar, _Alignof(i64));
   1206     w->call_var_kinds = (u8*)h->alloc(h, nvar, 1);
   1207     w->call_var_types = (KitCgTypeId*)h->alloc(h, sizeof(KitCgTypeId) * nvar,
   1208                                                _Alignof(KitCgTypeId));
   1209     for (u32 i = 0; i < nvar; ++i) {
   1210       const CGABIValue* av = &d->args[nfixed + i];
   1211       const CgType* aty = av->type ? cg_type_get(t->c, av->type) : NULL;
   1212       w->call_var_types[i] = av->type;
   1213       if (aty &&
   1214           (aty->kind == KIT_CG_TYPE_RECORD || aty->kind == KIT_CG_TYPE_ARRAY)) {
   1215         wfail(t, "wasm target: aggregate variadic arg %u not yet supported", i);
   1216       }
   1217       if (av->storage.kind == OPK_REG) {
   1218         w->call_var_kinds[i] = WOP_REG;
   1219         w->call_var_regs[i] = av->storage.v.reg;
   1220         w->call_var_imms[i] = 0;
   1221       } else if (av->storage.kind == OPK_IMM) {
   1222         w->call_var_kinds[i] = WOP_IMM;
   1223         w->call_var_regs[i] = REG_NONE;
   1224         w->call_var_imms[i] = av->storage.v.imm;
   1225       } else {
   1226         wfail(t,
   1227               "wasm target: variadic arg %u has unsupported operand kind %u "
   1228               "(only REG/IMM scalar args supported in v1)",
   1229               i, (unsigned)av->storage.kind);
   1230       }
   1231     }
   1232   }
   1233   if (callee_has_sret) {
   1234     /* The call has no wasm result; the buffer pointed to by the sret arg
   1235      * holds the aggregate. */
   1236     w->dst = REG_NONE;
   1237   } else if (d->ret.storage.kind == OPK_REG &&
   1238              d->ret.storage.v.reg != REG_NONE) {
   1239     w->dst = d->ret.storage.v.reg;
   1240   } else {
   1241     w->dst = REG_NONE;
   1242   }
   1243 }
   1244 
   1245 void wasm_ret(CGTarget* tg, const CGABIValue* v) {
   1246   WTarget* t = (WTarget*)tg;
   1247   if (t->dead) return;
   1248   WIR* w = wir_push(t);
   1249   w->op = WIR_RET;
   1250   if (t->cur_has_sret && v && v->abi && v->abi->kind == ABI_ARG_INDIRECT) {
   1251     /* Aggregate sret return: emit a memcpy from av.storage to the buffer
   1252      * pointed to by the hidden sret parameter, then a void return. */
   1253     w->addr = v->storage;
   1254     w->type = v->type;
   1255     w->agg.size = (u32)abi_cg_sizeof(t->c->abi, v->type);
   1256     w->agg.align = 1u;
   1257     w->cgop = 1; /* tag: sret copy */
   1258     w->dst = REG_NONE;
   1259   } else if (v && v->storage.kind == OPK_REG && v->storage.v.reg != REG_NONE) {
   1260     w->dst = v->storage.v.reg;
   1261     w->type = v->type;
   1262   } else if (v && v->storage.kind == OPK_IMM) {
   1263     w->imm_kind = 1;
   1264     w->imm_a = v->storage.v.imm;
   1265     w->type = v->type;
   1266     w->dst = REG_NONE;
   1267   } else {
   1268     w->dst = REG_NONE;
   1269   }
   1270   t->dead = 1;
   1271 }
   1272 
   1273 void wasm_load(CGTarget* tg, Operand dst, Operand addr, MemAccess mem) {
   1274   WTarget* t = (WTarget*)tg;
   1275   if (t->dead) return;
   1276   if (dst.kind != OPK_REG) wfail(t, "wasm: load dst must be REG");
   1277   if (addr.kind != OPK_LOCAL ||
   1278       slot_for(t, addr.v.frame_slot)->kind == W_SLOT_STACK)
   1279     ensure_linear_memory(t);
   1280   WIR* w = wir_push(t);
   1281   w->op = (addr.kind == OPK_LOCAL &&
   1282            slot_for(t, addr.v.frame_slot)->kind == W_SLOT_LOCAL)
   1283               ? WIR_LOAD_LOCAL
   1284               : WIR_LOAD_MEM;
   1285   w->dst = dst.v.reg;
   1286   w->addr = addr;
   1287   if (addr.kind == OPK_LOCAL) {
   1288     WSlot* s = slot_for(t, addr.v.frame_slot);
   1289     w->imm =
   1290         (w->op == WIR_LOAD_LOCAL) ? (i64)s->wasm_local : (i64)addr.v.frame_slot;
   1291   }
   1292   w->mem = mem;
   1293   w->type = dst.type;
   1294   w->cls = dst.cls;
   1295 }
   1296 
   1297 void wasm_store(CGTarget* tg, Operand addr, Operand src, MemAccess mem) {
   1298   WTarget* t = (WTarget*)tg;
   1299   if (t->dead) return;
   1300   if (addr.kind != OPK_LOCAL ||
   1301       slot_for(t, addr.v.frame_slot)->kind == W_SLOT_STACK)
   1302     ensure_linear_memory(t);
   1303   WIR* w = wir_push(t);
   1304   w->op = (addr.kind == OPK_LOCAL &&
   1305            slot_for(t, addr.v.frame_slot)->kind == W_SLOT_LOCAL)
   1306               ? WIR_STORE_LOCAL
   1307               : WIR_STORE_MEM;
   1308   w->addr = addr;
   1309   if (addr.kind == OPK_LOCAL) {
   1310     WSlot* s = slot_for(t, addr.v.frame_slot);
   1311     w->imm = (w->op == WIR_STORE_LOCAL) ? (i64)s->wasm_local
   1312                                         : (i64)addr.v.frame_slot;
   1313   }
   1314   wir_capture_operand(w, 0, src);
   1315   w->mem = mem;
   1316   /* The store's value type is the accessed type. When storing through a
   1317    * pointer register, addr.type is the (possibly void) pointer rvalue type,
   1318    * not the pointee — so prefer the MemAccess type, which always describes
   1319    * the element being written, before falling back to the operands. */
   1320   w->type = mem.type ? mem.type : (addr.type ? addr.type : src.type);
   1321 }
   1322 
   1323 /* Variadic CG hooks. va_list on wasm32 is a single i32 pointer into a
   1324  * caller-packed buffer of 8-byte slots (see wasm_call's variadic packing).
   1325  * va_start writes the hidden va_ptr param into *ap; va_arg loads T from
   1326  * *ap and advances *ap by 8; va_end is a no-op; va_copy copies the i32. */
   1327 void wasm_va_start(CGTarget* tg, Operand ap_addr) {
   1328   WTarget* t = (WTarget*)tg;
   1329   if (t->dead) return;
   1330   if (!t->cur_is_variadic || t->va_ptr_param_local == 0xffffffffu)
   1331     wfail(t, "wasm: va_start in non-variadic function");
   1332   ensure_linear_memory(t);
   1333   WIR* w = wir_push(t);
   1334   w->op = WIR_VA_START;
   1335   w->addr = ap_addr;
   1336 }
   1337 
   1338 void wasm_va_arg(CGTarget* tg, Operand dst, Operand ap_addr, KitCgTypeId type) {
   1339   WTarget* t = (WTarget*)tg;
   1340   if (t->dead) return;
   1341   if (dst.kind != OPK_REG) wfail(t, "wasm: va_arg dst must be REG");
   1342   const CgType* aty = type ? cg_type_get(t->c, type) : NULL;
   1343   if (aty &&
   1344       (aty->kind == KIT_CG_TYPE_RECORD || aty->kind == KIT_CG_TYPE_ARRAY)) {
   1345     wfail(t, "wasm target: va_arg of aggregate type not yet supported");
   1346   }
   1347   ensure_linear_memory(t);
   1348   WIR* w = wir_push(t);
   1349   w->op = WIR_VA_ARG;
   1350   w->dst = dst.v.reg;
   1351   w->addr = ap_addr;
   1352   w->type = type;
   1353   w->cls = dst.cls;
   1354 }
   1355 
   1356 void wasm_va_end(CGTarget* tg, Operand ap_addr) {
   1357   WTarget* t = (WTarget*)tg;
   1358   (void)ap_addr;
   1359   if (t->dead) return;
   1360   /* No-op: nothing to release. */
   1361 }
   1362 
   1363 void wasm_va_copy(CGTarget* tg, Operand dst_ap_addr, Operand src_ap_addr) {
   1364   WTarget* t = (WTarget*)tg;
   1365   if (t->dead) return;
   1366   ensure_linear_memory(t);
   1367   WIR* w = wir_push(t);
   1368   w->op = WIR_VA_COPY;
   1369   w->addr = dst_ap_addr;
   1370   w->call_sret_addr = src_ap_addr; /* reused slot — see WIR comment */
   1371 }
   1372 
   1373 void wasm_addr_of(CGTarget* tg, Operand dst, Operand lv) {
   1374   WTarget* t = (WTarget*)tg;
   1375   if (t->dead) return;
   1376   if (dst.kind != OPK_REG) wfail(t, "wasm: addr_of dst must be REG");
   1377   if (lv.kind == OPK_LOCAL) {
   1378     WSlot* s = slot_for(t, lv.v.frame_slot);
   1379     if (s->kind == W_SLOT_LOCAL) {
   1380       u32 old_local = s->wasm_local;
   1381       promote_slot_to_stack(t, s);
   1382       WIR* st = wir_push(t);
   1383       st->op = WIR_STORE_MEM;
   1384       st->addr = lv;
   1385       st->type = s->type;
   1386       st->mem.type = s->type;
   1387       st->mem.size = s->size;
   1388       st->mem.align = s->align;
   1389       st->imm_kind = WOP_WASM_LOCAL;
   1390       st->imm_a = old_local;
   1391     }
   1392   } else {
   1393     ensure_linear_memory(t);
   1394   }
   1395   WIR* w = wir_push(t);
   1396   w->op = WIR_ADDR_OF;
   1397   w->dst = dst.v.reg;
   1398   w->addr = lv;
   1399   w->type = dst.type;
   1400   w->cls = dst.cls;
   1401 }
   1402 
   1403 void wasm_alloca(CGTarget* tg, Operand dst, Operand size, u32 align) {
   1404   WTarget* t = (WTarget*)tg;
   1405   if (t->dead) return;
   1406   ensure_linear_memory(t);
   1407   if (dst.kind != OPK_REG) wfail(t, "wasm: alloca dst must be REG");
   1408   ensure_stack_pointer(t);
   1409   t->has_stack_frame = 1;
   1410   WIR* w = wir_push(t);
   1411   w->op = WIR_ALLOCA;
   1412   w->dst = dst.v.reg;
   1413   wir_capture_operand(w, 0, size);
   1414   w->type = dst.type;
   1415   w->type2 = size.type;
   1416   w->cls = dst.cls;
   1417   w->imm = align ? align : 16u;
   1418 }
   1419 
   1420 void wasm_copy_bytes(CGTarget* tg, Operand dst, Operand src,
   1421                      AggregateAccess a) {
   1422   WTarget* t = (WTarget*)tg;
   1423   if (t->dead) return;
   1424   ensure_linear_memory(t);
   1425   WIR* w = wir_push(t);
   1426   w->op = WIR_COPY_BYTES;
   1427   w->addr = dst;
   1428   wir_capture_operand(w, 0, src);
   1429   w->agg = a;
   1430 }
   1431 
   1432 void wasm_set_bytes(CGTarget* tg, Operand dst, Operand byte,
   1433                     AggregateAccess a) {
   1434   WTarget* t = (WTarget*)tg;
   1435   if (t->dead) return;
   1436   WIR* w = wir_push(t);
   1437   w->op = WIR_SET_BYTES;
   1438   w->addr = dst;
   1439   wir_capture_operand(w, 0, byte);
   1440   w->agg = a;
   1441 }
   1442 
   1443 /* Atomic ops. CG forces `addr` to a REG and accepts reg-or-imm for value
   1444  * operands. Wasm only models seq_cst; the KitCgMemOrder argument is captured
   1445  * but not encoded — every emitted atomic op is sequentially consistent. The
   1446  * caller-provided MemAccess carries the type and natural alignment we need
   1447  * for the memarg width. */
   1448 static void atomic_require_addr_reg(WTarget* t, Operand addr,
   1449                                     const char* what) {
   1450   if (addr.kind != OPK_REG)
   1451     wfail(t, "wasm: %s address must be in a register (got opkind %u)", what,
   1452           (unsigned)addr.kind);
   1453 }
   1454 
   1455 void wasm_atomic_load(CGTarget* tg, Operand dst, Operand addr, MemAccess mem,
   1456                       KitCgMemOrder mo) {
   1457   WTarget* t = (WTarget*)tg;
   1458   (void)mo;
   1459   if (t->dead) return;
   1460   if (dst.kind != OPK_REG) wfail(t, "wasm: atomic_load dst must be REG");
   1461   atomic_require_addr_reg(t, addr, "atomic_load");
   1462   ensure_shared_memory(t);
   1463   WIR* w = wir_push(t);
   1464   w->op = WIR_ATOMIC_LOAD;
   1465   w->dst = dst.v.reg;
   1466   w->a = addr.v.reg;
   1467   w->mem = mem;
   1468   w->type = dst.type ? dst.type : mem.type;
   1469   w->cls = dst.cls;
   1470 }
   1471 
   1472 void wasm_atomic_store(CGTarget* tg, Operand addr, Operand src, MemAccess mem,
   1473                        KitCgMemOrder mo) {
   1474   WTarget* t = (WTarget*)tg;
   1475   (void)mo;
   1476   if (t->dead) return;
   1477   atomic_require_addr_reg(t, addr, "atomic_store");
   1478   if (src.kind != OPK_REG && src.kind != OPK_IMM)
   1479     wfail(t, "wasm: atomic_store value must be REG or IMM");
   1480   ensure_shared_memory(t);
   1481   WIR* w = wir_push(t);
   1482   w->op = WIR_ATOMIC_STORE;
   1483   w->a = addr.v.reg;
   1484   wir_capture_operand(w, 1, src);
   1485   w->mem = mem;
   1486   w->type = mem.type ? mem.type : src.type;
   1487 }
   1488 
   1489 void wasm_atomic_rmw(CGTarget* tg, KitCgAtomicOp op, Operand dst, Operand addr,
   1490                      Operand val, MemAccess mem, KitCgMemOrder mo) {
   1491   WTarget* t = (WTarget*)tg;
   1492   (void)mo;
   1493   if (t->dead) return;
   1494   if (dst.kind != OPK_REG) wfail(t, "wasm: atomic_rmw dst must be REG");
   1495   atomic_require_addr_reg(t, addr, "atomic_rmw");
   1496   if (val.kind != OPK_REG && val.kind != OPK_IMM)
   1497     wfail(t, "wasm: atomic_rmw value must be REG or IMM");
   1498   /* KIT_CG_ATOMIC_NAND has no native wasm-threads opcode; the linearizer
   1499    * expands it into an atomic cmpxchg retry loop (see WIR_ATOMIC_RMW). */
   1500   ensure_shared_memory(t);
   1501   WIR* w = wir_push(t);
   1502   w->op = WIR_ATOMIC_RMW;
   1503   w->cgop = (u8)op;
   1504   w->dst = dst.v.reg;
   1505   w->a = addr.v.reg;
   1506   wir_capture_operand(w, 1, val);
   1507   w->mem = mem;
   1508   w->type = dst.type ? dst.type : mem.type;
   1509   w->cls = dst.cls;
   1510 }
   1511 
   1512 void wasm_atomic_cas(CGTarget* tg, Operand prior, Operand ok, Operand addr,
   1513                      Operand expected, Operand desired, MemAccess mem,
   1514                      KitCgMemOrder success, KitCgMemOrder failure) {
   1515   WTarget* t = (WTarget*)tg;
   1516   (void)success;
   1517   (void)failure;
   1518   if (t->dead) return;
   1519   if (prior.kind != OPK_REG) wfail(t, "wasm: atomic_cas prior must be REG");
   1520   if (ok.kind != OPK_REG) wfail(t, "wasm: atomic_cas ok must be REG");
   1521   atomic_require_addr_reg(t, addr, "atomic_cas");
   1522   if (expected.kind != OPK_REG && expected.kind != OPK_IMM)
   1523     wfail(t, "wasm: atomic_cas expected must be REG or IMM");
   1524   if (desired.kind != OPK_REG && desired.kind != OPK_IMM)
   1525     wfail(t, "wasm: atomic_cas desired must be REG or IMM");
   1526   ensure_shared_memory(t);
   1527   WIR* w = wir_push(t);
   1528   w->op = WIR_ATOMIC_CAS;
   1529   w->dst = prior.v.reg;
   1530   w->dst2 = ok.v.reg;
   1531   w->a = addr.v.reg;
   1532   wir_capture_operand(w, 1, expected);
   1533   /* Capture desired into op_c/imm_kind_c/imm_c (third operand slot). */
   1534   if (desired.kind == OPK_REG) {
   1535     w->imm_kind_c = WOP_REG;
   1536     w->op_c = desired.v.reg;
   1537   } else {
   1538     w->imm_kind_c = WOP_IMM;
   1539     w->imm_c = desired.v.imm;
   1540   }
   1541   w->mem = mem;
   1542   w->type = prior.type ? prior.type : mem.type;
   1543   w->cls = prior.cls;
   1544   w->type2 = ok.type;
   1545 }
   1546 
   1547 void wasm_fence(CGTarget* tg, KitCgMemOrder mo) {
   1548   WTarget* t = (WTarget*)tg;
   1549   (void)mo;
   1550   if (t->dead) return;
   1551   /* Wasm atomic.fence does not require a memory to exist, but in practice it
   1552    * is meaningful only inside a module that has shared memory. We don't
   1553    * force-create memory here to avoid producing a bogus memory for fence-only
   1554    * modules. */
   1555   WIR* w = wir_push(t);
   1556   w->op = WIR_FENCE;
   1557 }
   1558 
   1559 /* Forward decls: defined further down. */
   1560 static WasmValType type_valtype(WTarget* t, KitCgTypeId ty);
   1561 void wasm_emit_unreachable(WTarget* t);
   1562 
   1563 /* Per-intrinsic-name diagnostic text. Used both by the recorder for
   1564  * SETJMP/LONGJMP (which we still reject) and for the fallback panic so
   1565  * users see "wasm target: __builtin_clz ..." instead of a numeric kind. */
   1566 static const char* intrin_name(IntrinKind k) {
   1567   switch (k) {
   1568     case INTRIN_NONE:
   1569       return "<none>";
   1570     case INTRIN_POPCOUNT:
   1571       return "__builtin_popcount";
   1572     case INTRIN_CTZ:
   1573       return "__builtin_ctz";
   1574     case INTRIN_CLZ:
   1575       return "__builtin_clz";
   1576     case INTRIN_BSWAP:
   1577       return "__builtin_bswap";
   1578     case INTRIN_MEMMOVE:
   1579       return "memmove";
   1580     case INTRIN_PREFETCH:
   1581       return "__builtin_prefetch";
   1582     case INTRIN_ASSUME_ALIGNED:
   1583       return "__builtin_assume_aligned";
   1584     case INTRIN_EXPECT:
   1585       return "__builtin_expect";
   1586     case INTRIN_TRAP:
   1587       return "__builtin_trap";
   1588     case INTRIN_SYSCALL:
   1589       return "__kit_syscall";
   1590     case INTRIN_SETJMP:
   1591       return "setjmp";
   1592     case INTRIN_LONGJMP:
   1593       return "longjmp";
   1594     case INTRIN_SADD_OVERFLOW:
   1595       return "__builtin_sadd_overflow";
   1596     case INTRIN_UADD_OVERFLOW:
   1597       return "__builtin_uadd_overflow";
   1598     case INTRIN_SSUB_OVERFLOW:
   1599       return "__builtin_ssub_overflow";
   1600     case INTRIN_USUB_OVERFLOW:
   1601       return "__builtin_usub_overflow";
   1602     case INTRIN_SMUL_OVERFLOW:
   1603       return "__builtin_smul_overflow";
   1604     case INTRIN_UMUL_OVERFLOW:
   1605       return "__builtin_umul_overflow";
   1606     case INTRIN_CPU_NOP:
   1607       return "cpu_nop";
   1608     case INTRIN_CPU_YIELD:
   1609       return "cpu_yield";
   1610     case INTRIN_WFI:
   1611       return "wfi";
   1612     case INTRIN_WFE:
   1613       return "wfe";
   1614     case INTRIN_SEV:
   1615       return "sev";
   1616     case INTRIN_ISB:
   1617       return "isb";
   1618     case INTRIN_DMB:
   1619       return "dmb";
   1620     case INTRIN_DSB:
   1621       return "dsb";
   1622     case INTRIN_IRQ_SAVE:
   1623       return "irq_save";
   1624     case INTRIN_IRQ_RESTORE:
   1625       return "irq_restore";
   1626     case INTRIN_IRQ_ENABLE:
   1627       return "irq_enable";
   1628     case INTRIN_IRQ_DISABLE:
   1629       return "irq_disable";
   1630     case INTRIN_FRAME_ADDRESS:
   1631       return "frame_address";
   1632     case INTRIN_RETURN_ADDRESS:
   1633       return "return_address";
   1634   }
   1635   return "<unknown>";
   1636 }
   1637 
   1638 void wasm_intrinsic(CGTarget* tg, IntrinKind k, Operand* dst, u32 ndst,
   1639                     const Operand* args, u32 nargs) {
   1640   WTarget* t = (WTarget*)tg;
   1641   if (t->dead) return;
   1642 
   1643   switch (k) {
   1644     case INTRIN_TRAP:
   1645       wasm_emit_unreachable(t);
   1646       return;
   1647 
   1648     case INTRIN_PREFETCH:
   1649       /* No-op hint. */
   1650       return;
   1651 
   1652     case INTRIN_EXPECT:
   1653     case INTRIN_ASSUME_ALIGNED:
   1654       /* Pass-through hint: result = first argument. CG always allocates a
   1655        * dst reg for these; copy arg[0] there so downstream uses see the
   1656        * expected value. CG keeps the first arg as OPK_IMM when it was a
   1657        * literal so the constant flows through unchanged. */
   1658       if (ndst == 1 && nargs >= 1) {
   1659         if (args[0].kind == OPK_IMM) {
   1660           wasm_load_imm(tg, dst[0], args[0].v.imm);
   1661         } else {
   1662           wasm_copy(tg, dst[0], args[0]);
   1663         }
   1664       }
   1665       return;
   1666 
   1667     case INTRIN_MEMMOVE: {
   1668       /* memmove lowers to memory.copy, which is spec-defined to handle overlap
   1669        * correctly. CG forces (dst, src) to REG and passes size as OPK_IMM
   1670        * (kit_cg_memmove). */
   1671       if (nargs != 3 || args[0].kind != OPK_REG || args[1].kind != OPK_REG) {
   1672         compiler_panic(t->c, cur_loc(t),
   1673                        "wasm target: %s requires register pointers",
   1674                        intrin_name(k));
   1675         return;
   1676       }
   1677       if (args[2].kind != OPK_IMM) {
   1678         compiler_panic(t->c, cur_loc(t),
   1679                        "wasm target: %s with non-constant size is not yet "
   1680                        "supported",
   1681                        intrin_name(k));
   1682         return;
   1683       }
   1684       ensure_linear_memory(t);
   1685       AggregateAccess a;
   1686       memset(&a, 0, sizeof a);
   1687       a.size = (u32)args[2].v.imm;
   1688       a.align = 1;
   1689       WIR* w = wir_push(t);
   1690       w->op = WIR_COPY_BYTES;
   1691       w->addr = args[0];
   1692       wir_capture_operand(w, 0, args[1]);
   1693       w->agg = a;
   1694       return;
   1695     }
   1696 
   1697     case INTRIN_CLZ:
   1698     case INTRIN_CTZ:
   1699     case INTRIN_POPCOUNT:
   1700     case INTRIN_BSWAP: {
   1701       if (ndst != 1 || nargs != 1 || dst[0].kind != OPK_REG ||
   1702           args[0].kind != OPK_REG) {
   1703         compiler_panic(t->c, cur_loc(t),
   1704                        "wasm target: %s requires single REG operand",
   1705                        intrin_name(k));
   1706         return;
   1707       }
   1708       WIR* w = wir_push(t);
   1709       w->op = WIR_INTRINSIC;
   1710       w->cgop = (u8)k;
   1711       w->dst = dst[0].v.reg;
   1712       w->a = args[0].v.reg;
   1713       w->type = dst[0].type;
   1714       /* clz/ctz/popcount return int (i32) but operate at the operand's width
   1715        * (e.g. __builtin_ctzl over an i64). The wasm op width must follow the
   1716        * operand, with a wrap to the i32 dst afterward. type2 carries it. */
   1717       w->type2 = args[0].type;
   1718       w->cls = dst[0].cls;
   1719       return;
   1720     }
   1721 
   1722     case INTRIN_SADD_OVERFLOW:
   1723     case INTRIN_UADD_OVERFLOW:
   1724     case INTRIN_SSUB_OVERFLOW:
   1725     case INTRIN_USUB_OVERFLOW:
   1726     case INTRIN_SMUL_OVERFLOW:
   1727     case INTRIN_UMUL_OVERFLOW: {
   1728       if (ndst != 2 || nargs != 2 || dst[0].kind != OPK_REG ||
   1729           dst[1].kind != OPK_REG) {
   1730         compiler_panic(t->c, cur_loc(t),
   1731                        "wasm target: %s requires 2 args + 2 result regs",
   1732                        intrin_name(k));
   1733         return;
   1734       }
   1735       /* Reject i64 mul-overflow for now: wasm core has no widening 64x64
   1736        * multiply, so the standard expansion would need partial-product
   1737        * synthesis. 32-bit (the common shape on wasm32) is supported. */
   1738       WasmValType vt = type_valtype(t, dst[0].type);
   1739       if (vt == WASM_VAL_I64 &&
   1740           (k == INTRIN_SMUL_OVERFLOW || k == INTRIN_UMUL_OVERFLOW)) {
   1741         compiler_panic(t->c, cur_loc(t),
   1742                        "wasm target: 64-bit checked-overflow multiply is "
   1743                        "not yet supported");
   1744         return;
   1745       }
   1746       WIR* w = wir_push(t);
   1747       w->op = WIR_INTRINSIC;
   1748       w->cgop = (u8)k;
   1749       w->dst = dst[0].v.reg;
   1750       w->dst2 = dst[1].v.reg;
   1751       w->type = dst[0].type;
   1752       w->cls = dst[0].cls;
   1753       wir_capture_operand(w, 0, args[0]);
   1754       wir_capture_operand(w, 1, args[1]);
   1755       return;
   1756     }
   1757 
   1758     case INTRIN_SETJMP:
   1759     case INTRIN_LONGJMP:
   1760       compiler_panic(t->c, cur_loc(t),
   1761                      "wasm target: %s is not yet supported (no exception/"
   1762                      "stack-unwind runtime)",
   1763                      intrin_name(k));
   1764       return;
   1765 
   1766     /* Baremetal/CPU-control intrinsics have no wasm lowering;
   1767      * kit_cg_target_supports_intrinsic reports them false so frontends
   1768      * diagnose before reaching here. Fall through to the generic panic. */
   1769     case INTRIN_CPU_NOP:
   1770     case INTRIN_CPU_YIELD:
   1771     case INTRIN_WFI:
   1772     case INTRIN_WFE:
   1773     case INTRIN_SEV:
   1774     case INTRIN_ISB:
   1775     case INTRIN_DMB:
   1776     case INTRIN_DSB:
   1777     case INTRIN_IRQ_SAVE:
   1778     case INTRIN_IRQ_RESTORE:
   1779     case INTRIN_IRQ_ENABLE:
   1780     case INTRIN_IRQ_DISABLE:
   1781     case INTRIN_SYSCALL:
   1782     /* No frame-pointer chain in wasm; reported unsupported up front. */
   1783     case INTRIN_FRAME_ADDRESS:
   1784     case INTRIN_RETURN_ADDRESS:
   1785     case INTRIN_NONE:
   1786       break;
   1787   }
   1788   compiler_panic(t->c, cur_loc(t),
   1789                  "wasm target: intrinsic %s not yet implemented",
   1790                  intrin_name(k));
   1791 }
   1792 
   1793 /* Inline asm v1 — see doc/WASM.md "Inline asm" for the surface contract.
   1794  *
   1795  * Template syntax: WAT instruction sequence. Inputs pre-pushed to the value
   1796  * stack (via local.get on synthetic input locals); outputs popped from the
   1797  * stack into synthetic output locals after the body. The snippet's
   1798  * local.get/set/tee with index < nin refers to the i-th input.
   1799  *
   1800  * Constraints: "r" → wasm local; "i" → const-folded; "m" → i32 address.
   1801  * Numeric tie-back constraints ("0","1",...) reuse the referenced output's
   1802  * slot (cg/asm.c expands them into duplicate input entries at the end).
   1803  *
   1804  * Disallowed in v1: escaping br/br_if/br_table, return/return_call*,
   1805  * call_indirect, snippet-internal locals, output count > 1, register
   1806  * clobbers (only `memory` is accepted). */
   1807 static WasmValType wasm_asm_operand_vt(WTarget* t, KitCgTypeId ty,
   1808                                        const char* what, SrcLoc loc) {
   1809   WasmValType vt;
   1810   if (!ty) wfail_at(t, loc, "wasm target: asm %s with no CG type", what);
   1811   vt = valtype_for_type(t, ty);
   1812   if (vt != WASM_VAL_I32 && vt != WASM_VAL_I64 && vt != WASM_VAL_F32 &&
   1813       vt != WASM_VAL_F64)
   1814     wfail_at(t, loc, "wasm target: asm %s of non-scalar type not supported",
   1815              what);
   1816   return vt;
   1817 }
   1818 
   1819 void wasm_asm_block(CGTarget* tg, const char* tmpl, const AsmConstraint* outs,
   1820                     u32 nout, Operand* out_ops, const AsmConstraint* ins,
   1821                     u32 nin, const Operand* in_ops, const Sym* clob,
   1822                     u32 nclob) {
   1823   WTarget* t = (WTarget*)tg;
   1824   Heap* h = t->c->ctx->heap;
   1825   Sym sym_memory;
   1826   WasmFunc scratch;
   1827   SrcLoc loc = cur_loc(t);
   1828   u32 depth;
   1829   u32 i;
   1830 
   1831   if (t->dead) return;
   1832 
   1833   /* Clobber policy: only `memory` is meaningful on wasm (effective no-op
   1834    * because cg/asm.c already spilled live SSA values). Reject named-register
   1835    * clobbers explicitly. */
   1836   sym_memory = pool_intern_slice(t->c->global, SLICE_LIT("memory"));
   1837   for (i = 0; i < nclob; ++i) {
   1838     if (clob[i] != sym_memory)
   1839       wfail_at(t, loc, "wasm target: asm register clobbers not yet supported");
   1840   }
   1841   for (i = 0; i < nout; ++i) {
   1842     if (outs[i].reg)
   1843       wfail_at(t, loc, "wasm target: asm hard-register operands not supported");
   1844   }
   1845   for (i = 0; i < nin; ++i) {
   1846     if (ins[i].reg)
   1847       wfail_at(t, loc, "wasm target: asm hard-register operands not supported");
   1848   }
   1849 
   1850   /* Build a scratch WasmFunc with the synthetic signature. Layout is:
   1851    *   params  = input types (indices 0 .. nin-1)
   1852    *   locals  = output types (indices nin .. nin+nout-1)
   1853    *   results = empty
   1854    * Snippets use local.get/set/tee N to read/write either side. The author
   1855    * is responsible for writing each output via local.set N (>= nin); empty
   1856    * snippets paired with `+r` / numeric tieback constraints get identity
   1857    * behavior because the input and output share a wasm local at lowering. */
   1858   memset(&scratch, 0, sizeof scratch);
   1859   for (i = 0; i < nin; ++i) {
   1860     WasmValType vt = wasm_asm_operand_vt(t, ins[i].type, "input operand", loc);
   1861     /* Constraint-specific checks: "i" requires an immediate operand; "m"
   1862      * requires an indirect (i32 address). */
   1863     if (ins[i].str && ins[i].str[0] == 'i' && in_ops[i].kind != OPK_IMM)
   1864       wfail_at(t, loc, "wasm target: asm 'i' input must be an immediate");
   1865     if (ins[i].str && ins[i].str[0] == 'm') {
   1866       if (in_ops[i].kind != OPK_INDIRECT)
   1867         wfail_at(t, loc, "wasm target: asm 'm' input must be indirect");
   1868       vt = WASM_VAL_I32;
   1869     }
   1870     wasm_func_push_param(t->c, t->module, &scratch, vt);
   1871   }
   1872   for (i = 0; i < nout; ++i) {
   1873     WasmValType vt =
   1874         wasm_asm_operand_vt(t, outs[i].type, "output operand", loc);
   1875     wasm_func_push_local(t->c, t->module, &scratch, vt);
   1876   }
   1877   /* No declared result — outputs are read from locals at the end of the
   1878    * lowering, not popped from the value stack. */
   1879 
   1880   /* Parse the template into scratch.insns. */
   1881   wasm_parse_wat_body(t->c, t->module, &scratch, tmpl, strlen(tmpl), loc);
   1882 
   1883   /* Walk the parsed body to reject constructs that escape or aren't
   1884    * supported in v1. Track control depth so br/br_if/br_table with imm >=
   1885    * depth (i.e. would branch out of the snippet) are rejected. */
   1886   depth = 0;
   1887   for (i = 0; i < scratch.ninsns; ++i) {
   1888     WasmInsn* in = &scratch.insns[i];
   1889     switch (in->kind) {
   1890       case WASM_INSN_BLOCK:
   1891       case WASM_INSN_LOOP:
   1892       case WASM_INSN_IF:
   1893         depth++;
   1894         break;
   1895       case WASM_INSN_END:
   1896         if (depth) depth--;
   1897         break;
   1898       case WASM_INSN_BR:
   1899       case WASM_INSN_BR_IF:
   1900         if (in->imm < 0 || (u64)in->imm >= depth)
   1901           wfail_at(t, in->loc,
   1902                    "wasm target: asm template branch escapes snippet");
   1903         break;
   1904       case WASM_INSN_BR_TABLE: {
   1905         u32 k;
   1906         for (k = 0; k < in->ntargets; ++k)
   1907           if (in->targets[k] >= depth)
   1908             wfail_at(t, in->loc,
   1909                      "wasm target: asm template br_table escapes snippet");
   1910         break;
   1911       }
   1912       case WASM_INSN_RETURN:
   1913       case WASM_INSN_RETURN_CALL:
   1914       case WASM_INSN_RETURN_CALL_INDIRECT:
   1915       case WASM_INSN_RETURN_CALL_REF:
   1916         wfail_at(t, in->loc,
   1917                  "wasm target: return/tail-call in asm template not "
   1918                  "supported");
   1919         break;
   1920       case WASM_INSN_LOCAL_GET:
   1921       case WASM_INSN_LOCAL_SET:
   1922       case WASM_INSN_LOCAL_TEE:
   1923         if (in->imm < 0 || (u64)in->imm >= (u64)(nin + nout))
   1924           wfail_at(t, in->loc,
   1925                    "wasm target: asm template references local beyond "
   1926                    "declared operands (snippet-internal locals not "
   1927                    "supported)");
   1928         break;
   1929       default:
   1930         break;
   1931     }
   1932   }
   1933 
   1934   /* Validate the body against the synthetic signature. */
   1935   wasm_validate_func(t->c, t->module, &scratch);
   1936 
   1937   {
   1938     /* Build the WIR_ASM_BLOCK payload. raw_insns is owned by the WIR; the
   1939      * other arrays are too. Sizes are zero-when-empty per the WIR teardown
   1940      * conventions. */
   1941     WIR* w = wir_push(t);
   1942     w->op = WIR_ASM_BLOCK;
   1943     w->raw_ninsns = scratch.ninsns;
   1944     if (scratch.ninsns) {
   1945       w->raw_insns = (WasmInsn*)h->alloc(h, sizeof(WasmInsn) * scratch.ninsns,
   1946                                          _Alignof(WasmInsn));
   1947       if (!w->raw_insns) wfail(t, "wasm: out of memory");
   1948       memcpy(w->raw_insns, scratch.insns, sizeof(WasmInsn) * scratch.ninsns);
   1949     }
   1950     w->asm_nin = nin;
   1951     w->asm_nout = nout;
   1952     if (nin) {
   1953       w->asm_in_kinds = (u8*)h->alloc(h, nin, 1);
   1954       w->asm_in_imms = (i64*)h->alloc(h, sizeof(i64) * nin, _Alignof(i64));
   1955       w->asm_in_regs = (Reg*)h->alloc(h, sizeof(Reg) * nin, _Alignof(Reg));
   1956       w->asm_in_types = (KitCgTypeId*)h->alloc(h, sizeof(KitCgTypeId) * nin,
   1957                                                _Alignof(KitCgTypeId));
   1958       w->asm_in_share_out = (i32*)h->alloc(h, sizeof(i32) * nin, _Alignof(i32));
   1959       if (!w->asm_in_kinds || !w->asm_in_imms || !w->asm_in_regs ||
   1960           !w->asm_in_types || !w->asm_in_share_out)
   1961         wfail(t, "wasm: out of memory");
   1962       for (i = 0; i < nin; ++i) w->asm_in_share_out[i] = -1;
   1963       for (i = 0; i < nin; ++i) {
   1964         Operand op = in_ops[i];
   1965         /* Numeric tieback constraints ("0".."9") share the matching
   1966          * output's wasm local. cg/asm.c also rewrites +r inout outputs
   1967          * into duplicate inputs at the tail of ins[], using the same
   1968          * numeric encoding. */
   1969         const char* s = ins[i].str;
   1970         if (s && s[0] >= '0' && s[0] <= '9' && s[1] == '\0') {
   1971           int idx = s[0] - '0';
   1972           if ((u32)idx < nout) w->asm_in_share_out[i] = idx;
   1973         }
   1974         switch (op.kind) {
   1975           case OPK_REG:
   1976             w->asm_in_kinds[i] = WOP_REG;
   1977             w->asm_in_regs[i] = op.v.reg;
   1978             w->asm_in_imms[i] = 0;
   1979             w->asm_in_types[i] = op.type ? op.type : ins[i].type;
   1980             break;
   1981           case OPK_IMM:
   1982             w->asm_in_kinds[i] = WOP_IMM;
   1983             w->asm_in_regs[i] = REG_NONE;
   1984             w->asm_in_imms[i] = op.v.imm;
   1985             w->asm_in_types[i] = op.type ? op.type : ins[i].type;
   1986             break;
   1987           case OPK_INDIRECT:
   1988             /* For "m" constraint: the input local holds the i32 address
   1989              * `base + ofs` of the lvalue. We re-use asm_in_imms (unused
   1990              * for WOP_REG operands) to carry the displacement so the
   1991              * linearizer can splice in `i32.const ofs; i32.add` after
   1992              * pushing the base local. */
   1993             w->asm_in_kinds[i] = WOP_REG;
   1994             w->asm_in_regs[i] = op.v.ind.base;
   1995             w->asm_in_imms[i] = (i64)op.v.ind.ofs;
   1996             w->asm_in_types[i] = builtin_id(KIT_CG_BUILTIN_I32);
   1997             break;
   1998           default:
   1999             wfail_at(t, loc, "wasm target: unsupported asm input operand kind");
   2000         }
   2001       }
   2002     }
   2003     if (nout) {
   2004       w->asm_out_regs = (Reg*)h->alloc(h, sizeof(Reg) * nout, _Alignof(Reg));
   2005       w->asm_out_types = (KitCgTypeId*)h->alloc(h, sizeof(KitCgTypeId) * nout,
   2006                                                 _Alignof(KitCgTypeId));
   2007       if (!w->asm_out_regs || !w->asm_out_types)
   2008         wfail(t, "wasm: out of memory");
   2009       for (i = 0; i < nout; ++i) {
   2010         if (out_ops[i].kind != OPK_REG)
   2011           wfail_at(t, loc, "wasm target: asm output must be a register");
   2012         w->asm_out_regs[i] = out_ops[i].v.reg;
   2013         w->asm_out_types[i] = out_ops[i].type ? out_ops[i].type : outs[i].type;
   2014       }
   2015     }
   2016   }
   2017 
   2018   /* Free scratch func storage. The parsed insns have been copied into the
   2019    * WIR payload. */
   2020   if (scratch.params)
   2021     h->free(h, scratch.params, sizeof(WasmValType) * scratch.cap_params);
   2022   if (scratch.locals)
   2023     h->free(h, scratch.locals, sizeof(WasmValType) * scratch.cap_locals);
   2024   if (scratch.insns)
   2025     h->free(h, scratch.insns, sizeof(WasmInsn) * scratch.cap_insns);
   2026 }
   2027 
   2028 void wasm_file_scope_asm(CGTarget* tg, const char* src, size_t len) {
   2029   WTarget* t = (WTarget*)tg;
   2030   (void)src;
   2031   (void)len;
   2032   compiler_panic(t->c, cur_loc(t),
   2033                  "wasm target: file-scope asm not yet supported");
   2034 }
   2035 
   2036 void wasm_set_loc(CGTarget* tg, SrcLoc loc) {
   2037   WTarget* t = (WTarget*)tg;
   2038   /* No debug info in v1, but we stash the most recent loc so cur_loc /
   2039    * diagnostics attribute to the actual statement rather than the
   2040    * function-definition location. */
   2041   t->cur_stmt_loc = loc;
   2042 }
   2043 
   2044 void wasm_emit_unreachable(WTarget* t) {
   2045   if (t->dead) return;
   2046   WIR* w = wir_push(t);
   2047   w->op = WIR_UNREACHABLE;
   2048   t->dead = 1;
   2049 }
   2050 
   2051 /* Control terminator (the C __builtin_unreachable point): emit the Wasm
   2052  * `unreachable` opcode, which traps if reached. Ends the current block. */
   2053 void wasm_unreachable(CGTarget* tg) { wasm_emit_unreachable((WTarget*)tg); }
   2054 
   2055 /* -----------------------------------------------------------------
   2056  * WIR -> WasmFunc lowering
   2057  * ----------------------------------------------------------------- */
   2058 
   2059 static void emit_insn(WTarget* t, WasmInsnKind k, i64 imm) {
   2060   wasm_func_add_insn(t->c, t->module, t->cur_func, k, imm);
   2061 }
   2062 static void emit_fp(WTarget* t, WasmInsnKind k, double v) {
   2063   wasm_func_add_fp_insn(t->c, t->module, t->cur_func, k, v);
   2064 }
   2065 
   2066 /* Push an operand onto the wasm value stack. */
   2067 static void emit_push_operand_reg(WTarget* t, Reg r) {
   2068   if (r == REG_NONE) wfail(t, "wasm: push of REG_NONE");
   2069   /* The reg must already have a local. */
   2070   if (r >= t->reg_cap || t->reg_to_local[r] == 0xffffffffu) {
   2071     wfail(t, "wasm: reg %u used before being defined", (unsigned)r);
   2072   }
   2073   emit_insn(t, WASM_INSN_LOCAL_GET, (i64)t->reg_to_local[r]);
   2074 }
   2075 
   2076 static WasmValType type_valtype(WTarget* t, KitCgTypeId ty) {
   2077   return valtype_for_type(t, ty);
   2078 }
   2079 
   2080 static void emit_push_imm(WTarget* t, WasmValType vt, i64 imm) {
   2081   WasmInsnKind k =
   2082       (vt == WASM_VAL_I64) ? WASM_INSN_I64_CONST : WASM_INSN_I32_CONST;
   2083   emit_insn(t, k, imm);
   2084 }
   2085 
   2086 static u32 memarg_align_log2(u32 align, u32 width);
   2087 static WasmInsnKind load_kind_for(WTarget* t, KitCgTypeId ty, MemAccess ma);
   2088 
   2089 static void emit_push_operand(WTarget* t, u32 kind, i64 imm, Reg r,
   2090                               KitCgTypeId ty) {
   2091   if (kind == WOP_IMM) {
   2092     WasmValType vt = type_valtype(t, ty);
   2093     if (vt == WASM_VAL_F32 || vt == WASM_VAL_F64) {
   2094       wfail(t, "wasm: float immediate operand not supported");
   2095     }
   2096     emit_push_imm(t, vt, imm);
   2097   } else if (kind == WOP_LOCAL) {
   2098     FrameSlot fs = (FrameSlot)imm;
   2099     WSlot* s = slot_for(t, fs);
   2100     if (s->kind == W_SLOT_LOCAL) {
   2101       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)s->wasm_local);
   2102     } else {
   2103       MemAccess ma;
   2104       memset(&ma, 0, sizeof ma);
   2105       ma.type = ty;
   2106       ma.size = (u32)abi_cg_sizeof(t->c->abi, ty);
   2107       ma.align = s->align;
   2108       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)t->frame_base_local);
   2109       WasmInsnKind k = load_kind_for(t, ty, ma);
   2110       wasm_func_add_mem_insn(
   2111           t->c, t->module, t->cur_func, k,
   2112           memarg_align_log2(ma.align, wasm_mem_width((uint8_t)k)),
   2113           s->frame_offset, 0);
   2114     }
   2115   } else {
   2116     emit_push_operand_reg(t, r);
   2117   }
   2118 }
   2119 
   2120 static void emit_local_set(WTarget* t, Reg dst, KitCgTypeId ty, RegClass cls) {
   2121   u32 idx = reg_local(t, dst, ty, cls);
   2122   emit_insn(t, WASM_INSN_LOCAL_SET, (i64)idx);
   2123 }
   2124 
   2125 static u32 memarg_align_log2(u32 align, u32 width) {
   2126   u32 a = align ? align : width;
   2127   u32 lg = 0;
   2128   if (a > width) a = width;
   2129   while (a > 1u) {
   2130     a >>= 1u;
   2131     lg++;
   2132   }
   2133   return lg;
   2134 }
   2135 
   2136 static WasmInsnKind load_kind_for(WTarget* t, KitCgTypeId ty, MemAccess ma) {
   2137   WasmValType vt = type_valtype(t, ty);
   2138   u32 size = ma.size ? ma.size : (u32)abi_cg_sizeof(t->c->abi, ty);
   2139   if (vt == WASM_VAL_F32) return WASM_INSN_F32_LOAD;
   2140   if (vt == WASM_VAL_F64) return WASM_INSN_F64_LOAD;
   2141   if (vt == WASM_VAL_I64) {
   2142     if (size == 1) return WASM_INSN_I64_LOAD8_U;
   2143     if (size == 2) return WASM_INSN_I64_LOAD16_U;
   2144     if (size == 4) return WASM_INSN_I64_LOAD32_U;
   2145     return WASM_INSN_I64_LOAD;
   2146   }
   2147   if (size == 1) return WASM_INSN_I32_LOAD8_U;
   2148   if (size == 2) return WASM_INSN_I32_LOAD16_U;
   2149   return WASM_INSN_I32_LOAD;
   2150 }
   2151 
   2152 static WasmInsnKind store_kind_for(WTarget* t, KitCgTypeId ty, MemAccess ma) {
   2153   WasmValType vt = type_valtype(t, ty);
   2154   u32 size = ma.size ? ma.size : (u32)abi_cg_sizeof(t->c->abi, ty);
   2155   if (vt == WASM_VAL_F32) return WASM_INSN_F32_STORE;
   2156   if (vt == WASM_VAL_F64) return WASM_INSN_F64_STORE;
   2157   if (vt == WASM_VAL_I64) {
   2158     if (size == 1) return WASM_INSN_I64_STORE8;
   2159     if (size == 2) return WASM_INSN_I64_STORE16;
   2160     if (size == 4) return WASM_INSN_I64_STORE32;
   2161     return WASM_INSN_I64_STORE;
   2162   }
   2163   if (size == 1) return WASM_INSN_I32_STORE8;
   2164   if (size == 2) return WASM_INSN_I32_STORE16;
   2165   return WASM_INSN_I32_STORE;
   2166 }
   2167 
   2168 /* Atomic op selection. Wasm threads gives natural-width atomic load/store for
   2169  * i32/i64 (with 8/16/32 subword variants) but only full-width (i32/i64) RMW
   2170  * and cmpxchg in kit's wasm core. Sub-word RMW/cmpxchg therefore diagnose
   2171  * rather than silently widening. */
   2172 static WasmInsnKind atomic_load_kind_for(WTarget* t, KitCgTypeId ty,
   2173                                          MemAccess ma) {
   2174   WasmValType vt = type_valtype(t, ty);
   2175   u32 size = ma.size ? ma.size : (u32)abi_cg_sizeof(t->c->abi, ty);
   2176   if (vt == WASM_VAL_F32 || vt == WASM_VAL_F64)
   2177     wfail(t,
   2178           "wasm target: atomic load of floating-point value is not "
   2179           "representable in wasm threads");
   2180   if (vt == WASM_VAL_I64) {
   2181     if (size == 1) return WASM_INSN_I64_ATOMIC_LOAD8_U;
   2182     if (size == 2) return WASM_INSN_I64_ATOMIC_LOAD16_U;
   2183     if (size == 4) return WASM_INSN_I64_ATOMIC_LOAD32_U;
   2184     if (size == 8) return WASM_INSN_I64_ATOMIC_LOAD;
   2185     wfail(t, "wasm: atomic load i64 size %u not supported", size);
   2186   }
   2187   if (size == 1) return WASM_INSN_I32_ATOMIC_LOAD8_U;
   2188   if (size == 2) return WASM_INSN_I32_ATOMIC_LOAD16_U;
   2189   if (size == 4) return WASM_INSN_I32_ATOMIC_LOAD;
   2190   wfail(t, "wasm: atomic load i32 size %u not supported", size);
   2191 }
   2192 
   2193 static WasmInsnKind atomic_store_kind_for(WTarget* t, KitCgTypeId ty,
   2194                                           MemAccess ma) {
   2195   WasmValType vt = type_valtype(t, ty);
   2196   u32 size = ma.size ? ma.size : (u32)abi_cg_sizeof(t->c->abi, ty);
   2197   if (vt == WASM_VAL_F32 || vt == WASM_VAL_F64)
   2198     wfail(t,
   2199           "wasm target: atomic store of floating-point value is not "
   2200           "representable in wasm threads");
   2201   if (vt == WASM_VAL_I64) {
   2202     if (size == 1) return WASM_INSN_I64_ATOMIC_STORE8;
   2203     if (size == 2) return WASM_INSN_I64_ATOMIC_STORE16;
   2204     if (size == 4) return WASM_INSN_I64_ATOMIC_STORE32;
   2205     if (size == 8) return WASM_INSN_I64_ATOMIC_STORE;
   2206     wfail(t, "wasm: atomic store i64 size %u not supported", size);
   2207   }
   2208   if (size == 1) return WASM_INSN_I32_ATOMIC_STORE8;
   2209   if (size == 2) return WASM_INSN_I32_ATOMIC_STORE16;
   2210   if (size == 4) return WASM_INSN_I32_ATOMIC_STORE;
   2211   wfail(t, "wasm: atomic store i32 size %u not supported", size);
   2212 }
   2213 
   2214 static WasmInsnKind atomic_rmw_kind_for(WTarget* t, KitCgAtomicOp op,
   2215                                         KitCgTypeId ty, MemAccess ma) {
   2216   WasmValType vt = type_valtype(t, ty);
   2217   u32 size = ma.size ? ma.size : (u32)abi_cg_sizeof(t->c->abi, ty);
   2218   int is64 = (vt == WASM_VAL_I64);
   2219   if (vt == WASM_VAL_F32 || vt == WASM_VAL_F64)
   2220     wfail(t,
   2221           "wasm target: atomic RMW on floating-point value is not "
   2222           "representable in wasm threads");
   2223   if (!(size == 4 || size == 8) || (is64 && size != 8) ||
   2224       (!is64 && size != 4)) {
   2225     wfail(t,
   2226           "wasm target: atomic RMW size %u not yet supported (only "
   2227           "full-width i32 and i64 atomic RMW are wired)",
   2228           size);
   2229   }
   2230   switch (op) {
   2231     case KIT_CG_ATOMIC_ADD:
   2232       return is64 ? WASM_INSN_I64_ATOMIC_RMW_ADD : WASM_INSN_I32_ATOMIC_RMW_ADD;
   2233     case KIT_CG_ATOMIC_SUB:
   2234       return is64 ? WASM_INSN_I64_ATOMIC_RMW_SUB : WASM_INSN_I32_ATOMIC_RMW_SUB;
   2235     case KIT_CG_ATOMIC_AND:
   2236       return is64 ? WASM_INSN_I64_ATOMIC_RMW_AND : WASM_INSN_I32_ATOMIC_RMW_AND;
   2237     case KIT_CG_ATOMIC_OR:
   2238       return is64 ? WASM_INSN_I64_ATOMIC_RMW_OR : WASM_INSN_I32_ATOMIC_RMW_OR;
   2239     case KIT_CG_ATOMIC_XOR:
   2240       return is64 ? WASM_INSN_I64_ATOMIC_RMW_XOR : WASM_INSN_I32_ATOMIC_RMW_XOR;
   2241     case KIT_CG_ATOMIC_XCHG:
   2242       return is64 ? WASM_INSN_I64_ATOMIC_RMW_XCHG
   2243                   : WASM_INSN_I32_ATOMIC_RMW_XCHG;
   2244     case KIT_CG_ATOMIC_NAND:
   2245       wfail(t, "wasm target: atomic NAND has no native wasm-threads opcode");
   2246   }
   2247   wfail(t, "wasm: unsupported atomic RMW op %d", (int)op);
   2248 }
   2249 
   2250 static WasmInsnKind atomic_cmpxchg_kind_for(WTarget* t, KitCgTypeId ty,
   2251                                             MemAccess ma) {
   2252   WasmValType vt = type_valtype(t, ty);
   2253   u32 size = ma.size ? ma.size : (u32)abi_cg_sizeof(t->c->abi, ty);
   2254   if (vt == WASM_VAL_F32 || vt == WASM_VAL_F64)
   2255     wfail(t,
   2256           "wasm target: atomic cmpxchg on floating-point value is not "
   2257           "representable in wasm threads");
   2258   if (vt == WASM_VAL_I64) {
   2259     if (size != 8)
   2260       wfail(t, "wasm target: atomic cmpxchg i64 size %u not yet supported",
   2261             size);
   2262     return WASM_INSN_I64_ATOMIC_RMW_CMPXCHG;
   2263   }
   2264   if (size != 4)
   2265     wfail(t, "wasm target: atomic cmpxchg i32 size %u not yet supported", size);
   2266   return WASM_INSN_I32_ATOMIC_RMW_CMPXCHG;
   2267 }
   2268 
   2269 /* Look up (or assign) `sym`'s slot in the funcref table. Returned index is
   2270  * the wasm table index (>= 1, with 0 reserved as the null/trap slot). */
   2271 static u32 func_table_index_for(WTarget* t, ObjSymId sym) {
   2272   Heap* h = t->c->ctx->heap;
   2273   for (u32 i = 0; i < t->func_table_count; ++i) {
   2274     if (t->func_table[i] == sym) return i + 1u;
   2275   }
   2276   if (t->func_table_count == t->func_table_cap) {
   2277     u32 nc = t->func_table_cap ? t->func_table_cap * 2u : 8u;
   2278     void* p = h->realloc(h, t->func_table, sizeof(ObjSymId) * t->func_table_cap,
   2279                          sizeof(ObjSymId) * nc, _Alignof(ObjSymId));
   2280     if (!p) wfail(t, "wasm: out of memory");
   2281     t->func_table = (ObjSymId*)p;
   2282     t->func_table_cap = nc;
   2283   }
   2284   t->func_table[t->func_table_count] = sym;
   2285   t->has_func_table = 1;
   2286   return ++t->func_table_count; /* slot 0 reserved; first sym -> index 1 */
   2287 }
   2288 
   2289 /* Defer function-pointer materialization to wasm_materialize_functable.
   2290  * Emits `i32.const 0` and queues a WFuncTableFixup keyed by the placeholder's
   2291  * (cur_func_idx, ninsns-1). */
   2292 static void queue_func_table_fixup(WTarget* t, ObjSymId sym) {
   2293   Heap* h = t->c->ctx->heap;
   2294   if (!t->cur_func) wfail(t, "wasm: function address outside a function");
   2295   /* Ensure the function gets a slot and force the WasmFunc shell to exist so
   2296    * the table's element segment can resolve its wasm-func index later. */
   2297   (void)func_table_index_for(t, sym);
   2298   (void)sym_to_wasm_func(t, sym, NULL);
   2299   emit_insn(t, WASM_INSN_I32_CONST, 0);
   2300   if (t->func_table_fixups_count == t->func_table_fixups_cap) {
   2301     u32 nc = t->func_table_fixups_cap ? t->func_table_fixups_cap * 2u : 16u;
   2302     void* p =
   2303         h->realloc(h, t->func_table_fixups,
   2304                    sizeof(WFuncTableFixup) * t->func_table_fixups_cap,
   2305                    sizeof(WFuncTableFixup) * nc, _Alignof(WFuncTableFixup));
   2306     if (!p) wfail(t, "wasm: out of memory");
   2307     t->func_table_fixups = (WFuncTableFixup*)p;
   2308     t->func_table_fixups_cap = nc;
   2309   }
   2310   WFuncTableFixup* fx = &t->func_table_fixups[t->func_table_fixups_count++];
   2311   fx->wasm_func_idx = t->cur_func_idx;
   2312   fx->insn_idx = t->cur_func->ninsns - 1u;
   2313   fx->sym = sym;
   2314 }
   2315 
   2316 /* Defer symbol-address resolution to wasm_materialize_data. Emits an
   2317  * i32.const placeholder into the current function and queues a WSymFixup
   2318  * keyed by (cur_func_idx, ninsns-1). The compact section layout is only
   2319  * known once every section's final size is settled, which doesn't happen
   2320  * until finalize. */
   2321 static void queue_symbol_addr_fixup(WTarget* t, ObjSymId sym, i64 addend) {
   2322   Heap* h = t->c->ctx->heap;
   2323   const ObjSym* os = obj_symbol_get(t->obj, sym);
   2324   /* Function-symbol addresses route through the funcref table, not linear
   2325    * memory. CG occasionally takes the address of an extern function before
   2326    * the function body is seen (forward declarations, indirect-call
   2327    * setup); the SK_FUNC kind is set by the frontend at sym creation. */
   2328   if (os && os->kind == SK_FUNC) {
   2329     if (addend != 0)
   2330       wfail(t, "wasm: nonzero addend on function-pointer reference");
   2331     queue_func_table_fixup(t, sym);
   2332     return;
   2333   }
   2334   if (!os)
   2335     wfail(t, "wasm target: address of unresolved symbol not yet implemented");
   2336   if (os->section_id == OBJ_SEC_NONE && os->kind != SK_COMMON)
   2337     wfail(t, "wasm target: address of undefined symbol not yet implemented");
   2338   /* SK_COMMON falls through: apply_sym_fixups allocates a BSS-style base for
   2339    * it in wasm_materialize_data and patches the i32.const at finalize. */
   2340   if (addend < INT32_MIN || addend > INT32_MAX)
   2341     wfail(t, "wasm: symbol addend out of range");
   2342   if (!t->cur_func) wfail(t, "wasm: symbol address outside a function");
   2343   emit_insn(t, WASM_INSN_I32_CONST, 0);
   2344   if (t->sym_fixups_count == t->sym_fixups_cap) {
   2345     u32 nc = t->sym_fixups_cap ? t->sym_fixups_cap * 2u : 16u;
   2346     void* p =
   2347         h->realloc(h, t->sym_fixups, sizeof(WSymFixup) * t->sym_fixups_cap,
   2348                    sizeof(WSymFixup) * nc, _Alignof(WSymFixup));
   2349     if (!p) wfail(t, "wasm: out of memory");
   2350     t->sym_fixups = (WSymFixup*)p;
   2351     t->sym_fixups_cap = nc;
   2352   }
   2353   WSymFixup* fx = &t->sym_fixups[t->sym_fixups_count++];
   2354   fx->wasm_func_idx = t->cur_func_idx;
   2355   fx->insn_idx = t->cur_func->ninsns - 1u;
   2356   fx->sym = sym;
   2357   fx->addend = addend;
   2358 }
   2359 
   2360 /* Push the value of an OPK_INDIRECT base/index component. The CG defers loading
   2361  * the pointer value of an address-taken (frame-resident) pointer local to the
   2362  * backend: the deref of such a local arrives as an OPK_INDIRECT whose base
   2363  * names the local itself, not a materialized register (see
   2364  * fold_ea_into_operand, and native_direct_target's nd_cache_reg_for, which
   2365  * loads it from the home). In the wasm backend each id is either a register
   2366  * (reg_to_local set) or a frame slot, never both — so dispatch on that: a
   2367  * register is fetched directly; a frame-resident local is read from its home
   2368  * like any other WOP_LOCAL operand. */
   2369 static void emit_push_addr_component(WTarget* t, Reg id) {
   2370   if (id < t->reg_cap && t->reg_to_local[id] != 0xffffffffu) {
   2371     emit_push_operand_reg(t, id);
   2372   } else {
   2373     WSlot* s = slot_for(t, id);
   2374     emit_push_operand(t, WOP_LOCAL, (i64)id, REG_NONE, s->type);
   2375   }
   2376 }
   2377 
   2378 /* Value type of an indirect component, whether it lives in a register or a
   2379  * frame slot (used to decide i64->i32 address narrowing). */
   2380 static WasmValType addr_component_valtype(WTarget* t, Reg id) {
   2381   if (id < t->reg_cap && t->reg_to_local[id] != 0xffffffffu && t->reg_type[id])
   2382     return type_valtype(t, t->reg_type[id]);
   2383   return type_valtype(t, slot_for(t, id)->type);
   2384 }
   2385 
   2386 static void emit_addr_operand(WTarget* t, Operand addr, uint64_t* offset_out) {
   2387   *offset_out = 0;
   2388   if (addr.kind == OPK_LOCAL) {
   2389     WSlot* s = slot_for(t, addr.v.frame_slot);
   2390     if (s->kind != W_SLOT_STACK)
   2391       wfail(t, "wasm: address of non-addressable local");
   2392     emit_insn(t, WASM_INSN_LOCAL_GET, (i64)t->frame_base_local);
   2393     *offset_out = s->frame_offset;
   2394     return;
   2395   }
   2396   if (addr.kind == OPK_INDIRECT) {
   2397     emit_push_addr_component(t, addr.v.ind.base);
   2398     if (addr.v.ind.index != REG_NONE) {
   2399       emit_push_addr_component(t, addr.v.ind.index);
   2400       if (addr_component_valtype(t, addr.v.ind.index) == WASM_VAL_I64) {
   2401         emit_insn(t, WASM_INSN_I32_WRAP_I64, 0);
   2402       }
   2403       if (addr.v.ind.log2_scale != 0) {
   2404         emit_insn(t, WASM_INSN_I32_CONST, (i64)addr.v.ind.log2_scale);
   2405         emit_insn(t, WASM_INSN_I32_SHL, 0);
   2406       }
   2407       emit_insn(t, WASM_INSN_I32_ADD, 0);
   2408     }
   2409     if (addr.v.ind.ofs >= 0) {
   2410       *offset_out = (uint32_t)addr.v.ind.ofs;
   2411     } else {
   2412       emit_insn(t, WASM_INSN_I32_CONST, (i64)addr.v.ind.ofs);
   2413       emit_insn(t, WASM_INSN_I32_ADD, 0);
   2414     }
   2415     return;
   2416   }
   2417   if (addr.kind == OPK_GLOBAL) {
   2418     queue_symbol_addr_fixup(t, addr.v.global.sym, addr.v.global.addend);
   2419     return;
   2420   }
   2421   if (addr.kind == OPK_REG) {
   2422     /* An i32 address already materialized in a register: just push it. */
   2423     emit_push_operand_reg(t, addr.v.reg);
   2424     return;
   2425   }
   2426   wfail(t, "wasm: unsupported address operand kind %u", (unsigned)addr.kind);
   2427 }
   2428 
   2429 /* Push a complete i32 address value (folding any positive offset into the base
   2430  * via i32.add). Used by bulk-memory ops (memory.copy / memory.fill) which take
   2431  * the address as a stack operand and carry no memarg offset. */
   2432 static void emit_push_addr_value(WTarget* t, Operand addr) {
   2433   uint64_t off;
   2434   emit_addr_operand(t, addr, &off);
   2435   if (off != 0) {
   2436     emit_insn(t, WASM_INSN_I32_CONST, (i64)(uint32_t)off);
   2437     emit_insn(t, WASM_INSN_I32_ADD, 0);
   2438   }
   2439 }
   2440 
   2441 static void emit_load_addr(WTarget* t, Operand addr, KitCgTypeId ty,
   2442                            MemAccess ma) {
   2443   uint64_t offset;
   2444   WasmInsnKind k = load_kind_for(t, ty, ma);
   2445   u32 width = wasm_mem_width((uint8_t)k);
   2446   emit_addr_operand(t, addr, &offset);
   2447   wasm_func_add_mem_insn(t->c, t->module, t->cur_func, k,
   2448                          memarg_align_log2(ma.align, width), offset, 0);
   2449 }
   2450 
   2451 static void emit_store_addr(WTarget* t, Operand addr, KitCgTypeId ty,
   2452                             Operand src, MemAccess ma, u32 src_kind,
   2453                             i64 src_imm, Reg src_reg) {
   2454   uint64_t offset;
   2455   WasmInsnKind k = store_kind_for(t, ty, ma);
   2456   u32 width = wasm_mem_width((uint8_t)k);
   2457   emit_addr_operand(t, addr, &offset);
   2458   if (src_kind == WOP_IMM) {
   2459     emit_push_imm(t, type_valtype(t, ty), src_imm);
   2460   } else if (src_kind == WOP_WASM_LOCAL) {
   2461     emit_insn(t, WASM_INSN_LOCAL_GET, src_imm);
   2462   } else if (src.kind == OPK_IMM) {
   2463     emit_push_imm(t, type_valtype(t, ty), src.v.imm);
   2464   } else if (src_kind == WOP_LOCAL) {
   2465     emit_push_operand(t, WOP_LOCAL, src_imm, REG_NONE, ty);
   2466   } else {
   2467     emit_push_operand_reg(t, src_reg);
   2468   }
   2469   wasm_func_add_mem_insn(t->c, t->module, t->cur_func, k,
   2470                          memarg_align_log2(ma.align, width), offset, 0);
   2471 }
   2472 
   2473 /* Map (BinOp, valtype) to wasm opcode. */
   2474 static WasmInsnKind binop_kind(WTarget* t, BinOp op, WasmValType vt) {
   2475   switch (op) {
   2476     case BO_IADD:
   2477       return vt == WASM_VAL_I64 ? WASM_INSN_I64_ADD : WASM_INSN_I32_ADD;
   2478     case BO_ISUB:
   2479       return vt == WASM_VAL_I64 ? WASM_INSN_I64_SUB : WASM_INSN_I32_SUB;
   2480     case BO_IMUL:
   2481       return vt == WASM_VAL_I64 ? WASM_INSN_I64_MUL : WASM_INSN_I32_MUL;
   2482     case BO_SDIV:
   2483       return vt == WASM_VAL_I64 ? WASM_INSN_I64_DIV_S : WASM_INSN_I32_DIV_S;
   2484     case BO_UDIV:
   2485       return vt == WASM_VAL_I64 ? WASM_INSN_I64_DIV_U : WASM_INSN_I32_DIV_U;
   2486     case BO_SREM:
   2487       return vt == WASM_VAL_I64 ? WASM_INSN_I64_REM_S : WASM_INSN_I32_REM_S;
   2488     case BO_UREM:
   2489       return vt == WASM_VAL_I64 ? WASM_INSN_I64_REM_U : WASM_INSN_I32_REM_U;
   2490     case BO_AND:
   2491       return vt == WASM_VAL_I64 ? WASM_INSN_I64_AND : WASM_INSN_I32_AND;
   2492     case BO_OR:
   2493       return vt == WASM_VAL_I64 ? WASM_INSN_I64_OR : WASM_INSN_I32_OR;
   2494     case BO_XOR:
   2495       return vt == WASM_VAL_I64 ? WASM_INSN_I64_XOR : WASM_INSN_I32_XOR;
   2496     case BO_SHL:
   2497       return vt == WASM_VAL_I64 ? WASM_INSN_I64_SHL : WASM_INSN_I32_SHL;
   2498     case BO_SHR_S:
   2499       return vt == WASM_VAL_I64 ? WASM_INSN_I64_SHR_S : WASM_INSN_I32_SHR_S;
   2500     case BO_SHR_U:
   2501       return vt == WASM_VAL_I64 ? WASM_INSN_I64_SHR_U : WASM_INSN_I32_SHR_U;
   2502     case BO_FADD:
   2503       return vt == WASM_VAL_F64 ? WASM_INSN_F64_ADD : WASM_INSN_F32_ADD;
   2504     case BO_FSUB:
   2505       return vt == WASM_VAL_F64 ? WASM_INSN_F64_SUB : WASM_INSN_F32_SUB;
   2506     case BO_FMUL:
   2507       return vt == WASM_VAL_F64 ? WASM_INSN_F64_MUL : WASM_INSN_F32_MUL;
   2508     case BO_FDIV:
   2509       return vt == WASM_VAL_F64 ? WASM_INSN_F64_DIV : WASM_INSN_F32_DIV;
   2510   }
   2511   wfail(t, "wasm: unsupported binop %d", (int)op);
   2512 }
   2513 
   2514 static WasmInsnKind cmp_kind(WTarget* t, CmpOp op, WasmValType vt) {
   2515   int is64 = (vt == WASM_VAL_I64);
   2516   switch (op) {
   2517     case CMP_EQ:
   2518       return is64 ? WASM_INSN_I64_EQ : WASM_INSN_I32_EQ;
   2519     case CMP_NE:
   2520       return is64 ? WASM_INSN_I64_NE : WASM_INSN_I32_NE;
   2521     case CMP_LT_S:
   2522       return is64 ? WASM_INSN_I64_LT_S : WASM_INSN_I32_LT_S;
   2523     case CMP_LE_S:
   2524       return is64 ? WASM_INSN_I64_LE_S : WASM_INSN_I32_LE_S;
   2525     case CMP_GT_S:
   2526       return is64 ? WASM_INSN_I64_GT_S : WASM_INSN_I32_GT_S;
   2527     case CMP_GE_S:
   2528       return is64 ? WASM_INSN_I64_GE_S : WASM_INSN_I32_GE_S;
   2529     case CMP_LT_U:
   2530       return is64 ? WASM_INSN_I64_LT_U : WASM_INSN_I32_LT_U;
   2531     case CMP_LE_U:
   2532       return is64 ? WASM_INSN_I64_LE_U : WASM_INSN_I32_LE_U;
   2533     case CMP_GT_U:
   2534       return is64 ? WASM_INSN_I64_GT_U : WASM_INSN_I32_GT_U;
   2535     case CMP_GE_U:
   2536       return is64 ? WASM_INSN_I64_GE_U : WASM_INSN_I32_GE_U;
   2537     /* FP compares are lowered by emit_fp_cmp (they may need multiple wasm
   2538      * instructions) and never reach cmp_kind. Listed so -Wswitch stays useful.
   2539      */
   2540     case CMP_OEQ_F:
   2541     case CMP_ONE_F:
   2542     case CMP_OLT_F:
   2543     case CMP_OLE_F:
   2544     case CMP_OGT_F:
   2545     case CMP_OGE_F:
   2546     case CMP_UEQ_F:
   2547     case CMP_UNE_F:
   2548     case CMP_ULT_F:
   2549     case CMP_ULE_F:
   2550     case CMP_UGT_F:
   2551     case CMP_UGE_F:
   2552       break;
   2553   }
   2554   wfail(t, "wasm: unsupported cmp %d", (int)op);
   2555 }
   2556 
   2557 /* Push both compare operands (a then b) onto the wasm stack. */
   2558 static void push_cmp_operands(WTarget* t, WIR* w, KitCgTypeId opty) {
   2559   emit_push_operand(t, w->imm_kind, w->imm_a, w->a, opty);
   2560   emit_push_operand(t, w->imm_kind_b, w->imm_b, w->b, opty);
   2561 }
   2562 
   2563 /* Lower an FP compare to the wasm stack, leaving an i32 0/1 result. wasm's
   2564  * f.eq/f.lt/f.le/f.gt/f.ge are ordered (false on NaN) and f.ne is unordered
   2565  * (true on NaN), so the 12 IEEE predicates compose from those plus i32.eqz /
   2566  * i32.or, using unordered-R == !(ordered-not-R). ONE/UEQ need both operands
   2567  * twice, so push_cmp_operands runs again for the second relation. */
   2568 static void emit_fp_cmp(WTarget* t, CmpOp op, WIR* w, KitCgTypeId opty) {
   2569   int d = (type_valtype(t, opty) == WASM_VAL_F64);
   2570   WasmInsnKind EQ = d ? WASM_INSN_F64_EQ : WASM_INSN_F32_EQ;
   2571   WasmInsnKind NE = d ? WASM_INSN_F64_NE : WASM_INSN_F32_NE;
   2572   WasmInsnKind LT = d ? WASM_INSN_F64_LT : WASM_INSN_F32_LT;
   2573   WasmInsnKind LE = d ? WASM_INSN_F64_LE : WASM_INSN_F32_LE;
   2574   WasmInsnKind GT = d ? WASM_INSN_F64_GT : WASM_INSN_F32_GT;
   2575   WasmInsnKind GE = d ? WASM_INSN_F64_GE : WASM_INSN_F32_GE;
   2576   switch (op) {
   2577     case CMP_OEQ_F:
   2578       push_cmp_operands(t, w, opty);
   2579       emit_insn(t, EQ, 0);
   2580       return;
   2581     case CMP_UNE_F:
   2582       push_cmp_operands(t, w, opty);
   2583       emit_insn(t, NE, 0);
   2584       return;
   2585     case CMP_OLT_F:
   2586       push_cmp_operands(t, w, opty);
   2587       emit_insn(t, LT, 0);
   2588       return;
   2589     case CMP_OLE_F:
   2590       push_cmp_operands(t, w, opty);
   2591       emit_insn(t, LE, 0);
   2592       return;
   2593     case CMP_OGT_F:
   2594       push_cmp_operands(t, w, opty);
   2595       emit_insn(t, GT, 0);
   2596       return;
   2597     case CMP_OGE_F:
   2598       push_cmp_operands(t, w, opty);
   2599       emit_insn(t, GE, 0);
   2600       return;
   2601     case CMP_UGE_F: /* !(OLT) */
   2602       push_cmp_operands(t, w, opty);
   2603       emit_insn(t, LT, 0);
   2604       emit_insn(t, WASM_INSN_I32_EQZ, 0);
   2605       return;
   2606     case CMP_UGT_F: /* !(OLE) */
   2607       push_cmp_operands(t, w, opty);
   2608       emit_insn(t, LE, 0);
   2609       emit_insn(t, WASM_INSN_I32_EQZ, 0);
   2610       return;
   2611     case CMP_ULE_F: /* !(OGT) */
   2612       push_cmp_operands(t, w, opty);
   2613       emit_insn(t, GT, 0);
   2614       emit_insn(t, WASM_INSN_I32_EQZ, 0);
   2615       return;
   2616     case CMP_ULT_F: /* !(OGE) */
   2617       push_cmp_operands(t, w, opty);
   2618       emit_insn(t, GE, 0);
   2619       emit_insn(t, WASM_INSN_I32_EQZ, 0);
   2620       return;
   2621     case CMP_ONE_F: /* ordered & !=: (a<b) | (a>b) */
   2622       push_cmp_operands(t, w, opty);
   2623       emit_insn(t, LT, 0);
   2624       push_cmp_operands(t, w, opty);
   2625       emit_insn(t, GT, 0);
   2626       emit_insn(t, WASM_INSN_I32_OR, 0);
   2627       return;
   2628     case CMP_UEQ_F: /* unordered | ==: !((a<b) | (a>b)) */
   2629       push_cmp_operands(t, w, opty);
   2630       emit_insn(t, LT, 0);
   2631       push_cmp_operands(t, w, opty);
   2632       emit_insn(t, GT, 0);
   2633       emit_insn(t, WASM_INSN_I32_OR, 0);
   2634       emit_insn(t, WASM_INSN_I32_EQZ, 0);
   2635       return;
   2636     default:
   2637       wfail(t, "wasm: unsupported fp cmp %d", (int)op);
   2638   }
   2639 }
   2640 
   2641 static void emit_convert(WTarget* t, ConvKind ck, WasmValType src,
   2642                          WasmValType dst, u32 sw, u32 dw) {
   2643   (void)dw;
   2644   /* Integer sign/zero extension. Sub-i32 logical widths (i8/i16) share the i32
   2645    * valtype, so a "same valtype" SEXT/ZEXT is NOT a no-op — the high bits must
   2646    * be filled per the source's logical width (sw). The CG IR keeps narrow
   2647    * immediates as truncated bit patterns, so without this an i8 value like
   2648    * (signed char)-128 reads back as 128. */
   2649   if (ck == CV_SEXT && src != WASM_VAL_F32 && src != WASM_VAL_F64) {
   2650     if (src == WASM_VAL_I32) {
   2651       if (sw == 8u)
   2652         emit_insn(t, WASM_INSN_I32_EXTEND8_S, 0);
   2653       else if (sw == 16u)
   2654         emit_insn(t, WASM_INSN_I32_EXTEND16_S, 0);
   2655     } else {
   2656       if (sw == 8u)
   2657         emit_insn(t, WASM_INSN_I64_EXTEND8_S, 0);
   2658       else if (sw == 16u)
   2659         emit_insn(t, WASM_INSN_I64_EXTEND16_S, 0);
   2660       else if (sw == 32u)
   2661         emit_insn(t, WASM_INSN_I64_EXTEND32_S, 0);
   2662     }
   2663     if (src == WASM_VAL_I32 && dst == WASM_VAL_I64)
   2664       emit_insn(t, WASM_INSN_I64_EXTEND_I32_S, 0);
   2665     else if (src == WASM_VAL_I64 && dst == WASM_VAL_I32)
   2666       emit_insn(t, WASM_INSN_I32_WRAP_I64, 0);
   2667     return;
   2668   }
   2669   if (ck == CV_ZEXT && src != WASM_VAL_F32 && src != WASM_VAL_F64) {
   2670     if (src == WASM_VAL_I32) {
   2671       if (sw > 0u && sw < 32u) {
   2672         emit_insn(t, WASM_INSN_I32_CONST, (i64)(((u32)1 << sw) - 1u));
   2673         emit_insn(t, WASM_INSN_I32_AND, 0);
   2674       }
   2675       if (dst == WASM_VAL_I64) emit_insn(t, WASM_INSN_I64_EXTEND_I32_U, 0);
   2676     } else {
   2677       if (sw > 0u && sw < 64u) {
   2678         emit_push_imm(t, WASM_VAL_I64, (i64)(((u64)1 << sw) - 1u));
   2679         emit_insn(t, WASM_INSN_I64_AND, 0);
   2680       }
   2681       if (dst == WASM_VAL_I32) emit_insn(t, WASM_INSN_I32_WRAP_I64, 0);
   2682     }
   2683     return;
   2684   }
   2685   if (src == dst && (ck == CV_BITCAST || ck == CV_TRUNC)) {
   2686     /* No-op conversion. */
   2687     return;
   2688   }
   2689   if (ck == CV_BITCAST) {
   2690     if (src == WASM_VAL_I32 && dst == WASM_VAL_F32) {
   2691       emit_insn(t, WASM_INSN_F32_REINTERPRET_I32, 0);
   2692       return;
   2693     }
   2694     if (src == WASM_VAL_F32 && dst == WASM_VAL_I32) {
   2695       emit_insn(t, WASM_INSN_I32_REINTERPRET_F32, 0);
   2696       return;
   2697     }
   2698     if (src == WASM_VAL_I64 && dst == WASM_VAL_F64) {
   2699       emit_insn(t, WASM_INSN_F64_REINTERPRET_I64, 0);
   2700       return;
   2701     }
   2702     if (src == WASM_VAL_F64 && dst == WASM_VAL_I64) {
   2703       emit_insn(t, WASM_INSN_I64_REINTERPRET_F64, 0);
   2704       return;
   2705     }
   2706     /* Width-changing ptr/int bitcasts: kit_cg_ptr_to_int and
   2707      * kit_cg_int_to_ptr route through CV_BITCAST, and on wasm32 a pointer
   2708      * is i32 while the frontend integer side may be i64. Lower as
   2709      * wrap/extend (zero-extend; pointers are non-negative addresses). */
   2710     if (src == WASM_VAL_I64 && dst == WASM_VAL_I32) {
   2711       emit_insn(t, WASM_INSN_I32_WRAP_I64, 0);
   2712       return;
   2713     }
   2714     if (src == WASM_VAL_I32 && dst == WASM_VAL_I64) {
   2715       emit_insn(t, WASM_INSN_I64_EXTEND_I32_U, 0);
   2716       return;
   2717     }
   2718     wfail(t, "wasm: unsupported bitcast");
   2719   }
   2720   if (ck == CV_TRUNC) {
   2721     if (src == WASM_VAL_I64 && dst == WASM_VAL_I32) {
   2722       emit_insn(t, WASM_INSN_I32_WRAP_I64, 0);
   2723       return;
   2724     }
   2725   }
   2726   if (ck == CV_FEXT && src == WASM_VAL_F32 && dst == WASM_VAL_F64) {
   2727     emit_insn(t, WASM_INSN_F64_PROMOTE_F32, 0);
   2728     return;
   2729   }
   2730   if (ck == CV_FTRUNC && src == WASM_VAL_F64 && dst == WASM_VAL_F32) {
   2731     emit_insn(t, WASM_INSN_F32_DEMOTE_F64, 0);
   2732     return;
   2733   }
   2734   if (ck == CV_ITOF_S) {
   2735     if (src == WASM_VAL_I32 && dst == WASM_VAL_F32) {
   2736       emit_insn(t, WASM_INSN_F32_CONVERT_I32_S, 0);
   2737       return;
   2738     }
   2739     if (src == WASM_VAL_I32 && dst == WASM_VAL_F64) {
   2740       emit_insn(t, WASM_INSN_F64_CONVERT_I32_S, 0);
   2741       return;
   2742     }
   2743     if (src == WASM_VAL_I64 && dst == WASM_VAL_F32) {
   2744       emit_insn(t, WASM_INSN_F32_CONVERT_I64_S, 0);
   2745       return;
   2746     }
   2747     if (src == WASM_VAL_I64 && dst == WASM_VAL_F64) {
   2748       emit_insn(t, WASM_INSN_F64_CONVERT_I64_S, 0);
   2749       return;
   2750     }
   2751   }
   2752   if (ck == CV_ITOF_U) {
   2753     if (src == WASM_VAL_I32 && dst == WASM_VAL_F32) {
   2754       emit_insn(t, WASM_INSN_F32_CONVERT_I32_U, 0);
   2755       return;
   2756     }
   2757     if (src == WASM_VAL_I32 && dst == WASM_VAL_F64) {
   2758       emit_insn(t, WASM_INSN_F64_CONVERT_I32_U, 0);
   2759       return;
   2760     }
   2761     if (src == WASM_VAL_I64 && dst == WASM_VAL_F32) {
   2762       emit_insn(t, WASM_INSN_F32_CONVERT_I64_U, 0);
   2763       return;
   2764     }
   2765     if (src == WASM_VAL_I64 && dst == WASM_VAL_F64) {
   2766       emit_insn(t, WASM_INSN_F64_CONVERT_I64_U, 0);
   2767       return;
   2768     }
   2769   }
   2770   if (ck == CV_FTOI_S) {
   2771     if (src == WASM_VAL_F32 && dst == WASM_VAL_I32) {
   2772       emit_insn(t, WASM_INSN_I32_TRUNC_F32_S, 0);
   2773       return;
   2774     }
   2775     if (src == WASM_VAL_F64 && dst == WASM_VAL_I32) {
   2776       emit_insn(t, WASM_INSN_I32_TRUNC_F64_S, 0);
   2777       return;
   2778     }
   2779     if (src == WASM_VAL_F32 && dst == WASM_VAL_I64) {
   2780       emit_insn(t, WASM_INSN_I64_TRUNC_F32_S, 0);
   2781       return;
   2782     }
   2783     if (src == WASM_VAL_F64 && dst == WASM_VAL_I64) {
   2784       emit_insn(t, WASM_INSN_I64_TRUNC_F64_S, 0);
   2785       return;
   2786     }
   2787   }
   2788   if (ck == CV_FTOI_U) {
   2789     if (src == WASM_VAL_F32 && dst == WASM_VAL_I32) {
   2790       emit_insn(t, WASM_INSN_I32_TRUNC_F32_U, 0);
   2791       return;
   2792     }
   2793     if (src == WASM_VAL_F64 && dst == WASM_VAL_I32) {
   2794       emit_insn(t, WASM_INSN_I32_TRUNC_F64_U, 0);
   2795       return;
   2796     }
   2797     if (src == WASM_VAL_F32 && dst == WASM_VAL_I64) {
   2798       emit_insn(t, WASM_INSN_I64_TRUNC_F32_U, 0);
   2799       return;
   2800     }
   2801     if (src == WASM_VAL_F64 && dst == WASM_VAL_I64) {
   2802       emit_insn(t, WASM_INSN_I64_TRUNC_F64_U, 0);
   2803       return;
   2804     }
   2805   }
   2806   wfail(t, "wasm: unsupported convert kind %d (%d -> %d)", (int)ck, (int)src,
   2807         (int)dst);
   2808 }
   2809 
   2810 /* During lowering we keep a running active-scope stack so we can compute
   2811  * br depths. */
   2812 typedef struct LoweringScope {
   2813   u32 id;
   2814   u8 kind;
   2815   /* Depth at which break/continue targets are reached. */
   2816   u32 break_depth;
   2817   u32 cont_depth;
   2818 } LoweringScope;
   2819 
   2820 typedef struct LoweringState {
   2821   WTarget* t;
   2822   /* Bounded by the deepest synthetic + CG scope nesting we'll emit.
   2823    * Switch islands wrap one block per case, so the limit is roughly
   2824    * (max cases + max user nesting). 1024 leaves room for very wide
   2825    * switches without forcing future per-case-count caps. */
   2826   LoweringScope stack[1024];
   2827   u32 nstack;
   2828   u32 cur_depth;
   2829 } LoweringState;
   2830 
   2831 static u32 br_to_label(LoweringState* L, Label l) {
   2832   WLabel* lbl = lookup_label(L->t, l);
   2833   if (!lbl) wfail(L->t, "wasm: br to unknown label");
   2834   if (lbl->kind == WLBL_SCOPE_BREAK) {
   2835     for (u32 i = L->nstack; i > 0; --i) {
   2836       if (L->stack[i - 1u].id == lbl->scope_id) {
   2837         return L->cur_depth - L->stack[i - 1u].break_depth;
   2838       }
   2839     }
   2840     wfail(L->t, "wasm: br to break label of inactive scope");
   2841   }
   2842   if (lbl->kind == WLBL_SCOPE_CONT) {
   2843     for (u32 i = L->nstack; i > 0; --i) {
   2844       if (L->stack[i - 1u].id == lbl->scope_id) {
   2845         return L->cur_depth - L->stack[i - 1u].cont_depth;
   2846       }
   2847     }
   2848     wfail(L->t, "wasm: br to continue label of inactive scope");
   2849   }
   2850   /* wasm_structurize wraps every reachable forward label in a synthetic
   2851    * SCOPE_BLOCK (forward goto) or SCOPE_LOOP (backward goto), and
   2852    * unroll_switch_islands reorders the WIR so switch case labels are
   2853    * forward refs from WIR_SWITCH. Arriving here means the structurer
   2854    * missed a shape — a bug, not a feature gap. */
   2855   wfail(L->t,
   2856         "wasm: br to free label whose synthetic scope was not "
   2857         "emitted; structurer bug");
   2858 }
   2859 
   2860 static i64 wasm_switch_sign_extend(u64 v, u32 width) {
   2861   if (width == 0u || width >= 64u) return (i64)v;
   2862   {
   2863     u64 bit = 1ull << (width - 1u);
   2864     u64 mask = (1ull << width) - 1u;
   2865     v &= mask;
   2866     return (i64)((v ^ bit) - bit);
   2867   }
   2868 }
   2869 
   2870 static int wasm_switch_extents(WTarget* t, const WIR* w, i64* out_vmin,
   2871                                u64* out_span) {
   2872   u32 width;
   2873   i64 vmin = INT64_MAX;
   2874   i64 vmax = INT64_MIN;
   2875   if (w->switch_ncases == 0) return 0;
   2876   width = kit_cg_type_int_width((KitCompiler*)t->c, w->type);
   2877   if (!width || width > 64u) return 0;
   2878   for (u32 i = 0; i < w->switch_ncases; ++i) {
   2879     i64 vi = wasm_switch_sign_extend(w->switch_cases[i].value, width);
   2880     if (vi < vmin) vmin = vi;
   2881     if (vi > vmax) vmax = vi;
   2882   }
   2883   if (vmax < vmin) return 0;
   2884   {
   2885     u64 delta = (u64)vmax - (u64)vmin;
   2886     if (delta == UINT64_MAX) return 0;
   2887     *out_span = delta + 1u;
   2888   }
   2889   *out_vmin = vmin;
   2890   return 1;
   2891 }
   2892 
   2893 static void emit_br_table(WTarget* t, const u32* targets, u32 ntargets) {
   2894   WasmInsn* in;
   2895   if (ntargets == 0)
   2896     wfail(t, "wasm: br_table needs at least the default target");
   2897   wasm_func_add_insn(t->c, t->module, t->cur_func, WASM_INSN_BR_TABLE, 0);
   2898   in = &t->cur_func->insns[t->cur_func->ninsns - 1u];
   2899   wasm_insn_set_targets(t->c, t->module, in, targets, ntargets);
   2900 }
   2901 
   2902 /* A switch lowers to a dense br_table when its case values span a range that
   2903  * isn't pathologically sparse relative to the number of cases; otherwise an
   2904  * `eq`/`br_if` comparison chain. Small ranges always take the table (cheap
   2905  * either way); larger ranges only when at least ~half the table slots carry a
   2906  * real case, so a sparse switch (e.g. `case 0`, `case 1000000`) doesn't
   2907  * materialize a giant mostly-default table. There is no range-splitting yet,
   2908  * so a switch that fails this test is a linear scan. */
   2909 static int switch_use_br_table(const WIR* w, u64 span) {
   2910   if (span <= 64u) return 1;
   2911   return span <= (u64)w->switch_ncases * 2u;
   2912 }
   2913 
   2914 static void emit_switch_br_table(WTarget* t, LoweringState* L, const WIR* w) {
   2915   i64 vmin;
   2916   u64 span;
   2917   u32* targets;
   2918   Label* labels;
   2919   u32 ntargets;
   2920   WasmValType vt;
   2921   Heap* h = t->c->ctx->heap;
   2922 
   2923   if (w->switch_ncases == 0) {
   2924     emit_insn(t, WASM_INSN_BR, (i64)br_to_label(L, w->labels[0]));
   2925     return;
   2926   }
   2927   if (!wasm_switch_extents(t, w, &vmin, &span))
   2928     wfail(t, "wasm: unsupported switch selector type");
   2929   vt = type_valtype(t, w->type);
   2930   if (vt != WASM_VAL_I32 && vt != WASM_VAL_I64)
   2931     wfail(t, "wasm: switch selector must be integer");
   2932   if (!switch_use_br_table(w, span)) {
   2933     for (u32 i = 0; i < w->switch_ncases; ++i) {
   2934       u32 width = kit_cg_type_int_width((KitCompiler*)t->c, w->type);
   2935       i64 vi = wasm_switch_sign_extend(w->switch_cases[i].value, width);
   2936       emit_push_operand(t, w->imm_kind, w->imm_a, w->a, w->type);
   2937       emit_push_imm(t, vt, vi);
   2938       emit_insn(t, vt == WASM_VAL_I64 ? WASM_INSN_I64_EQ : WASM_INSN_I32_EQ, 0);
   2939       emit_insn(t, WASM_INSN_BR_IF,
   2940                 (i64)br_to_label(L, w->switch_cases[i].label));
   2941     }
   2942     emit_insn(t, WASM_INSN_BR, (i64)br_to_label(L, w->labels[0]));
   2943     return;
   2944   }
   2945 
   2946   /* Dense table: one slot per value in [vmin, vmin+span), default-filled, with
   2947    * the default appended as the trailing out-of-range target. */
   2948   ntargets = (u32)span + 1u;
   2949   labels = (Label*)h->alloc(h, sizeof(Label) * span, _Alignof(Label));
   2950   targets = (u32*)h->alloc(h, sizeof(u32) * ntargets, _Alignof(u32));
   2951   if (!labels || !targets) wfail(t, "wasm: out of memory for switch table");
   2952   for (u64 i = 0; i < span; ++i) labels[i] = w->labels[0];
   2953   for (u32 i = 0; i < w->switch_ncases; ++i) {
   2954     u32 width = kit_cg_type_int_width((KitCompiler*)t->c, w->type);
   2955     i64 vi = wasm_switch_sign_extend(w->switch_cases[i].value, width);
   2956     u64 slot = (u64)(vi - vmin);
   2957     if (slot >= span) wfail(t, "wasm: switch case outside span");
   2958     labels[slot] = w->switch_cases[i].label;
   2959   }
   2960 
   2961   emit_push_operand(t, w->imm_kind, w->imm_a, w->a, w->type);
   2962   if (vmin != 0) {
   2963     emit_push_imm(t, vt, vmin);
   2964     emit_insn(t, vt == WASM_VAL_I64 ? WASM_INSN_I64_SUB : WASM_INSN_I32_SUB, 0);
   2965   }
   2966   if (vt == WASM_VAL_I64) emit_insn(t, WASM_INSN_I32_WRAP_I64, 0);
   2967 
   2968   for (u32 i = 0; i < (u32)span; ++i) targets[i] = br_to_label(L, labels[i]);
   2969   targets[ntargets - 1u] = br_to_label(L, w->labels[0]);
   2970   emit_br_table(t, targets, ntargets);
   2971   h->free(h, targets, sizeof(u32) * ntargets);
   2972   h->free(h, labels, sizeof(Label) * span);
   2973 }
   2974 
   2975 /* -----------------------------------------------------------------
   2976  * Intrinsics (bit ops / bswap / overflow arith)
   2977  *
   2978  * MEMCPY/MEMMOVE/MEMSET don't appear here: the recorder funnels them
   2979  * into WIR_COPY_BYTES / WIR_SET_BYTES which already lower to
   2980  * memory.copy / memory.fill. Hints (PREFETCH/EXPECT/ASSUME_ALIGNED)
   2981  * also don't reach the linearizer — the recorder either drops them or
   2982  * emits a plain copy. ----------------------------------------------- */
   2983 
   2984 static void emit_intrinsic_bit_op(WTarget* t, const WIR* w) {
   2985   /* clz/ctz/popcount instruction width follows the operand (type2), not the
   2986    * i32 result. i64 forms produce an i64 count that we wrap to the i32 dst. */
   2987   WasmValType vt = type_valtype(t, w->type2 ? w->type2 : w->type);
   2988   WasmValType dvt = type_valtype(t, w->type);
   2989   WasmInsnKind op;
   2990   switch ((IntrinKind)w->cgop) {
   2991     case INTRIN_CLZ:
   2992       op = (vt == WASM_VAL_I64) ? WASM_INSN_I64_CLZ : WASM_INSN_I32_CLZ;
   2993       break;
   2994     case INTRIN_CTZ:
   2995       op = (vt == WASM_VAL_I64) ? WASM_INSN_I64_CTZ : WASM_INSN_I32_CTZ;
   2996       break;
   2997     case INTRIN_POPCOUNT:
   2998       op = (vt == WASM_VAL_I64) ? WASM_INSN_I64_POPCNT : WASM_INSN_I32_POPCNT;
   2999       break;
   3000     default:
   3001       wfail(t, "wasm: unexpected bit-op intrinsic %d", (int)w->cgop);
   3002       return;
   3003   }
   3004   emit_push_operand_reg(t, w->a);
   3005   emit_insn(t, op, 0);
   3006   if (vt == WASM_VAL_I64 && dvt == WASM_VAL_I32)
   3007     emit_insn(t, WASM_INSN_I32_WRAP_I64, 0);
   3008   emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3009 }
   3010 
   3011 static void emit_intrinsic_bswap(WTarget* t, const WIR* w) {
   3012   /* Width-by-type: the recorded result type fixes the byte width. */
   3013   u32 width = (u32)abi_cg_sizeof(t->c->abi, w->type);
   3014   if (width <= 4) {
   3015     /* Both 16- and 32-bit forms operate over i32. The 16-bit form only
   3016      * touches the low 16 bits; any extra high bits in the input are
   3017      * discarded by the AND mask. */
   3018     u32 tmp = add_wasm_local(t, WASM_VAL_I32);
   3019     emit_push_operand_reg(t, w->a);
   3020     emit_insn(t, WASM_INSN_LOCAL_SET, (i64)tmp);
   3021     if (width <= 2) {
   3022       /* (x & 0xff) << 8 */
   3023       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)tmp);
   3024       emit_insn(t, WASM_INSN_I32_CONST, 0xff);
   3025       emit_insn(t, WASM_INSN_I32_AND, 0);
   3026       emit_insn(t, WASM_INSN_I32_CONST, 8);
   3027       emit_insn(t, WASM_INSN_I32_SHL, 0);
   3028       /* (x >> 8) & 0xff */
   3029       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)tmp);
   3030       emit_insn(t, WASM_INSN_I32_CONST, 8);
   3031       emit_insn(t, WASM_INSN_I32_SHR_U, 0);
   3032       emit_insn(t, WASM_INSN_I32_CONST, 0xff);
   3033       emit_insn(t, WASM_INSN_I32_AND, 0);
   3034       emit_insn(t, WASM_INSN_I32_OR, 0);
   3035     } else {
   3036       /* Four-byte shuffle. */
   3037       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)tmp);
   3038       emit_insn(t, WASM_INSN_I32_CONST, 24);
   3039       emit_insn(t, WASM_INSN_I32_SHR_U, 0);
   3040       emit_insn(t, WASM_INSN_I32_CONST, 0xff);
   3041       emit_insn(t, WASM_INSN_I32_AND, 0);
   3042 
   3043       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)tmp);
   3044       emit_insn(t, WASM_INSN_I32_CONST, 8);
   3045       emit_insn(t, WASM_INSN_I32_SHR_U, 0);
   3046       emit_insn(t, WASM_INSN_I32_CONST, 0xff00);
   3047       emit_insn(t, WASM_INSN_I32_AND, 0);
   3048       emit_insn(t, WASM_INSN_I32_OR, 0);
   3049 
   3050       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)tmp);
   3051       emit_insn(t, WASM_INSN_I32_CONST, 8);
   3052       emit_insn(t, WASM_INSN_I32_SHL, 0);
   3053       emit_insn(t, WASM_INSN_I32_CONST, 0xff0000);
   3054       emit_insn(t, WASM_INSN_I32_AND, 0);
   3055       emit_insn(t, WASM_INSN_I32_OR, 0);
   3056 
   3057       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)tmp);
   3058       emit_insn(t, WASM_INSN_I32_CONST, 24);
   3059       emit_insn(t, WASM_INSN_I32_SHL, 0);
   3060       emit_insn(t, WASM_INSN_I32_OR, 0);
   3061     }
   3062     emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3063     return;
   3064   }
   3065   /* 8-byte form: byte reverse over i64. */
   3066   u32 tmp = add_wasm_local(t, WASM_VAL_I64);
   3067   emit_push_operand_reg(t, w->a);
   3068   emit_insn(t, WASM_INSN_LOCAL_SET, (i64)tmp);
   3069   for (int i = 0; i < 8; ++i) {
   3070     emit_insn(t, WASM_INSN_LOCAL_GET, (i64)tmp);
   3071     if (i > 0) {
   3072       emit_insn(t, WASM_INSN_I64_CONST, (i64)(i * 8));
   3073       emit_insn(t, WASM_INSN_I64_SHR_U, 0);
   3074     }
   3075     emit_insn(t, WASM_INSN_I64_CONST, 0xff);
   3076     emit_insn(t, WASM_INSN_I64_AND, 0);
   3077     int shift = (7 - i) * 8;
   3078     if (shift > 0) {
   3079       emit_insn(t, WASM_INSN_I64_CONST, (i64)shift);
   3080       emit_insn(t, WASM_INSN_I64_SHL, 0);
   3081     }
   3082     if (i > 0) emit_insn(t, WASM_INSN_I64_OR, 0);
   3083   }
   3084   emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3085 }
   3086 
   3087 static void emit_intrinsic_overflow(WTarget* t, const WIR* w) {
   3088   IntrinKind k = (IntrinKind)w->cgop;
   3089   WasmValType vt = type_valtype(t, w->type);
   3090   WasmInsnKind k_add =
   3091       (vt == WASM_VAL_I64) ? WASM_INSN_I64_ADD : WASM_INSN_I32_ADD;
   3092   WasmInsnKind k_sub =
   3093       (vt == WASM_VAL_I64) ? WASM_INSN_I64_SUB : WASM_INSN_I32_SUB;
   3094   WasmInsnKind k_and =
   3095       (vt == WASM_VAL_I64) ? WASM_INSN_I64_AND : WASM_INSN_I32_AND;
   3096   WasmInsnKind k_xor =
   3097       (vt == WASM_VAL_I64) ? WASM_INSN_I64_XOR : WASM_INSN_I32_XOR;
   3098   WasmInsnKind k_shr_u =
   3099       (vt == WASM_VAL_I64) ? WASM_INSN_I64_SHR_U : WASM_INSN_I32_SHR_U;
   3100   WasmInsnKind k_lt_u =
   3101       (vt == WASM_VAL_I64) ? WASM_INSN_I64_LT_U : WASM_INSN_I32_LT_U;
   3102   WasmInsnKind k_const =
   3103       (vt == WASM_VAL_I64) ? WASM_INSN_I64_CONST : WASM_INSN_I32_CONST;
   3104   KitCgTypeId bool_ty = builtin_id(KIT_CG_BUILTIN_BOOL);
   3105 
   3106   /* Stash both operands in scratch locals so each side of the expansion can
   3107    * re-load them without re-evaluating immediates or relying on the wasm
   3108    * value stack shape. */
   3109   u32 a_loc = add_wasm_local(t, vt);
   3110   u32 b_loc = add_wasm_local(t, vt);
   3111   emit_push_operand(t, w->imm_kind, w->imm_a, w->a, w->type);
   3112   emit_insn(t, WASM_INSN_LOCAL_SET, (i64)a_loc);
   3113   emit_push_operand(t, w->imm_kind_b, w->imm_b, w->b, w->type);
   3114   emit_insn(t, WASM_INSN_LOCAL_SET, (i64)b_loc);
   3115 
   3116   switch (k) {
   3117     case INTRIN_UADD_OVERFLOW:
   3118       /* r = a + b; ovf = (r <u a) */
   3119       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)a_loc);
   3120       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)b_loc);
   3121       emit_insn(t, k_add, 0);
   3122       emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3123       emit_push_operand_reg(t, w->dst);
   3124       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)a_loc);
   3125       emit_insn(t, k_lt_u, 0);
   3126       emit_local_set(t, w->dst2, bool_ty, RC_INT);
   3127       break;
   3128     case INTRIN_USUB_OVERFLOW:
   3129       /* r = a - b; ovf = (a <u b) */
   3130       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)a_loc);
   3131       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)b_loc);
   3132       emit_insn(t, k_sub, 0);
   3133       emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3134       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)a_loc);
   3135       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)b_loc);
   3136       emit_insn(t, k_lt_u, 0);
   3137       emit_local_set(t, w->dst2, bool_ty, RC_INT);
   3138       break;
   3139     case INTRIN_SADD_OVERFLOW:
   3140       /* r = a + b; ovf = ((r ^ a) & (r ^ b)) >>u (W-1) */
   3141       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)a_loc);
   3142       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)b_loc);
   3143       emit_insn(t, k_add, 0);
   3144       emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3145       emit_push_operand_reg(t, w->dst);
   3146       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)a_loc);
   3147       emit_insn(t, k_xor, 0);
   3148       emit_push_operand_reg(t, w->dst);
   3149       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)b_loc);
   3150       emit_insn(t, k_xor, 0);
   3151       emit_insn(t, k_and, 0);
   3152       emit_insn(t, k_const, (i64)(vt == WASM_VAL_I64 ? 63 : 31));
   3153       emit_insn(t, k_shr_u, 0);
   3154       if (vt == WASM_VAL_I64) emit_insn(t, WASM_INSN_I32_WRAP_I64, 0);
   3155       emit_local_set(t, w->dst2, bool_ty, RC_INT);
   3156       break;
   3157     case INTRIN_SSUB_OVERFLOW:
   3158       /* r = a - b; ovf = ((a ^ b) & (a ^ r)) >>u (W-1) */
   3159       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)a_loc);
   3160       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)b_loc);
   3161       emit_insn(t, k_sub, 0);
   3162       emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3163       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)a_loc);
   3164       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)b_loc);
   3165       emit_insn(t, k_xor, 0);
   3166       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)a_loc);
   3167       emit_push_operand_reg(t, w->dst);
   3168       emit_insn(t, k_xor, 0);
   3169       emit_insn(t, k_and, 0);
   3170       emit_insn(t, k_const, (i64)(vt == WASM_VAL_I64 ? 63 : 31));
   3171       emit_insn(t, k_shr_u, 0);
   3172       if (vt == WASM_VAL_I64) emit_insn(t, WASM_INSN_I32_WRAP_I64, 0);
   3173       emit_local_set(t, w->dst2, bool_ty, RC_INT);
   3174       break;
   3175     case INTRIN_UMUL_OVERFLOW: {
   3176       /* i32 only (i64 rejected in recorder). Widen to i64, multiply,
   3177        * low 32 = result, ovf = (wide >> 32) != 0. */
   3178       u32 wide = add_wasm_local(t, WASM_VAL_I64);
   3179       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)a_loc);
   3180       emit_insn(t, WASM_INSN_I64_EXTEND_I32_U, 0);
   3181       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)b_loc);
   3182       emit_insn(t, WASM_INSN_I64_EXTEND_I32_U, 0);
   3183       emit_insn(t, WASM_INSN_I64_MUL, 0);
   3184       emit_insn(t, WASM_INSN_LOCAL_SET, (i64)wide);
   3185 
   3186       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)wide);
   3187       emit_insn(t, WASM_INSN_I32_WRAP_I64, 0);
   3188       emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3189 
   3190       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)wide);
   3191       emit_insn(t, WASM_INSN_I64_CONST, 32);
   3192       emit_insn(t, WASM_INSN_I64_SHR_U, 0);
   3193       emit_insn(t, WASM_INSN_I64_CONST, 0);
   3194       emit_insn(t, WASM_INSN_I64_NE, 0);
   3195       emit_local_set(t, w->dst2, bool_ty, RC_INT);
   3196       break;
   3197     }
   3198     case INTRIN_SMUL_OVERFLOW: {
   3199       /* i32 only. Sign-extend, multiply, low 32 = result, ovf if
   3200        * sext(result) != wide product. */
   3201       u32 wide = add_wasm_local(t, WASM_VAL_I64);
   3202       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)a_loc);
   3203       emit_insn(t, WASM_INSN_I64_EXTEND_I32_S, 0);
   3204       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)b_loc);
   3205       emit_insn(t, WASM_INSN_I64_EXTEND_I32_S, 0);
   3206       emit_insn(t, WASM_INSN_I64_MUL, 0);
   3207       emit_insn(t, WASM_INSN_LOCAL_SET, (i64)wide);
   3208 
   3209       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)wide);
   3210       emit_insn(t, WASM_INSN_I32_WRAP_I64, 0);
   3211       emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3212 
   3213       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)wide);
   3214       emit_push_operand_reg(t, w->dst);
   3215       emit_insn(t, WASM_INSN_I64_EXTEND_I32_S, 0);
   3216       emit_insn(t, WASM_INSN_I64_NE, 0);
   3217       emit_local_set(t, w->dst2, bool_ty, RC_INT);
   3218       break;
   3219     }
   3220     default:
   3221       wfail(t, "wasm: overflow intrinsic dispatch reached default (%d)",
   3222             (int)k);
   3223   }
   3224 }
   3225 
   3226 static void emit_intrinsic(WTarget* t, const WIR* w) {
   3227   IntrinKind k = (IntrinKind)w->cgop;
   3228   switch (k) {
   3229     case INTRIN_CLZ:
   3230     case INTRIN_CTZ:
   3231     case INTRIN_POPCOUNT:
   3232       emit_intrinsic_bit_op(t, w);
   3233       return;
   3234     case INTRIN_BSWAP:
   3235       emit_intrinsic_bswap(t, w);
   3236       return;
   3237     case INTRIN_SADD_OVERFLOW:
   3238     case INTRIN_UADD_OVERFLOW:
   3239     case INTRIN_SSUB_OVERFLOW:
   3240     case INTRIN_USUB_OVERFLOW:
   3241     case INTRIN_SMUL_OVERFLOW:
   3242     case INTRIN_UMUL_OVERFLOW:
   3243       emit_intrinsic_overflow(t, w);
   3244       return;
   3245     default:
   3246       wfail(t, "wasm: unexpected intrinsic kind %d in linearizer", (int)k);
   3247   }
   3248 }
   3249 
   3250 static void linearize_range(WTarget* t, LoweringState* L, u32 start, u32 end);
   3251 
   3252 #if 0 /* Switch-island matcher: replaced by wasm_structurize's          \
   3253        * unroll_switch_islands, which reorders the WIR in-place so case \
   3254        * labels become forward refs handled by the general structurer. */
   3255 static int label_in_list(Label l, const Label* labels, u32 nlabels) {
   3256   for (u32 i = 0; i < nlabels; ++i) {
   3257     if (labels[i] == l) return 1;
   3258   }
   3259   return 0;
   3260 }
   3261 
   3262 static int try_linearize_switch_island(WTarget* t, LoweringState* L, u32* ip) {
   3263   WIR* jump = &t->wir[*ip];
   3264   WLabel* dispatch = lookup_label(t, jump->labels[0]);
   3265   u32 dispatch_i;
   3266   u32 switch_i = UINT32_MAX;
   3267   WIR* sw;
   3268   Label target_labels[64];
   3269   Label body_labels[64];
   3270   Label end_labels[64];
   3271   u32 ntarget_labels = 0;
   3272   u32 nbody_labels = 0;
   3273   u32 nend_labels = 0;
   3274   u32 end_remap_mark;
   3275   u32 case_remap_mark;
   3276   Label synthetic_end = LABEL_NONE;
   3277 
   3278   if (!dispatch || dispatch->kind != WLBL_FORWARD || !dispatch->placed ||
   3279       dispatch->wir_index <= *ip)
   3280     return 0;
   3281   dispatch_i = dispatch->wir_index;
   3282   if (dispatch_i >= t->nwir || t->wir[dispatch_i].op != WIR_LABEL)
   3283     return 0;
   3284   for (u32 i = dispatch_i + 1u; i < t->nwir; ++i) {
   3285     if (t->wir[i].op == WIR_SWITCH) {
   3286       switch_i = i;
   3287       break;
   3288     }
   3289   }
   3290   if (switch_i == UINT32_MAX) return 0;
   3291 
   3292   sw = &t->wir[switch_i];
   3293   for (u32 i = 0; i < sw->switch_ncases; ++i) {
   3294     Label l = sw->switch_cases[i].label;
   3295     if (!label_in_list(l, target_labels, ntarget_labels)) {
   3296       if (ntarget_labels >= 64u) wfail(t, "wasm: too many switch targets");
   3297       target_labels[ntarget_labels++] = l;
   3298     }
   3299   }
   3300   if (sw->labels[0] != LABEL_NONE &&
   3301       !label_in_list(sw->labels[0], target_labels, ntarget_labels)) {
   3302     if (ntarget_labels >= 64u) wfail(t, "wasm: too many switch targets");
   3303     target_labels[ntarget_labels++] = sw->labels[0];
   3304   }
   3305 
   3306   for (u32 i = *ip + 1u; i < dispatch_i; ++i) {
   3307     if (t->wir[i].op != WIR_LABEL) continue;
   3308     Label l = t->wir[i].labels[0];
   3309     if (label_in_list(l, target_labels, ntarget_labels) &&
   3310         !label_in_list(l, body_labels, nbody_labels)) {
   3311       if (nbody_labels >= 64u) wfail(t, "wasm: too many switch body labels");
   3312       body_labels[nbody_labels++] = l;
   3313     }
   3314   }
   3315   for (u32 i = *ip + 1u; i < dispatch_i; ++i) {
   3316     Label l = LABEL_NONE;
   3317     WLabel* lbl;
   3318     if (t->wir[i].op == WIR_JUMP || t->wir[i].op == WIR_CMP_BRANCH) {
   3319       l = t->wir[i].labels[0];
   3320     }
   3321     if (l == LABEL_NONE || label_in_list(l, body_labels, nbody_labels))
   3322       continue;
   3323     lbl = lookup_label(t, l);
   3324     if (!lbl || lbl->kind != WLBL_FORWARD || !lbl->placed ||
   3325         lbl->wir_index <= switch_i)
   3326       continue;
   3327     if (!label_in_list(l, end_labels, nend_labels)) {
   3328       if (nend_labels >= 64u) wfail(t, "wasm: too many switch exit labels");
   3329       end_labels[nend_labels++] = l;
   3330     }
   3331   }
   3332   if (nbody_labels == 0) return 0;
   3333 
   3334   if (!label_in_list(sw->labels[0], body_labels, nbody_labels))
   3335     synthetic_end = sw->labels[0];
   3336 
   3337   emit_insn(t, WASM_INSN_BLOCK, 0);
   3338   L->cur_depth++;
   3339   end_remap_mark = L->nremaps;
   3340   if (synthetic_end != LABEL_NONE)
   3341     lowering_push_remap(L, synthetic_end, L->cur_depth);
   3342   for (u32 i = 0; i < nend_labels; ++i) {
   3343     lowering_push_remap(L, end_labels[i], L->cur_depth);
   3344   }
   3345 
   3346   for (u32 ri = nbody_labels; ri > 0; --ri) {
   3347     Label l = body_labels[ri - 1u];
   3348     emit_insn(t, WASM_INSN_BLOCK, 0);
   3349     L->cur_depth++;
   3350     lowering_push_remap(L, l, L->cur_depth);
   3351   }
   3352 
   3353   linearize_range(t, L, dispatch_i + 1u, switch_i);
   3354   case_remap_mark = end_remap_mark + nend_labels +
   3355                     (synthetic_end != LABEL_NONE ? 1u : 0u);
   3356   emit_switch_br_table(t, L, sw);
   3357   lowering_pop_remaps(L, case_remap_mark);
   3358 
   3359   for (u32 bi = 0; bi < nbody_labels; ++bi) {
   3360     u32 seg_start;
   3361     u32 seg_end = dispatch_i;
   3362     WLabel* lbl = lookup_label(t, body_labels[bi]);
   3363     if (!lbl) wfail(t, "wasm: switch body label disappeared");
   3364     emit_insn(t, WASM_INSN_END, 0);
   3365     L->cur_depth--;
   3366     seg_start = lbl->wir_index + 1u;
   3367     if (bi + 1u < nbody_labels) {
   3368       WLabel* next = lookup_label(t, body_labels[bi + 1u]);
   3369       if (!next) wfail(t, "wasm: switch body label disappeared");
   3370       seg_end = next->wir_index;
   3371     }
   3372     linearize_range(t, L, seg_start, seg_end);
   3373   }
   3374 
   3375   emit_insn(t, WASM_INSN_END, 0);
   3376   L->cur_depth--;
   3377   lowering_pop_remaps(L, end_remap_mark);
   3378   *ip = switch_i;
   3379   return 1;
   3380 }
   3381 #endif
   3382 
   3383 static void linearize_range(WTarget* t, LoweringState* L, u32 start, u32 end) {
   3384   for (u32 i = start; i < end; ++i) {
   3385     WIR* w = &t->wir[i];
   3386     switch (w->op) {
   3387       case WIR_LOAD_IMM: {
   3388         WasmValType vt = type_valtype(t, w->type);
   3389         emit_push_imm(t, vt, w->imm);
   3390         emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3391         break;
   3392       }
   3393       case WIR_LOAD_CONST_F: {
   3394         WasmValType vt = type_valtype(t, w->type);
   3395         emit_fp(t,
   3396                 vt == WASM_VAL_F64 ? WASM_INSN_F64_CONST : WASM_INSN_F32_CONST,
   3397                 w->fp_imm);
   3398         emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3399         break;
   3400       }
   3401       case WIR_COPY: {
   3402         emit_push_operand_reg(t, w->a);
   3403         emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3404         break;
   3405       }
   3406       case WIR_BINOP: {
   3407         emit_push_operand(t, w->imm_kind, w->imm_a, w->a, w->type);
   3408         emit_push_operand(t, w->imm_kind_b, w->imm_b, w->b, w->type);
   3409         WasmValType vt = type_valtype(t, w->type);
   3410         emit_insn(t, binop_kind(t, (BinOp)w->cgop, vt), 0);
   3411         emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3412         break;
   3413       }
   3414       case WIR_UNOP: {
   3415         WasmValType vt = type_valtype(t, w->type);
   3416         switch ((UnOp)w->cgop) {
   3417           case UO_NEG: {
   3418             /* 0 - a */
   3419             emit_push_imm(t, vt, 0);
   3420             emit_push_operand(t, w->imm_kind, w->imm_a, w->a, w->type);
   3421             emit_insn(
   3422                 t, vt == WASM_VAL_I64 ? WASM_INSN_I64_SUB : WASM_INSN_I32_SUB,
   3423                 0);
   3424             break;
   3425           }
   3426           case UO_FNEG: {
   3427             emit_push_operand(t, w->imm_kind, w->imm_a, w->a, w->type);
   3428             emit_insn(
   3429                 t, vt == WASM_VAL_F64 ? WASM_INSN_F64_NEG : WASM_INSN_F32_NEG,
   3430                 0);
   3431             break;
   3432           }
   3433           case UO_BNOT: {
   3434             /* a XOR -1 */
   3435             emit_push_operand(t, w->imm_kind, w->imm_a, w->a, w->type);
   3436             emit_push_imm(t, vt, -1);
   3437             emit_insn(
   3438                 t, vt == WASM_VAL_I64 ? WASM_INSN_I64_XOR : WASM_INSN_I32_XOR,
   3439                 0);
   3440             break;
   3441           }
   3442           case UO_NOT: {
   3443             /* a == 0 — i{32,64}.eqz always produces an i32 0/1. When the CG
   3444              * destination is i64 (e.g. !x where x was zext'd to i64 before the
   3445              * negation), widen the i32 boolean back to i64 so the following
   3446              * local.set is well-typed. */
   3447             emit_push_operand(t, w->imm_kind, w->imm_a, w->a, w->type);
   3448             emit_insn(
   3449                 t, vt == WASM_VAL_I64 ? WASM_INSN_I64_EQZ : WASM_INSN_I32_EQZ,
   3450                 0);
   3451             if (vt == WASM_VAL_I64)
   3452               emit_insn(t, WASM_INSN_I64_EXTEND_I32_U, 0);
   3453             break;
   3454           }
   3455         }
   3456         emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3457         break;
   3458       }
   3459       case WIR_CMP: {
   3460         CmpOp cop = (CmpOp)w->cgop;
   3461         if (cop >= CMP_OEQ_F) {
   3462           emit_fp_cmp(t, cop, w, w->type2);
   3463         } else {
   3464           push_cmp_operands(t, w, w->type2);
   3465           emit_insn(t, cmp_kind(t, cop, type_valtype(t, w->type2)), 0);
   3466         }
   3467         /* cmp result is i32 (0/1). dst type may be wider — but cg generally
   3468          * stores cmp results into i32. */
   3469         emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3470         break;
   3471       }
   3472       case WIR_CONVERT: {
   3473         WasmValType src = type_valtype(t, w->type2);
   3474         WasmValType dst = type_valtype(t, w->type);
   3475         u32 sw = kit_cg_type_int_width((KitCompiler*)t->c, w->type2);
   3476         u32 dw = kit_cg_type_int_width((KitCompiler*)t->c, w->type);
   3477         emit_push_operand(t, w->imm_kind, w->imm_a, w->a, w->type2);
   3478         emit_convert(t, (ConvKind)w->cgop, src, dst, sw, dw);
   3479         emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3480         break;
   3481       }
   3482       case WIR_CALL:
   3483       case WIR_CALL_INDIRECT: {
   3484         /* Tail calls tear down the caller's wasm frame (return_call /
   3485          * return_call_indirect have polymorphic-unreachable type after the
   3486          * call). Mirror the WIR_RET linear-stack epilogue before pushing
   3487          * args so the linear-memory stack frame is released. Operands
   3488          * below come from wasm locals (incoming params or reg-locals),
   3489          * not from the linear stack we just freed. Variadic tail calls are
   3490          * rejected upstream; sret tail calls forward the incoming pointer. */
   3491         if (w->call_tail && t->has_stack_frame) {
   3492           emit_insn(t, WASM_INSN_LOCAL_GET, (i64)t->frame_saved_sp_local);
   3493           emit_insn(t, WASM_INSN_GLOBAL_SET, (i64)t->stack_pointer_global);
   3494         }
   3495         if (w->call_has_sret) {
   3496           if (w->call_tail) {
   3497             /* Forward this function's own incoming sret pointer: the callee
   3498              * writes the same buffer (in our caller's frame, which outlives
   3499              * the sibling call) and return_calls back. The pointer is a wasm
   3500              * local, unaffected by the linear-frame teardown above. */
   3501             if (t->sret_param_local == 0xffffffffu)
   3502               wfail(t, "wasm: sret tail call without an incoming sret pointer");
   3503             emit_insn(t, WASM_INSN_LOCAL_GET, (i64)t->sret_param_local);
   3504           } else {
   3505             /* Push sret pointer (address of caller-allocated buffer). */
   3506             uint64_t off;
   3507             emit_addr_operand(t, w->call_sret_addr, &off);
   3508             if (off) {
   3509               emit_insn(t, WASM_INSN_I32_CONST, (i64)off);
   3510               emit_insn(t, WASM_INSN_I32_ADD, 0);
   3511             }
   3512           }
   3513         }
   3514         for (u32 a = 0; a < w->call_narg; ++a) {
   3515           if (w->call_arg_kinds[a] == WOP_REG) {
   3516             emit_push_operand_reg(t, w->call_args[a]);
   3517           } else if (w->call_arg_kinds[a] == WOP_IMM) {
   3518             WasmValType vt = type_valtype(t, w->call_arg_types[a]);
   3519             emit_push_imm(t, vt, w->call_arg_imms[a]);
   3520           } else if (w->call_arg_kinds[a] == WOP_ADDR) {
   3521             uint64_t off;
   3522             emit_addr_operand(t, w->call_arg_addrs[a], &off);
   3523             if (off) {
   3524               emit_insn(t, WASM_INSN_I32_CONST, (i64)off);
   3525               emit_insn(t, WASM_INSN_I32_ADD, 0);
   3526             }
   3527           } else {
   3528             wfail(t, "wasm: bad call-arg kind %u", w->call_arg_kinds[a]);
   3529           }
   3530         }
   3531         /* Variadic packing. Each variadic arg occupies an 8-byte slot in a
   3532          * caller-allocated linear-memory buffer; the buffer's address is
   3533          * passed as the hidden trailing i32. We save __stack_pointer to a
   3534          * scratch local before allocating the buffer and restore it after
   3535          * the call returns, so a variadic call in a loop doesn't grow the
   3536          * linear stack. See wasm_va_start / wasm_va_arg for the callee side.
   3537          */
   3538         if (w->call_variadic) {
   3539           if (w->call_nvar == 0u) {
   3540             /* No varargs: still pass a hidden i32. NULL is fine — the callee
   3541              * must not deref va_list without a matching @va_arg, which a
   3542              * well-typed program won't do. */
   3543             emit_insn(t, WASM_INSN_I32_CONST, 0);
   3544           } else {
   3545             ensure_stack_pointer(t);
   3546             if (t->varcall_saved_sp_local == 0xffffffffu)
   3547               t->varcall_saved_sp_local = add_wasm_local(t, WASM_VAL_I32);
   3548             if (t->varcall_buf_local == 0xffffffffu)
   3549               t->varcall_buf_local = add_wasm_local(t, WASM_VAL_I32);
   3550             u32 buf_size = w->call_nvar * 8u;
   3551             /* Save SP, allocate aligned buffer, set SP = buf. */
   3552             emit_insn(t, WASM_INSN_GLOBAL_GET, (i64)t->stack_pointer_global);
   3553             emit_insn(t, WASM_INSN_LOCAL_TEE, (i64)t->varcall_saved_sp_local);
   3554             emit_insn(t, WASM_INSN_I32_CONST, (i64)buf_size);
   3555             emit_insn(t, WASM_INSN_I32_SUB, 0);
   3556             emit_insn(t, WASM_INSN_I32_CONST, -(i64)8);
   3557             emit_insn(t, WASM_INSN_I32_AND, 0);
   3558             emit_insn(t, WASM_INSN_LOCAL_TEE, (i64)t->varcall_buf_local);
   3559             emit_insn(t, WASM_INSN_GLOBAL_SET, (i64)t->stack_pointer_global);
   3560             /* Pack each variadic arg at offset i*8. Store width is the
   3561              * value's natural width (i32/i64/f32/f64); the unused high
   3562              * bytes of i32/f32 slots are left as whatever __stack_pointer
   3563              * pointed at, which @va_arg won't read for those slots. */
   3564             for (u32 v = 0; v < w->call_nvar; ++v) {
   3565               KitCgTypeId vty = w->call_var_types[v];
   3566               WasmValType vvt = type_valtype(t, vty);
   3567               WasmInsnKind store_op;
   3568               u32 width;
   3569               if (vvt == WASM_VAL_I64) {
   3570                 store_op = WASM_INSN_I64_STORE;
   3571                 width = 8u;
   3572               } else if (vvt == WASM_VAL_F32) {
   3573                 store_op = WASM_INSN_F32_STORE;
   3574                 width = 4u;
   3575               } else if (vvt == WASM_VAL_F64) {
   3576                 store_op = WASM_INSN_F64_STORE;
   3577                 width = 8u;
   3578               } else {
   3579                 store_op = WASM_INSN_I32_STORE;
   3580                 width = 4u;
   3581               }
   3582               emit_insn(t, WASM_INSN_LOCAL_GET, (i64)t->varcall_buf_local);
   3583               if (w->call_var_kinds[v] == WOP_REG) {
   3584                 emit_push_operand_reg(t, w->call_var_regs[v]);
   3585               } else if (w->call_var_kinds[v] == WOP_IMM) {
   3586                 if (vvt == WASM_VAL_F32 || vvt == WASM_VAL_F64)
   3587                   wfail(t, "wasm: float immediate variadic arg unsupported");
   3588                 emit_push_imm(t, vvt, w->call_var_imms[v]);
   3589               } else {
   3590                 wfail(t, "wasm: bad variadic-arg kind %u",
   3591                       w->call_var_kinds[v]);
   3592               }
   3593               wasm_func_add_mem_insn(t->c, t->module, t->cur_func, store_op,
   3594                                      memarg_align_log2(width, width),
   3595                                      (u64)(v * 8u), 0u);
   3596             }
   3597             /* Push buf addr as hidden last arg. */
   3598             emit_insn(t, WASM_INSN_LOCAL_GET, (i64)t->varcall_buf_local);
   3599           }
   3600         }
   3601         if (w->op == WIR_CALL_INDIRECT) {
   3602           /* Callee: push the i32 table index. */
   3603           emit_push_operand_reg(t, w->a);
   3604           /* call_indirect / return_call_indirect both encode
   3605            * (typeidx, tableidx). The encoder reads `imm` as typeidx and
   3606            * `align` as tableidx. */
   3607           wasm_func_add_insn(t->c, t->module, t->cur_func,
   3608                              w->call_tail ? WASM_INSN_RETURN_CALL_INDIRECT
   3609                                           : WASM_INSN_CALL_INDIRECT,
   3610                              w->imm);
   3611           t->cur_func->insns[t->cur_func->ninsns - 1u].align = 0u;
   3612         } else {
   3613           u32 idx = sym_to_wasm_func(t, w->call_sym, NULL);
   3614           emit_insn(t, w->call_tail ? WASM_INSN_RETURN_CALL : WASM_INSN_CALL,
   3615                     (i64)idx);
   3616         }
   3617         /* Tail calls never return to this function: the operand stack is
   3618          * polymorphic-unreachable after return_call*, so writing dst or
   3619          * restoring the variadic stack pointer would be dead and would
   3620          * also corrupt stack typing. (Variadic tail calls are rejected
   3621          * upstream, so the variadic SP-restore guard is defensive.) */
   3622         if (!w->call_tail) {
   3623           if (w->dst != REG_NONE) {
   3624             emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3625           }
   3626           /* Restore SP after variadic call so loop-resident variadic calls
   3627            * don't accumulate stack usage. Done after stashing the return
   3628            * value into its local. */
   3629           if (w->call_variadic && w->call_nvar) {
   3630             emit_insn(t, WASM_INSN_LOCAL_GET, (i64)t->varcall_saved_sp_local);
   3631             emit_insn(t, WASM_INSN_GLOBAL_SET, (i64)t->stack_pointer_global);
   3632           }
   3633         }
   3634         break;
   3635       }
   3636       case WIR_RET: {
   3637         if (w->cgop == 1) {
   3638           /* Aggregate sret return: memcpy w->addr -> *sret_param, then
   3639            * void return. The sret pointer was the hidden first wasm param.
   3640            * NOTE: this still uses a byte loop rather than memory.copy so
   3641            * the produced module remains loadable by the kit runtime
   3642            * before the wasm-core default-feature change lands. The path
   3643            * will collapse to memory.copy once the core's default feature
   3644            * set includes WASM_FEATURE_BULK_MEMORY (subagent A). */
   3645           if (t->sret_param_local == 0xffffffffu)
   3646             wfail(t, "wasm: sret return without hidden sret param");
   3647           for (u32 n = 0; n < w->agg.size; ++n) {
   3648             /* Push destination address (sret_ptr) onto stack. The memarg
   3649              * offset on the i32.store8 carries the per-byte offset. */
   3650             emit_insn(t, WASM_INSN_LOCAL_GET, (i64)t->sret_param_local);
   3651             /* Load src byte at (w->addr + n). */
   3652             Operand src = w->addr;
   3653             if (src.kind == OPK_INDIRECT)
   3654               src.v.ind.ofs += (i32)n;
   3655             else if (src.kind == OPK_GLOBAL)
   3656               src.v.global.addend += n;
   3657             uint64_t src_off;
   3658             emit_addr_operand(t, src, &src_off);
   3659             if (src.kind == OPK_LOCAL) src_off += n;
   3660             wasm_func_add_mem_insn(t->c, t->module, t->cur_func,
   3661                                    WASM_INSN_I32_LOAD8_U, 0, src_off, 0);
   3662             wasm_func_add_mem_insn(t->c, t->module, t->cur_func,
   3663                                    WASM_INSN_I32_STORE8, 0, n, 0);
   3664           }
   3665         } else if (w->dst != REG_NONE)
   3666           emit_push_operand_reg(t, w->dst);
   3667         else if (w->imm_kind == 1) {
   3668           WasmValType vt = type_valtype(t, w->type);
   3669           emit_push_imm(t, vt, w->imm_a);
   3670         }
   3671         if (t->has_stack_frame) {
   3672           emit_insn(t, WASM_INSN_LOCAL_GET, (i64)t->frame_saved_sp_local);
   3673           emit_insn(t, WASM_INSN_GLOBAL_SET, (i64)t->stack_pointer_global);
   3674         }
   3675         emit_insn(t, WASM_INSN_RETURN, 0);
   3676         break;
   3677       }
   3678       case WIR_UNREACHABLE: {
   3679         emit_insn(t, WASM_INSN_UNREACHABLE, 0);
   3680         break;
   3681       }
   3682       case WIR_LOAD_LOCAL: {
   3683         u32 wli = (u32)w->imm;
   3684         emit_insn(t, WASM_INSN_LOCAL_GET, (i64)wli);
   3685         emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3686         break;
   3687       }
   3688       case WIR_STORE_LOCAL: {
   3689         u32 wli = (u32)w->imm;
   3690         if (w->imm_kind == 1) {
   3691           WasmValType vt = type_valtype(t, w->type);
   3692           emit_push_imm(t, vt, w->imm_a);
   3693         } else {
   3694           emit_push_operand_reg(t, w->a);
   3695         }
   3696         emit_insn(t, WASM_INSN_LOCAL_SET, (i64)wli);
   3697         break;
   3698       }
   3699       case WIR_LOAD_MEM: {
   3700         emit_load_addr(t, w->addr, w->type, w->mem);
   3701         emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3702         break;
   3703       }
   3704       case WIR_STORE_MEM: {
   3705         Operand src;
   3706         memset(&src, 0, sizeof src);
   3707         src.kind = w->imm_kind == WOP_IMM ? OPK_IMM : OPK_REG;
   3708         src.type = w->type;
   3709         if (src.kind == OPK_IMM)
   3710           src.v.imm = w->imm_a;
   3711         else
   3712           src.v.reg = w->a;
   3713         emit_store_addr(t, w->addr, w->type, src, w->mem, w->imm_kind, w->imm_a,
   3714                         w->a);
   3715         break;
   3716       }
   3717       case WIR_ADDR_OF: {
   3718         uint64_t offset;
   3719         emit_addr_operand(t, w->addr, &offset);
   3720         if (offset) {
   3721           emit_insn(t, WASM_INSN_I32_CONST, (i64)offset);
   3722           emit_insn(t, WASM_INSN_I32_ADD, 0);
   3723         }
   3724         emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3725         break;
   3726       }
   3727       case WIR_ALLOCA: {
   3728         u32 align = (u32)w->imm;
   3729         emit_insn(t, WASM_INSN_GLOBAL_GET, (i64)t->stack_pointer_global);
   3730         emit_push_operand(t, w->imm_kind, w->imm_a, w->a,
   3731                           w->type2 ? w->type2 : builtin_id(KIT_CG_BUILTIN_I32));
   3732         if (w->type2 && type_valtype(t, w->type2) == WASM_VAL_I64)
   3733           emit_insn(t, WASM_INSN_I32_WRAP_I64, 0);
   3734         emit_insn(t, WASM_INSN_I32_SUB, 0);
   3735         if (align > 1u) {
   3736           emit_insn(t, WASM_INSN_I32_CONST, -(i64)align);
   3737           emit_insn(t, WASM_INSN_I32_AND, 0);
   3738         }
   3739         emit_insn(t, WASM_INSN_LOCAL_TEE,
   3740                   (i64)reg_local(t, w->dst, w->type, (RegClass)w->cls));
   3741         emit_insn(t, WASM_INSN_GLOBAL_SET, (i64)t->stack_pointer_global);
   3742         break;
   3743       }
   3744       case WIR_COPY_BYTES: {
   3745         /* memory.copy: stack = dst_addr, src_addr, n; both memidx fields = 0.
   3746          */
   3747         Operand src_addr;
   3748         if (w->imm_kind != WOP_REG)
   3749           wfail(t, "wasm: copy_bytes source must be a register pointer");
   3750         memset(&src_addr, 0, sizeof src_addr);
   3751         src_addr.kind = OPK_INDIRECT;
   3752         src_addr.type = w->addr.type;
   3753         src_addr.v.ind.base = w->a;
   3754         src_addr.v.ind.index = REG_NONE;
   3755         src_addr.v.ind.log2_scale = 0;
   3756         src_addr.v.ind.ofs = 0;
   3757         if (w->agg.size == 0) break;
   3758         emit_push_addr_value(t, w->addr);
   3759         emit_push_addr_value(t, src_addr);
   3760         emit_insn(t, WASM_INSN_I32_CONST, (i64)(uint32_t)w->agg.size);
   3761         wasm_func_add_insn(t->c, t->module, t->cur_func, WASM_INSN_MEMORY_COPY,
   3762                            0);
   3763         /* dst memidx = 0, src memidx = 0 (kit-cc single-memory module). */
   3764         t->cur_func->insns[t->cur_func->ninsns - 1u].memidx = 0;
   3765         t->cur_func->insns[t->cur_func->ninsns - 1u].aux_idx = 0;
   3766         break;
   3767       }
   3768       case WIR_SET_BYTES: {
   3769         /* memory.fill: stack = dst_addr, value_i32, n; memidx = 0. */
   3770         if (w->imm_kind != WOP_IMM)
   3771           wfail(t, "wasm: set_bytes value must be immediate in v1");
   3772         if (w->agg.size == 0) break;
   3773         emit_push_addr_value(t, w->addr);
   3774         emit_insn(t, WASM_INSN_I32_CONST, (i64)(w->imm_a & 0xff));
   3775         emit_insn(t, WASM_INSN_I32_CONST, (i64)(uint32_t)w->agg.size);
   3776         wasm_func_add_insn(t->c, t->module, t->cur_func, WASM_INSN_MEMORY_FILL,
   3777                            0);
   3778         t->cur_func->insns[t->cur_func->ninsns - 1u].memidx = 0;
   3779         break;
   3780       }
   3781       case WIR_ATOMIC_LOAD: {
   3782         WasmInsnKind k = atomic_load_kind_for(t, w->type, w->mem);
   3783         u32 width = wasm_mem_width((uint8_t)k);
   3784         emit_push_operand_reg(t, w->a);
   3785         wasm_func_add_mem_insn(t->c, t->module, t->cur_func, k,
   3786                                memarg_align_log2(w->mem.align, width), 0, 0);
   3787         emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3788         break;
   3789       }
   3790       case WIR_ATOMIC_STORE: {
   3791         WasmInsnKind k = atomic_store_kind_for(t, w->type, w->mem);
   3792         u32 width = wasm_mem_width((uint8_t)k);
   3793         emit_push_operand_reg(t, w->a);
   3794         emit_push_operand(t, w->imm_kind_b, w->imm_b, w->b, w->type);
   3795         wasm_func_add_mem_insn(t->c, t->module, t->cur_func, k,
   3796                                memarg_align_log2(w->mem.align, width), 0, 0);
   3797         break;
   3798       }
   3799       case WIR_ATOMIC_RMW: {
   3800         if ((KitCgAtomicOp)w->cgop == KIT_CG_ATOMIC_NAND) {
   3801           /* wasm-threads has no atomic.rmw.nand. Expand to a cmpxchg retry
   3802            * loop computing desired = ~(old & val):
   3803            *   loop
   3804            *     old = atomic.load(addr)            ; tee into old_local
   3805            *     desired = (old & val) ^ -1
   3806            *     got = atomic.rmw.cmpxchg(addr, old, desired)
   3807            *     br_if loop  (got != old)           ; lost the race, retry
   3808            *   end
   3809            *   dst = old_local                      ; fetch returns prior value
   3810            */
   3811           WasmValType vt = type_valtype(t, w->type);
   3812           WasmInsnKind load_k = atomic_load_kind_for(t, w->type, w->mem);
   3813           WasmInsnKind cas_k = atomic_cmpxchg_kind_for(t, w->type, w->mem);
   3814           u32 load_w = wasm_mem_width((uint8_t)load_k);
   3815           u32 cas_w = wasm_mem_width((uint8_t)cas_k);
   3816           int is64 = (vt == WASM_VAL_I64);
   3817           u32 old_local = add_wasm_local(t, vt);
   3818           emit_insn(t, WASM_INSN_LOOP, 0);
   3819           /* addr (cmpxchg arg0) */
   3820           emit_push_operand_reg(t, w->a);
   3821           /* expected = atomic.load(addr), tee into old_local */
   3822           emit_push_operand_reg(t, w->a);
   3823           wasm_func_add_mem_insn(t->c, t->module, t->cur_func, load_k,
   3824                                  memarg_align_log2(w->mem.align, load_w), 0, 0);
   3825           emit_insn(t, WASM_INSN_LOCAL_TEE, (i64)old_local);
   3826           /* desired = (old & val) ^ -1 */
   3827           emit_insn(t, WASM_INSN_LOCAL_GET, (i64)old_local);
   3828           emit_push_operand(t, w->imm_kind_b, w->imm_b, w->b, w->type);
   3829           emit_insn(t, is64 ? WASM_INSN_I64_AND : WASM_INSN_I32_AND, 0);
   3830           emit_push_imm(t, vt, -1);
   3831           emit_insn(t, is64 ? WASM_INSN_I64_XOR : WASM_INSN_I32_XOR, 0);
   3832           /* cmpxchg -> value previously in memory */
   3833           wasm_func_add_mem_insn(t->c, t->module, t->cur_func, cas_k,
   3834                                  memarg_align_log2(w->mem.align, cas_w), 0, 0);
   3835           /* retry if memory had changed (got != expected) */
   3836           emit_insn(t, WASM_INSN_LOCAL_GET, (i64)old_local);
   3837           emit_insn(t, is64 ? WASM_INSN_I64_NE : WASM_INSN_I32_NE, 0);
   3838           emit_insn(t, WASM_INSN_BR_IF, 0); /* 0 = innermost loop */
   3839           emit_insn(t, WASM_INSN_END, 0);
   3840           emit_insn(t, WASM_INSN_LOCAL_GET, (i64)old_local);
   3841           emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3842           break;
   3843         }
   3844         WasmInsnKind k =
   3845             atomic_rmw_kind_for(t, (KitCgAtomicOp)w->cgop, w->type, w->mem);
   3846         u32 width = wasm_mem_width((uint8_t)k);
   3847         emit_push_operand_reg(t, w->a);
   3848         emit_push_operand(t, w->imm_kind_b, w->imm_b, w->b, w->type);
   3849         wasm_func_add_mem_insn(t->c, t->module, t->cur_func, k,
   3850                                memarg_align_log2(w->mem.align, width), 0, 0);
   3851         emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3852         break;
   3853       }
   3854       case WIR_ATOMIC_CAS: {
   3855         WasmInsnKind k = atomic_cmpxchg_kind_for(t, w->type, w->mem);
   3856         WasmValType vt = type_valtype(t, w->type);
   3857         u32 width = wasm_mem_width((uint8_t)k);
   3858         /* Save expected into a fresh wasm local before consuming inputs. CG
   3859          * may reuse one of (addr, expected, desired) regs for prior or ok;
   3860          * reg_local() for w->dst/w->dst2 would then rebind that reg's local
   3861          * mid-stream, and re-pushing expected via the (now-stale) mapping
   3862          * would read an uninitialized local. The temp sidesteps that. */
   3863         u32 saved_expected = add_wasm_local(t, vt);
   3864         /* push addr; expected (tee into saved-expected, leaves on stack);
   3865          * desired; cmpxchg -> prior on stack. */
   3866         emit_push_operand_reg(t, w->a);
   3867         emit_push_operand(t, w->imm_kind_b, w->imm_b, w->b, w->type);
   3868         emit_insn(t, WASM_INSN_LOCAL_TEE, (i64)saved_expected);
   3869         emit_push_operand(t, w->imm_kind_c, w->imm_c, w->op_c, w->type);
   3870         wasm_func_add_mem_insn(t->c, t->module, t->cur_func, k,
   3871                                memarg_align_log2(w->mem.align, width), 0, 0);
   3872         /* All input regs have been consumed; safe to rebind. */
   3873         u32 prior_local = reg_local(t, w->dst, w->type, (RegClass)w->cls);
   3874         emit_insn(t, WASM_INSN_LOCAL_TEE, (i64)prior_local);
   3875         emit_insn(t, WASM_INSN_LOCAL_GET, (i64)saved_expected);
   3876         emit_insn(t, vt == WASM_VAL_I64 ? WASM_INSN_I64_EQ : WASM_INSN_I32_EQ,
   3877                   0);
   3878         emit_local_set(t, w->dst2,
   3879                        w->type2 ? w->type2 : builtin_id(KIT_CG_BUILTIN_BOOL),
   3880                        RC_INT);
   3881         break;
   3882       }
   3883       case WIR_FENCE: {
   3884         emit_insn(t, WASM_INSN_ATOMIC_FENCE, 0);
   3885         break;
   3886       }
   3887       case WIR_JUMP: {
   3888         u32 d = br_to_label(L, w->labels[0]);
   3889         emit_insn(t, WASM_INSN_BR, (i64)d);
   3890         break;
   3891       }
   3892       case WIR_CMP_BRANCH: {
   3893         CmpOp cop = (CmpOp)w->cgop;
   3894         if (cop >= CMP_OEQ_F) {
   3895           emit_fp_cmp(t, cop, w, w->type);
   3896         } else {
   3897           push_cmp_operands(t, w, w->type);
   3898           emit_insn(t, cmp_kind(t, cop, type_valtype(t, w->type)), 0);
   3899         }
   3900         u32 d = br_to_label(L, w->labels[0]);
   3901         emit_insn(t, WASM_INSN_BR_IF, (i64)d);
   3902         break;
   3903       }
   3904       case WIR_SWITCH: {
   3905         emit_switch_br_table(t, L, w);
   3906         break;
   3907       }
   3908       case WIR_SCOPE_OPEN: {
   3909         if (L->nstack >= 1024u)
   3910           wfail(t, "wasm: scope nesting too deep (max 1024)");
   3911         LoweringScope* s = &L->stack[L->nstack++];
   3912         s->id = w->scope_id;
   3913         s->kind = w->cgop;
   3914         if (w->cgop == SCOPE_LOOP) {
   3915           /* (block (loop ...)); inside the body:
   3916            *   br to loop top (cur_depth+1) = continue
   3917            *   br to past block (cur_depth) = break (one more level out) */
   3918           emit_insn(t, WASM_INSN_BLOCK, 0);
   3919           L->cur_depth++;
   3920           s->break_depth = L->cur_depth; /* `br N` lands AFTER block */
   3921           emit_insn(t, WASM_INSN_LOOP, 0);
   3922           L->cur_depth++;
   3923           s->cont_depth = L->cur_depth; /* `br N` lands at LOOP top */
   3924         } else if (w->cgop == SCOPE_BLOCK) {
   3925           emit_insn(t, WASM_INSN_BLOCK, 0);
   3926           L->cur_depth++;
   3927           s->break_depth = L->cur_depth;
   3928           s->cont_depth = L->cur_depth; /* unused */
   3929         } else {
   3930           wfail(t, "wasm: unknown scope kind %d", (int)w->cgop);
   3931         }
   3932         break;
   3933       }
   3934       case WIR_SCOPE_CLOSE: {
   3935         if (L->nstack == 0) wfail(t, "wasm: scope_close without open scope");
   3936         LoweringScope* s = &L->stack[L->nstack - 1u];
   3937         if (s->kind == SCOPE_LOOP) {
   3938           emit_insn(t, WASM_INSN_END, 0); /* close loop */
   3939           L->cur_depth--;
   3940           emit_insn(t, WASM_INSN_END, 0); /* close outer block */
   3941           L->cur_depth--;
   3942         } else {
   3943           emit_insn(t, WASM_INSN_END, 0);
   3944           L->cur_depth--;
   3945         }
   3946         L->nstack--;
   3947         break;
   3948       }
   3949       case WIR_VA_START: {
   3950         /* *ap_addr = va_ptr_param_local */
   3951         emit_push_addr_value(t, w->addr);
   3952         emit_insn(t, WASM_INSN_LOCAL_GET, (i64)t->va_ptr_param_local);
   3953         wasm_func_add_mem_insn(t->c, t->module, t->cur_func,
   3954                                WASM_INSN_I32_STORE, 2u, 0u, 0u);
   3955         break;
   3956       }
   3957       case WIR_VA_ARG: {
   3958         if (t->va_arg_tmp_addr_local == 0xffffffffu)
   3959           t->va_arg_tmp_addr_local = add_wasm_local(t, WASM_VAL_I32);
   3960         WasmValType vt = type_valtype(t, w->type);
   3961         WasmInsnKind load_op;
   3962         u32 width;
   3963         if (vt == WASM_VAL_I64) {
   3964           load_op = WASM_INSN_I64_LOAD;
   3965           width = 8u;
   3966         } else if (vt == WASM_VAL_F32) {
   3967           load_op = WASM_INSN_F32_LOAD;
   3968           width = 4u;
   3969         } else if (vt == WASM_VAL_F64) {
   3970           load_op = WASM_INSN_F64_LOAD;
   3971           width = 8u;
   3972         } else {
   3973           load_op = WASM_INSN_I32_LOAD;
   3974           width = 4u;
   3975         }
   3976         /* Load T from current *ap and stash into dst. */
   3977         emit_push_addr_value(t, w->addr);
   3978         emit_insn(t, WASM_INSN_LOCAL_TEE, (i64)t->va_arg_tmp_addr_local);
   3979         wasm_func_add_mem_insn(t->c, t->module, t->cur_func, WASM_INSN_I32_LOAD,
   3980                                2u, 0u, 0u);
   3981         wasm_func_add_mem_insn(t->c, t->module, t->cur_func, load_op,
   3982                                memarg_align_log2(width, width), 0u, 0u);
   3983         emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
   3984         /* Advance: *ap = *ap + 8. */
   3985         emit_insn(t, WASM_INSN_LOCAL_GET, (i64)t->va_arg_tmp_addr_local);
   3986         emit_insn(t, WASM_INSN_LOCAL_GET, (i64)t->va_arg_tmp_addr_local);
   3987         wasm_func_add_mem_insn(t->c, t->module, t->cur_func, WASM_INSN_I32_LOAD,
   3988                                2u, 0u, 0u);
   3989         emit_insn(t, WASM_INSN_I32_CONST, (i64)8);
   3990         emit_insn(t, WASM_INSN_I32_ADD, 0);
   3991         wasm_func_add_mem_insn(t->c, t->module, t->cur_func,
   3992                                WASM_INSN_I32_STORE, 2u, 0u, 0u);
   3993         break;
   3994       }
   3995       case WIR_VA_COPY: {
   3996         /* *dst_ap = *src_ap (single i32). */
   3997         emit_push_addr_value(t, w->addr);
   3998         emit_push_addr_value(t, w->call_sret_addr);
   3999         wasm_func_add_mem_insn(t->c, t->module, t->cur_func, WASM_INSN_I32_LOAD,
   4000                                2u, 0u, 0u);
   4001         wasm_func_add_mem_insn(t->c, t->module, t->cur_func,
   4002                                WASM_INSN_I32_STORE, 2u, 0u, 0u);
   4003         break;
   4004       }
   4005       case WIR_INTRINSIC: {
   4006         emit_intrinsic(t, w);
   4007         break;
   4008       }
   4009       case WIR_ASM_BLOCK: {
   4010         Heap* h_blk = t->c->ctx->heap;
   4011         u32 nin = w->asm_nin;
   4012         u32 nout = w->asm_nout;
   4013         u32* in_locals = NULL;
   4014         u32* out_locals = NULL;
   4015         if (nin) {
   4016           in_locals =
   4017               (u32*)h_blk->alloc(h_blk, sizeof(u32) * nin, _Alignof(u32));
   4018           if (!in_locals) wfail(t, "wasm: out of memory");
   4019           /* defer per-input allocation until after output locals are known
   4020            * so numeric tieback ("+r", "0".."9") can share. */
   4021         }
   4022         if (nout) {
   4023           out_locals =
   4024               (u32*)h_blk->alloc(h_blk, sizeof(u32) * nout, _Alignof(u32));
   4025           if (!out_locals) wfail(t, "wasm: out of memory");
   4026           for (u32 i = 0; i < nout; ++i)
   4027             out_locals[i] =
   4028                 add_wasm_local(t, valtype_for_type(t, w->asm_out_types[i]));
   4029         }
   4030         if (nin) {
   4031           for (u32 i = 0; i < nin; ++i) {
   4032             i32 share = w->asm_in_share_out[i];
   4033             if (share >= 0 && (u32)share < nout) {
   4034               in_locals[i] = out_locals[share];
   4035             } else {
   4036               in_locals[i] =
   4037                   add_wasm_local(t, valtype_for_type(t, w->asm_in_types[i]));
   4038             }
   4039           }
   4040         }
   4041         /* Input materialization: push source operand, then for OPK_INDIRECT
   4042          * inputs ("m" constraint with displacement) splice in
   4043          * `i32.const ofs; i32.add` so the input local holds base+ofs.
   4044          * Finally local.set into the input's local (which may be a shared
   4045          * output local). */
   4046         for (u32 i = 0; i < nin; ++i) {
   4047           emit_push_operand(t, w->asm_in_kinds[i], w->asm_in_imms[i],
   4048                             w->asm_in_regs[i], w->asm_in_types[i]);
   4049           if (w->asm_in_kinds[i] == WOP_REG && w->asm_in_imms[i] != 0) {
   4050             emit_push_imm(t, WASM_VAL_I32, w->asm_in_imms[i]);
   4051             emit_insn(t, WASM_INSN_I32_ADD, 0);
   4052           }
   4053           emit_insn(t, WASM_INSN_LOCAL_SET, (i64)in_locals[i]);
   4054         }
   4055         /* Splice body, remapping local indices < nin+nout to the actual
   4056          * wasm local table. */
   4057         for (u32 i = 0; i < w->raw_ninsns; ++i) {
   4058           WasmInsn in = w->raw_insns[i];
   4059           if (in.kind == WASM_INSN_LOCAL_GET ||
   4060               in.kind == WASM_INSN_LOCAL_SET ||
   4061               in.kind == WASM_INSN_LOCAL_TEE) {
   4062             if (in.imm >= 0 && (u64)in.imm < (u64)nin)
   4063               in.imm = (i64)in_locals[in.imm];
   4064             else if (in.imm >= (i64)nin && (u64)in.imm < (u64)(nin + nout))
   4065               in.imm = (i64)out_locals[in.imm - (i64)nin];
   4066           }
   4067           t->module->current_loc = in.loc;
   4068           wasm_func_add_insn(t->c, t->module, t->cur_func,
   4069                              (WasmInsnKind)in.kind, in.imm);
   4070           t->cur_func->insns[t->cur_func->ninsns - 1u] = in;
   4071         }
   4072         /* Output extraction: copy each output local into the destination
   4073          * Reg's wasm local. */
   4074         for (u32 i = 0; i < nout; ++i) {
   4075           WasmValType ovt = valtype_for_type(t, w->asm_out_types[i]);
   4076           RegClass cls =
   4077               (ovt == WASM_VAL_F32 || ovt == WASM_VAL_F64) ? RC_FP : RC_INT;
   4078           emit_insn(t, WASM_INSN_LOCAL_GET, (i64)out_locals[i]);
   4079           emit_local_set(t, w->asm_out_regs[i], w->asm_out_types[i], cls);
   4080         }
   4081         if (in_locals) h_blk->free(h_blk, in_locals, sizeof(u32) * nin);
   4082         if (out_locals) h_blk->free(h_blk, out_locals, sizeof(u32) * nout);
   4083         break;
   4084       }
   4085       case WIR_LABEL: {
   4086         break;
   4087       }
   4088     }
   4089   }
   4090 }
   4091 
   4092 static void linearize(WTarget* t) {
   4093   LoweringState L;
   4094   /* Rewrite WIR so every free label is bound to a synthetic SCOPE_BLOCK
   4095    * (forward goto) or SCOPE_LOOP (backward goto). After this, the only
   4096    * remaining free labels are switch-island participants, which the
   4097    * try_linearize_switch_island fast path inside linearize_range handles. */
   4098   wasm_structurize(t);
   4099   memset(&L, 0, sizeof L);
   4100   L.t = t;
   4101 
   4102   if (t->has_stack_frame) {
   4103     t->frame_saved_sp_local = add_wasm_local(t, WASM_VAL_I32);
   4104     t->frame_base_local = add_wasm_local(t, WASM_VAL_I32);
   4105     emit_insn(t, WASM_INSN_GLOBAL_GET, (i64)t->stack_pointer_global);
   4106     emit_insn(t, WASM_INSN_LOCAL_TEE, (i64)t->frame_saved_sp_local);
   4107     if (t->frame_size) {
   4108       emit_insn(t, WASM_INSN_I32_CONST,
   4109                 (i64)align_to_u32(t->frame_size, t->frame_align));
   4110       emit_insn(t, WASM_INSN_I32_SUB, 0);
   4111     }
   4112     emit_insn(t, WASM_INSN_LOCAL_TEE, (i64)t->frame_base_local);
   4113     emit_insn(t, WASM_INSN_GLOBAL_SET, (i64)t->stack_pointer_global);
   4114   }
   4115 
   4116   /* Byval copy-in: for each ABI_ARG_INDIRECT param, copy the aggregate from
   4117    * the caller's pointer into the callee's stack-frame buffer so callee
   4118    * mutations are isolated (wasm32 BasicCABI). Byte-by-byte for v1; can be
   4119    * promoted to wider chunks later. */
   4120   for (u32 i = 0; i < t->nbyval_copies; ++i) {
   4121     const WByvalCopy* bc = &t->byval_copies[i];
   4122     const WSlot* s = &t->slots[bc->dst_slot_id];
   4123     for (u32 n = 0; n < s->size; ++n) {
   4124       /* dst: frame_base */
   4125       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)t->frame_base_local);
   4126       /* src byte: i32.load8_u (ptr_local) offset=n */
   4127       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)bc->ptr_wasm_local);
   4128       wasm_func_add_mem_insn(t->c, t->module, t->cur_func,
   4129                              WASM_INSN_I32_LOAD8_U, 0, n, 0);
   4130       wasm_func_add_mem_insn(t->c, t->module, t->cur_func, WASM_INSN_I32_STORE8,
   4131                              0, s->frame_offset + n, 0);
   4132     }
   4133   }
   4134 
   4135   linearize_range(t, &L, 0, t->nwir);
   4136   if (L.nstack != 0)
   4137     wfail(t, "wasm: function ended with %u open scopes", L.nstack);
   4138   /* If the body's last real WIR is a terminator (return / br / switch /
   4139    * unreachable / tail call) buried inside nested blocks, kit's wasm
   4140    * validator does not propagate the unreachable flag across enclosing
   4141    * ENDs and would complain about a missing result at the implicit
   4142    * function exit. Emit an explicit trailing `unreachable` so control[0]
   4143    * is marked unreachable independent of the validator's propagation
   4144    * rules. Also: when the terminator is a tail call we've already
   4145    * emitted the linear-stack SP restore inline (see WIR_CALL handler),
   4146    * and the function never reaches the post-body epilogue at runtime —
   4147    * skip it to avoid emitting dead GLOBAL_GET/GLOBAL_SET pairs after the
   4148    * return_call. */
   4149   int last_is_tail_call = 0;
   4150   {
   4151     int needs_unreachable = 0;
   4152     for (u32 i = t->nwir; i > 0; --i) {
   4153       WIR* w = &t->wir[i - 1u];
   4154       if (w->op == WIR_LABEL || w->op == WIR_SCOPE_OPEN ||
   4155           w->op == WIR_SCOPE_CLOSE)
   4156         continue;
   4157       if (w->op == WIR_RET || w->op == WIR_JUMP || w->op == WIR_SWITCH ||
   4158           w->op == WIR_UNREACHABLE) {
   4159         needs_unreachable = 1;
   4160       } else if ((w->op == WIR_CALL || w->op == WIR_CALL_INDIRECT) &&
   4161                  w->call_tail) {
   4162         needs_unreachable = 1;
   4163         last_is_tail_call = 1;
   4164       }
   4165       break;
   4166     }
   4167     if (needs_unreachable) emit_insn(t, WASM_INSN_UNREACHABLE, 0);
   4168   }
   4169   if (t->has_stack_frame && !t->dead && !last_is_tail_call) {
   4170     emit_insn(t, WASM_INSN_LOCAL_GET, (i64)t->frame_saved_sp_local);
   4171     emit_insn(t, WASM_INSN_GLOBAL_SET, (i64)t->stack_pointer_global);
   4172   }
   4173 }
   4174 
   4175 void wasm_func_end(CGTarget* tg) {
   4176   WTarget* t = (WTarget*)tg;
   4177   if (!t->cur_func) return;
   4178   /* Linearize WIR into the WasmFunc body. */
   4179   linearize(t);
   4180   t->cur_fn_desc = NULL;
   4181   t->cur_func = NULL;
   4182   /* Free per-function WIR arg arrays. */
   4183   Heap* h = t->c->ctx->heap;
   4184   for (u32 i = 0; i < t->nwir; ++i) {
   4185     WIR* w = &t->wir[i];
   4186     if (w->call_args) {
   4187       h->free(h, w->call_args, sizeof(Reg) * w->call_narg);
   4188       h->free(h, w->call_arg_imms, sizeof(i64) * w->call_narg);
   4189       h->free(h, w->call_arg_kinds, w->call_narg);
   4190       h->free(h, w->call_arg_types, sizeof(KitCgTypeId) * w->call_narg);
   4191       if (w->call_arg_addrs)
   4192         h->free(h, w->call_arg_addrs, sizeof(Operand) * w->call_narg);
   4193       w->call_args = NULL;
   4194       w->call_arg_imms = NULL;
   4195       w->call_arg_kinds = NULL;
   4196       w->call_arg_types = NULL;
   4197       w->call_arg_addrs = NULL;
   4198     }
   4199     if (w->switch_cases) {
   4200       h->free(h, w->switch_cases, sizeof(CGSwitchCase) * w->switch_ncases);
   4201       w->switch_cases = NULL;
   4202       w->switch_ncases = 0;
   4203     }
   4204     if (w->raw_insns) {
   4205       h->free(h, w->raw_insns, sizeof(WasmInsn) * w->raw_ninsns);
   4206       w->raw_insns = NULL;
   4207       w->raw_ninsns = 0;
   4208     }
   4209     if (w->asm_in_kinds) {
   4210       h->free(h, w->asm_in_kinds, w->asm_nin);
   4211       h->free(h, w->asm_in_imms, sizeof(i64) * w->asm_nin);
   4212       h->free(h, w->asm_in_regs, sizeof(Reg) * w->asm_nin);
   4213       h->free(h, w->asm_in_types, sizeof(KitCgTypeId) * w->asm_nin);
   4214       h->free(h, w->asm_in_share_out, sizeof(i32) * w->asm_nin);
   4215       w->asm_in_kinds = NULL;
   4216       w->asm_in_imms = NULL;
   4217       w->asm_in_regs = NULL;
   4218       w->asm_in_types = NULL;
   4219       w->asm_in_share_out = NULL;
   4220       w->asm_nin = 0;
   4221     }
   4222     if (w->asm_out_regs) {
   4223       h->free(h, w->asm_out_regs, sizeof(Reg) * w->asm_nout);
   4224       h->free(h, w->asm_out_types, sizeof(KitCgTypeId) * w->asm_nout);
   4225       w->asm_out_regs = NULL;
   4226       w->asm_out_types = NULL;
   4227       w->asm_nout = 0;
   4228     }
   4229   }
   4230   t->nwir = 0;
   4231 }
   4232 
   4233 /* CGTarget alias hook. cg/session.c has already shared (section_id, value)
   4234  * between alias_sym and target_sym at the ObjBuilder layer, which covers
   4235  * data aliases (apply_sym_fixups reads section_id/value directly off the
   4236  * ObjSym). Function aliases need extra wiring: the wasm function payload
   4237  * lives in a target-side side-table (sym_to_func), not in obj sections,
   4238  * and the alias's external linker name needs its own WasmExport entry. */
   4239 void wasm_alias(CGTarget* tg, ObjSymId alias_sym, ObjSymId target_sym,
   4240                 KitCgTypeId type) {
   4241   WTarget* t = (WTarget*)tg;
   4242   const ObjSym* tsym;
   4243   const ObjSym* asym;
   4244   (void)type;
   4245   if (t->dead) return;
   4246   /* Aliases are processed before any function body is emitted, so the module
   4247    * may not exist yet; sym_to_wasm_func / wasm_add_export both need it. */
   4248   ensure_module(t);
   4249   tsym = obj_symbol_get(t->obj, target_sym);
   4250   if (!tsym) wfail(t, "wasm: alias against unknown target symbol");
   4251   if (tsym->kind == SK_FUNC) {
   4252     /* Mirror sym_to_func so any later WIR_CALL against the alias resolves
   4253      * to the target's wasm function index. */
   4254     u32 idx = sym_to_wasm_func(t, target_sym, NULL);
   4255     if (alias_sym >= t->sym_to_func_cap) {
   4256       Heap* h = t->c->ctx->heap;
   4257       u32 nc = t->sym_to_func_cap ? t->sym_to_func_cap : 16u;
   4258       while (nc <= alias_sym) nc *= 2u;
   4259       u32* p =
   4260           (u32*)h->realloc(h, t->sym_to_func, sizeof(u32) * t->sym_to_func_cap,
   4261                            sizeof(u32) * nc, _Alignof(u32));
   4262       if (!p) wfail(t, "wasm: out of memory");
   4263       for (u32 i = t->sym_to_func_cap; i < nc; ++i) p[i] = 0;
   4264       t->sym_to_func = p;
   4265       t->sym_to_func_cap = nc;
   4266     }
   4267     t->sym_to_func[alias_sym] = idx + 1u;
   4268     /* Export under the alias's linker name when non-local. Mirrors the
   4269      * export logic at the end of wasm_func_begin. */
   4270     asym = obj_symbol_get(t->obj, alias_sym);
   4271     if (asym && asym->bind != SB_LOCAL) {
   4272       const char* name = pool_sym_cstr(t->c->global, asym->name, NULL);
   4273       if (name && *name) {
   4274         Heap* h = t->c->ctx->heap;
   4275         size_t nlen = strlen(name);
   4276         char* exp_name = (char*)h->alloc(h, nlen + 1u, 1);
   4277         WasmExport* e;
   4278         memcpy(exp_name, name, nlen + 1u);
   4279         e = wasm_add_export(t->c, t->module);
   4280         e->name = exp_name;
   4281         e->kind = 0; /* function export */
   4282         e->index = idx;
   4283       }
   4284     }
   4285     return;
   4286   }
   4287   if (tsym->kind == SK_OBJ) {
   4288     /* Data aliases: obj_symbol_define has already shared (section_id,
   4289      * value), and apply_sym_fixups reads those directly. Nothing more
   4290      * to do here — but diagnose if the target hasn't been defined yet
   4291      * (it would produce a bogus address at finalize). */
   4292     if (tsym->section_id == OBJ_SEC_NONE) {
   4293       wfail(t, "wasm: data alias against undefined target symbol");
   4294     }
   4295     return;
   4296   }
   4297   wfail(t, "wasm target: alias of symbol kind %u not yet supported",
   4298         (unsigned)tsym->kind);
   4299 }
   4300 
   4301 /* Assign each SF_ALLOC (non-EXEC) ObjBuilder section a compact base in
   4302  * linear memory. Walks sections in id order so the layout is deterministic.
   4303  * Each base is aligned to the section's required alignment and lives in
   4304  * t->section_base[sid]. Returns the next unused offset (end of data image). */
   4305 static u32 assign_section_bases(WTarget* t) {
   4306   Heap* h = t->c->ctx->heap;
   4307   u32 nsec = obj_section_count(t->obj);
   4308   if (nsec > t->section_base_cap) {
   4309     u32 nc = t->section_base_cap ? t->section_base_cap : 4u;
   4310     while (nc < nsec) nc *= 2u;
   4311     void* p = h->realloc(h, t->section_base, sizeof(u32) * t->section_base_cap,
   4312                          sizeof(u32) * nc, _Alignof(u32));
   4313     if (!p) wfail(t, "wasm: out of memory");
   4314     t->section_base = (u32*)p;
   4315     for (u32 i = t->section_base_cap; i < nc; ++i)
   4316       t->section_base[i] = 0xFFFFFFFFu;
   4317     t->section_base_cap = nc;
   4318   }
   4319   u32 next = WASM_DATA_NULL_GUARD;
   4320   for (ObjSecId sid = 0; sid < nsec; ++sid) t->section_base[sid] = 0xFFFFFFFFu;
   4321   for (ObjSecId sid = 1; sid < nsec; ++sid) {
   4322     const Section* s = obj_section_get(t->obj, sid);
   4323     if (!s || s->removed || !(s->flags & SF_ALLOC) || s->flags & SF_EXEC)
   4324       continue;
   4325     u32 align = s->align ? s->align : 1u;
   4326     if (align < 1u) align = 1u;
   4327     next = align_to_u32(next, align);
   4328     t->section_base[sid] = next;
   4329     u32 sz = (s->kind == SEC_BSS || s->sem == SSEM_NOBITS)
   4330                  ? s->bss_size
   4331                  : (u32)s->bytes.total;
   4332     if (sz > UINT32_MAX - next) wfail(t, "wasm: linear memory image too large");
   4333     next += sz;
   4334   }
   4335   return next;
   4336 }
   4337 
   4338 /* Patch a single i32/i64 value into the linear-memory image buffer at
   4339  * `offset`. Wasm is little-endian. */
   4340 static void mem_write_le(u8* buf, u32 offset, u64 value, u32 width) {
   4341   for (u32 i = 0; i < width; ++i) buf[offset + i] = (u8)(value >> (i * 8u));
   4342 }
   4343 
   4344 /* Allocate aligned BSS-style space in linear memory for every SK_COMMON
   4345  * symbol the ObjBuilder knows about. Called after assign_section_bases so
   4346  * common storage sits past the last SF_ALLOC section. Records the assigned
   4347  * base in t->common_base[id]; returns the next free cursor. */
   4348 static u32 assign_common_bases(WTarget* t, u32 next) {
   4349   Heap* h = t->c->ctx->heap;
   4350   ObjSymIter* it = obj_symiter_new(t->obj);
   4351   ObjSymEntry e;
   4352   while (obj_symiter_next(it, &e)) {
   4353     const ObjSym* os = e.sym;
   4354     if (!os || os->removed) continue;
   4355     if (os->kind != SK_COMMON) continue;
   4356     u32 align = os->common_align ? (u32)os->common_align : 1u;
   4357     if (align < 1u) align = 1u;
   4358     if (e.id >= t->common_base_cap) {
   4359       u32 nc = t->common_base_cap ? t->common_base_cap : 8u;
   4360       while (nc <= e.id) nc *= 2u;
   4361       void* p = h->realloc(h, t->common_base, sizeof(u32) * t->common_base_cap,
   4362                            sizeof(u32) * nc, _Alignof(u32));
   4363       if (!p) wfail(t, "wasm: out of memory");
   4364       t->common_base = (u32*)p;
   4365       for (u32 i = t->common_base_cap; i < nc; ++i)
   4366         t->common_base[i] = 0xFFFFFFFFu;
   4367       t->common_base_cap = nc;
   4368     }
   4369     next = align_to_u32(next, align);
   4370     t->common_base[e.id] = next;
   4371     u32 sz = (u32)os->size;
   4372     if (sz > UINT32_MAX - next)
   4373       wfail(t, "wasm: linear memory image too large (common symbols)");
   4374     next += sz;
   4375   }
   4376   obj_symiter_free(it);
   4377   return next;
   4378 }
   4379 
   4380 /* Resolve `sym + addend` to a linear-memory address. Handles both
   4381  * section-defined symbols (via t->section_base[sym->section_id]) and
   4382  * common symbols (via t->common_base[sym]). Returns 0 and sets *ok=0 if
   4383  * the symbol can't be resolved here; callers diagnose. */
   4384 static u32 wasm_sym_linear_addr(WTarget* t, ObjSymId sym, i64 addend, int* ok) {
   4385   const ObjSym* os = obj_symbol_get(t->obj, sym);
   4386   *ok = 0;
   4387   if (!os) return 0;
   4388   if (os->kind == SK_COMMON) {
   4389     if (sym >= t->common_base_cap || t->common_base[sym] == 0xFFFFFFFFu)
   4390       return 0;
   4391     *ok = 1;
   4392     return t->common_base[sym] + (u32)addend;
   4393   }
   4394   if (os->section_id == OBJ_SEC_NONE) return 0;
   4395   if (os->section_id >= t->section_base_cap ||
   4396       t->section_base[os->section_id] == 0xFFFFFFFFu)
   4397     return 0;
   4398   *ok = 1;
   4399   return t->section_base[os->section_id] + (u32)os->value + (u32)addend;
   4400 }
   4401 
   4402 /* Apply each ObjBuilder relocation to the linear-memory image. Only
   4403  * absolute (R_ABS32/R_ABS64) relocations are supported for now; PC-relative
   4404  * and other kinds diagnose. */
   4405 static void apply_data_relocs(WTarget* t, u8* mem) {
   4406   u32 ntotal = obj_reloc_total(t->obj);
   4407   for (u32 i = 0; i < ntotal; ++i) {
   4408     const Reloc* r = obj_reloc_at(t->obj, i);
   4409     if (!r || r->removed) continue;
   4410     if (r->section_id == OBJ_SEC_NONE) continue;
   4411     if (r->section_id >= t->section_base_cap ||
   4412         t->section_base[r->section_id] == 0xFFFFFFFFu)
   4413       continue;
   4414     const Section* rs = obj_section_get(t->obj, r->section_id);
   4415     if (!rs || rs->flags & SF_EXEC) continue;
   4416     const ObjSym* tos = obj_symbol_get(t->obj, r->sym);
   4417     if (!tos)
   4418       wfail(t, "wasm: data relocation against unresolved symbol not supported");
   4419     /* Function-symbol references in data sections (e.g. a static vtable
   4420      * `static fn_t v = &foo;`) resolve to wasm function-table indices, not
   4421      * linear-memory addresses. The funcref table is built before
   4422      * apply_data_relocs runs, so the index is already known. */
   4423     u32 width;
   4424     u64 value;
   4425     if (tos->kind == SK_FUNC) {
   4426       if (r->kind != R_ABS32)
   4427         wfail(t,
   4428               "wasm: function-pointer data relocation kind %u not supported "
   4429               "(only R_ABS32 on wasm32 target)",
   4430               (unsigned)r->kind);
   4431       if (r->addend != 0)
   4432         wfail(t, "wasm: nonzero addend on function-pointer data relocation");
   4433       u32 tbl_idx = func_table_index_for(t, r->sym);
   4434       width = 4;
   4435       value = (u64)tbl_idx;
   4436       u32 dst_off = t->section_base[r->section_id] + r->offset;
   4437       mem_write_le(mem, dst_off, value, width);
   4438       continue;
   4439     }
   4440     if (tos->section_id == OBJ_SEC_NONE && tos->kind != SK_COMMON)
   4441       wfail(t, "wasm: data relocation against unresolved symbol not supported");
   4442     {
   4443       int ok = 0;
   4444       /* The addend is already added by `value = sym_addr + r->addend` below;
   4445        * pass 0 here so we don't double-count. */
   4446       u32 sym_addr = wasm_sym_linear_addr(t, r->sym, 0, &ok);
   4447       if (!ok)
   4448         wfail(t,
   4449               "wasm: data relocation target symbol has no linear-memory "
   4450               "address");
   4451       switch (r->kind) {
   4452         case R_ABS32:
   4453           width = 4;
   4454           value = (u64)(u32)((i64)sym_addr + r->addend);
   4455           break;
   4456         case R_ABS64:
   4457           wfail(t,
   4458                 "wasm: R_ABS64 data relocation not supported on wasm32 target");
   4459         default:
   4460           wfail(t, "wasm: unsupported data relocation kind %u",
   4461                 (unsigned)r->kind);
   4462       }
   4463     }
   4464     u32 dst_off = t->section_base[r->section_id] + r->offset;
   4465     mem_write_le(mem, dst_off, value, width);
   4466   }
   4467 }
   4468 
   4469 /* Walk the deferred WSymFixup queue and patch the placeholder i32.const
   4470  * imm in each WasmFunc.insns[] with the resolved absolute address. */
   4471 static void apply_sym_fixups(WTarget* t) {
   4472   for (u32 i = 0; i < t->sym_fixups_count; ++i) {
   4473     WSymFixup fx = t->sym_fixups[i];
   4474     int ok = 0;
   4475     u32 addr = wasm_sym_linear_addr(t, fx.sym, fx.addend, &ok);
   4476     if (!ok) wfail(t, "wasm: deferred symbol fixup against unresolved symbol");
   4477     WasmFunc* f = &t->module->funcs[fx.wasm_func_idx];
   4478     if (fx.insn_idx >= f->ninsns)
   4479       wfail(t, "wasm: deferred symbol fixup insn_idx out of range");
   4480     f->insns[fx.insn_idx].imm = (i64)addr;
   4481   }
   4482 }
   4483 
   4484 static void wasm_materialize_data(WTarget* t) {
   4485   if (!t->has_memory) {
   4486     /* No linear memory was needed by any function body or addr_of, so
   4487      * symbol fixups should be empty by construction. */
   4488     return;
   4489   }
   4490   u32 image_end = assign_section_bases(t);
   4491   image_end = assign_common_bases(t, image_end);
   4492   u32 stack_size = t->has_stack_pointer ? t->stack_size : 0u;
   4493   u32 image = image_end ? align_to_u32(image_end, 16u) : WASM_DATA_NULL_GUARD;
   4494   if (image > UINT32_MAX - stack_size)
   4495     wfail(t, "wasm: linear memory image too large");
   4496   /* Build a single active data segment covering 0..image. Passive segments
   4497    * + memory.init would be needed for multi-TU linking; single-TU output
   4498    * stays with the simpler shape. */
   4499   WasmDataSegment* seg = NULL;
   4500   if (image) {
   4501     seg = wasm_add_data(t->c, t->module);
   4502     seg->mode = WASM_SEG_ACTIVE;
   4503     seg->memidx = 0;
   4504     seg->offset = 0;
   4505     wasm_data_set_bytes(t->c, t->module, seg, NULL, (u64)image);
   4506   }
   4507   u32 nsec = obj_section_count(t->obj);
   4508   for (ObjSecId sid = 1; sid < nsec; ++sid) {
   4509     const Section* s = obj_section_get(t->obj, sid);
   4510     if (!s || s->removed || !(s->flags & SF_ALLOC) || s->flags & SF_EXEC)
   4511       continue;
   4512     if (s->kind == SEC_BSS || s->sem == SSEM_NOBITS || !s->bytes.total)
   4513       continue;
   4514     buf_flatten(&s->bytes, seg->bytes + t->section_base[sid]);
   4515   }
   4516   if (seg) apply_data_relocs(t, seg->bytes);
   4517   apply_sym_fixups(t);
   4518   t->data_end = image;
   4519   u32 stack_top = (u32)align_to_u32(image + stack_size, 16u);
   4520   t->module->memories[0].min_pages = (stack_top + 65535u) / 65536u;
   4521   /* Shared memory requires has_max and max >= min. ensure_shared_memory set a
   4522    * provisional wasm32-ceiling cap (65536 pages = 4 GiB); now that the final
   4523    * layout is known, tighten max down to min so the module declares a snug,
   4524    * fixed shared memory. The backend never emits memory.grow, so the memory is
   4525    * non-growable regardless, and a 4 GiB declared max would otherwise force an
   4526    * embedder (e.g. `kit run`) to reserve the full ceiling up front. */
   4527   if (t->module->memories[0].shared) {
   4528     t->module->memories[0].has_max = 1;
   4529     t->module->memories[0].max_pages = t->module->memories[0].min_pages;
   4530   }
   4531   if (t->has_stack_pointer && t->stack_pointer_global < t->module->nglobals) {
   4532     t->module->globals[t->stack_pointer_global].init.imm = stack_top;
   4533   }
   4534 }
   4535 
   4536 /* Static-data initializers (e.g. `static fn_t v[] = {&foo, &bar};`) go
   4537  * through ObjBuilder relocations rather than wasm_addr_of, so they never
   4538  * touch queue_func_table_fixup. Scan the reloc table once before building
   4539  * the funcref table so every function whose address is referenced from data
   4540  * also gets a table slot. apply_data_relocs then patches the linear-memory
   4541  * image with the assigned index. */
   4542 static void wasm_collect_func_data_refs(WTarget* t) {
   4543   u32 ntotal = obj_reloc_total(t->obj);
   4544   for (u32 i = 0; i < ntotal; ++i) {
   4545     const Reloc* r = obj_reloc_at(t->obj, i);
   4546     const ObjSym* tos;
   4547     if (!r || r->removed) continue;
   4548     if (r->section_id == OBJ_SEC_NONE) continue;
   4549     {
   4550       const Section* rs = obj_section_get(t->obj, r->section_id);
   4551       if (!rs || rs->flags & SF_EXEC) continue; /* code-section relocs */
   4552     }
   4553     tos = obj_symbol_get(t->obj, r->sym);
   4554     if (!tos || tos->kind != SK_FUNC) continue;
   4555     (void)func_table_index_for(t, r->sym);
   4556     (void)sym_to_wasm_func(t, r->sym, NULL);
   4557   }
   4558 }
   4559 
   4560 /* Build the single funcref table and its active element segment, then patch
   4561  * every queued WFuncTableFixup's placeholder `i32.const 0` with the assigned
   4562  * table index. Slot 0 stays reserved (call_indirect through index 0 traps on
   4563  * the type check), so the first recorded function lands at index 1. Each
   4564  * WasmElemSegment caps its funcs array at 64 entries; we chunk across
   4565  * multiple segments when the address-taken set is larger. */
   4566 static void wasm_materialize_functable(WTarget* t) {
   4567   wasm_collect_func_data_refs(t);
   4568   if (!t->has_func_table || t->func_table_count == 0) return;
   4569   ensure_module(t);
   4570   /* Table: non-growable, sized to hold the reserved null slot plus every
   4571    * assigned entry. */
   4572   WasmTable* tbl = wasm_add_table(t->c, t->module);
   4573   tbl->elem_type = WASM_VAL_FUNCREF;
   4574   tbl->min = 1u + t->func_table_count;
   4575   tbl->max = tbl->min;
   4576   tbl->has_max = 1;
   4577   /* Active element segment populates table 0 starting at offset 1 (slot 0
   4578    * stays null). Element segments are now heap-grown — no chunking needed. */
   4579   {
   4580     WasmElemSegment* seg = wasm_add_elem(t->c, t->module);
   4581     seg->mode = WASM_SEG_ACTIVE;
   4582     seg->elem_type = WASM_VAL_FUNCREF;
   4583     seg->tableidx = 0;
   4584     seg->offset = 1;
   4585     for (u32 i = 0; i < t->func_table_count; ++i) {
   4586       ObjSymId sym = t->func_table[i];
   4587       wasm_elem_push_func(t->c, t->module, seg, sym_to_wasm_func(t, sym, NULL));
   4588     }
   4589   }
   4590   /* Patch placeholders. */
   4591   for (u32 i = 0; i < t->func_table_fixups_count; ++i) {
   4592     WFuncTableFixup fx = t->func_table_fixups[i];
   4593     u32 tbl_idx = func_table_index_for(t, fx.sym);
   4594     WasmFunc* f = &t->module->funcs[fx.wasm_func_idx];
   4595     if (fx.insn_idx >= f->ninsns)
   4596       wfail(t, "wasm: function-pointer fixup insn_idx out of range");
   4597     f->insns[fx.insn_idx].imm = (i64)tbl_idx;
   4598   }
   4599 }
   4600 
   4601 /* Wasm requires every import to occupy a lower function index than any
   4602  * defined function. The backend, however, allocates a WasmFunc for any
   4603  * direct-call target in walk order — so a defined function may end up at a
   4604  * lower array index than an import created later by promote_import_func.
   4605  * Reorder m->funcs so all imports precede all definitions, then walk every
   4606  * function-index reference in the module and apply the old->new mapping. */
   4607 static void wasm_reorder_funcs_imports_first(WTarget* t) {
   4608   WasmModule* m = t->module;
   4609   if (!m || m->nfuncs == 0) return;
   4610   Heap* h = m->heap;
   4611   u32 n = m->nfuncs;
   4612   /* Quick check: bail out if imports are already before all definitions. */
   4613   int seen_def = 0;
   4614   int needs_reorder = 0;
   4615   for (u32 i = 0; i < n; ++i) {
   4616     if (m->funcs[i].is_import) {
   4617       if (seen_def) {
   4618         needs_reorder = 1;
   4619         break;
   4620       }
   4621     } else {
   4622       seen_def = 1;
   4623     }
   4624   }
   4625   if (!needs_reorder) return;
   4626   u32* old_to_new = (u32*)h->alloc(h, sizeof(u32) * n, _Alignof(u32));
   4627   WasmFunc* new_funcs =
   4628       (WasmFunc*)h->alloc(h, sizeof(WasmFunc) * n, _Alignof(WasmFunc));
   4629   if (!old_to_new || !new_funcs) wfail(t, "wasm: out of memory");
   4630   u32 w_idx = 0;
   4631   for (u32 i = 0; i < n; ++i) {
   4632     if (m->funcs[i].is_import) {
   4633       new_funcs[w_idx] = m->funcs[i];
   4634       old_to_new[i] = w_idx++;
   4635     }
   4636   }
   4637   for (u32 i = 0; i < n; ++i) {
   4638     if (!m->funcs[i].is_import) {
   4639       new_funcs[w_idx] = m->funcs[i];
   4640       old_to_new[i] = w_idx++;
   4641     }
   4642   }
   4643   /* Swap arrays. Old buffer is freed via the module's heap-tracked
   4644    * realloc bookkeeping when wasm_module_free runs; we just overwrite the
   4645    * pointer + length here. */
   4646   h->free(h, m->funcs, sizeof(WasmFunc) * m->cap_funcs);
   4647   m->funcs = new_funcs;
   4648   m->cap_funcs = n;
   4649   /* Remap every funcidx-bearing slot in the module. */
   4650   for (u32 fi = 0; fi < n; ++fi) {
   4651     WasmFunc* f = &m->funcs[fi];
   4652     for (u32 j = 0; j < f->ninsns; ++j) {
   4653       WasmInsn* in = &f->insns[j];
   4654       if (in->kind == WASM_INSN_CALL || in->kind == WASM_INSN_RETURN_CALL ||
   4655           in->kind == WASM_INSN_REF_FUNC) {
   4656         u32 old = (u32)in->imm;
   4657         if (old < n) in->imm = (int64_t)old_to_new[old];
   4658       }
   4659     }
   4660   }
   4661   for (u32 i = 0; i < m->nexports; ++i) {
   4662     if (m->exports[i].kind == 0u && m->exports[i].index < n)
   4663       m->exports[i].index = old_to_new[m->exports[i].index];
   4664   }
   4665   for (u32 i = 0; i < m->nelems; ++i) {
   4666     WasmElemSegment* seg = &m->elems[i];
   4667     for (u32 j = 0; j < seg->nfuncs; ++j) {
   4668       if (seg->funcs[j] < n) seg->funcs[j] = old_to_new[seg->funcs[j]];
   4669     }
   4670   }
   4671   if (m->has_start && m->start_func < n)
   4672     m->start_func = old_to_new[m->start_func];
   4673   /* Update the backend's sym_to_func reverse map so any post-finalize lookups
   4674    * (e.g. data-reloc fixups) resolve to the new indices. The map stores
   4675    * idx+1 so 0 = "unassigned"; preserve that convention. */
   4676   for (ObjSymId sym = 0; sym < t->sym_to_func_cap; ++sym) {
   4677     if (t->sym_to_func[sym]) {
   4678       u32 old = t->sym_to_func[sym] - 1u;
   4679       if (old < n) t->sym_to_func[sym] = old_to_new[old] + 1u;
   4680     }
   4681   }
   4682   h->free(h, old_to_new, sizeof(u32) * n);
   4683 }
   4684 
   4685 /* Export the module's linear memory under the conventional name "memory" so
   4686  * standard runtimes (browser/wasmtime/wasmer/Node) can find it. Only emits
   4687  * when the module has at least one defined (non-import) memory and no
   4688  * memory export already exists. */
   4689 static void wasm_export_memory(WTarget* t) {
   4690   WasmModule* m = t->module;
   4691   if (!m) return;
   4692   ensure_linear_memory(t);
   4693   m = t->module;
   4694   /* Find the first defined (non-import) memory. */
   4695   u32 mem_idx = 0;
   4696   int found = 0;
   4697   for (u32 i = 0; i < m->nmemories; ++i) {
   4698     if (!m->memories[i].is_import) {
   4699       mem_idx = i;
   4700       found = 1;
   4701       break;
   4702     }
   4703   }
   4704   if (!found) return;
   4705   /* Skip if the user already added a memory export (e.g. via the WAT path
   4706    * or future explicit-export hook). */
   4707   for (u32 i = 0; i < m->nexports; ++i) {
   4708     if (m->exports[i].kind == 2u && m->exports[i].index == mem_idx) return;
   4709   }
   4710   Heap* h = t->c->ctx->heap;
   4711   WasmExport* e = wasm_add_export(t->c, m);
   4712   static const char kName[] = "memory";
   4713   char* dup = (char*)h->alloc(h, sizeof(kName), 1);
   4714   if (!dup) wfail(t, "wasm: out of memory");
   4715   memcpy(dup, kName, sizeof(kName));
   4716   e->name = dup;
   4717   e->kind = 2u; /* memory export */
   4718   e->index = mem_idx;
   4719 }
   4720 
   4721 /* Diagnose any WasmFunc that has neither a body nor import status — that's a
   4722  * declaration with no definition, e.g. a function-pointer reference to an
   4723  * extern whose call site never appeared (so we never saw an ABI to synthesize
   4724  * an import signature from). Emitting such a function would produce a
   4725  * malformed module. Diagnose by sym name so users can fix the source. */
   4726 static void wasm_diagnose_unresolved_funcs(WTarget* t) {
   4727   if (!t->module) return;
   4728   for (ObjSymId sym = 1; sym < t->sym_to_func_cap; ++sym) {
   4729     if (!t->sym_to_func[sym]) continue;
   4730     u32 idx = t->sym_to_func[sym] - 1u;
   4731     if (idx >= t->module->nfuncs) continue;
   4732     WasmFunc* f = &t->module->funcs[idx];
   4733     if (f->is_import) continue;
   4734     if (f->ninsns != 0) continue;
   4735     const ObjSym* os = obj_symbol_get(t->obj, sym);
   4736     if (!os || os->section_id != OBJ_SEC_NONE) continue;
   4737     const char* name = pool_sym_cstr(t->c->global, os->name, NULL);
   4738     wfail(t,
   4739           "wasm: undefined function '%s' has its address taken but no direct "
   4740           "call was seen — cannot synthesize import signature; add a direct "
   4741           "call or annotate the declaration",
   4742           name ? name : "(anonymous)");
   4743   }
   4744 }
   4745 
   4746 void wasm_finalize(CGTarget* tg) {
   4747   WTarget* t = (WTarget*)tg;
   4748   wasm_materialize_functable(t);
   4749   wasm_materialize_data(t);
   4750   if (t->module) {
   4751     wasm_diagnose_unresolved_funcs(t);
   4752     wasm_export_memory(t);
   4753     wasm_reorder_funcs_imports_first(t);
   4754   }
   4755   /* WasmModule remains attached to ObjBuilder via OBJ_EXT_WASM; emit_wasm
   4756    * flushes it. */
   4757 }
   4758 
   4759 static void wasm_module_freefn(Compiler* c, void* p) {
   4760   (void)c;
   4761   WasmModule* m = (WasmModule*)p;
   4762   Heap* h = m->heap;
   4763   wasm_module_free(m);
   4764   h->free(h, m, sizeof *m);
   4765 }
   4766 
   4767 WTarget* wasm_emit_target_new(Compiler* c, ObjBuilder* o, MCEmitter* mc) {
   4768   Heap* h;
   4769   WTarget* t;
   4770   if (!c) return NULL;
   4771   h = (Heap*)c->ctx->heap;
   4772   t = (WTarget*)h->alloc(h, sizeof *t, _Alignof(WTarget));
   4773   if (!t) return NULL;
   4774   memset(t, 0, sizeof *t);
   4775   t->base.c = c;
   4776   t->base.obj = o;
   4777   t->c = c;
   4778   t->obj = o;
   4779   (void)mc;
   4780   return t;
   4781 }
   4782 
   4783 void wasm_destroy(CGTarget* tg) {
   4784   WTarget* t = (WTarget*)tg;
   4785   Heap* h = t->c->ctx->heap;
   4786   if (t->reg_to_local) h->free(h, t->reg_to_local, sizeof(u32) * t->reg_cap);
   4787   if (t->reg_type) h->free(h, t->reg_type, sizeof(KitCgTypeId) * t->reg_cap);
   4788   if (t->reg_cls) h->free(h, t->reg_cls, t->reg_cap);
   4789   if (t->wir) h->free(h, t->wir, sizeof(WIR) * t->wir_cap);
   4790   if (t->labels) h->free(h, t->labels, sizeof(WLabel) * t->labels_cap);
   4791   if (t->slots) h->free(h, t->slots, sizeof(WSlot) * t->slots_cap);
   4792   if (t->param_local_idx)
   4793     h->free(h, t->param_local_idx, sizeof(u32) * t->param_local_idx_cap);
   4794   if (t->byval_copies)
   4795     h->free(h, t->byval_copies, sizeof(WByvalCopy) * t->byval_copies_cap);
   4796   if (t->sym_to_func)
   4797     h->free(h, t->sym_to_func, sizeof(u32) * t->sym_to_func_cap);
   4798   if (t->funcs) h->free(h, t->funcs, sizeof(WFunc) * t->funcs_cap);
   4799   if (t->section_base)
   4800     h->free(h, t->section_base, sizeof(u32) * t->section_base_cap);
   4801   if (t->common_base)
   4802     h->free(h, t->common_base, sizeof(u32) * t->common_base_cap);
   4803   if (t->sym_fixups)
   4804     h->free(h, t->sym_fixups, sizeof(WSymFixup) * t->sym_fixups_cap);
   4805   if (t->func_table)
   4806     h->free(h, t->func_table, sizeof(ObjSymId) * t->func_table_cap);
   4807   if (t->func_table_fixups)
   4808     h->free(h, t->func_table_fixups,
   4809             sizeof(WFuncTableFixup) * t->func_table_fixups_cap);
   4810   h->free(h, t, sizeof *t);
   4811 }
   4812 
   4813 /* -----------------------------------------------------------------
   4814  * Module bootstrap: attach a WasmModule to the ObjBuilder so emit_wasm
   4815  * can find it. Lazily on first func_begin.
   4816  * ----------------------------------------------------------------- */
   4817 
   4818 static struct WasmModule* ensure_module(WTarget* t) {
   4819   if (t->module) return t->module;
   4820   Heap* h = t->c->ctx->heap;
   4821   WasmModule* m = (WasmModule*)h->alloc(h, sizeof *m, _Alignof(WasmModule));
   4822   if (!m) wfail(t, "wasm: out of memory");
   4823   wasm_module_init(m, h);
   4824   /* kit-produced modules always declare bulk-memory support: WIR_COPY_BYTES
   4825    * / WIR_SET_BYTES lower to memory.copy / memory.fill unconditionally, and
   4826    * the sret-return path emits memory.copy too. */
   4827   m->features |= WASM_FEATURE_BULK_MEMORY;
   4828   t->module = m;
   4829   obj_ext_set(t->obj, OBJ_EXT_WASM, m, wasm_module_freefn);
   4830   return m;
   4831 }