kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

engine.c (62812B)


      1 /* The interpreter engine: an explicit-stack dispatch loop over the lowered
      2  * bytecode. IR-level calls push/pop InterpFrames on the InterpStack instead of
      3  * recursing on the host C stack, so execution can be suspended and resumed.
      4  *
      5  * Dispatch is a switch on the record opcode. (Direct threading via a computed
      6  * goto is reserved for a later pass; the InterpInsn keeps a `handler` slot for
      7  * it. A switch keeps the engine portable under -Wpedantic and self-host.) */
      8 
      9 #include <kit/config.h> /* KIT_INTERP_THREADED: dispatch default */
     10 #include <string.h>
     11 
     12 #include "abi/abi.h"
     13 #include "cg/cgtarget.h"
     14 #include "cg/type.h"
     15 #include "core/arena.h"
     16 #include "core/core.h"
     17 #include "core/diag.h"
     18 #include "interp/interp.h"
     19 
     20 #define PERM_R KIT_INTERP_PERM_READ
     21 #define PERM_W KIT_INTERP_PERM_WRITE
     22 
     23 static SrcLoc iloc(void) {
     24   SrcLoc l;
     25   l.file_id = 0;
     26   l.line = 0;
     27   l.col = 0;
     28   return l;
     29 }
     30 
     31 /* ---- width / fp helpers ---- */
     32 
     33 static u64 mask_w(u64 v, u32 w) {
     34   if (w >= 8) return v;
     35   if (w == 0) return v;
     36   return v & ((1ull << (w * 8u)) - 1ull);
     37 }
     38 
     39 static i64 sext_w(u64 v, u32 w) {
     40   u32 bits;
     41   u64 m;
     42   if (w >= 8 || w == 0) return (i64)v;
     43   bits = w * 8u;
     44   v &= ((1ull << bits) - 1ull);
     45   m = 1ull << (bits - 1u);
     46   return (i64)((v ^ m) - m);
     47 }
     48 
     49 /* Low `width`-bit mask (width in *bits*, 0..64). */
     50 static u64 bits_mask(u32 width) {
     51   return width >= 64u ? ~0ull : ((1ull << width) - 1ull);
     52 }
     53 
     54 /* Interpreter-private va_list layout: a single cursor walks a contiguous buffer
     55  * of the anonymous arguments, each at an 8-byte (16 for >8B types) aligned
     56  * slot. The interpreter owns both the call-site buffer build and
     57  * va_start/va_arg, so the layout is self-consistent regardless of the target
     58  * ABI's real va_list. */
     59 static u32 va_align_of(u32 size) { return size > 8u ? 16u : 8u; }
     60 static u32 va_stride_of(u32 size) {
     61   return size > 8u ? ((size + 15u) & ~15u) : 8u;
     62 }
     63 
     64 static double rd_f(u64 bits, u32 w) {
     65   if (w == 4) {
     66     float f;
     67     u32 b = (u32)bits;
     68     memcpy(&f, &b, 4);
     69     return (double)f;
     70   }
     71   {
     72     double d;
     73     memcpy(&d, &bits, 8);
     74     return d;
     75   }
     76 }
     77 
     78 static u64 wr_f(double d, u32 w) {
     79   if (w == 4) {
     80     float f = (float)d;
     81     u32 b;
     82     memcpy(&b, &f, 4);
     83     return b;
     84   }
     85   {
     86     u64 b;
     87     memcpy(&b, &d, 8);
     88     return b;
     89   }
     90 }
     91 
     92 /* ---- memory access (always vtable-translated) ---- */
     93 /* A translation miss latches st->mem_fault; the run loop converts the latch to
     94  * a delivered fault at the next straight-line/branch re-check point. */
     95 
     96 static u64 mem_read(InterpStack* st, u64 addr, u32 size) {
     97   u8* host = interp_translate(st->prog, addr, size, PERM_R);
     98   u64 v = 0;
     99   if (!host) {
    100     st->mem_fault = 1;
    101     return 0;
    102   }
    103   memcpy(&v, host, size ? size : 8u);
    104   return v;
    105 }
    106 
    107 static void mem_write(InterpStack* st, u64 addr, u32 size, u64 v) {
    108   u8* host = interp_translate(st->prog, addr, size, PERM_W);
    109   if (!host) {
    110     st->mem_fault = 1;
    111     return;
    112   }
    113   memcpy(host, &v, size ? size : 8u);
    114 }
    115 
    116 static void mem_copy(InterpStack* st, u64 dst, u64 src, u32 n) {
    117   u8* d = interp_translate(st->prog, dst, n, PERM_W);
    118   u8* s = interp_translate(st->prog, src, n, PERM_R);
    119   if (!d || !s) {
    120     st->mem_fault = 1;
    121     return;
    122   }
    123   memmove(d, s, n);
    124 }
    125 
    126 /* ---- operand access ---- */
    127 
    128 static u64 frame_base(InterpStack* st, u32 mem_off) {
    129   return (u64)(uintptr_t)(st->mem_arena + mem_off);
    130 }
    131 
    132 /* addr_from_operand semantics: the abstract address an lvalue operand denotes.
    133  */
    134 static u64 op_addr(InterpStack* st, InterpFunc* fn, u64* regs, u32 mem_off,
    135                    const Operand* op) {
    136   switch ((OptOperandKind)op->kind) {
    137     case OPT_OPK_LOCAL:
    138       return frame_base(st, mem_off) + fn->slot_off[op->v.frame_slot];
    139     case OPT_OPK_GLOBAL:
    140       return (u64)(uintptr_t)interp_global_base(fn, op->v.global.sym) +
    141              (u64)op->v.global.addend;
    142     case OPT_OPK_INDIRECT: {
    143       u64 a = regs[op->v.ind.base];
    144       if (op->v.ind.index != (Reg)REG_NONE)
    145         a += regs[op->v.ind.index] << op->v.ind.log2_scale;
    146       a += (u64)(i64)op->v.ind.ofs;
    147       return a;
    148     }
    149     case OPT_OPK_REG:
    150       return regs[op->v.reg];
    151     default:
    152       return 0;
    153   }
    154 }
    155 
    156 /* loc_from_operand-as-value semantics: the scalar value of a value operand. */
    157 static u64 op_value(InterpStack* st, InterpFunc* fn, u64* regs, u32 mem_off,
    158                     const Operand* op) {
    159   switch ((OptOperandKind)op->kind) {
    160     case OPT_OPK_REG:
    161       return regs[op->v.reg];
    162     case OPT_OPK_IMM:
    163       return (u64)op->v.imm;
    164     case OPT_OPK_LOCAL:
    165     case OPT_OPK_GLOBAL:
    166     case OPT_OPK_INDIRECT: {
    167       u64 a = op_addr(st, fn, regs, mem_off, op);
    168       u32 sz = abi_cg_sizeof(fn->prog->c->abi, op->type);
    169       return mem_read(st, a, sz ? sz : 8u);
    170     }
    171     default:
    172       return 0;
    173   }
    174 }
    175 
    176 /* write_loc semantics: store a scalar result into a destination operand, which
    177  * may be a register OR a memory location (OPK_LOCAL/GLOBAL/INDIRECT). The
    178  * optimizer leaves un-promoted (e.g. address-taken) destinations as memory. */
    179 static void write_dst(InterpStack* st, InterpFunc* fn, u64* regs, u32 mem_off,
    180                       const Operand* op, u64 value) {
    181   if (op->kind == OPK_REG) {
    182     regs[op->v.reg] = value;
    183     return;
    184   }
    185   {
    186     u64 a = op_addr(st, fn, regs, mem_off, op);
    187     u32 sz = abi_cg_sizeof(fn->prog->c->abi, op->type);
    188     mem_write(st, a, sz ? sz : 8u, value);
    189   }
    190 }
    191 
    192 /* pointer_addr_from_operand semantics: the address an aggregate pointer
    193  * operand denotes. An OPK_LOCAL of pointer type *holds* the pointer (load it);
    194  * otherwise the local *is* the aggregate storage (its frame home is the
    195  * address). Used only by AGG_COPY/AGG_SET. */
    196 static u64 interp_ptr_addr(InterpStack* st, InterpFunc* fn, u64* regs,
    197                            u32 mem_off, const Operand* op) {
    198   if (op->kind == OPK_LOCAL && !cg_type_is_ptr(fn->prog->c, op->type))
    199     return frame_base(st, mem_off) + fn->slot_off[op->v.frame_slot];
    200   if (op->kind == OPK_LOCAL) {
    201     /* pointer-typed local: the slot holds the pointer value */
    202     u64 slot = frame_base(st, mem_off) + fn->slot_off[op->v.frame_slot];
    203     return mem_read(st, slot, 8u);
    204   }
    205   return op_addr(st, fn, regs, mem_off, op);
    206 }
    207 
    208 /* Common compiler intrinsics. Returns 0 (and sets status) if unsupported. */
    209 static int interp_intrinsic(InterpStack* st, InterpFunc* fn, u64* regs,
    210                             u32 mem_off, InterpInsn* in);
    211 
    212 /* The register and addressable-memory arenas are FIXED reservations that never
    213  * move: an OP_ADDR_OF materializes a local's address as an absolute host
    214  * pointer into mem_arena, and that pointer can escape into a register or out to
    215  * another local, so reallocating (moving) the arena would dangle it. Frames
    216  * follow strict stack discipline (CALL bumps the top, RET rewinds it), so a
    217  * generous fixed reservation suffices; overflow traps cleanly as a stack
    218  * overflow rather than corrupting memory. */
    219 #define INTERP_REGS_RESERVE (8u * 1024u * 1024u)
    220 #define INTERP_MEM_RESERVE (8u * 1024u * 1024u)
    221 
    222 static u32 bump(u8* arena, u32* top, u32 cap, u32 size, u32 align) {
    223   u32 off = (*top + align - 1u) & ~(align - 1u);
    224   (void)arena;
    225   if (off + size > cap || off + size < off) return 0xffffffffu; /* overflow */
    226   *top = off + size;
    227   return off;
    228 }
    229 
    230 /* Push a fresh frame for fn; returns its index, or 0xffffffff on overflow.
    231  * The arenas never move, so existing frame pointers stay valid. */
    232 static u32 frame_push(InterpStack* st, InterpFunc* fn) {
    233   InterpFrame* fr;
    234   u32 regs_off, mem_off;
    235   if (st->nframes == st->frames_cap) {
    236     Heap* h = st->prog->c->ctx->heap;
    237     u32 ncap = st->frames_cap ? st->frames_cap * 2u : 32u;
    238     InterpFrame* nf = (InterpFrame*)h->realloc(
    239         h, st->frames, sizeof(InterpFrame) * st->frames_cap,
    240         sizeof(InterpFrame) * ncap, _Alignof(InterpFrame));
    241     if (!nf) return 0xffffffffu;
    242     st->frames = nf;
    243     st->frames_cap = ncap;
    244   }
    245   regs_off = bump(st->regs_arena, &st->regs_top, st->regs_cap,
    246                   (fn->npregs ? fn->npregs : 1u) * 8u, 8u);
    247   mem_off = bump(st->mem_arena, &st->mem_top, st->mem_cap,
    248                  fn->frame_bytes ? fn->frame_bytes : 16u, fn->frame_align);
    249   if (regs_off == 0xffffffffu || mem_off == 0xffffffffu) return 0xffffffffu;
    250   fr = &st->frames[st->nframes];
    251   memset(fr, 0, sizeof *fr);
    252   fr->fn = fn;
    253   fr->regs_off = regs_off;
    254   fr->mem_off = mem_off;
    255   fr->frame_bytes = fn->frame_bytes;
    256   fr->alloca_top = fn->frame_bytes;
    257   fr->ip = &fn->code[fn->block_pc[fn->f->entry] == INTERP_PC_NONE
    258                          ? 0u
    259                          : fn->block_pc[fn->f->entry]];
    260   /* zero the register file */
    261   memset(st->regs_arena + regs_off, 0, (fn->npregs ? fn->npregs : 1u) * 8u);
    262   st->nframes++;
    263   return st->nframes - 1u;
    264 }
    265 
    266 static void unsupported(InterpStack* st, const char* what) {
    267   st->status = KIT_INTERP_ERROR;
    268   st->trap_reason = what;
    269   diag_emit(st->prog->c->ctx->diag, KIT_DIAG_ERROR, iloc(),
    270             "interp: %s not supported", what ? what : "operation");
    271 }
    272 
    273 static void fault(InterpStack* st, const char* what) {
    274   st->status = KIT_INTERP_TRAP;
    275   st->trap_reason = what;
    276   diag_emit(st->prog->c->ctx->diag, KIT_DIAG_ERROR, iloc(), "interp: trap: %s",
    277             what ? what : "fault");
    278 }
    279 
    280 /* ---- integer/fp arithmetic ---- */
    281 
    282 /* Shift-count mask for the spec's portable "reduce modulo width" rule
    283  * (doc/IR.md). The engine stores every scalar in a u64, so the meaningful
    284  * range is the storage width (<=64 bits); 16-byte scalars are lowered to
    285  * memory / 64-bit-half sequences before reaching here, never as a w==16 BINOP.
    286  * Clamping to the storage width keeps the host C shift in range regardless and
    287  * is identical to (w*8-1) for every width the engine actually carries (<=8). */
    288 static u32 shift_mask(u32 w) { return (w >= 8u ? 64u : w * 8u) - 1u; }
    289 
    290 static u64 do_binop(InterpStack* st, u32 binop, u64 a, u64 b, u32 w, u8 fp) {
    291   if (fp) {
    292     double x = rd_f(a, w), y = rd_f(b, w), r = 0;
    293     switch ((BinOp)binop) {
    294       case BO_FADD:
    295         r = x + y;
    296         break;
    297       case BO_FSUB:
    298         r = x - y;
    299         break;
    300       case BO_FMUL:
    301         r = x * y;
    302         break;
    303       case BO_FDIV:
    304         r = x / y;
    305         break;
    306       default:
    307         unsupported(st, "fp binop");
    308         return 0;
    309     }
    310     return wr_f(r, w);
    311   }
    312   switch ((BinOp)binop) {
    313     case BO_IADD:
    314       return mask_w(a + b, w);
    315     case BO_ISUB:
    316       return mask_w(a - b, w);
    317     case BO_IMUL:
    318       return mask_w(a * b, w);
    319     case BO_SDIV: {
    320       i64 x = sext_w(a, w), y = sext_w(b, w);
    321       if (y == 0) {
    322         fault(st, "integer divide by zero");
    323         return 0;
    324       }
    325       /* INT_MIN / -1 overflows (UB / SIGFPE on x86) — wraps to INT_MIN. */
    326       if (y == -1) return mask_w(0u - (u64)x, w);
    327       return mask_w((u64)(x / y), w);
    328     }
    329     case BO_UDIV: {
    330       u64 x = mask_w(a, w), y = mask_w(b, w);
    331       if (y == 0) {
    332         fault(st, "integer divide by zero");
    333         return 0;
    334       }
    335       return mask_w(x / y, w);
    336     }
    337     case BO_SREM: {
    338       i64 x = sext_w(a, w), y = sext_w(b, w);
    339       if (y == 0) {
    340         fault(st, "integer divide by zero");
    341         return 0;
    342       }
    343       if (y == -1) return 0; /* INT_MIN % -1 == 0 (avoids the overflow UB) */
    344       return mask_w((u64)(x % y), w);
    345     }
    346     case BO_UREM: {
    347       u64 x = mask_w(a, w), y = mask_w(b, w);
    348       if (y == 0) {
    349         fault(st, "integer divide by zero");
    350         return 0;
    351       }
    352       return mask_w(x % y, w);
    353     }
    354     case BO_AND:
    355       return mask_w(a & b, w);
    356     case BO_OR:
    357       return mask_w(a | b, w);
    358     case BO_XOR:
    359       return mask_w(a ^ b, w);
    360     case BO_SHL:
    361       return mask_w(a << (b & shift_mask(w)), w);
    362     case BO_SHR_S: {
    363       i64 x = sext_w(a, w);
    364       return mask_w((u64)(x >> (b & shift_mask(w))), w);
    365     }
    366     case BO_SHR_U:
    367       return mask_w(mask_w(a, w) >> (b & shift_mask(w)), w);
    368     default:
    369       unsupported(st, "int binop");
    370       return 0;
    371   }
    372 }
    373 
    374 static int do_cmp(InterpStack* st, u32 cmp, u64 a, u64 b, u32 w) {
    375   /* FP-ness is self-describing from the opcode (the FP block starts at
    376    * CMP_OEQ_F); no operand-class sniffing needed. */
    377   if (cmp >= CMP_OEQ_F) {
    378     double x = rd_f(a, w), y = rd_f(b, w);
    379     int uno = (x != x) || (y != y); /* unordered: either operand is NaN */
    380     switch ((CmpOp)cmp) {
    381       case CMP_OEQ_F:
    382         return x == y; /* ordered: false on NaN */
    383       case CMP_ONE_F:
    384         return !uno && (x != y);
    385       case CMP_OLT_F:
    386         return x < y;
    387       case CMP_OLE_F:
    388         return x <= y;
    389       case CMP_OGT_F:
    390         return x > y;
    391       case CMP_OGE_F:
    392         return x >= y;
    393       case CMP_UEQ_F:
    394         return uno || (x == y);
    395       case CMP_UNE_F:
    396         return x != y; /* unordered: true on NaN */
    397       case CMP_ULT_F:
    398         return uno || (x < y);
    399       case CMP_ULE_F:
    400         return uno || (x <= y);
    401       case CMP_UGT_F:
    402         return uno || (x > y);
    403       case CMP_UGE_F:
    404         return uno || (x >= y);
    405       default:
    406         break;
    407     }
    408   }
    409   switch ((CmpOp)cmp) {
    410     case CMP_EQ:
    411       return mask_w(a, w) == mask_w(b, w);
    412     case CMP_NE:
    413       return mask_w(a, w) != mask_w(b, w);
    414     case CMP_LT_S:
    415       return sext_w(a, w) < sext_w(b, w);
    416     case CMP_LE_S:
    417       return sext_w(a, w) <= sext_w(b, w);
    418     case CMP_GT_S:
    419       return sext_w(a, w) > sext_w(b, w);
    420     case CMP_GE_S:
    421       return sext_w(a, w) >= sext_w(b, w);
    422     case CMP_LT_U:
    423       return mask_w(a, w) < mask_w(b, w);
    424     case CMP_LE_U:
    425       return mask_w(a, w) <= mask_w(b, w);
    426     case CMP_GT_U:
    427       return mask_w(a, w) > mask_w(b, w);
    428     case CMP_GE_U:
    429       return mask_w(a, w) >= mask_w(b, w);
    430     default:
    431       unsupported(st, "cmp");
    432       return 0;
    433   }
    434 }
    435 
    436 /* Saturating float-to-integer (NaN -> 0, out-of-range -> clamped to the
    437  * destination width). Matches Wasm trunc_sat semantics and, crucially, avoids
    438  * the UB of casting a NaN/overflowing double to an integer (which traps under
    439  * UBSan). For in-range values this is identical to a plain truncating cast, so
    440  * well-defined C float->int conversions are unaffected. Avoids <math.h>
    441  * (libkit is freestanding) by building the 2^k bound with a loop. */
    442 static u64 ftoi_sat(double d, u32 wbytes, int is_signed) {
    443   u32 bits, i;
    444   double bound;
    445   if (d != d) return 0; /* NaN */
    446   if (wbytes == 0 || wbytes > 8) wbytes = 8;
    447   bits = wbytes * 8u;
    448   if (is_signed) {
    449     bound = 1.0;
    450     for (i = 0; i + 1u < bits; ++i) bound *= 2.0; /* 2^(bits-1) */
    451     if (d >= bound)
    452       return mask_w(
    453           bits >= 64 ? 0x7fffffffffffffffull : (((u64)1 << (bits - 1u)) - 1u),
    454           wbytes);
    455     if (d < -bound)
    456       return mask_w(
    457           bits >= 64 ? 0x8000000000000000ull : ((u64)1 << (bits - 1u)), wbytes);
    458     return mask_w((u64)(i64)d, wbytes);
    459   }
    460   bound = 1.0;
    461   for (i = 0; i < bits; ++i) bound *= 2.0; /* 2^bits */
    462   if (d < 0.0) return 0;
    463   if (d >= bound)
    464     return mask_w(bits >= 64 ? ~0ull : (((u64)1 << bits) - 1u), wbytes);
    465   return mask_w((u64)d, wbytes);
    466 }
    467 
    468 static u64 do_convert(InterpStack* st, InterpInsn* in, u64 v) {
    469   u32 wd = in->w0, ws = in->w1;
    470   switch ((ConvKind)in->sub) {
    471     case CV_SEXT:
    472       return mask_w((u64)sext_w(v, ws), wd);
    473     case CV_ZEXT:
    474       return mask_w(mask_w(v, ws), wd);
    475     case CV_TRUNC:
    476       return mask_w(v, wd);
    477     case CV_ITOF_S:
    478       return wr_f((double)sext_w(v, ws), wd);
    479     case CV_ITOF_U:
    480       return wr_f((double)mask_w(v, ws), wd);
    481     case CV_FTOI_S:
    482       return ftoi_sat(rd_f(v, ws), wd, 1);
    483     case CV_FTOI_U:
    484       return ftoi_sat(rd_f(v, ws), wd, 0);
    485     case CV_FEXT:
    486       return wr_f(rd_f(v, ws), wd);
    487     case CV_FTRUNC:
    488       return wr_f(rd_f(v, ws), wd);
    489     case CV_BITCAST:
    490       return mask_w(v, wd);
    491     default:
    492       unsupported(st, "convert");
    493       return 0;
    494   }
    495 }
    496 
    497 static u64 do_rmw(u32 op, u64 old, u64 val, u32 w) {
    498   switch ((KitCgAtomicOp)op) {
    499     case KIT_CG_ATOMIC_XCHG:
    500       return mask_w(val, w);
    501     case KIT_CG_ATOMIC_ADD:
    502       return mask_w(old + val, w);
    503     case KIT_CG_ATOMIC_SUB:
    504       return mask_w(old - val, w);
    505     case KIT_CG_ATOMIC_AND:
    506       return mask_w(old & val, w);
    507     case KIT_CG_ATOMIC_OR:
    508       return mask_w(old | val, w);
    509     case KIT_CG_ATOMIC_XOR:
    510       return mask_w(old ^ val, w);
    511     case KIT_CG_ATOMIC_NAND:
    512       return mask_w(~(old & val), w);
    513     default:
    514       return old;
    515   }
    516 }
    517 
    518 static u64 do_unop(InterpStack* st, u32 unop, u64 a, u32 w, u8 fp) {
    519   (void)fp;
    520   switch ((UnOp)unop) {
    521     case UO_NEG:
    522       return mask_w(0u - a, w); /* well-defined two's-complement */
    523     case UO_FNEG:
    524       return wr_f(-rd_f(a, w), w);
    525     case UO_NOT:
    526       return mask_w(a, w) == 0 ? 1u : 0u;
    527     case UO_BNOT:
    528       return mask_w(~a, w);
    529     default:
    530       unsupported(st, "unop");
    531       return 0;
    532   }
    533 }
    534 
    535 /* Bind call arguments into a freshly-pushed callee frame (value semantics). */
    536 static void bind_args(InterpStack* st, u32 caller_idx, u32 callee_idx,
    537                       const OptCGCallDesc* desc) {
    538   InterpProgram* p = st->prog;
    539   InterpFrame* caller = &st->frames[caller_idx];
    540   InterpFrame* callee = &st->frames[callee_idx];
    541   InterpFunc* cfn = caller->fn;
    542   InterpFunc* efn = callee->fn;
    543   u64* cregs = (u64*)(st->regs_arena + caller->regs_off);
    544   u64* eregs = (u64*)(st->regs_arena + callee->regs_off);
    545   u32 nbind = desc->nargs < efn->f->nparams ? desc->nargs : efn->f->nparams;
    546   u32 i;
    547   for (i = 0; i < nbind; ++i) {
    548     OptCGABIValue* arg = &desc->args[i];
    549     IRParam* pr = &efn->f->params[i];
    550     u32 size = abi_cg_sizeof(p->c->abi, arg->type);
    551     if (pr->storage.kind == CG_LOCAL_STORAGE_REG) {
    552       eregs[pr->storage.v.reg] =
    553           op_value(st, cfn, cregs, caller->mem_off, &arg->storage);
    554     } else {
    555       u64 dst = frame_base(st, callee->mem_off) +
    556                 efn->slot_off[pr->storage.v.frame_slot];
    557       if (cg_type_is_aggregate(p->c, arg->type) || size > 8u) {
    558         u64 src = op_addr(st, cfn, cregs, caller->mem_off, &arg->storage);
    559         mem_copy(st, dst, src, size);
    560       } else {
    561         mem_write(st, dst, size ? size : 8u,
    562                   op_value(st, cfn, cregs, caller->mem_off, &arg->storage));
    563       }
    564     }
    565   }
    566 }
    567 
    568 /* Lay out the anonymous (variadic) arguments of an internal call into a
    569  * contiguous buffer in the callee frame's addressable region, above its static
    570  * frame and any future alloca. Records the buffer offset on the callee frame so
    571  * IOP_VA_START can hand va_arg a cursor over it. Returns 0 on stack overflow.
    572  */
    573 static int build_varargs(InterpStack* st, u32 caller_idx, u32 callee_idx,
    574                          const OptCGCallDesc* desc) {
    575   InterpProgram* p = st->prog;
    576   InterpFrame* caller = &st->frames[caller_idx];
    577   InterpFrame* callee = &st->frames[callee_idx];
    578   InterpFunc* cfn = caller->fn;
    579   InterpFunc* efn = callee->fn;
    580   u64* cregs = (u64*)(st->regs_arena + caller->regs_off);
    581   u32 nfixed = efn->f->nparams;
    582   u32 cur = (callee->alloca_top + 15u) & ~15u; /* 16-align buffer start */
    583   u32 buf_start = cur;
    584   u32 i;
    585   if (desc->nargs <= nfixed) return 1; /* no anonymous args */
    586   for (i = nfixed; i < desc->nargs; ++i) {
    587     OptCGABIValue* arg = &desc->args[i];
    588     u32 size = abi_cg_sizeof(p->c->abi, arg->type);
    589     u32 al = va_align_of(size);
    590     u64 dst;
    591     cur = (cur + al - 1u) & ~(al - 1u);
    592     if ((u64)callee->mem_off + cur + va_stride_of(size) > st->mem_cap) return 0;
    593     dst = frame_base(st, callee->mem_off) + cur;
    594     if (cg_type_is_aggregate(p->c, arg->type) || size > 8u) {
    595       u64 src = op_addr(st, cfn, cregs, caller->mem_off, &arg->storage);
    596       mem_copy(st, dst, src, size);
    597     } else {
    598       mem_write(st, dst, 8u,
    599                 op_value(st, cfn, cregs, caller->mem_off, &arg->storage));
    600     }
    601     cur += va_stride_of(size);
    602   }
    603   callee->has_varargs = 1;
    604   callee->vararg_off = callee->mem_off + buf_start;
    605   callee->alloca_top = cur;
    606   if (callee->mem_off + cur > st->mem_top) st->mem_top = callee->mem_off + cur;
    607   return 1;
    608 }
    609 
    610 /* ---- external (host ABI) call marshalling ---- */
    611 
    612 /* Record an integer-register argument. Returns non-zero (with *why) on
    613  * overflow of the supported register-thunk family. */
    614 static int ffi_push_int(InterpFfiArgs* fa, u64 v, const char** why) {
    615   if (fa->nint >= 8u) {
    616     *why = "external call: too many int args";
    617     return 1;
    618   }
    619   fa->iargs[fa->nint++] = v;
    620   return 0;
    621 }
    622 
    623 /* Record an fp-register argument, tracking single vs double precision (the two
    624  * occupy the fp register differently). Returns non-zero (with *why) on overflow
    625  * or a float/double mix within one signature. */
    626 static int ffi_push_fp(InterpFfiArgs* fa, u64 bits, u32 size,
    627                        const char** why) {
    628   if (fa->nfp >= 8u) {
    629     *why = "external call: too many fp args";
    630     return 1;
    631   }
    632   if (size == 4u) {
    633     if (fa->nfp > 0u && !fa->args_fp_is_float) {
    634       *why = "external call: mixed float/double args";
    635       return 1;
    636     }
    637     fa->args_fp_is_float = 1u;
    638     fa->fargs_f[fa->nfp++] = (float)rd_f(bits, 4u);
    639   } else {
    640     if (fa->nfp > 0u && fa->args_fp_is_float) {
    641       *why = "external call: mixed float/double args";
    642       return 1;
    643     }
    644     fa->fargs[fa->nfp++] = rd_f(bits, size ? size : 8u);
    645   }
    646   return 0;
    647 }
    648 
    649 static u64 ext_call(InterpStack* st, InterpFrame* fr, u64* regs, void* host_fp,
    650                     const OptCGCallDesc* desc) {
    651   InterpProgram* p = st->prog;
    652   const ABIFuncInfo* fi = desc->abi;
    653   InterpFfiArgs fa;
    654   const char* reason = NULL;
    655   u32 i;
    656 
    657   if (!fi) {
    658     unsupported(st, "external call without ABI info");
    659     return 0;
    660   }
    661   if (fi->vararg_on_stack && fi->variadic) {
    662     unsupported(st, "variadic external call (stack-routed)");
    663     return 0;
    664   }
    665   memset(&fa, 0, sizeof fa);
    666   fa.fi = fi;
    667 
    668   /* hidden struct return: pass the caller's aggregate-return slot directly.
    669    * When the call is a tail call its result has no local home (ret.storage is
    670    * void) — forward this frame's own sret destination instead. */
    671   if (fi->has_sret) {
    672     u32 rsz = abi_cg_sizeof(p->c->abi, desc->ret.type);
    673     if (desc->ret.storage.kind == OPK_LOCAL ||
    674         desc->ret.storage.kind == OPK_GLOBAL ||
    675         desc->ret.storage.kind == OPK_INDIRECT) {
    676       u64 dst = op_addr(st, fr->fn, regs, fr->mem_off, &desc->ret.storage);
    677       fa.sret = interp_translate(p, dst, rsz, PERM_W);
    678     } else {
    679       fa.sret = fr->sret_ptr; /* tail call: deliver to our caller's sret slot */
    680     }
    681     if (!fa.sret) {
    682       unsupported(st, "sret destination");
    683       return 0;
    684     }
    685     fa.iargs[fa.nint++] = (u64)(uintptr_t)fa.sret;
    686     fa.ret_is_void = 1;
    687   }
    688 
    689   for (i = 0; i < desc->nargs; ++i) {
    690     OptCGABIValue* arg = &desc->args[i];
    691     const ABIArgInfo* ai = (i < fi->nparams) ? &fi->params[i] : NULL;
    692     if (ai && ai->kind == ABI_ARG_IGNORE) continue;
    693     if (ai && ai->kind == ABI_ARG_INDIRECT) {
    694       /* byval: pass a pointer to the aggregate (caller's copy). */
    695       u64 a = op_addr(st, fr->fn, regs, fr->mem_off, &arg->storage);
    696       u8* h = interp_translate(p, a, 1, PERM_R);
    697       if (fa.nint >= 8) {
    698         unsupported(st, "external call: too many int args");
    699         return 0;
    700       }
    701       fa.iargs[fa.nint++] = (u64)(uintptr_t)h;
    702       continue;
    703     }
    704     if (ai && ai->kind == ABI_ARG_DIRECT && ai->nparts > 1) {
    705       /* aggregate split across registers: read each part from memory. */
    706       u64 base = op_addr(st, fr->fn, regs, fr->mem_off, &arg->storage);
    707       u32 k;
    708       for (k = 0; k < ai->nparts; ++k) {
    709         const ABIArgPart* pt = &ai->parts[k];
    710         u64 chunk = mem_read(st, base + pt->src_offset, pt->size);
    711         int bad = (pt->cls == ABI_CLASS_FP)
    712                       ? ffi_push_fp(&fa, chunk, pt->size, &reason)
    713                       : ffi_push_int(&fa, chunk, &reason);
    714         if (bad) {
    715           unsupported(st, reason);
    716           return 0;
    717         }
    718       }
    719       continue;
    720     }
    721     /* scalar (or variadic extra arg): route by type. The named-parameter
    722      * aggregate/large cases are handled by the INDIRECT / multi-part branches
    723      * above; a variadic-tail arg has no ABI classification (ai==NULL), so an
    724      * aggregate or >8-byte scalar here can't be marshalled (and op_value's
    725      * 8-byte read would overflow) — diagnose rather than corrupt. */
    726     if (cg_type_is_aggregate(p->c, arg->type) ||
    727         abi_cg_sizeof(p->c->abi, arg->type) > 8u) {
    728       unsupported(st, "external call: aggregate/oversized variadic argument");
    729       return 0;
    730     }
    731     {
    732       ABITypeInfo ti = abi_cg_type_info(p->c->abi, arg->type);
    733       u64 v = op_value(st, fr->fn, regs, fr->mem_off, &arg->storage);
    734       int bad = (ti.scalar_kind == ABI_SC_FLOAT)
    735                     ? ffi_push_fp(&fa, v, ti.size ? ti.size : 8u, &reason)
    736                     : ffi_push_int(&fa, v, &reason);
    737       if (bad) {
    738         unsupported(st, reason);
    739         return 0;
    740       }
    741     }
    742   }
    743 
    744   /* Return classification from the ABI's own return descriptor (robust even
    745    * when desc->ret.type is void, e.g. a tail call whose result is not stored
    746    * into any caller local). A small struct can come back in up to two
    747    * registers; each part's class steers which return register the thunk reads.
    748    */
    749   if (!fi->has_sret) {
    750     if (fi->ret.kind == ABI_ARG_IGNORE || fi->ret.nparts == 0) {
    751       fa.ret_is_void = 1;
    752       fa.ret_nparts = 0;
    753     } else if (fi->ret.nparts > 2) {
    754       unsupported(st, "external call: 3+ register struct return");
    755       return 0;
    756     } else {
    757       u32 k;
    758       fa.ret_nparts = (u8)fi->ret.nparts;
    759       for (k = 0; k < fi->ret.nparts; ++k) {
    760         fa.ret_fp[k] = (fi->ret.parts[k].cls == ABI_CLASS_FP) ? 1u : 0u;
    761         fa.ret_size[k] = fi->ret.parts[k].size ? fi->ret.parts[k].size : 8u;
    762         /* A 4-byte fp return part is a single in the low half of an fp reg; the
    763          * two-register thunks read fp parts as doubles, so diagnose it. */
    764         if (fi->ret.nparts > 1u && fa.ret_fp[k] && fa.ret_size[k] == 4u) {
    765           unsupported(st, "external call: 32-bit fp struct-return field");
    766           return 0;
    767         }
    768       }
    769     }
    770   }
    771 
    772   {
    773     u64 out[2] = {0, 0};
    774     if (interp_ffi_invoke(host_fp, &fa, out, &reason) != 0) {
    775       unsupported(st, reason ? reason : "external call signature");
    776       return 0;
    777     }
    778     if (fa.ret_is_void || fa.ret_nparts == 0) return 0;
    779     /* Deliver the result. A register destination (OPK_REG) takes the low
    780      * register; a memory destination (an address-taken result local, or a small
    781      * aggregate returned in registers) receives each part's bytes scattered to
    782      * its src_offset. A value-less tail call has no home — the low register is
    783      * shuttled out as the scalar result. */
    784     if (desc->ret.storage.kind == OPK_REG) {
    785       if (fa.ret_nparts == 1) regs[desc->ret.storage.v.reg] = out[0];
    786     } else if (desc->ret.storage.kind == OPK_LOCAL ||
    787                desc->ret.storage.kind == OPK_GLOBAL ||
    788                desc->ret.storage.kind == OPK_INDIRECT) {
    789       u64 dst = op_addr(st, fr->fn, regs, fr->mem_off, &desc->ret.storage);
    790       u32 k;
    791       for (k = 0; k < fi->ret.nparts && k < 2u; ++k)
    792         mem_write(st, dst + fi->ret.parts[k].src_offset, fa.ret_size[k], out[k]);
    793     }
    794     return out[0];
    795   }
    796 }
    797 
    798 /* ---- engine ---- */
    799 
    800 /* Dispatch mechanism. With labels-as-values (GNU computed goto) the engine is
    801  * direct-threaded: each InterpInsn caches the &&handler of its opcode and every
    802  * handler tail-dispatches straight to the next via `goto *`, giving the branch
    803  * predictor a distinct indirect branch per opcode site. This is the default
    804  * (KIT_INTERP_THREADED in <kit/config.h>). GCC, clang, and kit itself
    805  * (__kit__) all implement labels-as-values; any other compiler transparently
    806  * falls back to a portable `switch`, sharing one set of handler bodies through
    807  * OP()/NEXT()/GO(). Force the choice with -DKIT_INTERP_THREADED=0|1. */
    808 #if !defined(KIT_INTERP_THREADED)
    809 /* Belt-and-braces: config.h normally defines this. Default on so a missed
    810  * include degrades to threaded-where-supported, never a silent switch. */
    811 #define KIT_INTERP_THREADED 1
    812 #endif
    813 /* Effective dispatch: requested AND the compiler can compile labels-as-values.
    814  */
    815 #if KIT_INTERP_THREADED && \
    816     (defined(__GNUC__) || defined(__clang__) || defined(__kit__))
    817 #define INTERP_DISPATCH_THREADED 1
    818 #else
    819 #define INTERP_DISPATCH_THREADED 0
    820 #endif
    821 
    822 /* The opcode roster: one entry per InterpOp with a handler, used to publish the
    823  * threaded dispatch table from the in-function &&labels. Must stay in sync with
    824  * the OP(...) handlers below (a missing/extra entry is a compile error: an
    825  * undefined or unused label). */
    826 // clang-format off
    827 #define INTERP_OPS(X)    \
    828   X(IOP_NOP)             \
    829   X(IOP_LOAD_IMM)        \
    830   X(IOP_LOAD_CONST)      \
    831   X(IOP_COPY)            \
    832   X(IOP_COPY_AGG)        \
    833   X(IOP_LOAD)            \
    834   X(IOP_LOAD_AGG)        \
    835   X(IOP_STORE)           \
    836   X(IOP_STORE_AGG)       \
    837   X(IOP_ADDR_OF)         \
    838   X(IOP_TLS_ADDR)        \
    839   X(IOP_BINOP)           \
    840   X(IOP_UNOP)            \
    841   X(IOP_CMP)             \
    842   X(IOP_CONVERT)         \
    843   X(IOP_CALL)            \
    844   X(IOP_BR)              \
    845   X(IOP_CONDBR)          \
    846   X(IOP_CMP_BRANCH)      \
    847   X(IOP_SWITCH)          \
    848   X(IOP_INDIRECT_BR)     \
    849   X(IOP_LOAD_LABEL_ADDR) \
    850   X(IOP_RET)             \
    851   X(IOP_RET_VOID)        \
    852   X(IOP_ALLOCA)          \
    853   X(IOP_AGG_COPY)        \
    854   X(IOP_AGG_SET)         \
    855   X(IOP_BITFIELD_LOAD)   \
    856   X(IOP_BITFIELD_STORE)  \
    857   X(IOP_VA_START)        \
    858   X(IOP_VA_ARG)          \
    859   X(IOP_VA_END)          \
    860   X(IOP_VA_COPY)         \
    861   X(IOP_ATOMIC_LOAD)     \
    862   X(IOP_ATOMIC_STORE)    \
    863   X(IOP_ATOMIC_RMW)      \
    864   X(IOP_ATOMIC_CAS)      \
    865   X(IOP_FENCE)           \
    866   X(IOP_INTRINSIC)       \
    867   X(IOP_UNREACHABLE)     \
    868   X(IOP_TRAP)
    869 // clang-format on
    870 
    871 #if INTERP_DISPATCH_THREADED
    872 #define OP(name) L_##name
    873 /* linear op: re-check the memory-fault latch, advance, dispatch the next insn
    874  */
    875 #define NEXT()                       \
    876   do {                               \
    877     if (st->mem_fault) goto fault_mem; \
    878     ++ip;                            \
    879     in = ip;                         \
    880     I = in->inst;                    \
    881     goto * in->handler;              \
    882   } while (0)
    883 /* branch op: ip already retargeted, dispatch without advancing */
    884 #define GO()            \
    885   do {                  \
    886     in = ip;            \
    887     I = in->inst;       \
    888     goto * in->handler; \
    889   } while (0)
    890 #if defined(__clang__)
    891 #pragma clang diagnostic push
    892 #pragma clang diagnostic ignored "-Wgnu-label-as-value"
    893 #pragma clang diagnostic ignored "-Wpedantic"
    894 #elif defined(__GNUC__)
    895 #pragma GCC diagnostic push
    896 #pragma GCC diagnostic ignored "-Wpedantic"
    897 #endif
    898 #else
    899 #define OP(name) case name
    900 #define NEXT() break
    901 #define GO() continue
    902 #endif
    903 
    904 KitInterpStatus interp_run_stack(InterpStack* st, int64_t* out_ret) {
    905   InterpProgram* p = st->prog;
    906   InterpFrame* fr;
    907   InterpFunc* fn;
    908   u64* regs;
    909   u32 mem_off;
    910   InterpInsn* ip;
    911   InterpInsn* in = NULL;
    912   const Inst* I = NULL;
    913 
    914   if (st->nframes == 0) {
    915     st->status = KIT_INTERP_DONE;
    916     if (out_ret) *out_ret = (int64_t)st->scalar_ret;
    917     return KIT_INTERP_DONE;
    918   }
    919 
    920 #if INTERP_DISPATCH_THREADED
    921   /* Per-function lazy threading: copy each opcode's handler into its record on
    922    * first entry to the function (RELOAD runs whenever the top frame changes).
    923    */
    924 #define RELOAD()                                             \
    925   do {                                                       \
    926     fr = &st->frames[st->nframes - 1u];                      \
    927     fn = fr->fn;                                             \
    928     regs = (u64*)(st->regs_arena + fr->regs_off);            \
    929     mem_off = fr->mem_off;                                   \
    930     ip = fr->ip;                                             \
    931     if (!fn->threaded) {                                     \
    932       u32 ti_;                                               \
    933       for (ti_ = 0; ti_ < fn->ncode; ++ti_) {                \
    934         u32 o_ = fn->code[ti_].op;                           \
    935         fn->code[ti_].handler =                              \
    936             g_dt[o_ < (u32)IOP__COUNT ? o_ : (u32)IOP_TRAP]; \
    937       }                                                      \
    938       fn->threaded = 1;                                      \
    939     }                                                        \
    940   } while (0)
    941 #else
    942 #define RELOAD()                                  \
    943   do {                                            \
    944     fr = &st->frames[st->nframes - 1u];           \
    945     fn = fr->fn;                                  \
    946     regs = (u64*)(st->regs_arena + fr->regs_off); \
    947     mem_off = fr->mem_off;                        \
    948     ip = fr->ip;                                  \
    949   } while (0)
    950 #endif
    951 
    952 #if INTERP_DISPATCH_THREADED
    953   static void* g_dt[IOP__COUNT];
    954   static int g_dt_ready = 0;
    955   if (!g_dt_ready) {
    956 #define DT_ENTRY(name) g_dt[name] = &&L_##name;
    957     INTERP_OPS(DT_ENTRY)
    958 #undef DT_ENTRY
    959     g_dt_ready = 1;
    960   }
    961 #endif
    962 
    963   RELOAD();
    964   if (!fn->ok) {
    965     unsupported(st, fn->reject_reason ? fn->reject_reason : "function");
    966     return (KitInterpStatus)st->status;
    967   }
    968   st->mem_fault = 0;
    969 
    970 #if INTERP_DISPATCH_THREADED
    971   in = ip;
    972   I = in->inst;
    973   goto * in->handler;
    974 #else
    975   for (;;) {
    976     in = ip;
    977     I = in->inst;
    978     switch ((InterpOp)in->op) {
    979 #endif
    980   OP(IOP_NOP) : NEXT();
    981   OP(IOP_LOAD_IMM)
    982       : write_dst(st, fn, regs, mem_off, &I->opnds[0], (u64)in->imm);
    983   NEXT();
    984   OP(IOP_LOAD_CONST) : {
    985     ConstBytes cb = I->extra.cbytes;
    986     u64 v = 0;
    987     u32 n = cb.size > 8u ? 8u : cb.size;
    988     if (cb.bytes && n) memcpy(&v, cb.bytes, n);
    989     write_dst(st, fn, regs, mem_off, &I->opnds[0], v);
    990     NEXT();
    991   }
    992   OP(IOP_COPY)
    993       : write_dst(st, fn, regs, mem_off, &I->opnds[0],
    994                   op_value(st, fn, regs, mem_off, &I->opnds[1]));
    995   NEXT();
    996   OP(IOP_COPY_AGG) : {
    997     u64 d = op_addr(st, fn, regs, mem_off, &I->opnds[0]);
    998     u64 s = op_addr(st, fn, regs, mem_off, &I->opnds[1]);
    999     mem_copy(st, d, s, abi_cg_sizeof(p->c->abi, I->opnds[0].type));
   1000     NEXT();
   1001   }
   1002   OP(IOP_LOAD) : {
   1003     u64 a = op_addr(st, fn, regs, mem_off, &I->opnds[1]);
   1004     write_dst(st, fn, regs, mem_off, &I->opnds[0],
   1005               mem_read(st, a, in->w0 ? in->w0 : 8u));
   1006     NEXT();
   1007   }
   1008   OP(IOP_LOAD_AGG) : {
   1009     u64 d = op_addr(st, fn, regs, mem_off, &I->opnds[0]);
   1010     u64 s = op_addr(st, fn, regs, mem_off, &I->opnds[1]);
   1011     mem_copy(st, d, s, abi_cg_sizeof(p->c->abi, I->opnds[0].type));
   1012     NEXT();
   1013   }
   1014   OP(IOP_STORE) : {
   1015     u64 a = op_addr(st, fn, regs, mem_off, &I->opnds[0]);
   1016     u64 v = op_value(st, fn, regs, mem_off, &I->opnds[1]);
   1017     mem_write(st, a, in->w0 ? in->w0 : 8u, v);
   1018     NEXT();
   1019   }
   1020   OP(IOP_STORE_AGG) : {
   1021     u64 d = op_addr(st, fn, regs, mem_off, &I->opnds[0]);
   1022     u64 s = op_addr(st, fn, regs, mem_off, &I->opnds[1]);
   1023     mem_copy(st, d, s, abi_cg_sizeof(p->c->abi, I->opnds[1].type));
   1024     NEXT();
   1025   }
   1026   OP(IOP_ADDR_OF)
   1027       : write_dst(st, fn, regs, mem_off, &I->opnds[0],
   1028                   op_addr(st, fn, regs, mem_off, &I->opnds[1]));
   1029   NEXT();
   1030   OP(IOP_BINOP) : {
   1031     u64 r = do_binop(st, in->sub, op_value(st, fn, regs, mem_off, &I->opnds[1]),
   1032                      op_value(st, fn, regs, mem_off, &I->opnds[2]), in->w0,
   1033                      in->fp0);
   1034     if (st->status) goto stop;
   1035     write_dst(st, fn, regs, mem_off, &I->opnds[0], r);
   1036     NEXT();
   1037   }
   1038   OP(IOP_UNOP) : {
   1039     u64 r = do_unop(st, in->sub, op_value(st, fn, regs, mem_off, &I->opnds[1]),
   1040                     in->w0, in->fp0);
   1041     if (st->status) goto stop;
   1042     write_dst(st, fn, regs, mem_off, &I->opnds[0], r);
   1043     NEXT();
   1044   }
   1045   OP(IOP_CMP) : {
   1046     u64 r =
   1047         (u64)do_cmp(st, in->sub, op_value(st, fn, regs, mem_off, &I->opnds[1]),
   1048                     op_value(st, fn, regs, mem_off, &I->opnds[2]), in->w0);
   1049     if (st->status) goto stop;
   1050     write_dst(st, fn, regs, mem_off, &I->opnds[0], r);
   1051     NEXT();
   1052   }
   1053   OP(IOP_CONVERT) : {
   1054     u64 r = do_convert(st, in, op_value(st, fn, regs, mem_off, &I->opnds[1]));
   1055     if (st->status) goto stop;
   1056     write_dst(st, fn, regs, mem_off, &I->opnds[0], r);
   1057     NEXT();
   1058   }
   1059   OP(IOP_ALLOCA) : {
   1060     u64 size = op_value(st, fn, regs, mem_off, &I->opnds[1]);
   1061     u32 align = in->imm ? (u32)in->imm : 16u;
   1062     u32 off = (fr->alloca_top + align - 1u) & ~(align - 1u);
   1063     if ((u64)fr->mem_off + off + size > st->mem_cap) {
   1064       fault(st, "alloca: stack overflow");
   1065       goto stop;
   1066     }
   1067     write_dst(st, fn, regs, mem_off, &I->opnds[0],
   1068               frame_base(st, fr->mem_off) + off);
   1069     fr->alloca_top = off + (u32)size;
   1070     /* Advance the global high-water so a nested call's frame is allocated
   1071      * ABOVE this live alloca region (otherwise it would alias it). */
   1072     if (fr->mem_off + fr->alloca_top > st->mem_top)
   1073       st->mem_top = fr->mem_off + fr->alloca_top;
   1074     NEXT();
   1075   }
   1076   OP(IOP_BR) : ip = &fn->code[in->t0];
   1077   GO();
   1078   OP(IOP_CONDBR) : {
   1079     u64 c = op_value(st, fn, regs, mem_off, &I->opnds[0]);
   1080     /* A faulting selector would otherwise branch on garbage: branch ops
   1081      * skip the straight-line fault re-check, so test the latch here. */
   1082     if (st->mem_fault) {
   1083       fault(st, "invalid memory access");
   1084       goto stop;
   1085     }
   1086     ip = &fn->code[c ? in->t0 : in->t1];
   1087     GO();
   1088   }
   1089   OP(IOP_CMP_BRANCH) : {
   1090     int taken = do_cmp(
   1091         st, in->sub, op_value(st, fn, regs, mem_off, &I->opnds[0]),
   1092         op_value(st, fn, regs, mem_off, &I->opnds[1]), in->w0 ? in->w0 : 8u);
   1093     if (st->status) goto stop;
   1094     if (st->mem_fault) {
   1095       fault(st, "invalid memory access");
   1096       goto stop;
   1097     }
   1098     ip = &fn->code[taken ? in->t0 : in->t1];
   1099     GO();
   1100   }
   1101   OP(IOP_SWITCH) : {
   1102     InterpSwitch* sw = &fn->switches[in->t0];
   1103     u64 sel = op_value(st, fn, regs, mem_off, &I->opnds[0]);
   1104     u32 ci;
   1105     u32 target = sw->default_pc;
   1106     u32 selw = (u32)abi_cg_sizeof(p->c->abi, sw->sel_type);
   1107     if (st->mem_fault) {
   1108       fault(st, "invalid memory access");
   1109       goto stop;
   1110     }
   1111     for (ci = 0; ci < sw->ncases; ++ci) {
   1112       if (mask_w(sel, selw) == mask_w(sw->aux->cases[ci].value, selw)) {
   1113         target = sw->case_pc[ci];
   1114         break; /* leaves the case-search loop, not the dispatch */
   1115       }
   1116     }
   1117     if (target == INTERP_PC_NONE) {
   1118       fault(st, "switch: no target");
   1119       goto stop;
   1120     }
   1121     ip = &fn->code[target];
   1122     GO();
   1123   }
   1124   OP(IOP_LOAD_LABEL_ADDR)
   1125       : /* encode target pc as the label address */
   1126         write_dst(st, fn, regs, mem_off, &I->opnds[0], (u64)in->t0);
   1127   NEXT();
   1128   OP(IOP_INDIRECT_BR) : {
   1129     u64 target = op_value(st, fn, regs, mem_off, &I->opnds[0]);
   1130     if (st->mem_fault) {
   1131       fault(st, "invalid memory access");
   1132       goto stop;
   1133     }
   1134     if (target >= fn->ncode) {
   1135       fault(st, "indirect branch out of range");
   1136       goto stop;
   1137     }
   1138     ip = &fn->code[target];
   1139     GO();
   1140   }
   1141   OP(IOP_CALL) : {
   1142     IRCallAux* aux = (IRCallAux*)I->extra.aux;
   1143     OptCGCallDesc* desc = &aux->desc;
   1144     InterpFunc* callee = NULL;
   1145     void* host_fp = NULL;
   1146     if (desc->callee.kind == OPK_GLOBAL) {
   1147       callee = interp_func_for_sym(p, desc->callee.v.global.sym);
   1148       if (callee && !callee->ok) {
   1149         /* A known internal callee we cannot interpret: propagate its
   1150          * reason rather than silently calling the native version (the
   1151          * --no-jit contract is that execution never falls back to JIT). */
   1152         unsupported(st,
   1153                     callee->reject_reason ? callee->reject_reason : "callee");
   1154         goto stop;
   1155       }
   1156       if (!callee) host_fp = interp_global_base(fn, desc->callee.v.global.sym);
   1157     } else if (desc->callee.kind == OPK_REG) {
   1158       host_fp = (void*)(uintptr_t)regs[desc->callee.v.reg];
   1159       /* If the function pointer targets a TU-internal function, interpret
   1160        * it (don't run its native code) so --no-jit truly never executes
   1161        * JITed code. External pointers fall through to the FFI path. */
   1162       callee = interp_func_for_addr(p, host_fp);
   1163       if (callee && !callee->ok) {
   1164         unsupported(st,
   1165                     callee->reject_reason ? callee->reject_reason : "callee");
   1166         goto stop;
   1167       }
   1168     }
   1169     if (callee) {
   1170       /* internal call: push a frame and bind args. */
   1171       u32 caller_idx = st->nframes - 1u;
   1172       u32 callee_idx;
   1173       if (!in->tail) fr->ip = ip + 1; /* resume after a non-tail call */
   1174       callee_idx = frame_push(st, callee);
   1175       if (callee_idx == 0xffffffffu) {
   1176         fault(st, "call: stack overflow");
   1177         goto stop;
   1178       }
   1179       bind_args(st, caller_idx, callee_idx, desc);
   1180       if (!build_varargs(st, caller_idx, callee_idx, desc)) {
   1181         fault(st, "call: stack overflow");
   1182         goto stop;
   1183       }
   1184       if (st->mem_fault) {
   1185         fault(st, "invalid memory access");
   1186         goto stop;
   1187       }
   1188       {
   1189         InterpFrame* cf = &st->frames[callee_idx];
   1190         InterpFrame* caller = &st->frames[caller_idx];
   1191         if (in->tail) {
   1192           /* True O(1) tail call: the callee's result IS this function's
   1193            * result, so inherit the tail-caller's return target and relocate
   1194            * the freshly-built callee frame down onto the (now dead) caller's
   1195            * register/memory region, rewinding the arenas. A tail loop then
   1196            * runs in constant interp+host stack space instead of growing the
   1197            * fixed reservation each iteration.
   1198            *
   1199            * Safe because the callee has not executed yet: no absolute
   1200            * pointers into its own frame exist (va_start runs later; an arg
   1201            * holding &caller_local would be UB, the caller being about to
   1202            * return). bind_args/build_varargs already copied every argument
   1203            * value out of the caller, so overwriting the caller is fine. */
   1204           u32 dst_regs = caller->regs_off;
   1205           u32 dst_mem = caller->mem_off;
   1206           u32 nregs_bytes = (callee->npregs ? callee->npregs : 1u) * 8u;
   1207           u32 mem_used = cf->alloca_top; /* static frame + vararg buffer */
   1208           cf->ret_wanted = caller->ret_wanted;
   1209           cf->ret_dst = caller->ret_dst;
   1210           cf->sret_ptr = caller->sret_ptr;
   1211           if (cf->regs_off != dst_regs)
   1212             memmove(st->regs_arena + dst_regs, st->regs_arena + cf->regs_off,
   1213                     nregs_bytes);
   1214           if (cf->mem_off != dst_mem) {
   1215             memmove(st->mem_arena + dst_mem, st->mem_arena + cf->mem_off,
   1216                     mem_used);
   1217             if (cf->has_varargs) cf->vararg_off -= (cf->mem_off - dst_mem);
   1218           }
   1219           cf->regs_off = dst_regs;
   1220           cf->mem_off = dst_mem;
   1221           *caller = *cf;
   1222           st->nframes = caller_idx + 1u;
   1223           st->regs_top = dst_regs + nregs_bytes;
   1224           st->mem_top = dst_mem + mem_used;
   1225         } else if (desc->ret.storage.kind == OPK_REG) {
   1226           cf->ret_wanted = 1;
   1227           cf->ret_dst = desc->ret.storage.v.reg;
   1228         } else if (desc->ret.storage.kind == OPK_LOCAL) {
   1229           /* aggregate return: callee writes into the caller's slot */
   1230           u64 a = frame_base(st, caller->mem_off) +
   1231                   caller->fn->slot_off[desc->ret.storage.v.frame_slot];
   1232           cf->sret_ptr = interp_translate(p, a, 1, PERM_W);
   1233         }
   1234       }
   1235       RELOAD();
   1236       GO();
   1237     }
   1238     if (!host_fp) {
   1239       unsupported(st, "unresolved call target");
   1240       goto stop;
   1241     }
   1242     {
   1243       u64 callret = ext_call(st, fr, regs, host_fp, desc);
   1244       if (st->status) goto stop;
   1245       if (in->tail) {
   1246         /* External tail call: the call's result is this function's
   1247          * result (desc.ret.storage may be empty for a tail call). */
   1248         u64 rv = callret;
   1249         u8 want = fr->ret_wanted;
   1250         u32 rdst = fr->ret_dst;
   1251         st->regs_top = fr->regs_off;
   1252         st->mem_top = fr->mem_off;
   1253         st->nframes--;
   1254         st->scalar_ret = rv;
   1255         if (st->nframes == 0) {
   1256           st->status = KIT_INTERP_DONE;
   1257           if (out_ret) *out_ret = (int64_t)rv;
   1258           return KIT_INTERP_DONE;
   1259         }
   1260         if (want) {
   1261           InterpFrame* caller = &st->frames[st->nframes - 1u];
   1262           u64* cregs = (u64*)(st->regs_arena + caller->regs_off);
   1263           cregs[rdst] = rv;
   1264         }
   1265         RELOAD();
   1266         GO();
   1267       }
   1268     }
   1269     NEXT();
   1270   }
   1271   OP(IOP_RET) : OP(IOP_RET_VOID) : {
   1272     u8 is_fp = 0;
   1273     u64 rv = 0;
   1274     u8* sret = fr->sret_ptr;
   1275     if (in->op == IOP_RET) {
   1276       IRRetAux* aux = (IRRetAux*)I->extra.aux;
   1277       OptCGABIValue* val = &aux->val;
   1278       if (cg_type_is_aggregate(p->c, val->type) ||
   1279           abi_cg_sizeof(p->c->abi, val->type) > 8u) {
   1280         if (sret) {
   1281           u64 src = op_addr(st, fn, regs, mem_off, &val->storage);
   1282           u8* s = interp_translate(p, src, abi_cg_sizeof(p->c->abi, val->type),
   1283                                    PERM_R);
   1284           if (s) memcpy(sret, s, abi_cg_sizeof(p->c->abi, val->type));
   1285         }
   1286       } else {
   1287         ABITypeInfo ti = abi_cg_type_info(p->c->abi, val->type);
   1288         u32 sz = abi_cg_sizeof(p->c->abi, val->type);
   1289         rv = op_value(st, fn, regs, mem_off, &val->storage);
   1290         is_fp = (ti.scalar_kind == ABI_SC_FLOAT) ? 1u : 0u;
   1291         /* A scalar result whose caller destination is a memory slot (an
   1292          * address-taken result local) is delivered via sret_ptr, not a
   1293          * register — write it there. */
   1294         if (sret) memcpy(sret, &rv, sz ? (sz > 8u ? 8u : sz) : 8u);
   1295       }
   1296     }
   1297     /* The popped (callee) frame records where its scalar result lands in
   1298      * the caller — capture before popping, then rewind the arenas to the
   1299      * frame's bases (strict stack discipline). */
   1300     {
   1301       u8 want = fr->ret_wanted;
   1302       u32 dst = fr->ret_dst;
   1303       st->regs_top = fr->regs_off;
   1304       st->mem_top = fr->mem_off;
   1305       st->nframes--;
   1306       st->scalar_ret = rv;
   1307       st->ret_is_fp = is_fp;
   1308       if (st->nframes == 0) {
   1309         st->status = KIT_INTERP_DONE;
   1310         if (out_ret) *out_ret = (int64_t)rv;
   1311         return KIT_INTERP_DONE;
   1312       }
   1313       if (want) {
   1314         InterpFrame* caller = &st->frames[st->nframes - 1u];
   1315         u64* cregs = (u64*)(st->regs_arena + caller->regs_off);
   1316         cregs[dst] = rv;
   1317       }
   1318     }
   1319     RELOAD();
   1320     GO();
   1321   }
   1322   OP(IOP_INTRINSIC) : {
   1323     if (!interp_intrinsic(st, fn, regs, mem_off, in)) goto stop;
   1324     NEXT();
   1325   }
   1326   OP(IOP_FENCE) : NEXT(); /* single-thread: no-op */
   1327   OP(IOP_AGG_SET) : OP(IOP_AGG_COPY) : {
   1328     /* AGG_COPY/SET use pointer-deref addressing (pointer_addr_from_operand):
   1329      * a LOCAL holding a pointer is dereferenced; otherwise it is the slot. */
   1330     u64 d = interp_ptr_addr(st, fn, regs, mem_off, &I->opnds[0]);
   1331     if (in->op == IOP_AGG_COPY) {
   1332       IRAggAux* aux = (IRAggAux*)I->extra.aux;
   1333       u64 s = interp_ptr_addr(st, fn, regs, mem_off, &I->opnds[1]);
   1334       mem_copy(st, d, s, aux ? aux->access.size : 0u);
   1335     } else {
   1336       IRAggAux* aux = (IRAggAux*)I->extra.aux;
   1337       u64 byte = op_value(st, fn, regs, mem_off, &I->opnds[1]);
   1338       u32 n = aux ? aux->access.size : 0u;
   1339       u8* h = interp_translate(p, d, n, PERM_W);
   1340       if (h)
   1341         memset(h, (int)(byte & 0xffu), n);
   1342       else
   1343         st->mem_fault = 1;
   1344     }
   1345     NEXT();
   1346   }
   1347   OP(IOP_TLS_ADDR) : {
   1348     /* A thread-local's symbol does not resolve to its storage on every
   1349      * target (a Mach-O symbol resolves to a TLV descriptor), so route
   1350      * through interp_tls_addr / the host resolve_tls hook, which returns the
   1351      * running thread's address of the variable (already +addend). */
   1352     IRTlsAux* aux = (IRTlsAux*)I->extra.aux;
   1353     void* addr = aux ? interp_tls_addr(fn, aux->sym, aux->addend) : NULL;
   1354     if (!addr) {
   1355       unsupported(st, "unresolved thread-local symbol");
   1356       goto stop;
   1357     }
   1358     write_dst(st, fn, regs, mem_off, &I->opnds[0], (u64)(uintptr_t)addr);
   1359     NEXT();
   1360   }
   1361   OP(IOP_BITFIELD_LOAD) : {
   1362     /* opnds[1] is the record address; the field bits live in the storage
   1363      * unit at record + storage_offset. Extract by shift+mask (target uses
   1364      * little-endian bit numbering), sign-extending signed fields. */
   1365     IRBitFieldAux* aux = (IRBitFieldAux*)I->extra.aux;
   1366     u64 rec, raw, v = 0;
   1367     u32 ssz, width;
   1368     if (!aux) {
   1369       unsupported(st, "bitfield access");
   1370       goto stop;
   1371     }
   1372     rec = op_addr(st, fn, regs, mem_off, &I->opnds[1]);
   1373     ssz = aux->access.storage.size ? aux->access.storage.size : 4u;
   1374     width = aux->access.bit_width;
   1375     if (width) {
   1376       raw = mem_read(st, rec + aux->access.storage_offset, ssz);
   1377       v = (raw >> aux->access.bit_offset) & bits_mask(width);
   1378       if (aux->access.signed_ && width < 64u && (v & (1ull << (width - 1u))))
   1379         v |= ~bits_mask(width);
   1380     }
   1381     write_dst(st, fn, regs, mem_off, &I->opnds[0], v);
   1382     NEXT();
   1383   }
   1384   OP(IOP_BITFIELD_STORE) : {
   1385     /* opnds[0] = record address, opnds[1] = source value. Read-modify-write
   1386      * the storage unit: clear the field bits, then OR in the masked, shifted
   1387      * source. A zero-width field is a layout barrier — no store. */
   1388     IRBitFieldAux* aux = (IRBitFieldAux*)I->extra.aux;
   1389     u64 rec, addr, ones, fmask, src, raw;
   1390     u32 ssz, width;
   1391     if (!aux) {
   1392       unsupported(st, "bitfield access");
   1393       goto stop;
   1394     }
   1395     width = aux->access.bit_width;
   1396     if (width == 0) NEXT();
   1397     rec = op_addr(st, fn, regs, mem_off, &I->opnds[0]);
   1398     ssz = aux->access.storage.size ? aux->access.storage.size : 4u;
   1399     addr = rec + aux->access.storage_offset;
   1400     ones = bits_mask(width);
   1401     fmask = ones << aux->access.bit_offset;
   1402     src = op_value(st, fn, regs, mem_off, &I->opnds[1]);
   1403     raw = mem_read(st, addr, ssz);
   1404     raw = (raw & ~fmask) | ((src & ones) << aux->access.bit_offset);
   1405     mem_write(st, addr, ssz, raw);
   1406     NEXT();
   1407   }
   1408   OP(IOP_VA_START) : {
   1409     /* opnds[0] is the va_list object's address (a pointer value). Seed it
   1410      * with a cursor over this frame's anonymous-argument buffer. */
   1411     u64 ap = op_value(st, fn, regs, mem_off, &I->opnds[0]);
   1412     u64 cursor = fr->has_varargs ? frame_base(st, fr->vararg_off) : 0u;
   1413     mem_write(st, ap, 8u, cursor);
   1414     NEXT();
   1415   }
   1416   OP(IOP_VA_END) : NEXT(); /* nothing to release in the cursor model */
   1417   OP(IOP_VA_COPY) : {
   1418     /* opnds = [dst va_list addr, src va_list addr]: duplicate the cursor. */
   1419     u64 d = op_value(st, fn, regs, mem_off, &I->opnds[0]);
   1420     u64 s = op_value(st, fn, regs, mem_off, &I->opnds[1]);
   1421     mem_write(st, d, 8u, mem_read(st, s, 8u));
   1422     NEXT();
   1423   }
   1424   OP(IOP_VA_ARG) : {
   1425     /* opnds[0] = dst (type drives the read width), opnds[1] = va_list addr.
   1426      * Align the cursor, read the slot, advance, store the cursor back. */
   1427     KitCgTypeId ty = I->opnds[0].type;
   1428     u64 ap = op_value(st, fn, regs, mem_off, &I->opnds[1]);
   1429     u64 cursor = mem_read(st, ap, 8u);
   1430     u32 size = abi_cg_sizeof(p->c->abi, ty);
   1431     u32 al = va_align_of(size);
   1432     cursor = (cursor + al - 1u) & ~((u64)al - 1u);
   1433     if (cg_type_is_aggregate(p->c, ty) || size > 8u) {
   1434       u64 dstaddr = op_addr(st, fn, regs, mem_off, &I->opnds[0]);
   1435       mem_copy(st, dstaddr, cursor, size);
   1436     } else {
   1437       write_dst(st, fn, regs, mem_off, &I->opnds[0],
   1438                 mem_read(st, cursor, size ? size : 8u));
   1439     }
   1440     mem_write(st, ap, 8u, cursor + va_stride_of(size));
   1441     NEXT();
   1442   }
   1443   /* Atomics: single-threaded interpreter, so the operation is serialized
   1444    * and the memory order is irrelevant (treated as seq-cst). */
   1445   OP(IOP_ATOMIC_LOAD) : {
   1446     u64 a = op_value(st, fn, regs, mem_off, &I->opnds[1]);
   1447     write_dst(st, fn, regs, mem_off, &I->opnds[0],
   1448               mem_read(st, a, in->w0 ? in->w0 : 8u));
   1449     NEXT();
   1450   }
   1451   OP(IOP_ATOMIC_STORE) : {
   1452     u64 a = op_value(st, fn, regs, mem_off, &I->opnds[0]);
   1453     mem_write(st, a, in->w0 ? in->w0 : 8u,
   1454               op_value(st, fn, regs, mem_off, &I->opnds[1]));
   1455     NEXT();
   1456   }
   1457   OP(IOP_ATOMIC_RMW) : {
   1458     u32 w = in->w0 ? in->w0 : 8u;
   1459     u64 a = op_value(st, fn, regs, mem_off, &I->opnds[1]);
   1460     u64 old = mem_read(st, a, w);
   1461     u64 v = op_value(st, fn, regs, mem_off, &I->opnds[2]);
   1462     mem_write(st, a, w, do_rmw(in->sub, old, v, w));
   1463     write_dst(st, fn, regs, mem_off, &I->opnds[0], old);
   1464     NEXT();
   1465   }
   1466   OP(IOP_ATOMIC_CAS) : {
   1467     u32 w = in->w0 ? in->w0 : 8u;
   1468     u64 a = op_value(st, fn, regs, mem_off, &I->opnds[2]);
   1469     u64 expected = op_value(st, fn, regs, mem_off, &I->opnds[3]);
   1470     u64 desired = op_value(st, fn, regs, mem_off, &I->opnds[4]);
   1471     u64 old = mem_read(st, a, w);
   1472     u64 ok = (mask_w(old, w) == mask_w(expected, w));
   1473     if (ok) mem_write(st, a, w, desired);
   1474     write_dst(st, fn, regs, mem_off, &I->opnds[0], old); /* prior */
   1475     write_dst(st, fn, regs, mem_off, &I->opnds[1], ok);  /* ok flag */
   1476     NEXT();
   1477   }
   1478   OP(IOP_UNREACHABLE) : fault(st, "unreachable");
   1479   goto stop;
   1480   OP(IOP_TRAP)
   1481       : unsupported(st, fn->reject_reason ? fn->reject_reason : "operation");
   1482   goto stop;
   1483 #if !INTERP_DISPATCH_THREADED
   1484   default:
   1485     unsupported(st, "opcode");
   1486     goto stop;
   1487 }
   1488 if (st->mem_fault) {
   1489   fault(st, "invalid memory access");
   1490   goto stop;
   1491 }
   1492 ip++;
   1493 }
   1494 #else
   1495     fault_mem:
   1496       fault(st, "invalid memory access");
   1497       /* fall through to stop */
   1498 #endif
   1499 
   1500 stop: fr->ip = ip;
   1501 return (KitInterpStatus)st->status;
   1502 #undef RELOAD
   1503 }
   1504 #if INTERP_DISPATCH_THREADED
   1505 #if defined(__clang__)
   1506 #pragma clang diagnostic pop
   1507 #elif defined(__GNUC__)
   1508 #pragma GCC diagnostic pop
   1509 #endif
   1510 #endif
   1511 
   1512 /* ---- intrinsics ---- */
   1513 
   1514 static u64 ipopcount(u64 v, u32 w) {
   1515   u64 m = (w >= 8) ? ~0ull : ((1ull << (w * 8u)) - 1ull);
   1516   u64 x = v & m;
   1517   u64 n = 0;
   1518   while (x) {
   1519     n += (x & 1u);
   1520     x >>= 1;
   1521   }
   1522   return n;
   1523 }
   1524 static u64 ictz(u64 v, u32 w) {
   1525   u32 bits = w * 8u;
   1526   u64 n = 0;
   1527   if ((v & ((bits >= 64) ? ~0ull : ((1ull << bits) - 1ull))) == 0) return bits;
   1528   while (!(v & 1u)) {
   1529     n++;
   1530     v >>= 1;
   1531   }
   1532   return n;
   1533 }
   1534 static u64 iclz(u64 v, u32 w) {
   1535   u32 bits = w * 8u;
   1536   u64 n = 0;
   1537   u64 top = 1ull << (bits - 1u);
   1538   v &= (bits >= 64) ? ~0ull : ((1ull << bits) - 1ull);
   1539   if (v == 0) return bits;
   1540   while (!(v & top)) {
   1541     n++;
   1542     v <<= 1;
   1543   }
   1544   return n;
   1545 }
   1546 static u64 ibswap(u64 v, u32 nbytes) {
   1547   u64 r = 0;
   1548   u32 i;
   1549   for (i = 0; i < nbytes; ++i) {
   1550     r = (r << 8) | (v & 0xffu);
   1551     v >>= 8;
   1552   }
   1553   return r;
   1554 }
   1555 
   1556 static int interp_intrinsic(InterpStack* st, InterpFunc* fn, u64* regs,
   1557                             u32 mem_off, InterpInsn* in) {
   1558   InterpProgram* p = st->prog;
   1559   IRIntrinAux* aux = (IRIntrinAux*)in->inst->extra.aux;
   1560   Compiler* c = p->c;
   1561   if (!aux) {
   1562     unsupported(st, "intrinsic");
   1563     return 0;
   1564   }
   1565 #define ARGV(i) op_value(st, fn, regs, mem_off, &aux->args[i])
   1566 #define AWID(i) ((u32)abi_cg_sizeof(c->abi, aux->args[i].type))
   1567 #define DWID(i) ((u32)abi_cg_sizeof(c->abi, aux->dsts[i].type))
   1568 #define DST0 \
   1569   (aux->ndst > 0 && aux->dsts[0].kind == OPK_REG ? aux->dsts[0].v.reg : 0u)
   1570   switch (aux->kind) {
   1571     case INTRIN_MEMMOVE: {
   1572       u64 d = ARGV(0), s = ARGV(1), n = ARGV(2);
   1573       mem_copy(st, d, s, (u32)n);
   1574       if (aux->ndst > 0 && aux->dsts[0].kind == OPK_REG) regs[DST0] = d;
   1575       return 1;
   1576     }
   1577     case INTRIN_POPCOUNT:
   1578       regs[DST0] = ipopcount(ARGV(0), AWID(0));
   1579       return 1;
   1580     case INTRIN_CTZ:
   1581       regs[DST0] = ictz(ARGV(0), AWID(0));
   1582       return 1;
   1583     case INTRIN_CLZ:
   1584       regs[DST0] = iclz(ARGV(0), AWID(0));
   1585       return 1;
   1586     case INTRIN_BSWAP:
   1587       regs[DST0] = ibswap(ARGV(0), DWID(0));
   1588       return 1;
   1589     case INTRIN_EXPECT:
   1590       regs[DST0] = ARGV(0);
   1591       return 1;
   1592     case INTRIN_ASSUME_ALIGNED:
   1593       if (aux->ndst > 0 && aux->dsts[0].kind == OPK_REG) regs[DST0] = ARGV(0);
   1594       return 1;
   1595     case INTRIN_PREFETCH:
   1596       return 1;
   1597     /* CPU hints and memory barriers have no observable effect in the
   1598      * single-threaded interpreter model: treat them as no-ops. */
   1599     case INTRIN_CPU_NOP:
   1600     case INTRIN_CPU_YIELD:
   1601     case INTRIN_ISB:
   1602     case INTRIN_DMB:
   1603     case INTRIN_DSB:
   1604       return 1;
   1605     case INTRIN_TRAP:
   1606       fault(st, "__builtin_trap");
   1607       return 0;
   1608     case INTRIN_SADD_OVERFLOW:
   1609     case INTRIN_UADD_OVERFLOW:
   1610     case INTRIN_SSUB_OVERFLOW:
   1611     case INTRIN_USUB_OVERFLOW:
   1612     case INTRIN_SMUL_OVERFLOW:
   1613     case INTRIN_UMUL_OVERFLOW: {
   1614       u32 w = AWID(0);
   1615       u64 a = ARGV(0), b = ARGV(1);
   1616       u64 res = 0;
   1617       int ovf = 0;
   1618       switch (aux->kind) {
   1619         /* For w<8 the operands fit in i64/u64 so the exact result is available
   1620          * and a re-narrow comparison detects overflow; for w==8 there is no
   1621          * wider type, so detect via sign/carry logic (the re-narrow trick would
   1622          * always read "no overflow"). */
   1623         case INTRIN_SADD_OVERFLOW: {
   1624           i64 x = sext_w(a, w), y = sext_w(b, w);
   1625           u64 r = (u64)x + (u64)y;
   1626           res = mask_w(r, w);
   1627           ovf = (w < 8) ? (sext_w(res, w) != x + y)
   1628                         : (int)((((u64)x ^ r) & ((u64)y ^ r)) >> 63);
   1629           break;
   1630         }
   1631         case INTRIN_UADD_OVERFLOW: {
   1632           u64 x = mask_w(a, w), y = mask_w(b, w), r = x + y;
   1633           res = mask_w(r, w);
   1634           ovf = (res != r) || (mask_w(r, w) < x);
   1635           break;
   1636         }
   1637         case INTRIN_SSUB_OVERFLOW: {
   1638           i64 x = sext_w(a, w), y = sext_w(b, w);
   1639           u64 r = (u64)x - (u64)y;
   1640           res = mask_w(r, w);
   1641           ovf = (w < 8) ? (sext_w(res, w) != x - y)
   1642                         : (int)((((u64)x ^ (u64)y) & ((u64)x ^ r)) >> 63);
   1643           break;
   1644         }
   1645         case INTRIN_USUB_OVERFLOW: {
   1646           ovf = mask_w(a, w) < mask_w(b, w);
   1647           res = mask_w(mask_w(a, w) - mask_w(b, w), w);
   1648           break;
   1649         }
   1650         case INTRIN_SMUL_OVERFLOW: {
   1651           i64 x = sext_w(a, w), y = sext_w(b, w);
   1652           u64 r = (u64)x * (u64)y;
   1653           res = mask_w(r, w);
   1654           if (w < 8) {
   1655             ovf = (sext_w(res, w) != x * y);
   1656           } else if (x == 0 || y == 0) {
   1657             ovf = 0;
   1658           } else if ((x == -1 && (u64)y == 0x8000000000000000ull) ||
   1659                      (y == -1 && (u64)x == 0x8000000000000000ull)) {
   1660             ovf = 1; /* INT64_MIN * -1 */
   1661           } else {
   1662             ovf = ((i64)r / x != y);
   1663           }
   1664           break;
   1665         }
   1666         case INTRIN_UMUL_OVERFLOW: {
   1667           u64 x = mask_w(a, w), y = mask_w(b, w), r = x * y;
   1668           res = mask_w(r, w);
   1669           ovf = (w < 8) ? (r != res) : (x != 0 && r / x != y);
   1670           break;
   1671         }
   1672         default:
   1673           break;
   1674       }
   1675       if (aux->ndst > 0 && aux->dsts[0].kind == OPK_REG)
   1676         regs[aux->dsts[0].v.reg] = res;
   1677       if (aux->ndst > 1 && aux->dsts[1].kind == OPK_REG)
   1678         regs[aux->dsts[1].v.reg] = (u64)ovf;
   1679       return 1;
   1680     }
   1681     default:
   1682       unsupported(st, "intrinsic");
   1683       return 0;
   1684   }
   1685 #undef ARGV
   1686 #undef AWID
   1687 #undef DWID
   1688 #undef DST0
   1689 }
   1690 
   1691 /* ---- public stack API ---- */
   1692 
   1693 KitInterpStack* kit_interp_stack_new(KitInterpProgram* pp) {
   1694   InterpProgram* p = (InterpProgram*)pp;
   1695   Heap* h;
   1696   InterpStack* st;
   1697   if (!p) return NULL;
   1698   h = p->c->ctx->heap;
   1699   st = (InterpStack*)h->alloc(h, sizeof(*st), _Alignof(InterpStack));
   1700   if (!st) return NULL;
   1701   memset(st, 0, sizeof *st);
   1702   st->prog = p;
   1703   /* Fixed, non-relocating arenas (see bump()/INTERP_*_RESERVE). */
   1704   st->regs_arena = (u8*)h->alloc(h, INTERP_REGS_RESERVE, 16u);
   1705   st->mem_arena = (u8*)h->alloc(h, INTERP_MEM_RESERVE, 16u);
   1706   if (!st->regs_arena || !st->mem_arena) {
   1707     if (st->regs_arena) h->free(h, st->regs_arena, INTERP_REGS_RESERVE);
   1708     if (st->mem_arena) h->free(h, st->mem_arena, INTERP_MEM_RESERVE);
   1709     h->free(h, st, sizeof *st);
   1710     return NULL;
   1711   }
   1712   st->regs_cap = INTERP_REGS_RESERVE;
   1713   st->mem_cap = INTERP_MEM_RESERVE;
   1714   return (KitInterpStack*)st;
   1715 }
   1716 
   1717 void kit_interp_stack_free(KitInterpStack* s) {
   1718   InterpStack* st = (InterpStack*)s;
   1719   Heap* h;
   1720   if (!st) return;
   1721   h = st->prog->c->ctx->heap;
   1722   if (st->frames) h->free(h, st->frames, sizeof(InterpFrame) * st->frames_cap);
   1723   if (st->regs_arena) h->free(h, st->regs_arena, st->regs_cap);
   1724   if (st->mem_arena) h->free(h, st->mem_arena, st->mem_cap);
   1725   h->free(h, st, sizeof *st);
   1726 }
   1727 
   1728 static void bind_entry_param(InterpStack* st, InterpFunc* fn, u32 idx, u32 i,
   1729                              u64 value) {
   1730   InterpFrame* fr = &st->frames[idx];
   1731   IRParam* pr;
   1732   if (i >= fn->f->nparams) return;
   1733   pr = &fn->f->params[i];
   1734   if (pr->storage.kind == CG_LOCAL_STORAGE_REG) {
   1735     u64* regs = (u64*)(st->regs_arena + fr->regs_off);
   1736     regs[pr->storage.v.reg] = value;
   1737   } else {
   1738     u64 dst =
   1739         frame_base(st, fr->mem_off) + fn->slot_off[pr->storage.v.frame_slot];
   1740     mem_write(st, dst, 8u, value);
   1741   }
   1742 }
   1743 
   1744 KitStatus kit_interp_call_on(KitInterpStack* s, KitInterpFunc* ff, int argc,
   1745                              char** argv) {
   1746   InterpStack* st = (InterpStack*)s;
   1747   InterpFunc* fn = (InterpFunc*)ff;
   1748   u32 idx;
   1749   if (!st || !fn) return KIT_INVALID;
   1750   idx = frame_push(st, fn);
   1751   if (idx == 0xffffffffu) return KIT_NOMEM;
   1752   bind_entry_param(st, fn, idx, 0u, (u64)(unsigned)argc);
   1753   bind_entry_param(st, fn, idx, 1u, (u64)(uintptr_t)argv);
   1754   return KIT_OK;
   1755 }
   1756 
   1757 KitInterpStatus kit_interp_resume(KitInterpStack* s, int64_t* out_ret) {
   1758   InterpStack* st = (InterpStack*)s;
   1759   if (!st) return KIT_INTERP_ERROR;
   1760   return interp_run_stack(st, out_ret);
   1761 }
   1762 
   1763 KitInterpStatus kit_interp_call(KitInterpProgram* pp, KitInterpFunc* ff,
   1764                                 int argc, char** argv, int64_t* out_ret) {
   1765   KitInterpStack* s = kit_interp_stack_new(pp);
   1766   KitInterpStatus rc;
   1767   if (!s) return KIT_INTERP_ERROR;
   1768   if (kit_interp_call_on(s, ff, argc, argv) != KIT_OK) {
   1769     kit_interp_stack_free(s);
   1770     return KIT_INTERP_ERROR;
   1771   }
   1772   rc = kit_interp_resume(s, out_ret);
   1773   kit_interp_stack_free(s);
   1774   return rc;
   1775 }
   1776 
   1777 KitInterpStatus kit_interp_call_args(KitInterpProgram* pp, KitInterpFunc* ff,
   1778                                      const uint64_t* args, uint32_t nargs,
   1779                                      int64_t* out_ret) {
   1780   InterpStack* st = (InterpStack*)kit_interp_stack_new(pp);
   1781   InterpFunc* fn = (InterpFunc*)ff;
   1782   KitInterpStatus rc;
   1783   u32 idx, i;
   1784   if (!st) return KIT_INTERP_ERROR;
   1785   if (!fn) {
   1786     kit_interp_stack_free((KitInterpStack*)st);
   1787     return KIT_INTERP_ERROR;
   1788   }
   1789   idx = frame_push(st, fn);
   1790   if (idx == 0xffffffffu) {
   1791     kit_interp_stack_free((KitInterpStack*)st);
   1792     return KIT_INTERP_ERROR;
   1793   }
   1794   for (i = 0; i < nargs; ++i) bind_entry_param(st, fn, idx, i, args[i]);
   1795   rc = interp_run_stack(st, out_ret);
   1796   kit_interp_stack_free((KitInterpStack*)st);
   1797   return rc;
   1798 }
   1799 
   1800 KitStatus kit_interp_stack_reset(KitInterpStack* s) {
   1801   InterpStack* st = (InterpStack*)s;
   1802   if (!st) return KIT_INVALID;
   1803   /* Keep the (fixed, non-relocating) arenas; rewind their bump tops and drop
   1804    * all frames + the return shuttle + any prior status/trap. */
   1805   st->nframes = 0;
   1806   st->regs_top = 0;
   1807   st->mem_top = 0;
   1808   st->scalar_ret = 0;
   1809   st->ret_is_fp = 0;
   1810   st->status = KIT_INTERP_DONE;
   1811   st->trap_reason = NULL;
   1812   st->mem_fault = 0;
   1813   return KIT_OK;
   1814 }
   1815 
   1816 KitStatus kit_interp_call_args_on(KitInterpStack* s, KitInterpFunc* ff,
   1817                                   const uint64_t* args, uint32_t nargs) {
   1818   InterpStack* st = (InterpStack*)s;
   1819   InterpFunc* fn = (InterpFunc*)ff;
   1820   u32 idx, i;
   1821   if (!st || !fn) return KIT_INVALID;
   1822   idx = frame_push(st, fn);
   1823   if (idx == 0xffffffffu) return KIT_NOMEM;
   1824   for (i = 0; i < nargs; ++i) bind_entry_param(st, fn, idx, i, args[i]);
   1825   return KIT_OK;
   1826 }
   1827 
   1828 const char* kit_interp_stack_trap_reason(KitInterpStack* s) {
   1829   InterpStack* st = (InterpStack*)s;
   1830   return st ? st->trap_reason : NULL;
   1831 }