kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

arith.c (74104B)


      1 #include "cg/internal.h"
      2 
      3 static int api_try_fold_int_convert(KitCg* g, ConvKind ck, KitCgTypeId sty,
      4                                     KitCgTypeId dty, i64 in, i64* out) {
      5   u32 sw;
      6   u32 dw;
      7   u64 r;
      8   if (!g || !out || !api_foldable_int_like_type(g->c, sty, &sw) ||
      9       !api_foldable_int_like_type(g->c, dty, &dw)) {
     10     return 0;
     11   }
     12   switch (ck) {
     13     case CV_SEXT:
     14       r = (u64)api_sign_extend_width((u64)in, sw);
     15       break;
     16     case CV_ZEXT:
     17       r = api_mask_width((u64)in, sw);
     18       break;
     19     case CV_TRUNC:
     20       r = api_mask_width((u64)in, dw);
     21       break;
     22     default:
     23       return 0;
     24   }
     25   *out = api_fold_result(g->c, dty, r, dw);
     26   return 1;
     27 }
     28 
     29 void api_cg_binop(KitCg* g, BinOp iop, u32 flags) {
     30   ApiSValue b, a;
     31   CgTarget* T;
     32   KitCgTypeId ty;
     33   Operand ra, rb;
     34   CGLocal rr;
     35   Operand dst;
     36   ApiSValue folded_sv;
     37   i64 folded;
     38   if (!g) return;
     39   T = g->target;
     40   b = api_pop(g);
     41   a = api_pop(g);
     42   ty = a.type ? a.type : b.type;
     43 
     44   if (!flags && api_sv_op_is(&a, OPK_IMM) && api_sv_op_is(&b, OPK_IMM) &&
     45       api_try_fold_int_binop(g, iop, ty, a.op.v.imm, b.op.v.imm, &folded)) {
     46     api_release(g, &a);
     47     api_release(g, &b);
     48     api_push(g, api_make_sv(api_op_imm(folded, ty), ty));
     49     return;
     50   }
     51 
     52   /* Strength-reduce mul/udiv/urem by a power of two into shift/and. Rewrites
     53    * iop and the operands in place; the result flows through the same delay /
     54    * identity / fallback machinery as any other shift or and. */
     55   if (!flags) api_try_strength_reduce(g, &iop, ty, &a, &b);
     56 
     57   if (api_can_delay_int_arith(g, ty, flags) &&
     58       api_try_fold_arith_chain(g, iop, ty, &a, &b, &folded_sv)) {
     59     api_release(g, &a);
     60     api_release(g, &b);
     61     api_push(g, folded_sv);
     62     return;
     63   }
     64 
     65   if (api_type_is_float(g->c, ty)) {
     66     ra = api_force_local(g, &a, ty);
     67     rb = api_force_local(g, &b, ty);
     68   } else {
     69     ra = api_force_local_unless_imm(g, &a, ty);
     70     rb = api_force_local_unless_imm(g, &b, ty);
     71   }
     72 
     73   if (api_can_delay_int_arith(g, ty, flags) &&
     74       api_try_collapse_binop_identity(g, iop, ty, &a, &b, &folded_sv)) {
     75     api_release(g, &a);
     76     api_release(g, &b);
     77     api_push(g, folded_sv);
     78     return;
     79   }
     80 
     81   if (api_can_delay_int_arith(g, ty, flags) &&
     82       (ra.kind == OPK_LOCAL || rb.kind == OPK_LOCAL) &&
     83       (ra.kind == OPK_LOCAL || ra.kind == OPK_IMM) &&
     84       (rb.kind == OPK_LOCAL || rb.kind == OPK_IMM)) {
     85     int a_owned = api_sv_owns_operand_local(&a, &ra);
     86     int b_owned = api_sv_owns_operand_local(&b, &rb);
     87     api_push(g, api_make_arith_binop(iop, ra, rb, ty, a_owned, b_owned));
     88     if (a_owned) a.res = RES_INHERENT;
     89     if (b_owned) b.res = RES_INHERENT;
     90     api_release(g, &a);
     91     api_release(g, &b);
     92     return;
     93   }
     94 
     95   rr = api_alloc_temp_local(g, ty);
     96   dst = api_op_local(rr, ty);
     97   T->binop(T, iop, dst, ra, rb);
     98   api_release(g, &a);
     99   api_release(g, &b);
    100   api_push(g, api_make_sv(dst, ty));
    101 }
    102 
    103 void api_cg_unop(KitCg* g, UnOp iop, u32 flags) {
    104   ApiSValue a;
    105   CgTarget* T;
    106   KitCgTypeId ty;
    107   Operand ra;
    108   CGLocal rr;
    109   Operand dst;
    110   ApiSValue folded_sv;
    111   i64 folded;
    112   if (!g) return;
    113   T = g->target;
    114   a = api_pop(g);
    115   ty = a.type ? a.type : a.op.type;
    116 
    117   if (iop == UO_FNEG) {
    118     if (!api_type_is_float(g->c, ty)) {
    119       compiler_panic(g->c, g->cur_loc,
    120                      "KitCg: FP negation requires floating operand");
    121     }
    122     ra = api_force_local(g, &a, ty);
    123     rr = api_alloc_temp_local(g, ty);
    124     dst = api_op_local(rr, ty);
    125     T->unop(T, iop, dst, ra);
    126     api_release(g, &a);
    127     api_push(g, api_make_sv(dst, ty));
    128     return;
    129   }
    130 
    131   /* Logical NOT of a delayed compare stays delayed: invert the predicate in
    132    * place. For FP this flips ordered<->unordered as well as the relation (via
    133    * api_invert_cmp), so `!(a<b)` becomes UGE (NaN -> true), matching IEEE
    134    * negation. The inverted compare keeps the same i32 result type. */
    135   if (iop == UO_NOT && a.kind == SV_CMP) {
    136     a.delayed.cmp.op = api_invert_cmp(a.delayed.cmp.op);
    137     api_push(g, a);
    138     return;
    139   }
    140 
    141   if (!flags && api_sv_op_is(&a, OPK_IMM) &&
    142       api_try_fold_int_unop(g, iop, ty, a.op.v.imm, &folded)) {
    143     api_release(g, &a);
    144     api_push(g, api_make_sv(api_op_imm(folded, ty), ty));
    145     return;
    146   }
    147 
    148   if (api_can_delay_int_arith(g, ty, flags) &&
    149       api_try_fold_unary_chain(&a, iop, ty, &folded_sv)) {
    150     api_release(g, &a);
    151     api_push(g, folded_sv);
    152     return;
    153   }
    154 
    155   ra = api_force_local_unless_imm(g, &a, ty);
    156   if (api_can_delay_int_arith(g, ty, flags) && ra.kind == OPK_LOCAL) {
    157     int a_owned = api_sv_owns_operand_local(&a, &ra);
    158     api_push(g, api_make_arith_unop(iop, ra, ty, a_owned));
    159     if (a_owned) a.res = RES_INHERENT;
    160     api_release(g, &a);
    161     return;
    162   }
    163   rr = api_alloc_temp_local(g, ty);
    164   dst = api_op_local(rr, ty);
    165   T->unop(T, iop, dst, ra);
    166   api_release(g, &a);
    167   api_push(g, api_make_sv(dst, ty));
    168 }
    169 
    170 void api_cg_cmp(KitCg* g, CmpOp cop) {
    171   ApiSValue b, a;
    172   KitCgTypeId opty;
    173   KitCgTypeId i32;
    174   Operand ra, rb;
    175   i64 folded;
    176   if (!g) return;
    177   b = api_pop(g);
    178   a = api_pop(g);
    179   opty = a.type ? a.type : b.type;
    180   i32 = builtin_id(KIT_CG_BUILTIN_I32);
    181 
    182   if (api_sv_op_is(&a, OPK_IMM) && api_sv_op_is(&b, OPK_IMM) &&
    183       api_try_fold_int_cmp(g, cop, opty, a.op.v.imm, b.op.v.imm, &folded)) {
    184     api_release(g, &a);
    185     api_release(g, &b);
    186     api_push(g, api_make_sv(api_op_imm(folded, i32), i32));
    187     return;
    188   }
    189 
    190   ra = api_force_local_unless_imm(g, &a, opty);
    191   rb = api_force_local_unless_imm(g, &b, opty);
    192   /* Both integer and FP compares are produced as delayed SV_CMP values.
    193    * Delaying is what lets api_branch_if (and api_cg_unop's UO_NOT) invert
    194    * the compare via api_invert_cmp, reaching the unordered FP duals
    195    * (UGE/UGT/ULE/ULT/UEQ/UNE) from `!(a<b)` etc. with NaN-correct semantics.
    196    * If the compare instead escapes into value context it is materialized
    197    * unchanged via api_materialize_cmp_to, which calls T->cmp with the same
    198    * opcode the eager path used to. */
    199   api_push(g, api_make_cmp(cop, ra, rb, i32, api_sv_owns_operand_local(&a, &ra),
    200                            api_sv_owns_operand_local(&b, &rb)));
    201 }
    202 
    203 int api_try_i128_convert(KitCg* g, ConvKind ck, KitCgTypeId sty,
    204                          KitCgTypeId dty, ApiSValue* v);
    205 int api_try_wide8_convert(KitCg* g, ConvKind ck, KitCgTypeId sty,
    206                           KitCgTypeId dty, ApiSValue* v);
    207 
    208 void api_cg_convert_kind(KitCg* g, KitCgTypeId dst_type, ConvKind ck) {
    209   ApiSValue v;
    210   CgTarget* T;
    211   KitCgTypeId sty;
    212   KitCgTypeId dty;
    213   Operand src;
    214   CGLocal rr;
    215   Operand dst;
    216   if (!g) return;
    217   T = g->target;
    218   dty = resolve_type(g->c, dst_type);
    219   if (!dty) return;
    220   v = api_pop(g);
    221   dty = api_unalias_type(g->c, dty);
    222   sty = api_unalias_type(g->c, v.type ? v.type : v.op.type);
    223   if (!sty) {
    224     api_release(g, &v);
    225     return;
    226   }
    227   if (sty == dty) {
    228     v.type = dty;
    229     v.op.type = dty;
    230     api_push(g, v);
    231     return;
    232   }
    233   if (api_sv_op_is(&v, OPK_IMM)) {
    234     i64 folded;
    235     if (api_try_fold_int_convert(g, ck, sty, dty, v.op.v.imm, &folded)) {
    236       api_release(g, &v);
    237       /* A folded split-lane 8-byte result must be memory-resident, not a bare
    238        * i64 immediate the backend would truncate. */
    239       if (api_is_wide8_scalar_type(g->c, dty))
    240         api_push(g, api_make_wide8_int_const(g, folded, dty));
    241       else
    242         api_push(g, api_make_sv(api_op_imm(folded, dty), dty));
    243       return;
    244     }
    245   }
    246   if (api_try_i128_convert(g, ck, sty, dty, &v)) return;
    247   if (api_try_wide8_convert(g, ck, sty, dty, &v)) return;
    248   if (ck == CV_BITCAST && abi_cg_sizeof(g->c->abi, sty) == 16 &&
    249       abi_cg_sizeof(g->c->abi, dty) == 16 &&
    250       (api_is_f128_type(g->c, sty) || api_is_f128_type(g->c, dty))) {
    251     CGLocal local = api_f128_temp_local(g, dty);
    252     Operand dst_lv = api_op_local(local, dty);
    253     if (api_is_lvalue_sv(&v) || v.op.kind == OPK_LOCAL ||
    254         v.op.kind == OPK_INDIRECT || v.op.kind == OPK_GLOBAL) {
    255       KitCgTypeId ptr_ty = cg_type_ptr_to(g->c, dty);
    256       ApiSValue src_lv = v;
    257       Operand dst_addr;
    258       Operand src_addr;
    259       AggregateAccess agg;
    260       src_lv.lvalue = 1;
    261       dst_addr = api_lvalue_addr(
    262           g,
    263           &(ApiSValue){
    264               .op = dst_lv, .type = dty, .kind = SV_OPERAND, .lvalue = 1},
    265           ptr_ty);
    266       src_addr = api_lvalue_addr(g, &src_lv, cg_type_ptr_to(g->c, sty));
    267       memset(&agg, 0, sizeof agg);
    268       agg.size = 16;
    269       agg.align = 16;
    270       g->target->copy_bytes(g->target, dst_addr, src_addr, agg);
    271       api_release_temp_local(g, dst_addr.v.local);
    272       api_release_temp_local(g, src_addr.v.local);
    273     } else if (v.op.kind == OPK_LOCAL) {
    274       g->target->store(g->target, dst_lv, v.op,
    275                        api_mem_for_lvalue(g, &dst_lv, sty));
    276     } else if (v.op.kind == OPK_IMM) {
    277       u8 bytes[16];
    278       u64 lo = (u64)v.op.v.imm;
    279       memset(bytes, 0, sizeof bytes);
    280       for (u32 i = 0; i < 8; ++i) {
    281         u32 idx = g->c->target.big_endian ? 15u - i : i;
    282         bytes[idx] = (u8)(lo >> (i * 8u));
    283       }
    284       api_store_f128_bytes(g, local, dty, bytes);
    285     } else {
    286       compiler_panic(g->c, g->cur_loc,
    287                      "KitCg: unsupported 16-byte bitcast source");
    288     }
    289     api_release(g, &v);
    290     api_push(g, api_make_lv(dst_lv, dty));
    291     return;
    292   }
    293 
    294   src = api_force_local(g, &v, sty);
    295   rr = api_alloc_temp_local(g, dty);
    296   dst = api_op_local(rr, dty);
    297   T->convert(T, ck, dst, src);
    298   api_release(g, &v);
    299   api_push(g, api_make_sv(dst, dty));
    300 }
    301 
    302 /* ============================================================
    303  * 128-bit integer lowering
    304  *
    305  * i128/u128 are 16-byte memory-resident scalars (see api_is_wide16
    306  * and src/cg/wide.c). The native backends only model <=64-bit
    307  * register ops, so every i128 arithmetic/compare/convert is lowered
    308  * here to a compiler-rt-style runtime call (rt/lib/int64). This
    309  * mirrors the f128 dispatch in kit_cg_fp_*.
    310  * ============================================================ */
    311 
    312 int api_i128_stack_top(KitCg* g, u32 depth) {
    313   if (!g || g->sp <= depth) return 0;
    314   return api_is_i128_type(g->c, api_sv_type(&g->stack[g->sp - 1u - depth]));
    315 }
    316 
    317 /* 64-bit integer split into two 32-bit lanes by the selected ABI. The native
    318  * backend handles add/sub/and/or/xor on such values as register pairs, but
    319  * mul/div/shift must be lowered to a __*di3 runtime call (see
    320  * api_wideint64_binop). i128 routes through its own ti3 path (api_i128_*), so
    321  * it is explicitly excluded here. */
    322 static int api_int_is_wide64(KitCg* g, KitCgTypeId ty) {
    323   if (!g) return 0;
    324   if (api_is_i128_type(g->c, ty)) return 0;
    325   if (kit_cg_type_int_width((KitCompiler*)g->c, ty) == 0) return 0;
    326   return api_is_wide8_scalar_type(g->c, ty);
    327 }
    328 
    329 static int api_wide64_stack_top(KitCg* g, u32 depth) {
    330   if (!g || g->sp <= depth) return 0;
    331   return api_int_is_wide64(g, api_sv_type(&g->stack[g->sp - 1u - depth]));
    332 }
    333 
    334 static int api_binop_is_shift(BinOp iop) {
    335   return iop == BO_SHL || iop == BO_SHR_U || iop == BO_SHR_S;
    336 }
    337 
    338 static int api_is_bool_type(Compiler* c, KitCgTypeId ty) {
    339   const CgType* cg = cg_type_get(c, api_unalias_type(c, ty));
    340   return cg && cg->kind == KIT_CG_TYPE_BOOL;
    341 }
    342 
    343 /* Materialize an i128 value as an lvalue and return a pointer local to it. */
    344 static Operand api_i128_addr(KitCg* g, ApiSValue* v) {
    345   KitCgTypeId i128 = builtin_id(KIT_CG_BUILTIN_I128);
    346   ApiSValue lv = api_wide16_materialize_lvalue(g, v, i128);
    347   return api_lvalue_addr(g, &lv, cg_type_ptr_to(g->c, i128));
    348 }
    349 
    350 /* Load a 64-bit lane of an i128 (addressed by `addr`) into a fresh i64. */
    351 static Operand api_i128_load_lane(KitCg* g, Operand addr, i32 off) {
    352   KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64);
    353   CGLocal rr = api_alloc_temp_local(g, i64);
    354   Operand dst = api_op_local(rr, i64);
    355   MemAccess ma;
    356   memset(&ma, 0, sizeof ma);
    357   ma.type = i64;
    358   ma.size = 8;
    359   ma.align = 8;
    360   g->target->load(g->target, dst, api_op_indirect(addr.v.local, off, i64), ma);
    361   return dst;
    362 }
    363 
    364 static void api_i128_binop(KitCg* g, BinOp iop) {
    365   KitCgTypeId i128 = builtin_id(KIT_CG_BUILTIN_I128);
    366   KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
    367   const char* name = api_i128_binop_helper(iop);
    368   KitCgTypeId ps[2];
    369   ApiSValue args[2];
    370   if (!name) {
    371     compiler_panic(g->c, g->cur_loc, "KitCg: unsupported i128 binop");
    372     return;
    373   }
    374   args[1] = api_pop(g);
    375   args[0] = api_pop(g);
    376   ps[0] = i128;
    377   ps[1] = api_binop_is_shift(iop) ? i32 : i128;
    378   api_runtime_call_values(g, name, i128, ps, 2, args);
    379 }
    380 
    381 /* Runtime helper name for a 64-bit-integer mul/div/rem/shift on a 32-bit
    382  * target. Mirrors api_i128_binop_helper but with the compiler-rt *di3 names.
    383  * Returns NULL for ops the inline backend handles (add/sub/and/or/xor). */
    384 static const char* api_wideint64_binop_helper(BinOp op) {
    385   switch (op) {
    386     case BO_IMUL:
    387       return "__muldi3";
    388     case BO_SDIV:
    389       return "__divdi3";
    390     case BO_UDIV:
    391       return "__udivdi3";
    392     case BO_SREM:
    393       return "__moddi3";
    394     case BO_UREM:
    395       return "__umoddi3";
    396     case BO_SHL:
    397       return "__ashldi3";
    398     case BO_SHR_U:
    399       return "__lshrdi3";
    400     case BO_SHR_S:
    401       return "__ashrdi3";
    402     default:
    403       return NULL;
    404   }
    405 }
    406 
    407 /* Lower a 64-bit mul/div/rem/shift to a runtime call. Mirrors api_i128_binop
    408  * but ret/params are builtin i64; the shift-count param is i32 (the __ashldi3
    409  * family takes (i64 value, i32 count) per compiler-rt). */
    410 static void api_wideint64_binop(KitCg* g, BinOp iop) {
    411   KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64);
    412   KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
    413   const char* name = api_wideint64_binop_helper(iop);
    414   KitCgTypeId ps[2];
    415   ApiSValue args[2];
    416   if (!name) {
    417     compiler_panic(g->c, g->cur_loc, "KitCg: unsupported wide i64 binop");
    418     return;
    419   }
    420   args[1] = api_pop(g);
    421   args[0] = api_pop(g);
    422   ps[0] = i64;
    423   ps[1] = api_binop_is_shift(iop) ? i32 : i64;
    424   api_runtime_call_values(g, name, i64, ps, 2, args);
    425 }
    426 
    427 /* ============================================================
    428  * wide8 inline 2-word lane arithmetic
    429  *
    430  * Some 32-bit ABIs represent a 64-bit integer as a memory-resident 8-byte
    431  * scalar split into two 32-bit lanes. add/sub/and/or/xor/neg/not and compares
    432  * have no compiler-rt helper (they would recurse), so they are emitted INLINE
    433  * here as lane ops. mul/div/rem/shift route to __*di3 (api_wideint64_*).
    434  * ============================================================ */
    435 
    436 static i32 wide8_lo_off(KitCg* g) { return g->c->target.big_endian ? 4 : 0; }
    437 static i32 wide8_hi_off(KitCg* g) { return g->c->target.big_endian ? 0 : 4; }
    438 
    439 /* Emit one i32 binop into a fresh temp and return it. */
    440 static Operand wide8_i32_binop(KitCg* g, BinOp op, Operand a, Operand b) {
    441   KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
    442   CGLocal r = api_alloc_temp_local(g, i32);
    443   Operand d = api_op_local(r, i32);
    444   g->target->binop(g->target, op, d, a, b);
    445   return d;
    446 }
    447 
    448 /* Emit one i32 compare (0/1 result) into a fresh temp and return it. */
    449 static Operand wide8_i32_cmp(KitCg* g, CmpOp op, Operand a, Operand b) {
    450   KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
    451   CGLocal r = api_alloc_temp_local(g, i32);
    452   Operand d = api_op_local(r, i32);
    453   g->target->cmp(g->target, op, d, a, b);
    454   return d;
    455 }
    456 
    457 /* (lo | hi) of the 8-byte value `v` as an i32, for a truthiness test. Consumes
    458  * nothing on the value stack (caller owns *v). */
    459 Operand api_wide8_or_lanes(KitCg* g, ApiSValue* v, KitCgTypeId ty) {
    460   Operand addr = api_wide8_addr(g, v, ty);
    461   Operand lo = api_wide8_load_lane(g, addr, wide8_lo_off(g));
    462   Operand hi = api_wide8_load_lane(g, addr, wide8_hi_off(g));
    463   return wide8_i32_binop(g, BO_OR, lo, hi);
    464 }
    465 
    466 /* add/sub/and/or/xor on two 8-byte ints, result pushed as a fresh 8-byte value.
    467  * add/sub carry/borrow through the high lane via an sltu (CMP_LT_U). */
    468 static void api_wide64_binop_inline(KitCg* g, BinOp iop) {
    469   ApiSValue b = api_pop(g);
    470   ApiSValue a = api_pop(g);
    471   KitCgTypeId ty = a.type ? a.type : b.type;
    472   int lo = wide8_lo_off(g), hi = wide8_hi_off(g);
    473   Operand aa = api_wide8_addr(g, &a, ty);
    474   Operand ab = api_wide8_addr(g, &b, ty);
    475   Operand alo = api_wide8_load_lane(g, aa, lo);
    476   Operand ahi = api_wide8_load_lane(g, aa, hi);
    477   Operand blo = api_wide8_load_lane(g, ab, lo);
    478   Operand bhi = api_wide8_load_lane(g, ab, hi);
    479   CGLocal res = api_wide8_temp_local(g, ty);
    480   ApiSValue res_lv = api_make_lv(api_op_local(res, ty), ty);
    481   Operand ar = api_lvalue_addr(g, &res_lv, cg_type_ptr_to(g->c, ty));
    482   Operand rlo;
    483   Operand rhi;
    484   switch (iop) {
    485     case BO_AND:
    486     case BO_OR:
    487     case BO_XOR:
    488       rlo = wide8_i32_binop(g, iop, alo, blo);
    489       rhi = wide8_i32_binop(g, iop, ahi, bhi);
    490       break;
    491     case BO_IADD: {
    492       Operand carry;
    493       rlo = wide8_i32_binop(g, BO_IADD, alo, blo);
    494       carry = wide8_i32_cmp(g, CMP_LT_U, rlo, alo); /* unsigned wrap -> carry */
    495       rhi = wide8_i32_binop(g, BO_IADD, ahi, bhi);
    496       rhi = wide8_i32_binop(g, BO_IADD, rhi, carry);
    497       break;
    498     }
    499     case BO_ISUB: {
    500       Operand borrow = wide8_i32_cmp(g, CMP_LT_U, alo, blo);
    501       rlo = wide8_i32_binop(g, BO_ISUB, alo, blo);
    502       rhi = wide8_i32_binop(g, BO_ISUB, ahi, bhi);
    503       rhi = wide8_i32_binop(g, BO_ISUB, rhi, borrow);
    504       break;
    505     }
    506     default:
    507       compiler_panic(g->c, g->cur_loc, "KitCg: unsupported wide i64 inline binop");
    508       return;
    509   }
    510   api_wide8_store_lane(g, ar, lo, rlo);
    511   api_wide8_store_lane(g, ar, hi, rhi);
    512   api_release(g, &a);
    513   api_release(g, &b);
    514   api_push(g, api_make_sv(api_op_local(res, ty), ty));
    515 }
    516 
    517 /* neg / bnot on an 8-byte int. NEG is two's complement: lo = 0-lo with borrow
    518  * into hi = 0-hi-borrow. BNOT is lane-wise xor -1. */
    519 static void api_wide64_unop_inline(KitCg* g, UnOp iop) {
    520   KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
    521   ApiSValue a = api_pop(g);
    522   KitCgTypeId ty = a.type ? a.type : a.op.type;
    523   int lo = wide8_lo_off(g), hi = wide8_hi_off(g);
    524   Operand aa = api_wide8_addr(g, &a, ty);
    525   Operand alo = api_wide8_load_lane(g, aa, lo);
    526   Operand ahi = api_wide8_load_lane(g, aa, hi);
    527   CGLocal res = api_wide8_temp_local(g, ty);
    528   ApiSValue res_lv = api_make_lv(api_op_local(res, ty), ty);
    529   Operand ar = api_lvalue_addr(g, &res_lv, cg_type_ptr_to(g->c, ty));
    530   Operand rlo;
    531   Operand rhi;
    532   if (iop == UO_BNOT) {
    533     rlo = wide8_i32_binop(g, BO_XOR, alo, api_op_imm(-1, i32));
    534     rhi = wide8_i32_binop(g, BO_XOR, ahi, api_op_imm(-1, i32));
    535   } else { /* UO_NEG: 0 - value */
    536     Operand zero = api_op_imm(0, i32);
    537     Operand borrow = wide8_i32_cmp(g, CMP_LT_U, zero, alo); /* 0<lo -> borrow */
    538     rlo = wide8_i32_binop(g, BO_ISUB, zero, alo);
    539     rhi = wide8_i32_binop(g, BO_ISUB, zero, ahi);
    540     rhi = wide8_i32_binop(g, BO_ISUB, rhi, borrow);
    541   }
    542   api_wide8_store_lane(g, ar, lo, rlo);
    543   api_wide8_store_lane(g, ar, hi, rhi);
    544   api_release(g, &a);
    545   api_push(g, api_make_sv(api_op_local(res, ty), ty));
    546 }
    547 
    548 /* a < b over 8-byte lanes: (a_hi <{s,u} b_hi) | (a_hi==b_hi & a_lo <u b_lo).
    549  * The high lane uses the signed/unsigned relation; the low lane is always
    550  * unsigned. Returns an i32 0/1. */
    551 static Operand wide8_lt(KitCg* g, int is_signed, Operand alo, Operand ahi,
    552                         Operand blo, Operand bhi) {
    553   Operand hi_lt = wide8_i32_cmp(g, is_signed ? CMP_LT_S : CMP_LT_U, ahi, bhi);
    554   Operand hi_eq = wide8_i32_cmp(g, CMP_EQ, ahi, bhi);
    555   Operand lo_lt = wide8_i32_cmp(g, CMP_LT_U, alo, blo);
    556   Operand t = wide8_i32_binop(g, BO_AND, hi_eq, lo_lt);
    557   return wide8_i32_binop(g, BO_OR, hi_lt, t);
    558 }
    559 
    560 static Operand wide8_eq(KitCg* g, Operand alo, Operand ahi, Operand blo,
    561                         Operand bhi) {
    562   KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
    563   Operand dlo = wide8_i32_binop(g, BO_XOR, alo, blo);
    564   Operand dhi = wide8_i32_binop(g, BO_XOR, ahi, bhi);
    565   Operand diff = wide8_i32_binop(g, BO_OR, dlo, dhi);
    566   return wide8_i32_cmp(g, CMP_EQ, diff, api_op_imm(0, i32));
    567 }
    568 
    569 static int cmp_is_signed_rel(CmpOp op) {
    570   return op == CMP_LT_S || op == CMP_LE_S || op == CMP_GT_S || op == CMP_GE_S;
    571 }
    572 
    573 /* 8-byte int compare -> eager i32 0/1 value (not a delayed SV_CMP). */
    574 static void api_wide64_cmp_inline(KitCg* g, CmpOp cop) {
    575   KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
    576   int sg = cmp_is_signed_rel(cop);
    577   ApiSValue b = api_pop(g);
    578   ApiSValue a = api_pop(g);
    579   KitCgTypeId ty = a.type ? a.type : b.type;
    580   int lo = wide8_lo_off(g), hi = wide8_hi_off(g);
    581   Operand aa = api_wide8_addr(g, &a, ty);
    582   Operand ab = api_wide8_addr(g, &b, ty);
    583   Operand alo = api_wide8_load_lane(g, aa, lo);
    584   Operand ahi = api_wide8_load_lane(g, aa, hi);
    585   Operand blo = api_wide8_load_lane(g, ab, lo);
    586   Operand bhi = api_wide8_load_lane(g, ab, hi);
    587   Operand one = api_op_imm(1, i32);
    588   Operand res;
    589   switch (cop) {
    590     case CMP_EQ:
    591       res = wide8_eq(g, alo, ahi, blo, bhi);
    592       break;
    593     case CMP_NE:
    594       res = wide8_i32_binop(g, BO_XOR, wide8_eq(g, alo, ahi, blo, bhi), one);
    595       break;
    596     case CMP_LT_S:
    597     case CMP_LT_U:
    598       res = wide8_lt(g, sg, alo, ahi, blo, bhi);
    599       break;
    600     case CMP_GT_S:
    601     case CMP_GT_U: /* a>b  ==  b<a */
    602       res = wide8_lt(g, sg, blo, bhi, alo, ahi);
    603       break;
    604     case CMP_LE_S:
    605     case CMP_LE_U: /* a<=b ==  !(b<a) */
    606       res = wide8_i32_binop(g, BO_XOR, wide8_lt(g, sg, blo, bhi, alo, ahi), one);
    607       break;
    608     case CMP_GE_S:
    609     case CMP_GE_U: /* a>=b ==  !(a<b) */
    610       res = wide8_i32_binop(g, BO_XOR, wide8_lt(g, sg, alo, ahi, blo, bhi), one);
    611       break;
    612     default:
    613       compiler_panic(g->c, g->cur_loc, "KitCg: unsupported wide i64 compare");
    614       return;
    615   }
    616   api_release(g, &a);
    617   api_release(g, &b);
    618   api_push(g, api_make_sv(res, i32));
    619 }
    620 
    621 /* ============================================================
    622  * wide64 __builtin_*_overflow on split-lane 64-bit values
    623  *
    624  * The native backends only model single-register overflow, so a 64-bit
    625  * operand traps there. Here we legalize the 6 overflow intrinsics for a
    626  * 64-bit operand pair into 32-bit lane ops, computing both the
    627  * 64-bit wrapped value (stored to a fresh 8-byte temp) and the boolean
    628  * overflow flag, then pushing [value, ok] exactly as the native path does.
    629  * add/sub reuse the carry/borrow lane logic; mul builds the full 128-bit
    630  * product from 32x32->64 partials (no MULHU opcode exists, so each partial
    631  * is itself synthesized from 16-bit halves).
    632  * ============================================================ */
    633 
    634 /* Unsigned 32x32 -> 64 product of i32 lanes a,b, returned as (*plo,*phi) i32
    635  * via the 16-bit-halves schoolbook method (the target has no high-multiply
    636  * opcode, and a plain BO_IMUL only yields the low 32 bits).
    637  *
    638  *   a = ah*2^16 + al,  b = bh*2^16 + bl
    639  *   a*b = ah*bh*2^32 + (ah*bl + al*bh)*2^16 + al*bl
    640  */
    641 static void wide8_umul32(KitCg* g, Operand a, Operand b, Operand* plo,
    642                          Operand* phi) {
    643   KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
    644   Operand mask = api_op_imm(0xffff, i32);
    645   Operand sh16 = api_op_imm(16, i32);
    646   Operand al = wide8_i32_binop(g, BO_AND, a, mask);
    647   Operand ah = wide8_i32_binop(g, BO_SHR_U, a, sh16);
    648   Operand bl = wide8_i32_binop(g, BO_AND, b, mask);
    649   Operand bh = wide8_i32_binop(g, BO_SHR_U, b, sh16);
    650   Operand ll = wide8_i32_binop(g, BO_IMUL, al, bl); /* bits 0..31  (<=32 bits) */
    651   Operand lh = wide8_i32_binop(g, BO_IMUL, al, bh); /* bits 16..47 */
    652   Operand hl = wide8_i32_binop(g, BO_IMUL, ah, bl); /* bits 16..47 */
    653   Operand hh = wide8_i32_binop(g, BO_IMUL, ah, bh); /* bits 32..63 */
    654   /* mid = lh + hl + (ll >> 16); a 33-bit sum -> track carry into bit 32. */
    655   Operand ll_hi = wide8_i32_binop(g, BO_SHR_U, ll, sh16);
    656   Operand mid = wide8_i32_binop(g, BO_IADD, lh, hl);
    657   /* carry out of (lh+hl) into bit 48 (i.e. +2^32 in the high word). */
    658   Operand c0 = wide8_i32_cmp(g, CMP_LT_U, mid, lh);
    659   Operand mid2 = wide8_i32_binop(g, BO_IADD, mid, ll_hi);
    660   Operand c1 = wide8_i32_cmp(g, CMP_LT_U, mid2, mid);
    661   Operand carry32 = wide8_i32_binop(g, BO_IADD, c0, c1); /* into high word */
    662   /* lo = (mid2 << 16) | (ll & 0xffff) */
    663   Operand mid2_lo = wide8_i32_binop(g, BO_AND, mid2, mask);
    664   Operand mid2_loshift = wide8_i32_binop(g, BO_SHL, mid2_lo, sh16);
    665   Operand ll_lo = wide8_i32_binop(g, BO_AND, ll, mask);
    666   *plo = wide8_i32_binop(g, BO_OR, mid2_loshift, ll_lo);
    667   /* hi = hh + (mid2 >> 16) + carry32*2^16 */
    668   Operand mid2_hi = wide8_i32_binop(g, BO_SHR_U, mid2, sh16);
    669   Operand carry_word = wide8_i32_binop(g, BO_SHL, carry32, sh16);
    670   Operand hi = wide8_i32_binop(g, BO_IADD, hh, mid2_hi);
    671   *phi = wide8_i32_binop(g, BO_IADD, hi, carry_word);
    672 }
    673 
    674 /* Add three i32 columns acc += addend, threading carry: returns the new sum and
    675  * adds the unsigned-wrap carry (0/1) into *carry. */
    676 static Operand wide8_addc(KitCg* g, Operand acc, Operand addend,
    677                           Operand* carry) {
    678   Operand sum = wide8_i32_binop(g, BO_IADD, acc, addend);
    679   Operand c = wide8_i32_cmp(g, CMP_LT_U, sum, acc);
    680   *carry = wide8_i32_binop(g, BO_IADD, *carry, c);
    681   return sum;
    682 }
    683 
    684 /* The 6 __builtin_*_overflow intrinsics for a split-lane wide64 operand pair.
    685  * Pops the two 8-byte args, computes the wrapped 64-bit value into a fresh
    686  * 8-byte temp and the bool overflow flag into an i32, then pushes [value, ok]
    687  * matching the contract of the native overflow path. */
    688 static void api_wide64_overflow_inline(KitCg* g, KitCgIntrinsic intrin) {
    689   KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
    690   KitCgTypeId bool_ty = builtin_id(KIT_CG_BUILTIN_BOOL);
    691   Operand sh31 = api_op_imm(31, i32);
    692   ApiSValue b = api_pop(g);
    693   ApiSValue a = api_pop(g);
    694   KitCgTypeId ty = a.type ? a.type : b.type;
    695   int lo = wide8_lo_off(g), hi = wide8_hi_off(g);
    696   Operand aa = api_wide8_addr(g, &a, ty);
    697   Operand ab = api_wide8_addr(g, &b, ty);
    698   Operand alo = api_wide8_load_lane(g, aa, lo);
    699   Operand ahi = api_wide8_load_lane(g, aa, hi);
    700   Operand blo = api_wide8_load_lane(g, ab, lo);
    701   Operand bhi = api_wide8_load_lane(g, ab, hi);
    702   CGLocal res = api_wide8_temp_local(g, ty);
    703   ApiSValue res_lv = api_make_lv(api_op_local(res, ty), ty);
    704   Operand ar = api_lvalue_addr(g, &res_lv, cg_type_ptr_to(g->c, ty));
    705   Operand rlo;
    706   Operand rhi;
    707   Operand ok;
    708   switch (intrin) {
    709     case KIT_CG_INTRIN_UADD_OVERFLOW:
    710     case KIT_CG_INTRIN_SADD_OVERFLOW: {
    711       Operand carry;
    712       rlo = wide8_i32_binop(g, BO_IADD, alo, blo);
    713       carry = wide8_i32_cmp(g, CMP_LT_U, rlo, alo);
    714       rhi = wide8_i32_binop(g, BO_IADD, ahi, bhi);
    715       /* carry-out of the high lane = (rhi<ahi) before +carry, OR wrap on +carry.
    716        * Compute rhi step by step so we can detect the final carry-out. */
    717       Operand c_hi0 = wide8_i32_cmp(g, CMP_LT_U, rhi, ahi);
    718       rhi = wide8_i32_binop(g, BO_IADD, rhi, carry);
    719       Operand c_hi1 = wide8_i32_cmp(g, CMP_LT_U, rhi, carry);
    720       if (intrin == KIT_CG_INTRIN_UADD_OVERFLOW) {
    721         /* unsigned: ok = carry-out of the high lane */
    722         ok = wide8_i32_binop(g, BO_OR, c_hi0, c_hi1);
    723       } else {
    724         /* signed: ok = ((a_hi ^ r_hi) & (b_hi ^ r_hi)) sign bit (bit 31) */
    725         Operand ar_x = wide8_i32_binop(g, BO_XOR, ahi, rhi);
    726         Operand br_x = wide8_i32_binop(g, BO_XOR, bhi, rhi);
    727         Operand both = wide8_i32_binop(g, BO_AND, ar_x, br_x);
    728         ok = wide8_i32_binop(g, BO_SHR_U, both, sh31);
    729       }
    730       break;
    731     }
    732     case KIT_CG_INTRIN_USUB_OVERFLOW:
    733     case KIT_CG_INTRIN_SSUB_OVERFLOW: {
    734       Operand borrow = wide8_i32_cmp(g, CMP_LT_U, alo, blo);
    735       rlo = wide8_i32_binop(g, BO_ISUB, alo, blo);
    736       Operand t = wide8_i32_binop(g, BO_ISUB, ahi, bhi);
    737       /* high-lane borrow-out: (ahi < bhi) OR (t < borrow after subtracting). */
    738       Operand b_hi0 = wide8_i32_cmp(g, CMP_LT_U, ahi, bhi);
    739       Operand b_hi1 = wide8_i32_cmp(g, CMP_LT_U, t, borrow);
    740       rhi = wide8_i32_binop(g, BO_ISUB, t, borrow);
    741       if (intrin == KIT_CG_INTRIN_USUB_OVERFLOW) {
    742         ok = wide8_i32_binop(g, BO_OR, b_hi0, b_hi1);
    743       } else {
    744         /* signed: ok = ((a_hi ^ b_hi) & (a_hi ^ r_hi)) sign bit (bit 31) */
    745         Operand ab_x = wide8_i32_binop(g, BO_XOR, ahi, bhi);
    746         Operand ar_x = wide8_i32_binop(g, BO_XOR, ahi, rhi);
    747         Operand both = wide8_i32_binop(g, BO_AND, ab_x, ar_x);
    748         ok = wide8_i32_binop(g, BO_SHR_U, both, sh31);
    749       }
    750       break;
    751     }
    752     case KIT_CG_INTRIN_UMUL_OVERFLOW:
    753     case KIT_CG_INTRIN_SMUL_OVERFLOW: {
    754       int is_signed = (intrin == KIT_CG_INTRIN_SMUL_OVERFLOW);
    755       /* For signed, compute |a|,|b| as unsigned 64-bit, do the unsigned 128-bit
    756        * product, then apply the result sign. Overflow tests below use the
    757        * unsigned magnitude product plus the expected sign. */
    758       Operand ua_lo = alo, ua_hi = ahi, ub_lo = blo, ub_hi = bhi;
    759       Operand sgn = (Operand){0};
    760       if (is_signed) {
    761         /* a_sign = ahi >> 31 (0 or 1 in i32, but as a mask we want -1/0). */
    762         Operand am = wide8_i32_binop(g, BO_SHR_S, ahi, sh31); /* 0 or -1 */
    763         Operand bm = wide8_i32_binop(g, BO_SHR_S, bhi, sh31);
    764         /* |a| = (a ^ am) - am  (two's-complement abs), lane-wise w/ borrow. */
    765         Operand axl = wide8_i32_binop(g, BO_XOR, alo, am);
    766         Operand axh = wide8_i32_binop(g, BO_XOR, ahi, am);
    767         Operand brwa = wide8_i32_cmp(g, CMP_LT_U, axl, am);
    768         ua_lo = wide8_i32_binop(g, BO_ISUB, axl, am);
    769         Operand tah = wide8_i32_binop(g, BO_ISUB, axh, am);
    770         ua_hi = wide8_i32_binop(g, BO_ISUB, tah, brwa);
    771         Operand bxl = wide8_i32_binop(g, BO_XOR, blo, bm);
    772         Operand bxh = wide8_i32_binop(g, BO_XOR, bhi, bm);
    773         Operand brwb = wide8_i32_cmp(g, CMP_LT_U, bxl, bm);
    774         ub_lo = wide8_i32_binop(g, BO_ISUB, bxl, bm);
    775         Operand tbh = wide8_i32_binop(g, BO_ISUB, bxh, bm);
    776         ub_hi = wide8_i32_binop(g, BO_ISUB, tbh, brwb);
    777         sgn = wide8_i32_binop(g, BO_XOR, am, bm); /* result sign mask -1/0 */
    778       }
    779       /* Unsigned 128-bit product of (ua_hi:ua_lo) * (ub_hi:ub_lo).
    780        *   P00 = ua_lo*ub_lo  -> columns 0,1
    781        *   P01 = ua_lo*ub_hi  -> columns 1,2
    782        *   P10 = ua_hi*ub_lo  -> columns 1,2
    783        *   P11 = ua_hi*ub_hi  -> columns 2,3 */
    784       Operand p00l, p00h, p01l, p01h, p10l, p10h, p11l, p11h;
    785       wide8_umul32(g, ua_lo, ub_lo, &p00l, &p00h);
    786       wide8_umul32(g, ua_lo, ub_hi, &p01l, &p01h);
    787       wide8_umul32(g, ua_hi, ub_lo, &p10l, &p10h);
    788       wide8_umul32(g, ua_hi, ub_hi, &p11l, &p11h);
    789       Operand zero = api_op_imm(0, i32);
    790       /* column 0 */
    791       Operand r0 = p00l;
    792       /* column 1 = p00h + p01l + p10l */
    793       Operand c1 = zero;
    794       Operand r1 = p00h;
    795       r1 = wide8_addc(g, r1, p01l, &c1);
    796       r1 = wide8_addc(g, r1, p10l, &c1);
    797       /* column 2 = p01h + p10h + p11l + c1 */
    798       Operand c2 = zero;
    799       Operand r2 = p01h;
    800       r2 = wide8_addc(g, r2, p10h, &c2);
    801       r2 = wide8_addc(g, r2, p11l, &c2);
    802       r2 = wide8_addc(g, r2, c1, &c2);
    803       /* column 3 = p11h + c2 */
    804       Operand r3 = wide8_i32_binop(g, BO_IADD, p11h, c2);
    805       /* low 64 bits = (r1:r0); high 64 bits = (r3:r2). */
    806       Operand mlo = r0, mhi = r1;
    807       Operand hi_lo = r2, hi_hi = r3;
    808       if (is_signed) {
    809         /* Apply result sign: negate the 128-bit magnitude if sgn==-1.
    810          * negated = (x ^ sgn) - sgn across all 4 words with borrow. */
    811         Operand w0 = wide8_i32_binop(g, BO_XOR, mlo, sgn);
    812         Operand w1 = wide8_i32_binop(g, BO_XOR, mhi, sgn);
    813         Operand w2 = wide8_i32_binop(g, BO_XOR, hi_lo, sgn);
    814         Operand w3 = wide8_i32_binop(g, BO_XOR, hi_hi, sgn);
    815         Operand bor0 = wide8_i32_cmp(g, CMP_LT_U, w0, sgn);
    816         mlo = wide8_i32_binop(g, BO_ISUB, w0, sgn);
    817         Operand t1 = wide8_i32_binop(g, BO_ISUB, w1, sgn);
    818         Operand bor1a = wide8_i32_cmp(g, CMP_LT_U, w1, sgn);
    819         Operand bor1b = wide8_i32_cmp(g, CMP_LT_U, t1, bor0);
    820         mhi = wide8_i32_binop(g, BO_ISUB, t1, bor0);
    821         Operand bor1 = wide8_i32_binop(g, BO_OR, bor1a, bor1b);
    822         Operand t2 = wide8_i32_binop(g, BO_ISUB, w2, sgn);
    823         Operand bor2a = wide8_i32_cmp(g, CMP_LT_U, w2, sgn);
    824         Operand bor2b = wide8_i32_cmp(g, CMP_LT_U, t2, bor1);
    825         hi_lo = wide8_i32_binop(g, BO_ISUB, t2, bor1);
    826         Operand bor2 = wide8_i32_binop(g, BO_OR, bor2a, bor2b);
    827         Operand t3 = wide8_i32_binop(g, BO_ISUB, w3, sgn);
    828         hi_hi = wide8_i32_binop(g, BO_ISUB, t3, bor2);
    829       }
    830       rlo = mlo;
    831       rhi = mhi;
    832       if (!is_signed) {
    833         /* unsigned overflow: high 64 bits nonzero. */
    834         Operand t = wide8_i32_binop(g, BO_OR, hi_lo, hi_hi);
    835         ok = wide8_i32_cmp(g, CMP_NE, t, zero);
    836       } else {
    837         /* signed overflow: the 128-bit result is not the sign-extension of its
    838          * low 64 bits. sext = (rhi >> 31) replicated; overflow if
    839          * (hi_lo != sext) | (hi_hi != sext) where sext = arithmetic sign of
    840          * the signed low-64 result (bit 63 = rhi sign). */
    841         Operand sext = wide8_i32_binop(g, BO_SHR_S, rhi, sh31); /* 0 or -1 */
    842         Operand d2 = wide8_i32_binop(g, BO_XOR, hi_lo, sext);
    843         Operand d3 = wide8_i32_binop(g, BO_XOR, hi_hi, sext);
    844         Operand d = wide8_i32_binop(g, BO_OR, d2, d3);
    845         ok = wide8_i32_cmp(g, CMP_NE, d, zero);
    846       }
    847       break;
    848     }
    849     default:
    850       compiler_panic(g->c, g->cur_loc,
    851                      "KitCg: unsupported wide i64 overflow intrinsic");
    852       api_release(g, &a);
    853       api_release(g, &b);
    854       return;
    855   }
    856   api_wide8_store_lane(g, ar, lo, rlo);
    857   api_wide8_store_lane(g, ar, hi, rhi);
    858   api_release(g, &a);
    859   api_release(g, &b);
    860   /* Materialize ok as a fresh bool temp so it has a stable home. */
    861   {
    862     CGLocal okl = api_alloc_temp_local(g, bool_ty);
    863     Operand okd = api_op_local(okl, bool_ty);
    864     g->target->binop(g->target, BO_AND, okd, ok, api_op_imm(1, i32));
    865     api_push(g, api_make_sv(api_op_local(res, ty), ty));
    866     api_push(g, api_make_sv(okd, bool_ty));
    867   }
    868 }
    869 
    870 /* int<->split-i64 conversions (sext/zext/trunc/bitcast across the 4<->8
    871  * boundary, and i64->bool). Returns 1 if it handled (and consumed) *v. The
    872  * i64<->float conversions are routed to libcalls in kit_cg_*_to_float /
    873  * kit_cg_float_to_* and never reach here. */
    874 int api_try_wide8_convert(KitCg* g, ConvKind ck, KitCgTypeId sty,
    875                           KitCgTypeId dty, ApiSValue* v) {
    876   KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
    877   int s_wide = api_is_wide8_scalar_type(g->c, sty);
    878   int d_wide = api_is_wide8_scalar_type(g->c, dty);
    879   int lo = wide8_lo_off(g), hi = wide8_hi_off(g);
    880   if (!s_wide && !d_wide) return 0;
    881   if (s_wide && d_wide) {
    882     /* i64<->soft-double reinterpret (same 8-byte layout) or i64<->u64. */
    883     v->type = dty;
    884     v->op.type = dty;
    885     api_push(g, *v);
    886     return 1;
    887   }
    888   if (d_wide) {
    889     /* narrower int -> i64: low lane is the (converted-to-i32) source; high lane
    890      * is the sign-extension (CV_SEXT) or zero (CV_ZEXT/CV_BITCAST of a ptr). */
    891     int sext = (ck == CV_SEXT);
    892     Operand src32;
    893     CGLocal res;
    894     ApiSValue res_lv;
    895     Operand ar;
    896     Operand hival;
    897     if (api_unalias_type(g->c, sty) != i32) {
    898       api_push(g, *v);
    899       api_cg_convert_kind(g, i32, ck == CV_SEXT ? CV_SEXT : CV_ZEXT);
    900       *v = api_pop(g);
    901     }
    902     src32 = api_force_local(g, v, i32);
    903     res = api_wide8_temp_local(g, dty);
    904     res_lv = api_make_lv(api_op_local(res, dty), dty);
    905     ar = api_lvalue_addr(g, &res_lv, cg_type_ptr_to(g->c, dty));
    906     api_wide8_store_lane(g, ar, lo, src32);
    907     if (sext)
    908       hival = wide8_i32_binop(g, BO_SHR_S, src32, api_op_imm(31, i32));
    909     else
    910       hival = api_op_imm(0, i32);
    911     api_wide8_store_lane(g, ar, hi, hival);
    912     api_release(g, v);
    913     api_push(g, api_make_sv(api_op_local(res, dty), dty));
    914     return 1;
    915   }
    916   /* s_wide: i64 -> narrower. _Bool is "any bit set"; else take the low lane and
    917    * truncate/convert further. */
    918   if (api_is_bool_type(g->c, dty)) {
    919     Operand orl = api_wide8_or_lanes(g, v, sty);
    920     api_release(g, v);
    921     api_push(g, api_make_sv(orl, i32));
    922     kit_cg_push_int(g, 0, i32);
    923     api_cg_cmp(g, CMP_NE);
    924     api_cg_convert_kind(g, dty, CV_TRUNC);
    925     return 1;
    926   }
    927   {
    928     Operand addr = api_wide8_addr(g, v, sty);
    929     Operand lolane = api_wide8_load_lane(g, addr, lo);
    930     api_release(g, v);
    931     api_push(g, api_make_sv(lolane, i32));
    932     if (api_unalias_type(g->c, dty) != i32) api_cg_convert_kind(g, dty, CV_TRUNC);
    933     return 1;
    934   }
    935 }
    936 
    937 static void api_i128_unop(KitCg* g, UnOp iop) {
    938   KitCgTypeId i128 = builtin_id(KIT_CG_BUILTIN_I128);
    939   const char* name = NULL;
    940   ApiSValue args[1];
    941   KitCgTypeId ps[1];
    942   if (iop == UO_NEG)
    943     name = "__negti2";
    944   else if (iop == UO_BNOT)
    945     name = "__kit_notti3";
    946   else {
    947     compiler_panic(g->c, g->cur_loc, "KitCg: unsupported i128 unop");
    948     return;
    949   }
    950   args[0] = api_pop(g);
    951   ps[0] = i128;
    952   api_runtime_call_values(g, name, i128, ps, 1, args);
    953 }
    954 
    955 /* Map a relational op to the form used to compare a __kit_*cmpti2
    956  * result (-1/0/1, a signed i32) against zero. */
    957 static CmpOp api_i128_cmp_vs_zero(CmpOp cop) {
    958   switch (cop) {
    959     case CMP_EQ:
    960       return CMP_EQ;
    961     case CMP_NE:
    962       return CMP_NE;
    963     case CMP_LT_S:
    964     case CMP_LT_U:
    965       return CMP_LT_S;
    966     case CMP_LE_S:
    967     case CMP_LE_U:
    968       return CMP_LE_S;
    969     case CMP_GT_S:
    970     case CMP_GT_U:
    971       return CMP_GT_S;
    972     case CMP_GE_S:
    973     case CMP_GE_U:
    974       return CMP_GE_S;
    975     default:
    976       return CMP_NE;
    977   }
    978 }
    979 
    980 static void api_i128_cmp(KitCg* g, CmpOp cop) {
    981   KitCgTypeId i128 = builtin_id(KIT_CG_BUILTIN_I128);
    982   KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
    983   const char* name =
    984       api_i128_cmp_is_unsigned(cop) ? "__kit_ucmpti2" : "__kit_cmpti2";
    985   KitCgTypeId ps[2] = {i128, i128};
    986   ApiSValue args[2];
    987   args[1] = api_pop(g);
    988   args[0] = api_pop(g);
    989   api_runtime_call_values(g, name, i32, ps, 2, args);
    990   kit_cg_push_int(g, 0, i32);
    991   api_cg_cmp(g, api_i128_cmp_vs_zero(cop));
    992 }
    993 
    994 /* int<->i128 conversions. Returns 1 if it handled the conversion and
    995  * consumed *v, 0 to fall through to the generic path. */
    996 int api_try_i128_convert(KitCg* g, ConvKind ck, KitCgTypeId sty,
    997                          KitCgTypeId dty, ApiSValue* v) {
    998   KitCgTypeId i128 = builtin_id(KIT_CG_BUILTIN_I128);
    999   KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64);
   1000   int s_is_128 = api_is_i128_type(g->c, sty);
   1001   int d_is_128 = api_is_i128_type(g->c, dty);
   1002   if (!s_is_128 && !d_is_128) return 0;
   1003   if (s_is_128 && d_is_128) {
   1004     /* signed<->unsigned i128 reinterpret: identical layout. */
   1005     v->type = dty;
   1006     v->op.type = dty;
   1007     api_push(g, *v);
   1008     return 1;
   1009   }
   1010   if (d_is_128) {
   1011     u32 sw = kit_cg_type_int_width((KitCompiler*)g->c, sty);
   1012     const char* name = (ck == CV_SEXT) ? "__kit_sext64ti" : "__kit_zext64ti";
   1013     ApiSValue arg;
   1014     KitCgTypeId ps[1];
   1015     if (sw == 0) return 0; /* float->i128 unsupported here */
   1016     if (sw >= 64) {
   1017       arg = *v;
   1018       arg.type = i64;
   1019       arg.op.type = i64;
   1020     } else {
   1021       api_push(g, *v);
   1022       api_cg_convert_kind(g, i64, ck);
   1023       arg = api_pop(g);
   1024     }
   1025     ps[0] = i64;
   1026     api_runtime_call_values(g, name, i128, ps, 1, &arg);
   1027     return 1;
   1028   }
   1029   /* s_is_128, dty is _Bool: "value != 0" over the full 128 bits, not a
   1030    * low-lane truncation (a value whose only set bits are above bit 63 must
   1031    * still become 1). Reuse the runtime i128 compare. */
   1032   if (api_is_bool_type(g->c, dty)) {
   1033     api_push(g, *v);
   1034     kit_cg_push_int(g, 0, i128);
   1035     api_i128_cmp(g, CMP_NE); /* leaves i32 0/1 */
   1036     api_cg_convert_kind(g, dty, CV_TRUNC);
   1037     return 1;
   1038   }
   1039   /* s_is_128, dty is a narrower integer: take the low 64 bits, then
   1040    * truncate further if needed. */
   1041   {
   1042     u32 dw = kit_cg_type_int_width((KitCompiler*)g->c, dty);
   1043     i32 lo_off = g->c->target.big_endian ? 8 : 0;
   1044     Operand addr;
   1045     Operand lo;
   1046     if (dw == 0) return 0; /* i128->float unsupported here */
   1047     addr = api_i128_addr(g, v);
   1048     lo = api_i128_load_lane(g, addr, lo_off);
   1049     api_release_temp_local(g, addr.v.local);
   1050     api_release(g, v);
   1051     if (dw >= 64) {
   1052       api_push(g, api_make_sv(lo, dty));
   1053     } else {
   1054       api_push(g, api_make_sv(lo, i64));
   1055       api_cg_convert_kind(g, dty, CV_TRUNC);
   1056     }
   1057     return 1;
   1058   }
   1059 }
   1060 
   1061 void kit_cg_int_binop(KitCg* g, KitCgIntBinOp op, uint32_t flags) {
   1062   BinOp iop = api_map_int_binop(op);
   1063   if (g && (api_i128_stack_top(g, 0) || api_i128_stack_top(g, 1))) {
   1064     api_i128_binop(g, iop);
   1065     return;
   1066   }
   1067   /* 64-bit int split into 32-bit lanes: mul/div/rem/shift become __*di3
   1068    * runtime calls; add/sub/and/or/xor are emitted inline as 2-word lane ops
   1069    * (no compiler-rt helper exists for them). Both keep the value memory-resident
   1070    * so the allocator never tries to put 8 bytes in one 4-byte value slot. */
   1071   if (g && (api_wide64_stack_top(g, 0) || api_wide64_stack_top(g, 1))) {
   1072     if (api_wideint64_binop_helper(iop))
   1073       api_wideint64_binop(g, iop);
   1074     else
   1075       api_wide64_binop_inline(g, iop);
   1076     return;
   1077   }
   1078   api_cg_binop(g, iop, flags);
   1079 }
   1080 
   1081 void kit_cg_int_unop(KitCg* g, KitCgIntUnOp op, uint32_t flags) {
   1082   UnOp iop = api_map_int_unop(op);
   1083   if (g && api_i128_stack_top(g, 0) && (iop == UO_NEG || iop == UO_BNOT)) {
   1084     api_i128_unop(g, iop);
   1085     return;
   1086   }
   1087   /* Split 64-bit int: neg/bnot are inline 2-word lane ops; logical-not (!x) is
   1088    * the full-value truthiness test (lo|hi)==0. */
   1089   if (g && api_wide64_stack_top(g, 0)) {
   1090     if (iop == UO_NEG || iop == UO_BNOT) {
   1091       api_wide64_unop_inline(g, iop);
   1092       return;
   1093     }
   1094     if (iop == UO_NOT) {
   1095       KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
   1096       ApiSValue v = api_pop(g);
   1097       KitCgTypeId ty = v.type ? v.type : v.op.type;
   1098       Operand orl = api_wide8_or_lanes(g, &v, ty);
   1099       api_release(g, &v);
   1100       api_push(g, api_make_sv(orl, i32));
   1101       kit_cg_push_int(g, 0, i32);
   1102       api_cg_cmp(g, CMP_EQ);
   1103       api_cg_convert_kind(g, ty, CV_ZEXT);
   1104       return;
   1105     }
   1106   }
   1107   api_cg_unop(g, iop, flags);
   1108 }
   1109 
   1110 void kit_cg_int_cmp(KitCg* g, KitCgIntCmpOp op) {
   1111   CmpOp cop = api_map_int_cmp(op);
   1112   if (g && (api_i128_stack_top(g, 0) || api_i128_stack_top(g, 1))) {
   1113     api_i128_cmp(g, cop);
   1114     return;
   1115   }
   1116   if (g && (api_wide64_stack_top(g, 0) || api_wide64_stack_top(g, 1))) {
   1117     api_wide64_cmp_inline(g, cop);
   1118     return;
   1119   }
   1120   api_cg_cmp(g, cop);
   1121 }
   1122 
   1123 const char* api_i128_binop_helper(BinOp op) {
   1124   switch (op) {
   1125     case BO_IADD:
   1126       return "__kit_addti3";
   1127     case BO_ISUB:
   1128       return "__kit_subti3";
   1129     case BO_IMUL:
   1130       return "__multi3";
   1131     case BO_SDIV:
   1132       return "__divti3";
   1133     case BO_UDIV:
   1134       return "__udivti3";
   1135     case BO_SREM:
   1136       return "__modti3";
   1137     case BO_UREM:
   1138       return "__umodti3";
   1139     case BO_AND:
   1140       return "__kit_andti3";
   1141     case BO_OR:
   1142       return "__kit_orti3";
   1143     case BO_XOR:
   1144       return "__kit_xorti3";
   1145     case BO_SHL:
   1146       return "__ashlti3";
   1147     case BO_SHR_U:
   1148       return "__lshrti3";
   1149     case BO_SHR_S:
   1150       return "__ashrti3";
   1151     case BO_FADD:
   1152     case BO_FSUB:
   1153     case BO_FMUL:
   1154     case BO_FDIV:
   1155     default:
   1156       return NULL;
   1157   }
   1158 }
   1159 
   1160 int api_i128_cmp_is_unsigned(CmpOp op) {
   1161   return op == CMP_LT_U || op == CMP_LE_U || op == CMP_GT_U || op == CMP_GE_U;
   1162 }
   1163 
   1164 const char* api_f128_binop_helper(KitCgFpBinOp op) {
   1165   switch (op) {
   1166     case KIT_CG_FP_ADD:
   1167       return "__addtf3";
   1168     case KIT_CG_FP_SUB:
   1169       return "__subtf3";
   1170     case KIT_CG_FP_MUL:
   1171       return "__multf3";
   1172     case KIT_CG_FP_DIV:
   1173       return "__divtf3";
   1174   }
   1175   return NULL;
   1176 }
   1177 
   1178 /* Runtime helper name for double (f64) arithmetic on a target that lacks a
   1179  * hardware double unit. Mirrors api_f128_binop_helper with the __*df3 names. */
   1180 static const char* api_softdf_binop_helper(KitCgFpBinOp op) {
   1181   switch (op) {
   1182     case KIT_CG_FP_ADD:
   1183       return "__adddf3";
   1184     case KIT_CG_FP_SUB:
   1185       return "__subdf3";
   1186     case KIT_CG_FP_MUL:
   1187       return "__muldf3";
   1188     case KIT_CG_FP_DIV:
   1189       return "__divdf3";
   1190   }
   1191   return NULL;
   1192 }
   1193 
   1194 int api_f128_stack_top(KitCg* g, u32 depth) {
   1195   if (!g || g->sp <= depth) return 0;
   1196   return api_is_f128_type(g->c, api_sv_type(&g->stack[g->sp - 1u - depth]));
   1197 }
   1198 
   1199 /* True when the target has no hardware double: float_abi is SOFT (ilp32/lp64,
   1200  * no FP regs) or SINGLE (ilp32f/lp64f, only float in FP regs — double is always
   1201  * soft). DOUBLE (rv64 lp64d) and DEFAULT (x64/aa64 hardware-double targets that
   1202  * never set float_abi) keep the inline hardware path, so existing rv64/x64/aa64
   1203  * codegen is unchanged. */
   1204 static int api_target_double_is_soft(KitCg* g) {
   1205   if (!g) return 0;
   1206   return g->c->target.float_abi == KIT_FLOAT_ABI_SOFT ||
   1207          g->c->target.float_abi == KIT_FLOAT_ABI_SINGLE;
   1208 }
   1209 
   1210 /* True when ty is a 64-bit float (double) AND the target lacks hardware double.
   1211  * f128 is handled by the separate api_f128_* path, so width must be exactly 64. */
   1212 static int api_type_is_soft_double(KitCg* g, KitCgTypeId ty) {
   1213   if (!api_target_double_is_soft(g)) return 0;
   1214   return kit_cg_type_float_width((KitCompiler*)g->c, ty) == 64;
   1215 }
   1216 
   1217 static int api_soft_double_stack_top(KitCg* g, u32 depth) {
   1218   if (!g || g->sp <= depth) return 0;
   1219   return api_type_is_soft_double(g, api_sv_type(&g->stack[g->sp - 1u - depth]));
   1220 }
   1221 
   1222 /* f32 under pure-soft ilp32/lp64 (float_abi SOFT, no FP unit): single-precision
   1223  * arithmetic/compare/convert is also a libcall. Under SINGLE (ilp32f) float is
   1224  * hardware (fadd.s etc.) so this is false; DOUBLE/DEFAULT keep hardware too. */
   1225 static int api_type_is_soft_single(KitCg* g, KitCgTypeId ty) {
   1226   if (!g || g->c->target.float_abi != KIT_FLOAT_ABI_SOFT) return 0;
   1227   return kit_cg_type_float_width((KitCompiler*)g->c, ty) == 32;
   1228 }
   1229 
   1230 static int api_soft_single_stack_top(KitCg* g, u32 depth) {
   1231   if (!g || g->sp <= depth) return 0;
   1232   return api_type_is_soft_single(g, api_sv_type(&g->stack[g->sp - 1u - depth]));
   1233 }
   1234 
   1235 /* Runtime helper for f32 arithmetic on a soft-float target (mirrors
   1236  * api_softdf_binop_helper with the __*sf3 names). */
   1237 static const char* api_softsf_binop_helper(KitCgFpBinOp op) {
   1238   switch (op) {
   1239     case KIT_CG_FP_ADD: return "__addsf3";
   1240     case KIT_CG_FP_SUB: return "__subsf3";
   1241     case KIT_CG_FP_MUL: return "__mulsf3";
   1242     case KIT_CG_FP_DIV: return "__divsf3";
   1243   }
   1244   return NULL;
   1245 }
   1246 
   1247 void api_f128_call_unary(KitCg* g, const char* name, KitCgTypeId ret,
   1248                          KitCgTypeId param) {
   1249   ApiSValue args[1];
   1250   KitCgTypeId ps[1];
   1251   args[0] = api_pop(g);
   1252   ps[0] = param;
   1253   api_runtime_call_values(g, name, ret, ps, 1, args);
   1254 }
   1255 
   1256 void kit_cg_fp_binop(KitCg* g, KitCgFpBinOp op, uint32_t flags) {
   1257   (void)flags;
   1258   if (api_f128_stack_top(g, 0) || api_f128_stack_top(g, 1)) {
   1259     KitCgTypeId f128 = builtin_id(KIT_CG_BUILTIN_F128);
   1260     KitCgTypeId ps[2];
   1261     ApiSValue args[2];
   1262     const char* name = api_f128_binop_helper(op);
   1263     if (!name)
   1264       compiler_panic(g->c, g->cur_loc, "KitCg: unsupported f128 binop");
   1265     args[1] = api_pop(g);
   1266     args[0] = api_pop(g);
   1267     ps[0] = f128;
   1268     ps[1] = f128;
   1269     api_runtime_call_values(g, name, f128, ps, 2, args);
   1270     return;
   1271   }
   1272   if (api_soft_double_stack_top(g, 0) || api_soft_double_stack_top(g, 1)) {
   1273     KitCgTypeId f64 = builtin_id(KIT_CG_BUILTIN_F64);
   1274     KitCgTypeId ps[2];
   1275     ApiSValue args[2];
   1276     const char* name = api_softdf_binop_helper(op);
   1277     if (!name)
   1278       compiler_panic(g->c, g->cur_loc, "KitCg: unsupported soft double binop");
   1279     args[1] = api_pop(g);
   1280     args[0] = api_pop(g);
   1281     ps[0] = f64;
   1282     ps[1] = f64;
   1283     api_runtime_call_values(g, name, f64, ps, 2, args);
   1284     return;
   1285   }
   1286   if (api_soft_single_stack_top(g, 0) || api_soft_single_stack_top(g, 1)) {
   1287     KitCgTypeId f32 = builtin_id(KIT_CG_BUILTIN_F32);
   1288     KitCgTypeId ps[2];
   1289     ApiSValue args[2];
   1290     const char* name = api_softsf_binop_helper(op);
   1291     if (!name)
   1292       compiler_panic(g->c, g->cur_loc, "KitCg: unsupported soft single binop");
   1293     args[1] = api_pop(g);
   1294     args[0] = api_pop(g);
   1295     ps[0] = f32;
   1296     ps[1] = f32;
   1297     api_runtime_call_values(g, name, f32, ps, 2, args);
   1298     return;
   1299   }
   1300   api_cg_binop(g, api_map_fp_binop(op), 0);
   1301 }
   1302 
   1303 void kit_cg_fp_unop(KitCg* g, KitCgFpUnOp op, uint32_t flags) {
   1304   (void)flags;
   1305   if (!g) return;
   1306   if (op != KIT_CG_FP_NEG) {
   1307     compiler_panic(g->c, g->cur_loc, "KitCg: FP unary op unsupported");
   1308   }
   1309   if (api_f128_stack_top(g, 0)) {
   1310     KitCgTypeId f128 = builtin_id(KIT_CG_BUILTIN_F128);
   1311     api_f128_call_unary(g, "__negtf2", f128, f128);
   1312     return;
   1313   }
   1314   /* Soft float has no FP unit, so negation is a libcall too (the inline FNEG
   1315    * path emits fsgnj on an FP register, which does not exist here). */
   1316   if (api_soft_double_stack_top(g, 0)) {
   1317     KitCgTypeId f64 = builtin_id(KIT_CG_BUILTIN_F64);
   1318     api_f128_call_unary(g, "__negdf2", f64, f64);
   1319     return;
   1320   }
   1321   if (api_soft_single_stack_top(g, 0)) {
   1322     KitCgTypeId f32 = builtin_id(KIT_CG_BUILTIN_F32);
   1323     api_f128_call_unary(g, "__negsf2", f32, f32);
   1324     return;
   1325   }
   1326   api_cg_unop(g, UO_FNEG, 0);
   1327 }
   1328 
   1329 /* Soft-float single-libcall comparison: call `name(a,b)` (both operands of type
   1330  * `opty`) and test its i32 three-way result against 0 with `icmp`. Consumes the
   1331  * two operands on the stack and pushes the i32 boolean. Shared by the f128 (tf)
   1332  * and soft-double (df) paths — only the helper name and operand type differ; the
   1333  * compiler-rt NaN-sign convention is identical for both. */
   1334 static void api_softfp_cmp_call(KitCg* g, const char* name, KitCgTypeId opty,
   1335                                 CmpOp icmp) {
   1336   KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
   1337   KitCgTypeId ps[2];
   1338   ApiSValue args[2];
   1339   ps[0] = opty;
   1340   ps[1] = opty;
   1341   args[1] = api_pop(g);
   1342   args[0] = api_pop(g);
   1343   api_runtime_call_values(g, name, i32, ps, 2, args);
   1344   kit_cg_push_int(g, 0, i32);
   1345   api_cg_cmp(g, icmp);
   1346 }
   1347 
   1348 /* UEQ and ONE are the only soft-float predicates that cannot be a single
   1349  * libcall: "equal" and "unordered" both yield a nonzero magnitude from
   1350  * __eq*2/__ne*2, so they need a separate __unord*2 to split them.
   1351  *   UEQ = (__eq*2(a,b) == 0) || (__unord*2(a,b) != 0)
   1352  *   ONE = (__ne*2(a,b) != 0) && (__unord*2(a,b) == 0)
   1353  * `suffix` is "tf" (f128) or "df" (double); `opty` the matching operand type.
   1354  * The operands are dup'd (kit_cg_dup copies into a fresh owned local) so each
   1355  * libcall consumes its own copy. */
   1356 static void api_softfp_cmp_with_unord(KitCg* g, KitCgFpCmpOp op,
   1357                                       const char* suffix, KitCgTypeId opty) {
   1358   char relname[16];
   1359   char unordname[16];
   1360   CmpOp relcmp = (op == KIT_CG_FP_UEQ) ? CMP_EQ : CMP_NE;
   1361   const char* rel = (op == KIT_CG_FP_UEQ) ? "eq" : "ne";
   1362   snprintf(relname, sizeof relname, "__%s%s2", rel, suffix);
   1363   snprintf(unordname, sizeof unordname, "__unord%s2", suffix);
   1364   /* [a, b] -> [a, b, a, b] */
   1365   kit_cg_dup2(g);
   1366   /* relation on the top (dup'd) copy: [a, b, R] */
   1367   api_softfp_cmp_call(g, relname, opty, relcmp);
   1368   /* bring the original a, b back to TOS with R underneath: [R, a, b] */
   1369   kit_cg_rot3(g);
   1370   kit_cg_rot3(g);
   1371   if (op == KIT_CG_FP_UEQ) {
   1372     api_softfp_cmp_call(g, unordname, opty, CMP_NE); /* [R, unordered?] */
   1373     api_cg_binop(g, BO_OR, 0);                       /* R || unordered */
   1374   } else {
   1375     api_softfp_cmp_call(g, unordname, opty, CMP_EQ); /* [R, ordered?] */
   1376     api_cg_binop(g, BO_AND, 0);                      /* R && ordered */
   1377   }
   1378 }
   1379 
   1380 /* Emit a soft-float comparison for either f128 (suffix "tf", opty f128) or
   1381  * soft double (suffix "df", opty f64). The predicate->helper mapping and the
   1382  * compiler-rt NaN-sign convention are XLEN/width-neutral, so a single body
   1383  * serves both — only the suffix and operand type vary. */
   1384 static void api_softfp_cmp(KitCg* g, KitCgFpCmpOp op, const char* suffix,
   1385                            KitCgTypeId opty) {
   1386   char name[16];
   1387   switch (op) {
   1388     case KIT_CG_FP_OEQ:
   1389       snprintf(name, sizeof name, "__eq%s2", suffix);
   1390       api_softfp_cmp_call(g, name, opty, CMP_EQ);
   1391       return;
   1392     case KIT_CG_FP_UNE:
   1393       snprintf(name, sizeof name, "__ne%s2", suffix);
   1394       api_softfp_cmp_call(g, name, opty, CMP_NE);
   1395       return;
   1396     case KIT_CG_FP_OLT:
   1397       snprintf(name, sizeof name, "__lt%s2", suffix);
   1398       api_softfp_cmp_call(g, name, opty, CMP_LT_S);
   1399       return;
   1400     case KIT_CG_FP_OLE:
   1401       snprintf(name, sizeof name, "__le%s2", suffix);
   1402       api_softfp_cmp_call(g, name, opty, CMP_LE_S);
   1403       return;
   1404     case KIT_CG_FP_OGT:
   1405       snprintf(name, sizeof name, "__gt%s2", suffix);
   1406       api_softfp_cmp_call(g, name, opty, CMP_GT_S);
   1407       return;
   1408     case KIT_CG_FP_OGE:
   1409       snprintf(name, sizeof name, "__ge%s2", suffix);
   1410       api_softfp_cmp_call(g, name, opty, CMP_GE_S);
   1411       return;
   1412     /* unordered duals via the opposite-sign helper (NaN flips the test): */
   1413     case KIT_CG_FP_UGE:
   1414       snprintf(name, sizeof name, "__lt%s2", suffix);
   1415       api_softfp_cmp_call(g, name, opty, CMP_GE_S);
   1416       return;
   1417     case KIT_CG_FP_UGT:
   1418       snprintf(name, sizeof name, "__le%s2", suffix);
   1419       api_softfp_cmp_call(g, name, opty, CMP_GT_S);
   1420       return;
   1421     case KIT_CG_FP_ULT:
   1422       snprintf(name, sizeof name, "__ge%s2", suffix);
   1423       api_softfp_cmp_call(g, name, opty, CMP_LT_S);
   1424       return;
   1425     case KIT_CG_FP_ULE:
   1426       snprintf(name, sizeof name, "__gt%s2", suffix);
   1427       api_softfp_cmp_call(g, name, opty, CMP_LE_S);
   1428       return;
   1429     case KIT_CG_FP_UEQ:
   1430     case KIT_CG_FP_ONE:
   1431       api_softfp_cmp_with_unord(g, op, suffix, opty);
   1432       return;
   1433   }
   1434 }
   1435 
   1436 void kit_cg_fp_cmp(KitCg* g, KitCgFpCmpOp op) {
   1437   /* f128/long double and soft double are both soft-float: the comparison is a
   1438    * libcall returning a three-way i32 we test against 0. kit's runtime uses the
   1439    * standard compiler-rt sign convention (rt/lib/impl/fp_compare_impl.inc):
   1440    *   __le-family (__eq*2/__ne*2/__lt*2/__le*2): NaN -> +1
   1441    *   __ge-family (__ge*2/__gt*2):               NaN -> -1
   1442    * so each ordered predicate AND its unordered dual maps to one libcall,
   1443    * choosing the helper whose NaN sign makes the integer test fall the right
   1444    * way (ordered: NaN must fail; unordered: NaN must pass). Only UEQ/ONE, which
   1445    * must split "equal" from "unordered", need a second (__unord*2) call. The
   1446    * convention is width-neutral, so the same logic drives the tf and df
   1447    * suffixes via api_softfp_cmp. */
   1448   if (api_f128_stack_top(g, 0) || api_f128_stack_top(g, 1)) {
   1449     api_softfp_cmp(g, op, "tf", builtin_id(KIT_CG_BUILTIN_F128));
   1450     return;
   1451   }
   1452   if (api_soft_double_stack_top(g, 0) || api_soft_double_stack_top(g, 1)) {
   1453     api_softfp_cmp(g, op, "df", builtin_id(KIT_CG_BUILTIN_F64));
   1454     return;
   1455   }
   1456   if (api_soft_single_stack_top(g, 0) || api_soft_single_stack_top(g, 1)) {
   1457     api_softfp_cmp(g, op, "sf", builtin_id(KIT_CG_BUILTIN_F32));
   1458     return;
   1459   }
   1460   api_cg_cmp(g, api_map_fp_cmp(op));
   1461 }
   1462 
   1463 void kit_cg_sext(KitCg* g, KitCgTypeId dst) {
   1464   api_cg_convert_kind(g, dst, CV_SEXT);
   1465 }
   1466 
   1467 void kit_cg_zext(KitCg* g, KitCgTypeId dst) {
   1468   api_cg_convert_kind(g, dst, CV_ZEXT);
   1469 }
   1470 
   1471 void kit_cg_trunc(KitCg* g, KitCgTypeId dst) {
   1472   api_cg_convert_kind(g, dst, CV_TRUNC);
   1473 }
   1474 
   1475 void kit_cg_ptr_to_int(KitCg* g, KitCgTypeId dst) {
   1476   api_cg_convert_kind(g, dst, CV_BITCAST);
   1477 }
   1478 
   1479 void kit_cg_int_to_ptr(KitCg* g, KitCgTypeId dst) {
   1480   api_cg_convert_kind(g, dst, CV_BITCAST);
   1481 }
   1482 
   1483 void kit_cg_bitcast(KitCg* g, KitCgTypeId dst) {
   1484   api_cg_convert_kind(g, dst, CV_BITCAST);
   1485 }
   1486 
   1487 void kit_cg_fpext(KitCg* g, KitCgTypeId dst) {
   1488   KitCgTypeId dty = resolve_type(g->c, dst);
   1489   if (api_is_f128_type(g->c, dty)) {
   1490     ApiSValue v = api_pop(g);
   1491     KitCgTypeId sty = api_unalias_type(g->c, api_sv_type(&v));
   1492     const char* name = sty == builtin_id(KIT_CG_BUILTIN_F32) ? "__extendsftf2"
   1493                                                              : "__extenddftf2";
   1494     api_push(g, v);
   1495     api_f128_call_unary(g, name, dty, sty);
   1496     return;
   1497   }
   1498   /* float -> soft double: runtime widen via __extendsfdf2. */
   1499   if (api_type_is_soft_double(g, dty)) {
   1500     ApiSValue v = api_pop(g);
   1501     KitCgTypeId sty = api_unalias_type(g->c, api_sv_type(&v));
   1502     api_push(g, v);
   1503     api_f128_call_unary(g, "__extendsfdf2", dty, sty);
   1504     return;
   1505   }
   1506   api_cg_convert_kind(g, dst, CV_FEXT);
   1507 }
   1508 
   1509 void kit_cg_fptrunc(KitCg* g, KitCgTypeId dst) {
   1510   KitCgTypeId dty = resolve_type(g->c, dst);
   1511   if (api_f128_stack_top(g, 0)) {
   1512     ApiSValue v = api_pop(g);
   1513     KitCgTypeId f128 = builtin_id(KIT_CG_BUILTIN_F128);
   1514     const char* name =
   1515         dty == builtin_id(KIT_CG_BUILTIN_F32) ? "__trunctfsf2" : "__trunctfdf2";
   1516     api_push(g, v);
   1517     api_f128_call_unary(g, name, dty, f128);
   1518     return;
   1519   }
   1520   /* soft double -> float: runtime narrow via __truncdfsf2. */
   1521   if (api_soft_double_stack_top(g, 0)) {
   1522     ApiSValue v = api_pop(g);
   1523     KitCgTypeId sty = api_unalias_type(g->c, api_sv_type(&v));
   1524     api_push(g, v);
   1525     api_f128_call_unary(g, "__truncdfsf2", dty, sty);
   1526     return;
   1527   }
   1528   api_cg_convert_kind(g, dst, CV_FTRUNC);
   1529 }
   1530 
   1531 void kit_cg_sint_to_float(KitCg* g, KitCgTypeId dst, KitCgRounding rounding) {
   1532   (void)rounding;
   1533   if (api_is_f128_type(g->c, resolve_type(g->c, dst))) {
   1534     ApiSValue v = api_pop(g);
   1535     KitCgTypeId sty = api_unalias_type(g->c, api_sv_type(&v));
   1536     u32 sz = (u32)abi_cg_sizeof(g->c->abi, sty);
   1537     KitCgTypeId pty = sz > 8 ? builtin_id(KIT_CG_BUILTIN_I128)
   1538                              : (sz > 4 ? builtin_id(KIT_CG_BUILTIN_I64)
   1539                                        : builtin_id(KIT_CG_BUILTIN_I32));
   1540     const char* name =
   1541         sz > 8 ? "__floattitf" : (sz > 4 ? "__floatditf" : "__floatsitf");
   1542     api_push(g, v);
   1543     api_f128_call_unary(g, name, resolve_type(g->c, dst), pty);
   1544     return;
   1545   }
   1546   /* signed int -> soft double: __floatsidf (i32) / __floatdidf (i64). */
   1547   if (api_type_is_soft_double(g, resolve_type(g->c, dst))) {
   1548     ApiSValue v = api_pop(g);
   1549     KitCgTypeId sty = api_unalias_type(g->c, api_sv_type(&v));
   1550     u32 sz = (u32)abi_cg_sizeof(g->c->abi, sty);
   1551     KitCgTypeId pty = sz > 8 ? builtin_id(KIT_CG_BUILTIN_I128)
   1552                              : (sz > 4 ? builtin_id(KIT_CG_BUILTIN_I64)
   1553                                        : builtin_id(KIT_CG_BUILTIN_I32));
   1554     const char* name =
   1555         sz > 8 ? "__floattidf" : (sz > 4 ? "__floatdidf" : "__floatsidf");
   1556     api_push(g, v);
   1557     api_f128_call_unary(g, name, resolve_type(g->c, dst), pty);
   1558     return;
   1559   }
   1560   /* signed split-i64 -> hardware single float: use __floatdisf. */
   1561   if (api_wide64_stack_top(g, 0)) {
   1562     api_f128_call_unary(g, "__floatdisf", resolve_type(g->c, dst),
   1563                         builtin_id(KIT_CG_BUILTIN_I64));
   1564     return;
   1565   }
   1566   /* i32 -> soft single float (ilp32, no FPU): __floatsisf. */
   1567   if (api_type_is_soft_single(g, resolve_type(g->c, dst))) {
   1568     api_f128_call_unary(g, "__floatsisf", resolve_type(g->c, dst),
   1569                         builtin_id(KIT_CG_BUILTIN_I32));
   1570     return;
   1571   }
   1572   api_cg_convert_kind(g, dst, CV_ITOF_S);
   1573 }
   1574 
   1575 void kit_cg_uint_to_float(KitCg* g, KitCgTypeId dst, KitCgRounding rounding) {
   1576   (void)rounding;
   1577   if (api_is_f128_type(g->c, resolve_type(g->c, dst))) {
   1578     ApiSValue v = api_pop(g);
   1579     KitCgTypeId sty = api_unalias_type(g->c, api_sv_type(&v));
   1580     u32 sz = (u32)abi_cg_sizeof(g->c->abi, sty);
   1581     KitCgTypeId pty = sz > 8 ? builtin_id(KIT_CG_BUILTIN_I128)
   1582                              : (sz > 4 ? builtin_id(KIT_CG_BUILTIN_I64)
   1583                                        : builtin_id(KIT_CG_BUILTIN_I32));
   1584     const char* name =
   1585         sz > 8 ? "__floatuntitf" : (sz > 4 ? "__floatunditf" : "__floatunsitf");
   1586     api_push(g, v);
   1587     api_f128_call_unary(g, name, resolve_type(g->c, dst), pty);
   1588     return;
   1589   }
   1590   /* unsigned int -> soft double: __floatunsidf (i32) / __floatundidf (i64). */
   1591   if (api_type_is_soft_double(g, resolve_type(g->c, dst))) {
   1592     ApiSValue v = api_pop(g);
   1593     KitCgTypeId sty = api_unalias_type(g->c, api_sv_type(&v));
   1594     u32 sz = (u32)abi_cg_sizeof(g->c->abi, sty);
   1595     KitCgTypeId pty = sz > 8 ? builtin_id(KIT_CG_BUILTIN_I128)
   1596                              : (sz > 4 ? builtin_id(KIT_CG_BUILTIN_I64)
   1597                                        : builtin_id(KIT_CG_BUILTIN_I32));
   1598     const char* name =
   1599         sz > 8 ? "__floatuntidf" : (sz > 4 ? "__floatundidf" : "__floatunsidf");
   1600     api_push(g, v);
   1601     api_f128_call_unary(g, name, resolve_type(g->c, dst), pty);
   1602     return;
   1603   }
   1604   /* unsigned i64 -> hardware single float: __floatundisf. */
   1605   if (api_wide64_stack_top(g, 0)) {
   1606     api_f128_call_unary(g, "__floatundisf", resolve_type(g->c, dst),
   1607                         builtin_id(KIT_CG_BUILTIN_I64));
   1608     return;
   1609   }
   1610   /* u32 -> soft single float (ilp32, no FPU): __floatunsisf. */
   1611   if (api_type_is_soft_single(g, resolve_type(g->c, dst))) {
   1612     api_f128_call_unary(g, "__floatunsisf", resolve_type(g->c, dst),
   1613                         builtin_id(KIT_CG_BUILTIN_I32));
   1614     return;
   1615   }
   1616   api_cg_convert_kind(g, dst, CV_ITOF_U);
   1617 }
   1618 
   1619 void kit_cg_float_to_sint(KitCg* g, KitCgTypeId dst, KitCgRounding rounding) {
   1620   (void)rounding;
   1621   if (api_f128_stack_top(g, 0)) {
   1622     KitCgTypeId dty = resolve_type(g->c, dst);
   1623     u32 sz = (u32)abi_cg_sizeof(g->c->abi, dty);
   1624     KitCgTypeId rty = sz > 8 ? builtin_id(KIT_CG_BUILTIN_I128)
   1625                              : (sz > 4 ? builtin_id(KIT_CG_BUILTIN_I64)
   1626                                        : builtin_id(KIT_CG_BUILTIN_I32));
   1627     const char* name =
   1628         sz > 8 ? "__fixtfti" : (sz > 4 ? "__fixtfdi" : "__fixtfsi");
   1629     api_f128_call_unary(g, name, rty, builtin_id(KIT_CG_BUILTIN_F128));
   1630     if (rty != dty) api_cg_convert_kind(g, dty, CV_TRUNC);
   1631     return;
   1632   }
   1633   /* soft double -> signed int: __fixdfsi (i32) / __fixdfdi (i64). */
   1634   if (api_soft_double_stack_top(g, 0)) {
   1635     KitCgTypeId dty = resolve_type(g->c, dst);
   1636     KitCgTypeId f64 = builtin_id(KIT_CG_BUILTIN_F64);
   1637     u32 sz = (u32)abi_cg_sizeof(g->c->abi, dty);
   1638     KitCgTypeId rty = sz > 8 ? builtin_id(KIT_CG_BUILTIN_I128)
   1639                              : (sz > 4 ? builtin_id(KIT_CG_BUILTIN_I64)
   1640                                        : builtin_id(KIT_CG_BUILTIN_I32));
   1641     const char* name =
   1642         sz > 8 ? "__fixdfti" : (sz > 4 ? "__fixdfdi" : "__fixdfsi");
   1643     api_f128_call_unary(g, name, rty, f64);
   1644     if (rty != dty) api_cg_convert_kind(g, dty, CV_TRUNC);
   1645     return;
   1646   }
   1647   /* hardware single float -> split-i64: use __fixsfdi. */
   1648   if (api_is_wide8_scalar_type(g->c, resolve_type(g->c, dst))) {
   1649     api_f128_call_unary(g, "__fixsfdi", resolve_type(g->c, dst),
   1650                         builtin_id(KIT_CG_BUILTIN_F32));
   1651     return;
   1652   }
   1653   /* soft single float -> signed int <=32 (ilp32, no FPU): __fixsfsi. */
   1654   if (api_soft_single_stack_top(g, 0)) {
   1655     KitCgTypeId dty = resolve_type(g->c, dst);
   1656     KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
   1657     api_f128_call_unary(g, "__fixsfsi", i32, builtin_id(KIT_CG_BUILTIN_F32));
   1658     if (i32 != dty) api_cg_convert_kind(g, dty, CV_TRUNC);
   1659     return;
   1660   }
   1661   api_cg_convert_kind(g, dst, CV_FTOI_S);
   1662 }
   1663 
   1664 void kit_cg_float_to_uint(KitCg* g, KitCgTypeId dst, KitCgRounding rounding) {
   1665   (void)rounding;
   1666   if (api_f128_stack_top(g, 0)) {
   1667     KitCgTypeId dty = resolve_type(g->c, dst);
   1668     u32 sz = (u32)abi_cg_sizeof(g->c->abi, dty);
   1669     KitCgTypeId rty = sz > 8 ? builtin_id(KIT_CG_BUILTIN_I128)
   1670                              : (sz > 4 ? builtin_id(KIT_CG_BUILTIN_I64)
   1671                                        : builtin_id(KIT_CG_BUILTIN_I32));
   1672     const char* name =
   1673         sz > 8 ? "__fixunstfti" : (sz > 4 ? "__fixunstfdi" : "__fixunstfsi");
   1674     api_f128_call_unary(g, name, rty, builtin_id(KIT_CG_BUILTIN_F128));
   1675     if (rty != dty) api_cg_convert_kind(g, dty, CV_TRUNC);
   1676     return;
   1677   }
   1678   /* soft double -> unsigned int: __fixunsdfsi (i32) / __fixunsdfdi (i64). */
   1679   if (api_soft_double_stack_top(g, 0)) {
   1680     KitCgTypeId dty = resolve_type(g->c, dst);
   1681     KitCgTypeId f64 = builtin_id(KIT_CG_BUILTIN_F64);
   1682     u32 sz = (u32)abi_cg_sizeof(g->c->abi, dty);
   1683     KitCgTypeId rty = sz > 8 ? builtin_id(KIT_CG_BUILTIN_I128)
   1684                              : (sz > 4 ? builtin_id(KIT_CG_BUILTIN_I64)
   1685                                        : builtin_id(KIT_CG_BUILTIN_I32));
   1686     const char* name =
   1687         sz > 8 ? "__fixunsdfti" : (sz > 4 ? "__fixunsdfdi" : "__fixunsdfsi");
   1688     api_f128_call_unary(g, name, rty, f64);
   1689     if (rty != dty) api_cg_convert_kind(g, dty, CV_TRUNC);
   1690     return;
   1691   }
   1692   /* hardware single float -> split-u64: use __fixunssfdi. */
   1693   if (api_is_wide8_scalar_type(g->c, resolve_type(g->c, dst))) {
   1694     api_f128_call_unary(g, "__fixunssfdi", resolve_type(g->c, dst),
   1695                         builtin_id(KIT_CG_BUILTIN_F32));
   1696     return;
   1697   }
   1698   /* soft single float -> unsigned int <=32 (ilp32, no FPU): __fixunssfsi. */
   1699   if (api_soft_single_stack_top(g, 0)) {
   1700     KitCgTypeId dty = resolve_type(g->c, dst);
   1701     KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
   1702     api_f128_call_unary(g, "__fixunssfsi", i32, builtin_id(KIT_CG_BUILTIN_F32));
   1703     if (i32 != dty) api_cg_convert_kind(g, dty, CV_TRUNC);
   1704     return;
   1705   }
   1706   api_cg_convert_kind(g, dst, CV_FTOI_U);
   1707 }
   1708 
   1709 /* ============================================================
   1710  * Intrinsics (stub)
   1711  * ============================================================ */
   1712 
   1713 /* One descriptor per KitCgIntrinsic, indexed by the enum value. The four
   1714  * accessors below are field reads off this single source of truth; unmapped
   1715  * intrinsics (FMA/cache/coro) use an INTRIN_NONE row. The table is laid
   1716  * out in enum order; the _Static_assert guards its length so a new enumerator
   1717  * is a compile error rather than a silently truncated index. */
   1718 typedef struct IntrinDesc {
   1719   IntrinKind kind;
   1720   const char* name;
   1721   bool is_void;
   1722   bool is_overflow;
   1723 } IntrinDesc;
   1724 
   1725 static const IntrinDesc kIntrinTable[] = {
   1726     [KIT_CG_INTRIN_TRAP] = {INTRIN_TRAP, "trap", true, false},
   1727     [KIT_CG_INTRIN_CLZ] = {INTRIN_CLZ, "clz", false, false},
   1728     [KIT_CG_INTRIN_CTZ] = {INTRIN_CTZ, "ctz", false, false},
   1729     [KIT_CG_INTRIN_POPCOUNT] = {INTRIN_POPCOUNT, "popcount", false, false},
   1730     [KIT_CG_INTRIN_BSWAP] = {INTRIN_BSWAP, "bswap", false, false},
   1731     [KIT_CG_INTRIN_SETJMP] = {INTRIN_SETJMP, "setjmp", false, false},
   1732     [KIT_CG_INTRIN_LONGJMP] = {INTRIN_LONGJMP, "longjmp", true, false},
   1733     [KIT_CG_INTRIN_SADD_OVERFLOW] =
   1734         {INTRIN_SADD_OVERFLOW, "sadd_overflow", false, true},
   1735     [KIT_CG_INTRIN_UADD_OVERFLOW] =
   1736         {INTRIN_UADD_OVERFLOW, "uadd_overflow", false, true},
   1737     [KIT_CG_INTRIN_SSUB_OVERFLOW] =
   1738         {INTRIN_SSUB_OVERFLOW, "ssub_overflow", false, true},
   1739     [KIT_CG_INTRIN_USUB_OVERFLOW] =
   1740         {INTRIN_USUB_OVERFLOW, "usub_overflow", false, true},
   1741     [KIT_CG_INTRIN_SMUL_OVERFLOW] =
   1742         {INTRIN_SMUL_OVERFLOW, "smul_overflow", false, true},
   1743     [KIT_CG_INTRIN_UMUL_OVERFLOW] =
   1744         {INTRIN_UMUL_OVERFLOW, "umul_overflow", false, true},
   1745     [KIT_CG_INTRIN_FMA] = {INTRIN_NONE, "fma", false, false},
   1746     [KIT_CG_INTRIN_PREFETCH] = {INTRIN_PREFETCH, "prefetch", true, false},
   1747     [KIT_CG_INTRIN_EXPECT] = {INTRIN_EXPECT, "expect", false, false},
   1748     [KIT_CG_INTRIN_ASSUME_ALIGNED] =
   1749         {INTRIN_ASSUME_ALIGNED, "assume_aligned", false, false},
   1750     [KIT_CG_INTRIN_SYSCALL] = {INTRIN_SYSCALL, "syscall", false, false},
   1751     [KIT_CG_INTRIN_IRQ_SAVE] = {INTRIN_IRQ_SAVE, "irq_save", false, false},
   1752     [KIT_CG_INTRIN_IRQ_RESTORE] =
   1753         {INTRIN_IRQ_RESTORE, "irq_restore", true, false},
   1754     [KIT_CG_INTRIN_IRQ_DISABLE] =
   1755         {INTRIN_IRQ_DISABLE, "irq_disable", true, false},
   1756     [KIT_CG_INTRIN_IRQ_ENABLE] = {INTRIN_IRQ_ENABLE, "irq_enable", true, false},
   1757     [KIT_CG_INTRIN_DMB] = {INTRIN_DMB, "dmb", true, false},
   1758     [KIT_CG_INTRIN_DSB] = {INTRIN_DSB, "dsb", true, false},
   1759     [KIT_CG_INTRIN_ISB] = {INTRIN_ISB, "isb", true, false},
   1760     [KIT_CG_INTRIN_DCACHE_CLEAN] = {INTRIN_NONE, "dcache_clean", false, false},
   1761     [KIT_CG_INTRIN_DCACHE_INVALIDATE] =
   1762         {INTRIN_NONE, "dcache_invalidate", false, false},
   1763     [KIT_CG_INTRIN_DCACHE_CLEAN_INVALIDATE] =
   1764         {INTRIN_NONE, "dcache_clean_invalidate", false, false},
   1765     [KIT_CG_INTRIN_ICACHE_INVALIDATE] =
   1766         {INTRIN_NONE, "icache_invalidate", false, false},
   1767     [KIT_CG_INTRIN_CPU_NOP] = {INTRIN_CPU_NOP, "cpu_nop", true, false},
   1768     [KIT_CG_INTRIN_CPU_YIELD] = {INTRIN_CPU_YIELD, "cpu_yield", true, false},
   1769     [KIT_CG_INTRIN_WFI] = {INTRIN_WFI, "wfi", true, false},
   1770     [KIT_CG_INTRIN_WFE] = {INTRIN_WFE, "wfe", true, false},
   1771     [KIT_CG_INTRIN_SEV] = {INTRIN_SEV, "sev", true, false},
   1772     [KIT_CG_INTRIN_CORO_SWITCH] = {INTRIN_NONE, "coro_switch", false, false},
   1773     [KIT_CG_INTRIN_FRAME_ADDRESS] =
   1774         {INTRIN_FRAME_ADDRESS, "frame_address", false, false},
   1775     [KIT_CG_INTRIN_RETURN_ADDRESS] =
   1776         {INTRIN_RETURN_ADDRESS, "return_address", false, false},
   1777 };
   1778 
   1779 _Static_assert(sizeof(kIntrinTable) / sizeof(kIntrinTable[0]) ==
   1780                    KIT_CG_INTRIN_RETURN_ADDRESS + 1,
   1781                "kIntrinTable must have exactly one row per KitCgIntrinsic");
   1782 
   1783 /* Bounds-guarded row lookup: an out-of-range intrinsic falls back to the NONE
   1784  * row, preserving the defensive `default:` behavior the four switches carried
   1785  * before they collapsed into kIntrinTable. */
   1786 static const IntrinDesc* intrin_desc(KitCgIntrinsic intrin) {
   1787   static const IntrinDesc none = {INTRIN_NONE, NULL, false, false};
   1788   unsigned i = (unsigned)intrin;
   1789   return i < sizeof(kIntrinTable) / sizeof(kIntrinTable[0]) ? &kIntrinTable[i]
   1790                                                             : &none;
   1791 }
   1792 
   1793 IntrinKind api_map_intrinsic(KitCg* g, KitCgIntrinsic intrin,
   1794                              KitCgTypeId result_type) {
   1795   /* Width-by-type: backends derive operand width from the result type, so the
   1796    * mapping no longer needs the size here. */
   1797   (void)g;
   1798   (void)result_type;
   1799   return intrin_desc(intrin)->kind;
   1800 }
   1801 
   1802 int api_intrinsic_is_void(KitCgIntrinsic intrin) {
   1803   return intrin_desc(intrin)->is_void;
   1804 }
   1805 
   1806 int api_intrinsic_is_overflow(KitCgIntrinsic intrin) {
   1807   return intrin_desc(intrin)->is_overflow;
   1808 }
   1809 
   1810 const char* api_intrinsic_name(KitCgIntrinsic intrin) {
   1811   const char* name = intrin_desc(intrin)->name;
   1812   return name ? name : "intrinsic";
   1813 }
   1814 
   1815 void kit_cg_intrinsic(KitCg* g, KitCgIntrinsic intrin, uint32_t nargs,
   1816                       KitCgTypeId result_type) {
   1817   CgTarget* T;
   1818   KitCgTypeId rty;
   1819   KitCgTypeId int_ty;
   1820   IntrinKind kind;
   1821   ApiSValue* svs;
   1822   Operand* args;
   1823   Operand dsts[2];
   1824   u32 ndst = 0;
   1825   Heap* h;
   1826   if (!g) return;
   1827   /* clz/ctz/popcount/bswap on a split 64-bit value cannot use the backend's
   1828    * single-register software sequence. Route them to the compiler-rt __*di2
   1829    * helpers, which decompose into 32-bit operations. (32-bit forms still lower
   1830    * inline.) */
   1831   if (nargs == 1 && api_wide64_stack_top(g, 0)) {
   1832     const char* name = NULL;
   1833     KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
   1834     KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64);
   1835     KitCgTypeId ret = i32;
   1836     switch (intrin) {
   1837       case KIT_CG_INTRIN_CLZ: name = "__clzdi2"; break;
   1838       case KIT_CG_INTRIN_CTZ: name = "__ctzdi2"; break;
   1839       case KIT_CG_INTRIN_POPCOUNT: name = "__popcountdi2"; break;
   1840       case KIT_CG_INTRIN_BSWAP: name = "__bswapdi2"; ret = i64; break;
   1841       default: break;
   1842     }
   1843     if (name) {
   1844       ApiSValue arg = api_pop(g);
   1845       KitCgTypeId ps[1] = {i64};
   1846       api_runtime_call_values(g, name, ret, ps, 1, &arg);
   1847       if (ret == i32 && api_unalias_type(g->c, result_type) != i32)
   1848         api_cg_convert_kind(g, result_type, CV_ZEXT);
   1849       return;
   1850     }
   1851   }
   1852   /* __builtin_*_overflow on a split 64-bit operand pair traps in the native
   1853    * backend (it only models single-register overflow). Legalize all 6 forms
   1854    * inline as 2-lane / 4-lane ops, pushing [value, ok] like the native path.
   1855    * Gated on both operands being wide64 so other targets are unchanged. */
   1856   if (nargs == 2 && api_intrinsic_is_overflow(intrin) &&
   1857       api_wide64_stack_top(g, 0) && api_wide64_stack_top(g, 1)) {
   1858     api_wide64_overflow_inline(g, intrin);
   1859     return;
   1860   }
   1861   if (nargs == 2 && intrin == KIT_CG_INTRIN_EXPECT &&
   1862       api_wide64_stack_top(g, 1)) {
   1863     ApiSValue expected = api_pop(g);
   1864     ApiSValue val = api_pop(g);
   1865     api_release(g, &expected);
   1866     api_push(g, val);
   1867     return;
   1868   }
   1869   T = g->target;
   1870   h = g->c->ctx->heap;
   1871   rty = resolve_type(g->c, result_type);
   1872   int_ty = builtin_id(KIT_CG_BUILTIN_I32);
   1873   kind = api_map_intrinsic(g, intrin, result_type);
   1874   if (!kit_cg_target_supports_intrinsic(g->c, intrin) || kind == INTRIN_NONE) {
   1875     compiler_panic(
   1876         g->c, g->cur_loc, "KitCg: target '%s' does not support intrinsic '%s'",
   1877         arch_kind_name(g->c->target.arch), api_intrinsic_name(intrin));
   1878     return;
   1879   }
   1880 
   1881   svs = NULL;
   1882   args = NULL;
   1883   if (nargs) {
   1884     svs = (ApiSValue*)h->alloc(h, sizeof(*svs) * nargs, _Alignof(ApiSValue));
   1885     args = (Operand*)h->alloc(h, sizeof(*args) * nargs, _Alignof(Operand));
   1886     memset(args, 0, sizeof(*args) * nargs);
   1887     for (u32 i = 0; i < nargs; ++i) {
   1888       u32 idx = nargs - 1u - i;
   1889       KitCgTypeId aty;
   1890       svs[idx] = api_pop(g);
   1891       aty = api_sv_type(&svs[idx]);
   1892       if (api_sv_op_is(&svs[idx], OPK_IMM) &&
   1893           (intrin == KIT_CG_INTRIN_EXPECT ||
   1894            intrin == KIT_CG_INTRIN_ASSUME_ALIGNED ||
   1895            intrin == KIT_CG_INTRIN_PREFETCH || intrin == KIT_CG_INTRIN_DMB ||
   1896            intrin == KIT_CG_INTRIN_DSB ||
   1897            intrin == KIT_CG_INTRIN_FRAME_ADDRESS ||
   1898            intrin == KIT_CG_INTRIN_RETURN_ADDRESS)) {
   1899         args[idx] = svs[idx].op;
   1900       } else {
   1901         args[idx] = api_force_local(g, &svs[idx], aty);
   1902       }
   1903     }
   1904   }
   1905 
   1906   if (api_intrinsic_is_overflow(intrin)) {
   1907     KitCgTypeId vty = rty ? rty : (nargs ? api_sv_type(&svs[0]) : int_ty);
   1908     KitCgTypeId bool_ty = builtin_id(KIT_CG_BUILTIN_BOOL);
   1909     CGLocal rr = api_alloc_temp_local(g, vty);
   1910     CGLocal ok = api_alloc_temp_local(g, bool_ty);
   1911     dsts[0] = api_op_local(rr, vty);
   1912     dsts[1] = api_op_local(ok, bool_ty);
   1913     ndst = 2;
   1914   } else if (!api_intrinsic_is_void(intrin) && !cg_type_is_void(g->c, rty)) {
   1915     CGLocal rr = api_alloc_temp_local(g, rty);
   1916     dsts[0] = api_op_local(rr, rty);
   1917     ndst = 1;
   1918   }
   1919 
   1920   T->intrinsic(T, kind, ndst ? dsts : NULL, ndst, args, nargs);
   1921 
   1922   for (u32 i = 0; i < nargs; ++i) api_release(g, &svs[i]);
   1923   if (svs) h->free(h, svs, sizeof(*svs) * nargs);
   1924   if (args) h->free(h, args, sizeof(*args) * nargs);
   1925 
   1926   if (api_intrinsic_is_overflow(intrin)) {
   1927     api_push(g, api_make_sv(dsts[0], dsts[0].type));
   1928     api_push(g, api_make_sv(dsts[1], dsts[1].type));
   1929   } else if (ndst == 1) {
   1930     api_push(g, api_make_sv(dsts[0], rty));
   1931   }
   1932 }
   1933 
   1934 /* ============================================================
   1935  * Atomics (stub)
   1936  * ============================================================ */
   1937 
   1938 KitCgTypeId api_atomic_pointee(KitCg* g, KitCgTypeId pty, const char* who) {
   1939   KitCgTypeId pointee = cg_type_pointee(g->c, pty);
   1940   if (!pointee) {
   1941     compiler_panic(g->c, g->cur_loc, "%.*s: operand is not a pointer",
   1942                    SLICE_ARG(slice_from_cstr(who)));
   1943     return builtin_id(KIT_CG_BUILTIN_I32);
   1944   }
   1945   return pointee;
   1946 }