cg: add i128 arithmetic via libcalls, wire supports_label_table through ir_recorder - kit

commit a691bcbf26887ba8ddcb59d1ebbf17408d2a3fca
parent ab11c06f26a5bd56cb96a76ff54c04d6ecbd44ea
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue, 26 May 2026 17:53:31 -0700

cg: add i128 arithmetic via libcalls, wire supports_label_table through ir_recorder

Extend the semantic CG layer with i128 integer arithmetic and conversions via
runtime library calls (__addti2, __multi2, etc.), mirroring existing f128
support. ir_recorder.c now delegates supports_label_table() so Wasm correctly
returns false and native targets return true, enabling correct switch lowering
decisions at the recording layer.

Diffstat:
M src/cg/arith.c  | 217 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
M src/cg/call.c  | 29 +++++++++++++++++++++++------
M src/cg/control.c  | 34 +++++++++++++++++++++++-----------
M src/cg/ir_recorder.c  | 9 +++++++++
M src/cg/memory.c  | 59 +++++++++++++++++++++++++++++++++++++++++++++--------------
M src/cg/native_direct_target.c  | 33 ++++++++++++++++++---------------
M src/cg/wide.c  | 33 +++++++++++++++++++++++----------

7 files changed, 353 insertions(+), 61 deletions(-)
diff --git a/src/cg/arith.c b/src/cg/arith.c
@@ -179,8 +179,9 @@ void api_cg_cmp(CfreeCg* g, CmpOp cop) {
   ra = api_force_local_unless_imm(g, &a, opty);
   rb = api_force_local_unless_imm(g, &b, opty);
   if (!api_type_is_float(g->c, opty)) {
-    api_push(g, api_make_cmp(cop, ra, rb, i32, api_sv_owns_operand_local(&a, &ra),
-                             api_sv_owns_operand_local(&b, &rb)));
+    api_push(g,
+             api_make_cmp(cop, ra, rb, i32, api_sv_owns_operand_local(&a, &ra),
+                          api_sv_owns_operand_local(&b, &rb)));
     return;
   }
   rr = api_alloc_temp_local(g, i32);
@@ -191,6 +192,9 @@ void api_cg_cmp(CfreeCg* g, CmpOp cop) {
   api_push(g, api_make_sv(dst, i32));
 }
 
+int api_try_i128_convert(CfreeCg* g, ConvKind ck, CfreeCgTypeId sty,
+                         CfreeCgTypeId dty, ApiSValue* v);
+
 void api_cg_convert_kind(CfreeCg* g, CfreeCgTypeId dst_type, ConvKind ck) {
   ApiSValue v;
   CgTarget* T;
@@ -224,6 +228,7 @@ void api_cg_convert_kind(CfreeCg* g, CfreeCgTypeId dst_type, ConvKind ck) {
       return;
     }
   }
+  if (api_try_i128_convert(g, ck, sty, dty, &v)) return;
   if (ck == CV_BITCAST && abi_cg_sizeof(g->c->abi, sty) == 16 &&
       abi_cg_sizeof(g->c->abi, dty) == 16 &&
       (api_is_f128_type(g->c, sty) || api_is_f128_type(g->c, dty))) {
@@ -278,16 +283,218 @@ void api_cg_convert_kind(CfreeCg* g, CfreeCgTypeId dst_type, ConvKind ck) {
   api_push(g, api_make_sv(dst, dty));
 }
 
+/* ============================================================
+ * 128-bit integer lowering
+ *
+ * i128/u128 are 16-byte memory-resident scalars (see api_is_wide16
+ * and src/cg/wide.c). The native backends only model <=64-bit
+ * register ops, so every i128 arithmetic/compare/convert is lowered
+ * here to a compiler-rt-style runtime call (rt/lib/int64). This
+ * mirrors the f128 dispatch in cfree_cg_fp_*.
+ * ============================================================ */
+
+int api_i128_stack_top(CfreeCg* g, u32 depth) {
+  if (!g || g->sp <= depth) return 0;
+  return api_is_i128_type(g->c, api_sv_type(&g->stack[g->sp - 1u - depth]));
+}
+
+static int api_binop_is_shift(BinOp iop) {
+  return iop == BO_SHL || iop == BO_SHR_U || iop == BO_SHR_S;
+}
+
+static int api_is_bool_type(Compiler* c, CfreeCgTypeId ty) {
+  const CgType* cg = cg_type_get(c, api_unalias_type(c, ty));
+  return cg && cg->kind == CFREE_CG_TYPE_BOOL;
+}
+
+/* Materialize an i128 value as an lvalue and return a pointer local to it. */
+static Operand api_i128_addr(CfreeCg* g, ApiSValue* v) {
+  CfreeCgTypeId i128 = builtin_id(CFREE_CG_BUILTIN_I128);
+  ApiSValue lv = api_wide16_materialize_lvalue(g, v, i128);
+  return api_lvalue_addr(g, &lv, cg_type_ptr_to(g->c, i128));
+}
+
+/* Load a 64-bit lane of an i128 (addressed by `addr`) into a fresh i64. */
+static Operand api_i128_load_lane(CfreeCg* g, Operand addr, i32 off) {
+  CfreeCgTypeId i64 = builtin_id(CFREE_CG_BUILTIN_I64);
+  CGLocal rr = api_alloc_temp_local(g, i64);
+  Operand dst = api_op_local(rr, i64);
+  MemAccess ma;
+  memset(&ma, 0, sizeof ma);
+  ma.type = i64;
+  ma.size = 8;
+  ma.align = 8;
+  g->target->load(g->target, dst, api_op_indirect(addr.v.local, off, i64), ma);
+  return dst;
+}
+
+static void api_i128_binop(CfreeCg* g, BinOp iop) {
+  CfreeCgTypeId i128 = builtin_id(CFREE_CG_BUILTIN_I128);
+  CfreeCgTypeId i32 = builtin_id(CFREE_CG_BUILTIN_I32);
+  const char* name = api_i128_binop_helper(iop);
+  CfreeCgTypeId ps[2];
+  ApiSValue args[2];
+  if (!name) {
+    compiler_panic(g->c, g->cur_loc, "CfreeCg: unsupported i128 binop");
+    return;
+  }
+  args[1] = api_pop(g);
+  args[0] = api_pop(g);
+  ps[0] = i128;
+  ps[1] = api_binop_is_shift(iop) ? i32 : i128;
+  api_runtime_call_values(g, name, i128, ps, 2, args);
+}
+
+static void api_i128_unop(CfreeCg* g, UnOp iop) {
+  CfreeCgTypeId i128 = builtin_id(CFREE_CG_BUILTIN_I128);
+  const char* name = NULL;
+  ApiSValue args[1];
+  CfreeCgTypeId ps[1];
+  if (iop == UO_NEG)
+    name = "__negti2";
+  else if (iop == UO_BNOT)
+    name = "__cfree_notti3";
+  else {
+    compiler_panic(g->c, g->cur_loc, "CfreeCg: unsupported i128 unop");
+    return;
+  }
+  args[0] = api_pop(g);
+  ps[0] = i128;
+  api_runtime_call_values(g, name, i128, ps, 1, args);
+}
+
+/* Map a relational op to the form used to compare a __cfree_*cmpti2
+ * result (-1/0/1, a signed i32) against zero. */
+static CmpOp api_i128_cmp_vs_zero(CmpOp cop) {
+  switch (cop) {
+    case CMP_EQ:
+      return CMP_EQ;
+    case CMP_NE:
+      return CMP_NE;
+    case CMP_LT_S:
+    case CMP_LT_U:
+      return CMP_LT_S;
+    case CMP_LE_S:
+    case CMP_LE_U:
+      return CMP_LE_S;
+    case CMP_GT_S:
+    case CMP_GT_U:
+      return CMP_GT_S;
+    case CMP_GE_S:
+    case CMP_GE_U:
+      return CMP_GE_S;
+    default:
+      return CMP_NE;
+  }
+}
+
+static void api_i128_cmp(CfreeCg* g, CmpOp cop) {
+  CfreeCgTypeId i128 = builtin_id(CFREE_CG_BUILTIN_I128);
+  CfreeCgTypeId i32 = builtin_id(CFREE_CG_BUILTIN_I32);
+  const char* name =
+      api_i128_cmp_is_unsigned(cop) ? "__cfree_ucmpti2" : "__cfree_cmpti2";
+  CfreeCgTypeId ps[2] = {i128, i128};
+  ApiSValue args[2];
+  args[1] = api_pop(g);
+  args[0] = api_pop(g);
+  api_runtime_call_values(g, name, i32, ps, 2, args);
+  cfree_cg_push_int(g, 0, i32);
+  api_cg_cmp(g, api_i128_cmp_vs_zero(cop));
+}
+
+/* int<->i128 conversions. Returns 1 if it handled the conversion and
+ * consumed *v, 0 to fall through to the generic path. */
+int api_try_i128_convert(CfreeCg* g, ConvKind ck, CfreeCgTypeId sty,
+                         CfreeCgTypeId dty, ApiSValue* v) {
+  CfreeCgTypeId i128 = builtin_id(CFREE_CG_BUILTIN_I128);
+  CfreeCgTypeId i64 = builtin_id(CFREE_CG_BUILTIN_I64);
+  int s_is_128 = api_is_i128_type(g->c, sty);
+  int d_is_128 = api_is_i128_type(g->c, dty);
+  if (!s_is_128 && !d_is_128) return 0;
+  if (s_is_128 && d_is_128) {
+    /* signed<->unsigned i128 reinterpret: identical layout. */
+    v->type = dty;
+    v->op.type = dty;
+    api_push(g, *v);
+    return 1;
+  }
+  if (d_is_128) {
+    u32 sw = cfree_cg_type_int_width((CfreeCompiler*)g->c, sty);
+    const char* name =
+        (ck == CV_SEXT) ? "__cfree_sext64ti" : "__cfree_zext64ti";
+    ApiSValue arg;
+    CfreeCgTypeId ps[1];
+    if (sw == 0) return 0; /* float->i128 unsupported here */
+    if (sw >= 64) {
+      arg = *v;
+      arg.type = i64;
+      arg.op.type = i64;
+    } else {
+      api_push(g, *v);
+      api_cg_convert_kind(g, i64, ck);
+      arg = api_pop(g);
+    }
+    ps[0] = i64;
+    api_runtime_call_values(g, name, i128, ps, 1, &arg);
+    return 1;
+  }
+  /* s_is_128, dty is _Bool: "value != 0" over the full 128 bits, not a
+   * low-lane truncation (a value whose only set bits are above bit 63 must
+   * still become 1). Reuse the runtime i128 compare. */
+  if (api_is_bool_type(g->c, dty)) {
+    api_push(g, *v);
+    cfree_cg_push_int(g, 0, i128);
+    api_i128_cmp(g, CMP_NE); /* leaves i32 0/1 */
+    api_cg_convert_kind(g, dty, CV_TRUNC);
+    return 1;
+  }
+  /* s_is_128, dty is a narrower integer: take the low 64 bits, then
+   * truncate further if needed. */
+  {
+    u32 dw = cfree_cg_type_int_width((CfreeCompiler*)g->c, dty);
+    i32 lo_off = g->c->target.big_endian ? 8 : 0;
+    Operand addr;
+    Operand lo;
+    if (dw == 0) return 0; /* i128->float unsupported here */
+    addr = api_i128_addr(g, v);
+    lo = api_i128_load_lane(g, addr, lo_off);
+    api_release_temp_local(g, addr.v.local);
+    api_release(g, v);
+    if (dw >= 64) {
+      api_push(g, api_make_sv(lo, dty));
+    } else {
+      api_push(g, api_make_sv(lo, i64));
+      api_cg_convert_kind(g, dty, CV_TRUNC);
+    }
+    return 1;
+  }
+}
+
 void cfree_cg_int_binop(CfreeCg* g, CfreeCgIntBinOp op, uint32_t flags) {
-  api_cg_binop(g, api_map_int_binop(op), flags);
+  BinOp iop = api_map_int_binop(op);
+  if (g && (api_i128_stack_top(g, 0) || api_i128_stack_top(g, 1))) {
+    api_i128_binop(g, iop);
+    return;
+  }
+  api_cg_binop(g, iop, flags);
 }
 
 void cfree_cg_int_unop(CfreeCg* g, CfreeCgIntUnOp op, uint32_t flags) {
-  api_cg_unop(g, api_map_int_unop(op), flags);
+  UnOp iop = api_map_int_unop(op);
+  if (g && api_i128_stack_top(g, 0) && (iop == UO_NEG || iop == UO_BNOT)) {
+    api_i128_unop(g, iop);
+    return;
+  }
+  api_cg_unop(g, iop, flags);
 }
 
 void cfree_cg_int_cmp(CfreeCg* g, CfreeCgIntCmpOp op) {
-  api_cg_cmp(g, api_map_int_cmp(op));
+  CmpOp cop = api_map_int_cmp(op);
+  if (g && (api_i128_stack_top(g, 0) || api_i128_stack_top(g, 1))) {
+    api_i128_cmp(g, cop);
+    return;
+  }
+  api_cg_cmp(g, cop);
 }
 
 const char* api_i128_binop_helper(BinOp op) {
diff --git a/src/cg/call.c b/src/cg/call.c
@@ -50,7 +50,23 @@ static CGLocal api_materialize_call_local(CfreeCg* g, ApiSValue* arg,
   CGLocal r = api_alloc_temp_local(g, ty);
   Operand dst = api_op_local(r, ty);
   if (op.kind == OPK_IMM) {
-    g->target->load_imm(g->target, dst, op.v.imm);
+    if (api_is_wide16_scalar_type(g->c, ty)) {
+      /* A 16-byte scalar immediate (an i128 small constant) only carries
+       * 64 bits in op.v.imm; load_imm would leave the high lane as stack
+       * garbage. Write both lanes, sign-extending into the high half. */
+      u8 bytes[16];
+      u64 lo = (u64)op.v.imm;
+      u64 hi = (op.v.imm < 0) ? ~(u64)0 : 0;
+      for (u32 i = 0; i < 8; ++i) {
+        u32 lo_idx = g->c->target.big_endian ? 15u - i : i;
+        u32 hi_idx = g->c->target.big_endian ? 7u - i : 8u + i;
+        bytes[lo_idx] = (u8)(lo >> (i * 8u));
+        bytes[hi_idx] = (u8)(hi >> (i * 8u));
+      }
+      api_store_f128_bytes(g, r, ty, bytes);
+    } else {
+      g->target->load_imm(g->target, dst, op.v.imm);
+    }
   } else if (op.kind == OPK_LOCAL) {
     g->target->copy(g->target, dst, op);
   } else {
@@ -64,8 +80,8 @@ static CGLocal api_materialize_call_local(CfreeCg* g, ApiSValue* arg,
 void api_pack_call_arg(CfreeCg* g, CGLocal* out, CfreeCgTypeId fty, u32 idx) {
   ApiSValue arg = api_pop(g);
   u32 nfixed = api_func_nparams(g, fty);
-  CfreeCgTypeId aty = idx >= nfixed ? api_sv_type(&arg)
-                                    : cg_type_func_param_id(g->c, fty, idx);
+  CfreeCgTypeId aty =
+      idx >= nfixed ? api_sv_type(&arg) : cg_type_func_param_id(g->c, fty, idx);
   if (!aty) aty = api_sv_type(&arg);
   *out = api_materialize_call_local(g, &arg, aty);
 }
@@ -82,7 +98,8 @@ void api_release_call_args(CfreeCg* g, CGLocal* args, u32 nargs) {
 
 void api_push_call_result(CfreeCg* g, CGLocal result, CfreeCgTypeId ret_ty) {
   Operand op = api_op_local(result, ret_ty);
-  if (cg_type_is_aggregate(g->c, ret_ty) || api_is_wide16_scalar_type(g->c, ret_ty)) {
+  if (cg_type_is_aggregate(g->c, ret_ty) ||
+      api_is_wide16_scalar_type(g->c, ret_ty)) {
     api_push(g, api_make_lv(op, ret_ty));
   } else {
     api_push(g, api_make_sv(op, ret_ty));
@@ -131,8 +148,8 @@ static void api_tail_fallback_ret(CfreeCg* g, CfreeCgTypeId ret_ty) {
 
 static void api_finish_call(CfreeCg* g, CGCallDesc* desc, CGLocal* args,
                             u32 nargs, Operand callee_op, ApiSValue* callee,
-                            CfreeCgTypeId ret_ty, int has_result,
-                            int want_tail, int emit_tail) {
+                            CfreeCgTypeId ret_ty, int has_result, int want_tail,
+                            int emit_tail) {
   if (emit_tail) api_temp_locals_finish(g);
   if (!emit_tail) api_call_clobber_boundary(g, desc);
   g->target->call(g->target, desc);
diff --git a/src/cg/control.c b/src/cg/control.c
@@ -323,12 +323,21 @@ void cfree_cg_switch(CfreeCg* g, CfreeCgSwitch sw) {
   /* Direct O0 targets may override switch_ for a single-pass branch-chain
    * lowering. Still honor an explicit jump-table hint so tests and frontends
    * can exercise the semantic label-table path without enabling O1. */
-  native_switch_override =
-      (g->target->switch_ && g->opt_level == 0 &&
-       desc.hint != CFREE_CG_SWITCH_JUMP_TABLE);
+  native_switch_override = (g->target->switch_ && g->opt_level == 0 &&
+                            desc.hint != CFREE_CG_SWITCH_JUMP_TABLE);
   plan = native_switch_override ? (CGSwitchPlan){CG_SWITCH_PLAN_CHAIN, 0, 0}
                                 : cg_plan_switch(g, &desc);
 
+  /* The label-table lowering materializes a rodata table of code-label
+   * addresses and an indirect branch. Targets that can't express that (Wasm)
+   * realize dense dispatch through their switch_ hook (br_table) instead, so
+   * hand the plan—hint and all—to switch_ rather than the table path. */
+  if (plan.kind == CG_SWITCH_PLAN_TABLE && g->target->switch_ &&
+      g->target->supports_label_table &&
+      !g->target->supports_label_table(g->target)) {
+    plan.kind = CG_SWITCH_PLAN_CHAIN;
+  }
+
   if (plan.kind == CG_SWITCH_PLAN_TABLE) {
     /* Selector stays on the value stack; cg_emit_switch_table consumes
      * it via cg-API ops so the path also records cleanly under opt. */
@@ -337,7 +346,8 @@ void cfree_cg_switch(CfreeCg* g, CfreeCgSwitch sw) {
   } else {
     metrics_count(g->c, "cg.switch.chain", 1);
     selector = api_pop(g);
-    desc.selector = api_force_local_unless_imm(g, &selector, desc.selector_type);
+    desc.selector =
+        api_force_local_unless_imm(g, &selector, desc.selector_type);
     if (g->target->switch_) {
       g->target->switch_(g->target, &desc);
     } else {
@@ -667,8 +677,9 @@ void cfree_cg_alloca(CfreeCg* g, uint32_t align,
   sz = api_pop(g);
   pty = resolve_type(g->c, result_ptr_type);
   if (!pty) pty = cg_type_ptr_to(g->c, builtin_id(CFREE_CG_BUILTIN_VOID));
-  sz_op = api_sv_op_is(&sz, OPK_IMM) ? sz.op
-                                     : api_force_local(g, &sz, api_sv_type(&sz));
+  sz_op = api_sv_op_is(&sz, OPK_IMM)
+              ? sz.op
+              : api_force_local(g, &sz, api_sv_type(&sz));
   rr = api_alloc_temp_local(g, pty);
   dst = api_op_local(rr, pty);
   T->alloca_(T, dst, sz_op, align ? align : 16);
@@ -903,7 +914,8 @@ void cfree_cg_index(CfreeCg* g, uint64_t offset) {
   if (!base_info || base_info->kind != CFREE_CG_TYPE_ARRAY)
     api_release(g, &base);
   api_release(g, &idx);
-  api_push(g, api_make_lv(api_op_indirect(result.v.local, 0, elem_ty), elem_ty));
+  api_push(g,
+           api_make_lv(api_op_indirect(result.v.local, 0, elem_ty), elem_ty));
 }
 
 void cfree_cg_field(CfreeCg* g, uint32_t field_index) {
@@ -997,8 +1009,8 @@ void cfree_cg_field(CfreeCg* g, uint32_t field_index) {
                api_op_imm((i64)field_offset, rec_ptr_ty));
       api_release(g, &base);
     }
-    api_push(g,
-             api_make_lv(api_op_indirect(result.v.local, 0, field_ty), field_ty));
+    api_push(
+        g, api_make_lv(api_op_indirect(result.v.local, 0, field_ty), field_ty));
   } else if (base.op.kind == OPK_GLOBAL) {
     result =
         api_op_global(base.op.v.global.sym,
@@ -1024,8 +1036,8 @@ void cfree_cg_field(CfreeCg* g, uint32_t field_index) {
                api_op_imm((i64)field_offset, rec_ptr_ty));
       api_release_temp_local(g, base_addr.v.local);
     }
-    api_push(g,
-             api_make_lv(api_op_indirect(result.v.local, 0, field_ty), field_ty));
+    api_push(
+        g, api_make_lv(api_op_indirect(result.v.local, 0, field_ty), field_ty));
   }
 }
 
diff --git a/src/cg/ir_recorder.c b/src/cg/ir_recorder.c
@@ -223,6 +223,14 @@ static const char* rec_data_label_addr_unsupported_msg(CgTarget* t) {
   return "IR recorder supports function-local label address data";
 }
 
+/* A target that cannot resolve code-label addresses in static data (it set
+ * data_label_addr_unsupported_msg) likewise cannot build a label-address jump
+ * table; report that so cfree_cg_switch routes table plans through switch_. */
+static int rec_supports_label_table(CgTarget* t) {
+  CgIrRecorder* r = rec_of(t);
+  return r->data_label_addr_unsupported_msg ? 0 : 1;
+}
+
 static CGScope rec_scope_begin(CgTarget* t, const CGScopeDesc* desc) {
   CgIrRecorder* r = rec_of(t);
   CgIrInst* in = emit(r, CG_IR_SCOPE_BEGIN);
@@ -582,6 +590,7 @@ CgTarget* cg_ir_recorder_new(Compiler* c, ObjBuilder* obj,
   r->base.jump = rec_jump;
   r->base.cmp_branch = rec_cmp_branch;
   r->base.switch_ = rec_switch;
+  r->base.supports_label_table = rec_supports_label_table;
   r->base.indirect_branch = rec_indirect_branch;
   r->base.load_label_addr = rec_load_label_addr;
   r->base.local_static_data_begin = rec_local_static_data_begin;
diff --git a/src/cg/memory.c b/src/cg/memory.c
@@ -5,6 +5,13 @@ void cfree_cg_push_int(CfreeCg* g, uint64_t value, CfreeCgTypeId type) {
   if (!g) return;
   ty = resolve_type(g->c, type);
   if (!ty) return;
+  /* A 16-byte scalar immediate cannot be represented by the 64-bit op.v.imm
+   * alone; materialize it into addressable storage with both lanes
+   * sign-extended so no downstream consumer sees an undefined high half. */
+  if (api_is_wide16_scalar_type(g->c, ty)) {
+    api_push(g, api_make_wide16_int_const(g, (i64)value, ty));
+    return;
+  }
   api_push(g, api_make_sv(api_op_imm((i64)value, ty), ty));
 }
 
@@ -182,10 +189,10 @@ static int scale_to_log2(uint32_t scale) {
  * this helper does not free it.
  */
 static Operand fold_ea_into_operand(CfreeCg* g, Operand addr, i64 offset,
-                                     CGLocal index, u8 log2_scale,
-                                     CfreeCgTypeId access_ty,
-                                     int addr_is_pointer_value,
-                                     CGLocal* out_owned_base) {
+                                    CGLocal index, u8 log2_scale,
+                                    CfreeCgTypeId access_ty,
+                                    int addr_is_pointer_value,
+                                    CGLocal* out_owned_base) {
   CgTarget* T = g->target;
   CfreeCgTypeId base_ty = cg_type_is_ptr(g->c, addr.type)
                               ? addr.type
@@ -303,7 +310,8 @@ static Operand fold_ea_into_operand(CfreeCg* g, Operand addr, i64 offset,
  *   *out_log2 = log2_scale (0..3) if scale was normalized to one of {1,2,4,8}
  *               or to 0 if we materialized the scaled value (log2=0).
  */
-static CGLocal pop_and_normalize_index(CfreeCg* g, uint32_t scale, u8* out_log2) {
+static CGLocal pop_and_normalize_index(CfreeCg* g, uint32_t scale,
+                                       u8* out_log2) {
   ApiSValue idx;
   CfreeCgTypeId idx_ty;
   int lg2;
@@ -485,7 +493,8 @@ void cfree_cg_load(CfreeCg* g, CfreeCgMemAccess access, CfreeCgEffAddr ea) {
       !api_sv_local_storage_is_aggregate(g, &base) &&
       !cg_type_is_aggregate(g->c, api_sv_type(&base)) &&
       !cg_type_is_aggregate(g->c, ty) &&
-      api_unalias_type(g->c, api_sv_type(&base)) == api_unalias_type(g->c, ty)) {
+      api_unalias_type(g->c, api_sv_type(&base)) ==
+          api_unalias_type(g->c, ty)) {
     base.lvalue = 0;
     base.res = RES_FIXED_LOCAL;
     api_push(g, base);
@@ -733,6 +742,25 @@ void cfree_cg_store(CfreeCg* g, CfreeCgMemAccess access, CfreeCgEffAddr ea) {
   /* Wide-16 scalar store: keep the pre-existing wide16 lowering for the plain
    * (no-EA) case. */
   if (!has_index && !is_bitfield && api_is_wide16_scalar_type(g->c, ty)) {
+    /* Normalize the destination up front into a single offset-0 lvalue
+     * operand so every sub-branch below addresses the right location. Two
+     * cases otherwise misbehave: a pointer-rvalue base (`*p`) is the address
+     * itself and must be dereferenced (not treated as storage), and a field
+     * offset (a struct member) must be folded in. Both collapse to an
+     * OPK_INDIRECT lvalue here. */
+    if (!is_lvalue) {
+      /* Pointer-rvalue base: the operand value is the destination address. */
+      Operand ptr_op = api_force_local(g, &base, api_sv_type(&base));
+      base =
+          api_make_lv(api_op_indirect(ptr_op.v.local, (i32)ea.offset, ty), ty);
+      ea.offset = 0;
+      is_lvalue = 1;
+    } else if (ea.offset != 0 && base.op.kind == OPK_LOCAL) {
+      CfreeCgTypeId base_ptr_ty = cg_type_ptr_to(g->c, ty);
+      Operand addr = api_lvalue_addr(g, &base, base_ptr_ty);
+      base = api_make_lv(api_op_indirect(addr.v.local, (i32)ea.offset, ty), ty);
+      ea.offset = 0;
+    }
     if (base.source_local != CFREE_CG_LOCAL_NONE) {
       api_local_const_clear(api_local_from_handle(g, base.source_local));
     } else if (base.op.kind == OPK_INDIRECT || base.op.kind == OPK_GLOBAL ||
@@ -750,8 +778,8 @@ void cfree_cg_store(CfreeCg* g, CfreeCgMemAccess access, CfreeCgEffAddr ea) {
         if (ea.offset == 0) {
           dst_addr = base.op;
         } else {
-          dst_addr = fold_ea_into_operand(g, base.op, ea.offset, CG_LOCAL_NONE, 0,
-                                          ty, 0, &owned_base);
+          dst_addr = fold_ea_into_operand(g, base.op, ea.offset, CG_LOCAL_NONE,
+                                          0, ty, 0, &owned_base);
           dst_addr_owned = owned_base != CG_LOCAL_NONE;
         }
       } else if (is_lvalue) {
@@ -771,9 +799,9 @@ void cfree_cg_store(CfreeCg* g, CfreeCgMemAccess access, CfreeCgEffAddr ea) {
       agg.align = access.align ? access.align : 16;
       T->copy_bytes(T, dst_addr, src_addr, agg);
       if (dst_addr_owned) {
-        api_release_temp_local(g,
-                     dst_addr.kind == OPK_INDIRECT ? dst_addr.v.ind.base
-                                                   : dst_addr.v.local);
+        api_release_temp_local(g, dst_addr.kind == OPK_INDIRECT
+                                      ? dst_addr.v.ind.base
+                                      : dst_addr.v.local);
       }
       if (src_addr_owned) api_release_temp_local(g, src_addr.v.local);
     } else if (rv.op.kind == OPK_IMM) {
@@ -841,7 +869,8 @@ void cfree_cg_store(CfreeCg* g, CfreeCgMemAccess access, CfreeCgEffAddr ea) {
       !api_sv_local_storage_is_aggregate(g, &base) &&
       !cg_type_is_aggregate(g->c, api_sv_type(&base)) &&
       !cg_type_is_aggregate(g->c, ty) &&
-      api_unalias_type(g->c, api_sv_type(&base)) == api_unalias_type(g->c, ty)) {
+      api_unalias_type(g->c, api_sv_type(&base)) ==
+          api_unalias_type(g->c, ty)) {
     Operand dst = base.op;
     if (src.kind == OPK_IMM) {
       T->load_imm(T, dst, src.v.imm);
@@ -934,7 +963,8 @@ void cfree_cg_dup(CfreeCg* g) {
       ty = api_owned_local_type(g, &v);
       r = api_alloc_temp_local(g, ty);
       dst = api_op_local(r, ty);
-      g->target->copy(g->target, dst, api_op_local((CGLocal)api_local_of_sv(&v), ty));
+      g->target->copy(g->target, dst,
+                      api_op_local((CGLocal)api_local_of_sv(&v), ty));
       dup = v;
       api_set_owned_local(&dup, r);
       dup.res = RES_LOCAL;
@@ -951,7 +981,8 @@ void cfree_cg_dup(CfreeCg* g) {
   ty = api_owned_local_type(g, &v);
   r = api_alloc_temp_local(g, ty);
   dst = api_op_local(r, ty);
-  g->target->copy(g->target, dst, api_op_local((CGLocal)api_local_of_sv(&v), ty));
+  g->target->copy(g->target, dst,
+                  api_op_local((CGLocal)api_local_of_sv(&v), ty));
   g->stack[g->sp - 1].pinned = 0;
   dup = v;
   api_set_owned_local(&dup, r);
diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c
@@ -494,8 +494,7 @@ static void nd_copy_to_reg(NativeDirectTarget* d, NativeLoc dst,
       break;
     case NATIVE_LOC_STACK: {
       NativeAddr addr;
-      MemAccess mem = nd_scalar_mem(dst.type, d->base.c->target.ptr_size,
-                                    d->base.c->target.ptr_align);
+      MemAccess mem = nd_type_mem(d, dst.type);
       memset(&addr, 0, sizeof addr);
       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
       addr.base.frame = src.v.stack.slot;
@@ -746,10 +745,11 @@ static int nd_local_static_data_begin(CgTarget* t,
   if (d->local_static_active) nd_panic(d, "nested local static data");
   if (desc->attrs.section) {
     name = (Sym)desc->attrs.section;
-    kind = (desc->attrs.flags & CFREE_CG_DATADEF_READONLY) ? SEC_RODATA
-                                                           : SEC_DATA;
-    flags = (desc->attrs.flags & CFREE_CG_DATADEF_READONLY) ? SF_ALLOC
-                                                            : (SF_ALLOC | SF_WRITE);
+    kind =
+        (desc->attrs.flags & CFREE_CG_DATADEF_READONLY) ? SEC_RODATA : SEC_DATA;
+    flags = (desc->attrs.flags & CFREE_CG_DATADEF_READONLY)
+                ? SF_ALLOC
+                : (SF_ALLOC | SF_WRITE);
   } else if (desc->attrs.flags & CFREE_CG_DATADEF_READONLY) {
     name = pool_intern_slice(t->c->global, SLICE_LIT(".rodata"));
     kind = SEC_RODATA;
@@ -795,7 +795,8 @@ static void nd_local_static_data_label_addr(CgTarget* t, Label target,
   u8 zero[8];
   (void)width;
   (void)address_space;
-  if (!d->local_static_active) nd_panic(d, "label address outside local static data");
+  if (!d->local_static_active)
+    nd_panic(d, "label address outside local static data");
   if (width != 8u) nd_panic(d, "unsupported local static label address width");
   memset(zero, 0, sizeof zero);
   off = d->local_static_base + d->local_static_size;
@@ -912,15 +913,15 @@ static void nd_copy(CgTarget* t, Operand dst, Operand src) {
     memset(&access, 0, sizeof access);
     access.type = dst.type;
     access.size = (u32)size;
-    access.align = dst.type ? cg_type_align(t->c, dst.type)
-                            : (u32)t->c->target.ptr_align;
+    access.align =
+        dst.type ? cg_type_align(t->c, dst.type) : (u32)t->c->target.ptr_align;
     access.mem.type = dst.type;
     access.mem.size = access.size;
     access.mem.align = access.align;
-    NativeAddr da = nd_addr_materialize(d, nd_addr_storage(d, dst), &dt,
-                                        access.mem);
-    NativeAddr sa = nd_addr_materialize(d, nd_addr_storage(d, src), &st,
-                                        access.mem);
+    NativeAddr da =
+        nd_addr_materialize(d, nd_addr_storage(d, dst), &dt, access.mem);
+    NativeAddr sa =
+        nd_addr_materialize(d, nd_addr_storage(d, src), &st, access.mem);
     ND_REQUIRE_NATIVE(d, copy_bytes, "target does not copy bytes");
     d->native->copy_bytes(d->native, da, sa, access);
     nd_addr_temps_release(d, &st);
@@ -935,7 +936,8 @@ static void nd_copy(CgTarget* t, Operand dst, Operand src) {
 static void nd_load(CgTarget* t, Operand dst, Operand addr, MemAccess mem) {
   NativeDirectTarget* d = nd_of(t);
   NdAddrTemps temps;
-  u64 size = mem.size ? mem.size : (mem.type ? cg_type_size(t->c, mem.type) : 0);
+  u64 size =
+      mem.size ? mem.size : (mem.type ? cg_type_size(t->c, mem.type) : 0);
   if (mem.flags & MF_VOLATILE)
     nd_barrier(d,
                NATIVE_DIRECT_BARRIER_MEMORY | NATIVE_DIRECT_BARRIER_VOLATILE);
@@ -967,7 +969,8 @@ static void nd_load(CgTarget* t, Operand dst, Operand addr, MemAccess mem) {
 static void nd_store(CgTarget* t, Operand addr, Operand src, MemAccess mem) {
   NativeDirectTarget* d = nd_of(t);
   NdAddrTemps temps;
-  u64 size = mem.size ? mem.size : (mem.type ? cg_type_size(t->c, mem.type) : 0);
+  u64 size =
+      mem.size ? mem.size : (mem.type ? cg_type_size(t->c, mem.type) : 0);
   if (mem.flags & MF_VOLATILE)
     nd_barrier(d,
                NATIVE_DIRECT_BARRIER_MEMORY | NATIVE_DIRECT_BARRIER_VOLATILE);
diff --git a/src/cg/wide.c b/src/cg/wide.c
@@ -19,6 +19,28 @@ u64 api_u64_from_target_bytes(CfreeCg* g, const u8* bytes) {
   return v;
 }
 
+void api_wide16_sext_imm_bytes(CfreeCg* g, i64 imm, u8 bytes[16]) {
+  /* A 16-byte scalar immediate only carries 64 bits in op.v.imm; the full
+   * value is its sign-extension. Fill both lanes accordingly, honoring the
+   * target byte order. */
+  u64 lo = (u64)imm;
+  u64 hi = imm < 0 ? ~(u64)0 : 0;
+  for (u32 i = 0; i < 8; ++i) {
+    u32 lo_idx = g->c->target.big_endian ? 15u - i : i;
+    u32 hi_idx = g->c->target.big_endian ? 7u - i : 8u + i;
+    bytes[lo_idx] = (u8)(lo >> (i * 8u));
+    bytes[hi_idx] = (u8)(hi >> (i * 8u));
+  }
+}
+
+ApiSValue api_make_wide16_int_const(CfreeCg* g, i64 value, CfreeCgTypeId ty) {
+  u8 bytes[16];
+  CGLocal local = api_f128_temp_local(g, ty);
+  api_wide16_sext_imm_bytes(g, value, bytes);
+  api_store_f128_bytes(g, local, ty, bytes);
+  return api_make_lv(api_op_local(local, ty), ty);
+}
+
 void api_store_f128_bytes(CfreeCg* g, CGLocal local, CfreeCgTypeId ty,
                           const u8 bytes[16]) {
   CfreeCgTypeId i64_ty = builtin_id(CFREE_CG_BUILTIN_I64);
@@ -145,16 +167,7 @@ ApiSValue api_wide16_materialize_lvalue(CfreeCg* g, ApiSValue* v,
     return api_make_lv(dst, ty);
   }
   if (v->op.kind == OPK_IMM) {
-    u8 bytes[16];
-    u64 lo = (u64)v->op.v.imm;
-    memset(bytes, 0, sizeof bytes);
-    for (u32 i = 0; i < 8; ++i) {
-      u32 idx = g->c->target.big_endian ? 15u - i : i;
-      bytes[idx] = (u8)(lo >> (i * 8u));
-    }
-    CGLocal local = api_f128_temp_local(g, ty);
-    api_store_f128_bytes(g, local, ty, bytes);
-    return api_make_lv(api_op_local(local, ty), ty);
+    return api_make_wide16_int_const(g, v->op.v.imm, ty);
   }
   compiler_panic(
       g->c, g->cur_loc,

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/cg/arith.c	\|	217	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
M	src/cg/call.c	\|	29	+++++++++++++++++++++++------
M	src/cg/control.c	\|	34	+++++++++++++++++++++++-----------
M	src/cg/ir_recorder.c	\|	9	+++++++++
M	src/cg/memory.c	\|	59	+++++++++++++++++++++++++++++++++++++++++++++--------------
M	src/cg/native_direct_target.c	\|	33	++++++++++++++++++---------------
M	src/cg/wide.c	\|	33	+++++++++++++++++++++++----------