cg: flow i128/f128 as VALUEs, collapse wide16 special paths (Track 7.3) - kit

commit 6f48bfde8f810cd7e705dc9c31f7391da7607acc
parent e554263a21a3c16282604bbdaad3bee5283f039c
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue,  2 Jun 2026 03:54:03 -0700

cg: flow i128/f128 as VALUEs, collapse wide16 special paths (Track 7.3)

Diffstat:
M src/cg/call.c  | 32 ++++++++++++--------------------
M src/cg/memory.c  | 87 +++++++------------------------------------------------------------------------

2 files changed, 19 insertions(+), 100 deletions(-)
diff --git a/src/cg/call.c b/src/cg/call.c
@@ -41,7 +41,14 @@ static CGLocal api_materialize_call_local(CfreeCg* g, ApiSValue* arg,
     return r;
   }
   CfreeCgTypeId src_ty = api_sv_type(arg);
-  Operand op = api_force_local_unless_imm(g, arg, src_ty);
+  Operand op;
+  /* A 16-byte scalar immediate (an i128 small constant) only carries 64 bits in
+   * op.v.imm; materialize it into both sign-extended lanes so it flows as an
+   * ordinary 16-byte value rather than load_imm'ing only the low lane. */
+  if (api_sv_op_is(arg, OPK_IMM) && api_is_wide16_scalar_type(g->c, ty)) {
+    *arg = api_make_wide16_int_const(g, arg->op.v.imm, ty);
+  }
+  op = api_force_local_unless_imm(g, arg, src_ty);
   if (op.kind == OPK_LOCAL &&
       api_unalias_type(g->c, op.type) == api_unalias_type(g->c, ty)) {
     return op.v.local;
@@ -50,23 +57,7 @@ static CGLocal api_materialize_call_local(CfreeCg* g, ApiSValue* arg,
   CGLocal r = api_alloc_temp_local(g, ty);
   Operand dst = api_op_local(r, ty);
   if (op.kind == OPK_IMM) {
-    if (api_is_wide16_scalar_type(g->c, ty)) {
-      /* A 16-byte scalar immediate (an i128 small constant) only carries
-       * 64 bits in op.v.imm; load_imm would leave the high lane as stack
-       * garbage. Write both lanes, sign-extending into the high half. */
-      u8 bytes[16];
-      u64 lo = (u64)op.v.imm;
-      u64 hi = (op.v.imm < 0) ? ~(u64)0 : 0;
-      for (u32 i = 0; i < 8; ++i) {
-        u32 lo_idx = g->c->target.big_endian ? 15u - i : i;
-        u32 hi_idx = g->c->target.big_endian ? 7u - i : 8u + i;
-        bytes[lo_idx] = (u8)(lo >> (i * 8u));
-        bytes[hi_idx] = (u8)(hi >> (i * 8u));
-      }
-      api_store_f128_bytes(g, r, ty, bytes);
-    } else {
-      g->target->load_imm(g->target, dst, op.v.imm);
-    }
+    g->target->load_imm(g->target, dst, op.v.imm);
   } else if (op.kind == OPK_LOCAL) {
     g->target->copy(g->target, dst, op);
   } else {
@@ -98,8 +89,9 @@ void api_release_call_args(CfreeCg* g, CGLocal* args, u32 nargs) {
 
 void api_push_call_result(CfreeCg* g, CGLocal result, CfreeCgTypeId ret_ty) {
   Operand op = api_op_local(result, ret_ty);
-  if (cg_type_is_aggregate(g->c, ret_ty) ||
-      api_is_wide16_scalar_type(g->c, ret_ty)) {
+  /* An aggregate result is a PLACE (it is addressed/copied, never a scalar
+   * VALUE); i128/f128 are scalar VALUEs and flow like any other result. */
+  if (cg_type_is_aggregate(g->c, ret_ty)) {
     api_push(g, api_make_lv(op, ret_ty));
   } else {
     api_push(g, api_make_sv(op, ret_ty));
diff --git a/src/cg/memory.c b/src/cg/memory.c
@@ -258,12 +258,6 @@ void cfree_cg_load(CfreeCg* g, CfreeCgMemAccess access) {
     return;
   }
 
-  /* Wide-16 scalar place: keep the addressable storage as the value. */
-  if (!is_bitfield && api_is_wide16_scalar_type(g->c, ty)) {
-    api_push(g, base);
-    return;
-  }
-
   /* Resolve the place into a single backend memop operand. */
   if (!api_operand_can_address(&base.op)) {
     CfreeCgTypeId pty = cg_type_ptr_to(g->c, api_sv_type(&base));
@@ -441,80 +435,13 @@ void cfree_cg_store(CfreeCg* g, CfreeCgMemAccess access) {
 
   if (!is_bitfield) api_validate_memory_value(g, "store", ty, api_sv_type(&rv));
 
-  /* Wide-16 scalar store. */
-  if (!is_bitfield && api_is_wide16_scalar_type(g->c, ty)) {
-    if (base.source_local != CFREE_CG_LOCAL_NONE) {
-      api_local_const_clear(api_local_from_handle(g, base.source_local));
-    } else if (base.op.kind == OPK_INDIRECT || base.op.kind == OPK_GLOBAL ||
-               (access.flags & CFREE_CG_MEM_VOLATILE)) {
-      api_local_const_memory_boundary(g);
-    }
-    if (api_is_lvalue_sv(&rv)) {
-      CfreeCgTypeId ptr_ty = cg_type_ptr_to(g->c, ty);
-      Operand dst_addr;
-      Operand src_addr;
-      int dst_addr_owned = 0;
-      int src_addr_owned = 0;
-      AggregateAccess agg;
-      if (base.op.kind == OPK_LOCAL) {
-        dst_addr = base.op;
-      } else {
-        dst_addr = api_lvalue_addr(g, &base, ptr_ty);
-        dst_addr_owned = 1;
-      }
-      if (rv.op.kind == OPK_LOCAL) {
-        src_addr = rv.op;
-      } else {
-        src_addr = api_lvalue_addr(g, &rv, ptr_ty);
-        src_addr_owned = 1;
-      }
-      memset(&agg, 0, sizeof agg);
-      agg.size = 16;
-      agg.align = access.align ? access.align : 16;
-      T->copy_bytes(T, dst_addr, src_addr, agg);
-      if (dst_addr_owned) {
-        api_release_temp_local(g, dst_addr.kind == OPK_INDIRECT
-                                      ? dst_addr.v.ind.base
-                                      : dst_addr.v.local);
-      }
-      if (src_addr_owned) api_release_temp_local(g, src_addr.v.local);
-    } else if (rv.op.kind == OPK_IMM) {
-      u8 bytes[16];
-      u64 lo = (u64)rv.op.v.imm;
-      u64 hi = rv.op.v.imm < 0 ? ~(u64)0 : 0;
-      memset(bytes, 0, sizeof bytes);
-      for (u32 i = 0; i < 8; ++i) {
-        u32 lo_idx = g->c->target.big_endian ? 15u - i : i;
-        u32 hi_idx = g->c->target.big_endian ? 7u - i : 8u + i;
-        bytes[lo_idx] = (u8)(lo >> (i * 8u));
-        bytes[hi_idx] = (u8)(hi >> (i * 8u));
-      }
-      if (base.op.kind == OPK_LOCAL) {
-        api_store_f128_bytes(g, base.op.v.local, ty, bytes);
-      } else {
-        CGLocal local = api_f128_temp_local(g, ty);
-        ApiSValue tmp = api_make_lv(api_op_local(local, ty), ty);
-        CfreeCgTypeId ptr_ty = cg_type_ptr_to(g->c, ty);
-        Operand dst_addr;
-        Operand src_addr;
-        AggregateAccess agg;
-        api_store_f128_bytes(g, local, ty, bytes);
-        dst_addr = api_lvalue_addr(g, &base, ptr_ty);
-        src_addr = api_lvalue_addr(g, &tmp, ptr_ty);
-        memset(&agg, 0, sizeof agg);
-        agg.size = 16;
-        agg.align = access.align ? access.align : 16;
-        T->copy_bytes(T, dst_addr, src_addr, agg);
-        api_release_temp_local(g, dst_addr.v.local);
-        api_release_temp_local(g, src_addr.v.local);
-      }
-    } else {
-      src = api_force_local(g, &rv, ty);
-      T->store(T, base.op, src, api_mem_from_access(g, &base.op, access));
-    }
-    api_release(g, &base);
-    api_release(g, &rv);
-    return;
+  /* A 16-byte scalar immediate (an i128 small constant) only carries 64 bits in
+   * op.v.imm; materialize it into both sign-extended lanes so the general store
+   * path moves a correct 16-byte value rather than load_imm'ing the low lane and
+   * leaving the high half as garbage. */
+  if (!is_bitfield && api_sv_op_is(&rv, OPK_IMM) &&
+      api_is_wide16_scalar_type(g->c, ty)) {
+    rv = api_make_wide16_int_const(g, rv.op.v.imm, ty);
   }
 
   /* General scalar / bit-field store. Compute the source operand first so its

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/cg/call.c	\|	32	++++++++++++--------------------
M	src/cg/memory.c	\|	87	+++++++------------------------------------------------------------------------