rv32: use ABI split-lane scalar metadata - kit

commit e4a333278d9c614276c265232293826e26aa3b63
parent ea0e478e0cdbf5187b454ea2844b83a9e12e2964
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed,  3 Jun 2026 21:22:40 -0700

rv32: use ABI split-lane scalar metadata

Add target data-model width selection for C long/size_t spellings, expose ABI scalar split-lane metadata, and use it to gate wide8 lowering/backend guards. Also fix frame-backed indirect index materialization to load the pointer-width value and add an RV32 parser fixture for data-model widths.

Diffstat:
M lang/c/abi/c_abi.c  | 2 ++
M lang/c/type/type.c  | 4 ++--
M lang/cpp/cpp_support.h  | 7 +++++++
M lang/cpp/pp/pp.c  | 2 +-
M src/abi/abi.c  | 5 +++++
M src/abi/abi.h  | 1 +
M src/abi/abi_internal.h  | 4 ++++
M src/abi/abi_rv64.c  | 14 ++++++++++++++
M src/cg/arith.c  | 60 +++++++++++++++++++++++++++++-------------------------------
M src/cg/call.c  | 5 ++---
M src/cg/control.c  | 6 +++---
M src/cg/local.c  | 4 ++--
M src/cg/memory.c  | 14 ++++++--------
M src/cg/native_direct_target.c  | 45 ++++++++++++++++++++++++---------------------
M src/cg/value.c  | 30 ++++++++++++++----------------
M src/cg/wide.c  | 29 ++++++++++++++---------------
M src/opt/cg_ir_lower.c  | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
A test/parse/cases/6_2_5_04_data_model_widths.c  | 23 +++++++++++++++++++++++
A test/parse/cases/6_2_5_04_data_model_widths.expected  | 1 +

19 files changed, 238 insertions(+), 119 deletions(-)
diff --git a/lang/c/abi/c_abi.c b/lang/c/abi/c_abi.c
@@ -96,6 +96,7 @@ const ABIFuncInfo* c_abi_func_info(KitCompiler* a, Pool* p,
 
 static const Type* c_size_or_uintptr(KitCompiler* a, Pool* p) {
   KitTargetSpec target = kit_compiler_target_spec(a);
+  if (kit_target_uses_lp64(target)) return type_prim(p, TY_ULONG);
   return target.ptr_size == 8 ? type_prim(p, TY_ULLONG) : type_prim(p, TY_UINT);
 }
 
@@ -105,6 +106,7 @@ const Type* c_abi_size_type(KitCompiler* a, Pool* p) {
 
 const Type* c_abi_ptrdiff_type(KitCompiler* a, Pool* p) {
   KitTargetSpec target = kit_compiler_target_spec(a);
+  if (kit_target_uses_lp64(target)) return type_prim(p, TY_LONG);
   return target.ptr_size == 8 ? type_prim(p, TY_LLONG) : type_prim(p, TY_INT);
 }
 
diff --git a/lang/c/type/type.c b/lang/c/type/type.c
@@ -538,8 +538,8 @@ static KitCgTypeId type_cg_builtin(KitCompiler* c, TypeKind kind) {
       return b.id[KIT_CG_BUILTIN_I32];
     case TY_LONG:
     case TY_ULONG:
-      if (target.os == KIT_OS_WINDOWS) return b.id[KIT_CG_BUILTIN_I32];
-      return b.id[KIT_CG_BUILTIN_I64];
+      return b.id[kit_target_uses_lp64(target) ? KIT_CG_BUILTIN_I64
+                                               : KIT_CG_BUILTIN_I32];
     case TY_LLONG:
     case TY_ULLONG:
       return b.id[KIT_CG_BUILTIN_I64];
diff --git a/lang/cpp/cpp_support.h b/lang/cpp/cpp_support.h
@@ -35,6 +35,13 @@ typedef struct Pool {
   void* type_cache; /* opaque slot owned by the C frontend; unused by cpp */
 } Pool;
 
+/* C data model for frontend-visible scalar spelling. kit currently uses LP64
+ * for 64-bit non-Windows targets, LLP64 for 64-bit Windows targets, and ILP32
+ * for 32-bit targets. */
+static inline int kit_target_uses_lp64(KitTargetSpec t) {
+  return t.ptr_size == 8 && t.os != KIT_OS_WINDOWS;
+}
+
 static inline Pool* c_pool_new(Compiler* c) {
   Heap* h = kit_compiler_context(c)->heap;
   Pool* p = h ? (Pool*)h->alloc(h, sizeof(*p), _Alignof(Pool)) : NULL;
diff --git a/lang/cpp/pp/pp.c b/lang/cpp/pp/pp.c
@@ -350,7 +350,7 @@ static void pp_register_target_predefined(Pp* pp) {
   uint32_t i;
   int ptr64 = (target.ptr_size == 8);
   int win = (target.os == KIT_OS_WINDOWS);
-  int lp64 = ptr64 && !win;
+  int lp64 = kit_target_uses_lp64(target);
   int wchar16 = win;
 
   for (i = 0; i < narch_defs; ++i) {
diff --git a/src/abi/abi.c b/src/abi/abi.c
@@ -98,6 +98,11 @@ u32 abi_cg_alignof(TargetABI* a, KitCgTypeId id) {
   return abi_cg_type_info(a, id).align;
 }
 
+u32 abi_cg_scalar_split_lane_size(TargetABI* a, KitCgTypeId id) {
+  if (!a || !a->vt || !a->vt->scalar_split_lane_size) return 0;
+  return a->vt->scalar_split_lane_size(a, id);
+}
+
 /* ---- record layout (struct/union) ----
  *
  * The CG type constructor computes the shared source-facing record layout.
diff --git a/src/abi/abi.h b/src/abi/abi.h
@@ -149,6 +149,7 @@ Compiler* abi_compiler(TargetABI*);
 ABITypeInfo abi_cg_type_info(TargetABI*, KitCgTypeId);
 u32 abi_cg_sizeof(TargetABI*, KitCgTypeId);
 u32 abi_cg_alignof(TargetABI*, KitCgTypeId);
+u32 abi_cg_scalar_split_lane_size(TargetABI*, KitCgTypeId);
 const ABIRecordLayout* abi_cg_record_layout(TargetABI*, KitCgTypeId);
 const ABIFuncInfo* abi_cg_func_info(TargetABI*, KitCgTypeId fn_type);
 ABITypeInfo abi_va_list_info(TargetABI*);
diff --git a/src/abi/abi_internal.h b/src/abi/abi_internal.h
@@ -13,6 +13,10 @@ typedef struct ABIVtable {
   /* Compute the ABIFuncInfo for a function type. The cache wrapper in
    * abi.c calls this once per CgTypeId and memoizes the result. */
   ABIFuncInfo* (*compute_func_info)(TargetABI*, KitCgTypeId fn);
+  /* Optional. Return the byte width of each lane when a scalar has to be
+   * lowered by generic CG as multiple addressable machine-word lanes, or 0
+   * when the target ABI treats it as one scalar value. */
+  u32 (*scalar_split_lane_size)(TargetABI*, KitCgTypeId);
   ABITypeInfo va_list_info;
   ABIVaListInfo va_list_layout;
 } ABIVtable;
diff --git a/src/abi/abi_rv64.c b/src/abi/abi_rv64.c
@@ -178,6 +178,19 @@ static void classify_scalar(TargetABI* a, KitCgTypeId t, ABIArgInfo* out) {
   out->nparts = 1;
 }
 
+static u32 riscv32_scalar_split_lane_size(TargetABI* a, KitCgTypeId t) {
+  RiscvAbiDesc d = riscv_abi_desc(a);
+  ABITypeInfo ti = abi_internal_type_info(a, t);
+  int fp_part;
+  if (d.gpr_bytes != 4u) return 0;
+  fp_part = (ti.scalar_kind == ABI_SC_FLOAT) &&
+            riscv_fp_eligible(d.flen, ti.size);
+  if (ti.size == 2u * d.gpr_bytes && !fp_part &&
+      (ti.scalar_kind == ABI_SC_INT || ti.scalar_kind == ABI_SC_FLOAT))
+    return d.gpr_bytes;
+  return 0;
+}
+
 static void classify_void(ABIArgInfo* out) {
   memset(out, 0, sizeof *out);
   out->kind = ABI_ARG_IGNORE;
@@ -326,6 +339,7 @@ const ABIVtable rv64_vtable = {
 
 const ABIVtable rv32_vtable = {
     .compute_func_info = riscv_compute_func_info,
+    .scalar_split_lane_size = riscv32_scalar_split_lane_size,
     .va_list_info = {4, 4, ABI_SC_PTR, 0, 0, 0},
     /* ILP32* va_list is a plain 4-byte pointer; the variadic register-save
      * area is the 8 integer arg registers (a0..a7) spilled contiguously =
diff --git a/src/cg/arith.c b/src/cg/arith.c
@@ -234,8 +234,8 @@ void api_cg_convert_kind(KitCg* g, KitCgTypeId dst_type, ConvKind ck) {
     i64 folded;
     if (api_try_fold_int_convert(g, ck, sty, dty, v.op.v.imm, &folded)) {
       api_release(g, &v);
-      /* A folded result of rv32 8-byte type must be memory-resident (two lanes),
-       * not a bare i64 immediate the backend would truncate. */
+      /* A folded split-lane 8-byte result must be memory-resident, not a bare
+       * i64 immediate the backend would truncate. */
       if (api_is_wide8_scalar_type(g->c, dty))
         api_push(g, api_make_wide8_int_const(g, folded, dty));
       else
@@ -314,16 +314,16 @@ int api_i128_stack_top(KitCg* g, u32 depth) {
   return api_is_i128_type(g->c, api_sv_type(&g->stack[g->sp - 1u - depth]));
 }
 
-/* "Wider than the target machine word but NOT i128" — i.e. a 64-bit integer on
- * a 32-bit target (rv32). The native backend handles add/sub/and/or/xor on such
- * values as register pairs, but mul/div/shift must be lowered to a __*di3
- * runtime call (see api_wideint64_binop). i128 routes through its own ti3 path
- * (api_i128_*), so it is explicitly excluded here. */
+/* 64-bit integer split into two 32-bit lanes by the selected ABI. The native
+ * backend handles add/sub/and/or/xor on such values as register pairs, but
+ * mul/div/shift must be lowered to a __*di3 runtime call (see
+ * api_wideint64_binop). i128 routes through its own ti3 path (api_i128_*), so
+ * it is explicitly excluded here. */
 static int api_int_is_wide64(KitCg* g, KitCgTypeId ty) {
   if (!g) return 0;
   if (api_is_i128_type(g->c, ty)) return 0;
   if (kit_cg_type_int_width((KitCompiler*)g->c, ty) == 0) return 0;
-  return abi_cg_sizeof(g->c->abi, ty) > g->c->target.ptr_size;
+  return api_is_wide8_scalar_type(g->c, ty);
 }
 
 static int api_wide64_stack_top(KitCg* g, u32 depth) {
@@ -425,13 +425,12 @@ static void api_wideint64_binop(KitCg* g, BinOp iop) {
 }
 
 /* ============================================================
- * wide8 inline 2-word lane arithmetic (rv32 i64)
+ * wide8 inline 2-word lane arithmetic
  *
- * On rv32 a 64-bit integer is a memory-resident 8-byte scalar. add/sub/and/or/
- * xor/neg/not and compares have no compiler-rt helper (they would recurse), so
- * they are emitted INLINE here as 32-bit lane ops, mirroring the i128 lane
- * primitives but operating on register-class i32 lanes loaded from / stored to
- * the value's memory home. mul/div/rem/shift route to __*di3 (api_wideint64_*).
+ * Some 32-bit ABIs represent a 64-bit integer as a memory-resident 8-byte
+ * scalar split into two 32-bit lanes. add/sub/and/or/xor/neg/not and compares
+ * have no compiler-rt helper (they would recurse), so they are emitted INLINE
+ * here as lane ops. mul/div/rem/shift route to __*di3 (api_wideint64_*).
  * ============================================================ */
 
 static i32 wide8_lo_off(KitCg* g) { return g->c->target.big_endian ? 4 : 0; }
@@ -620,11 +619,11 @@ static void api_wide64_cmp_inline(KitCg* g, CmpOp cop) {
 }
 
 /* ============================================================
- * wide64 __builtin_*_overflow on rv32 (inline, 2-lane)
+ * wide64 __builtin_*_overflow on split-lane 64-bit values
  *
  * The native backends only model single-register overflow, so a 64-bit
  * operand traps there. Here we legalize the 6 overflow intrinsics for a
- * 64-bit (rv32 i64) operand pair into 32-bit lane ops, computing both the
+ * 64-bit operand pair into 32-bit lane ops, computing both the
  * 64-bit wrapped value (stored to a fresh 8-byte temp) and the boolean
  * overflow flag, then pushing [value, ok] exactly as the native path does.
  * add/sub reuse the carry/borrow lane logic; mul builds the full 128-bit
@@ -682,7 +681,7 @@ static Operand wide8_addc(KitCg* g, Operand acc, Operand addend,
   return sum;
 }
 
-/* The 6 __builtin_*_overflow intrinsics for a wide64 (rv32 i64) operand pair.
+/* The 6 __builtin_*_overflow intrinsics for a split-lane wide64 operand pair.
  * Pops the two 8-byte args, computes the wrapped 64-bit value into a fresh
  * 8-byte temp and the bool overflow flag into an i32, then pushes [value, ok]
  * matching the contract of the native overflow path. */
@@ -868,7 +867,7 @@ static void api_wide64_overflow_inline(KitCg* g, KitCgIntrinsic intrin) {
   }
 }
 
-/* int<->i64 conversions on rv32 (sext/zext/trunc/bitcast across the 4<->8
+/* int<->split-i64 conversions (sext/zext/trunc/bitcast across the 4<->8
  * boundary, and i64->bool). Returns 1 if it handled (and consumed) *v. The
  * i64<->float conversions are routed to libcalls in kit_cg_*_to_float /
  * kit_cg_float_to_* and never reach here. */
@@ -1065,10 +1064,10 @@ void kit_cg_int_binop(KitCg* g, KitCgIntBinOp op, uint32_t flags) {
     api_i128_binop(g, iop);
     return;
   }
-  /* 64-bit int on a 32-bit target (rv32): mul/div/rem/shift become __*di3
+  /* 64-bit int split into 32-bit lanes: mul/div/rem/shift become __*di3
    * runtime calls; add/sub/and/or/xor are emitted inline as 2-word lane ops
    * (no compiler-rt helper exists for them). Both keep the value memory-resident
-   * so the allocator never tries to put 8 bytes in one 4-byte register. */
+   * so the allocator never tries to put 8 bytes in one 4-byte value slot. */
   if (g && (api_wide64_stack_top(g, 0) || api_wide64_stack_top(g, 1))) {
     if (api_wideint64_binop_helper(iop))
       api_wideint64_binop(g, iop);
@@ -1085,7 +1084,7 @@ void kit_cg_int_unop(KitCg* g, KitCgIntUnOp op, uint32_t flags) {
     api_i128_unop(g, iop);
     return;
   }
-  /* rv32 64-bit int: neg/bnot are inline 2-word lane ops; logical-not (!x) is
+  /* Split 64-bit int: neg/bnot are inline 2-word lane ops; logical-not (!x) is
    * the full-value truthiness test (lo|hi)==0. */
   if (g && api_wide64_stack_top(g, 0)) {
     if (iop == UO_NEG || iop == UO_BNOT) {
@@ -1557,8 +1556,7 @@ void kit_cg_sint_to_float(KitCg* g, KitCgTypeId dst, KitCgRounding rounding) {
     api_f128_call_unary(g, name, resolve_type(g->c, dst), pty);
     return;
   }
-  /* signed i64 -> hardware single float: rv32 has no fcvt.s.l, so the i64->f32
-   * conversion is a __floatdisf runtime call (mirrors clang under ilp32f). */
+  /* signed split-i64 -> hardware single float: use __floatdisf. */
   if (api_wide64_stack_top(g, 0)) {
     api_f128_call_unary(g, "__floatdisf", resolve_type(g->c, dst),
                         builtin_id(KIT_CG_BUILTIN_I64));
@@ -1645,7 +1643,7 @@ void kit_cg_float_to_sint(KitCg* g, KitCgTypeId dst, KitCgRounding rounding) {
     if (rty != dty) api_cg_convert_kind(g, dty, CV_TRUNC);
     return;
   }
-  /* hardware single float -> i64: rv32 has no fcvt.l.s, so __fixsfdi. */
+  /* hardware single float -> split-i64: use __fixsfdi. */
   if (api_is_wide8_scalar_type(g->c, resolve_type(g->c, dst))) {
     api_f128_call_unary(g, "__fixsfdi", resolve_type(g->c, dst),
                         builtin_id(KIT_CG_BUILTIN_F32));
@@ -1690,7 +1688,7 @@ void kit_cg_float_to_uint(KitCg* g, KitCgTypeId dst, KitCgRounding rounding) {
     if (rty != dty) api_cg_convert_kind(g, dty, CV_TRUNC);
     return;
   }
-  /* hardware single float -> u64: __fixunssfdi. */
+  /* hardware single float -> split-u64: use __fixunssfdi. */
   if (api_is_wide8_scalar_type(g->c, resolve_type(g->c, dst))) {
     api_f128_call_unary(g, "__fixunssfdi", resolve_type(g->c, dst),
                         builtin_id(KIT_CG_BUILTIN_F32));
@@ -1821,10 +1819,10 @@ void kit_cg_intrinsic(KitCg* g, KitCgIntrinsic intrin, uint32_t nargs,
   u32 ndst = 0;
   Heap* h;
   if (!g) return;
-  /* rv32: clz/ctz/popcount/bswap on a 64-bit value cannot be the single-register
-   * software sequence the backend emits (it would shift by 32 — an illegal rv32
-   * shamt). Route them to the compiler-rt __*di2 helpers, which decompose into
-   * 32-bit operations. (32-bit forms still lower inline.) */
+  /* clz/ctz/popcount/bswap on a split 64-bit value cannot use the backend's
+   * single-register software sequence. Route them to the compiler-rt __*di2
+   * helpers, which decompose into 32-bit operations. (32-bit forms still lower
+   * inline.) */
   if (nargs == 1 && api_wide64_stack_top(g, 0)) {
     const char* name = NULL;
     KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
@@ -1846,10 +1844,10 @@ void kit_cg_intrinsic(KitCg* g, KitCgIntrinsic intrin, uint32_t nargs,
       return;
     }
   }
-  /* rv32: __builtin_*_overflow on a 64-bit operand pair traps in the native
+  /* __builtin_*_overflow on a split 64-bit operand pair traps in the native
    * backend (it only models single-register overflow). Legalize all 6 forms
    * inline as 2-lane / 4-lane ops, pushing [value, ok] like the native path.
-   * Gated on both operands being wide64 so 32-bit / non-rv32 are unchanged. */
+   * Gated on both operands being wide64 so other targets are unchanged. */
   if (nargs == 2 && api_intrinsic_is_overflow(intrin) &&
       api_wide64_stack_top(g, 0) && api_wide64_stack_top(g, 1)) {
     api_wide64_overflow_inline(g, intrin);
diff --git a/src/cg/call.c b/src/cg/call.c
@@ -48,9 +48,8 @@ static CGLocal api_materialize_call_local(KitCg* g, ApiSValue* arg,
   if (api_sv_op_is(arg, OPK_IMM) && api_is_wide16_scalar_type(g->c, ty)) {
     *arg = api_make_wide16_int_const(g, arg->op.v.imm, ty);
   }
-  /* Same for an rv32 8-byte immediate argument: materialize it as a 2-lane
-   * memory value so the multi-part ABI path marshals both words into the GPR
-   * pair, instead of load_imm'ing only the low word into one register. */
+  /* Same for a split-lane 8-byte immediate argument: materialize it as a
+   * 2-lane memory value so the multi-part ABI path marshals both words. */
   if (api_sv_op_is(arg, OPK_IMM) && api_is_wide8_scalar_type(g->c, ty)) {
     *arg = api_make_wide8_int_const(g, arg->op.v.imm, ty);
   }
diff --git a/src/cg/control.c b/src/cg/control.c
@@ -50,9 +50,9 @@ void api_branch_if(KitCg* g, ApiSValue* v, int branch_when_true, Label label) {
     api_branch_if(g, &cmp, branch_when_true, label);
     return;
   }
-  /* rv32 8-byte int (or soft double) truthiness: branch on (lo | hi) != 0. The
-   * value is memory-resident, so a single-register CMP_NE-vs-zero would only see
-   * the low word; OR the two lanes into an i32 first. */
+  /* Split-lane 8-byte truthiness: branch on (lo | hi) != 0. The value is
+   * memory-resident, so a single-slot CMP_NE-vs-zero would only see the low
+   * word; OR the two lanes into an i32 first. */
   if (api_is_wide8_scalar_type(g->c, ty)) {
     KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
     Operand orl = api_wide8_or_lanes(g, v, ty);
diff --git a/src/cg/local.c b/src/cg/local.c
@@ -5,8 +5,8 @@ int api_local_requires_memory(KitCg* g, KitCgTypeId ty, KitCgLocalAttrs attrs) {
                      KIT_CG_LOCAL_COMPILER_TEMP;
   if (g && g->debug && attrs.name && (attrs.flags & hidden_flags) == 0)
     return 1;
-  /* Aggregates (records, arrays), wide16 (f128/i128), wide8 (rv32 i64/double),
-   * vararg state, and any non-scalar type must live in memory. */
+  /* Aggregates (records, arrays), wide16 (f128/i128), split-lane wide8, vararg
+   * state, and any non-scalar type must live in memory. */
   if (api_is_wide16_scalar_type(g->c, ty)) return 1;
   if (api_is_wide8_scalar_type(g->c, ty)) return 1;
   return !(cg_type_is_int(g->c, ty) || cg_type_is_float(g->c, ty) ||
diff --git a/src/cg/memory.c b/src/cg/memory.c
@@ -12,9 +12,8 @@ void kit_cg_push_int(KitCg* g, uint64_t value, KitCgTypeId type) {
     api_push(g, api_make_wide16_int_const(g, (i64)value, ty));
     return;
   }
-  /* rv32 8-byte int: the 64-bit value fits in op.v.imm, but the value is
-   * memory-resident, so materialize it as two 32-bit lanes (a register
-   * load_imm of an i64 on rv32 would keep only the low word). */
+  /* Split-lane 8-byte int: the 64-bit value fits in op.v.imm, but the value is
+   * memory-resident, so materialize it as two 32-bit lanes. */
   if (api_is_wide8_scalar_type(g->c, ty)) {
     api_push(g, api_make_wide8_int_const(g, (i64)value, ty));
     return;
@@ -40,9 +39,8 @@ void kit_cg_push_float(KitCg* g, double value, KitCgTypeId type) {
     api_push(g, api_make_f128_const(g, value, ty));
     return;
   }
-  /* rv32 soft double: the 8-byte value is memory-resident, so materialize the
-   * IEEE-754 binary64 pattern as two 32-bit lanes (a register load_const of an
-   * 8-byte value on rv32 would keep only the low word). */
+  /* Split-lane double: the 8-byte value is memory-resident, so materialize the
+   * IEEE-754 binary64 pattern as two 32-bit lanes. */
   if (api_is_wide8_scalar_type(g->c, ty)) {
     union {
       double d;
@@ -441,8 +439,8 @@ void kit_cg_store(KitCg* g, KitCgMemAccess access) {
       api_is_wide16_scalar_type(g->c, ty)) {
     rv = api_make_wide16_int_const(g, rv.op.v.imm, ty);
   }
-  /* Same for an rv32 8-byte immediate: lower it to a 2-lane memory value so the
-   * store moves a full 64-bit value rather than load_imm'ing only the low word. */
+  /* Same for a split-lane 8-byte immediate: lower it to a 2-lane memory value
+   * so the store moves a full 64-bit value rather than only the low word. */
   if (!is_bitfield && api_sv_op_is(&rv, OPK_IMM) &&
       api_is_wide8_scalar_type(g->c, ty)) {
     rv = api_make_wide8_int_const(g, rv.op.v.imm, ty);
diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c
@@ -15,6 +15,7 @@
 
 #include <string.h>
 
+#include "abi/abi.h"
 #include "cg/type.h"
 #include "core/arena.h"
 #include "core/pool.h"
@@ -1207,7 +1208,7 @@ static void nd_load_imm(CgTarget* t, Operand dst, i64 imm) {
   NativeLoc reg;
   if (nd_is_wide64_int(d, dst.type))
     nd_panic(d,
-             "64-bit integer immediate reached the rv32 backend un-lowered "
+             "64-bit integer immediate reached the backend un-lowered "
              "(cg should materialize it as two 32-bit lanes)");
   reg = nd_dst_reg(d, dst);
   ND_REQUIRE_NATIVE(d, load_imm, "target does not emit immediates");
@@ -1220,7 +1221,7 @@ static void nd_load_const(CgTarget* t, Operand dst, ConstBytes cbytes) {
   NativeLoc reg;
   if (nd_is_wide64_int(d, dst.type) || nd_is_soft_double(d, dst.type))
     nd_panic(d,
-             "8-byte constant reached the rv32 backend un-lowered (cg should "
+             "8-byte constant reached the backend un-lowered (cg should "
              "materialize it as two 32-bit lanes)");
   reg = nd_dst_reg(d, dst);
   ND_REQUIRE_NATIVE(d, load_const, "target does not emit byte constants");
@@ -1456,22 +1457,23 @@ static void nd_bitfield_store(CgTarget* t, Operand record_addr, Operand src,
   nd_addr_temps_release(d, &temps);
 }
 
-/* Last line of defense against an unlowered wide/soft-float op reaching the
- * machine backend. The cg-layer gates in src/cg/arith.c route i64 mul/div/shift
- * (rv32) and all soft-double arith/convert/compare to runtime calls; if one
- * escapes, the native backend would silently emit wrong code. These guards turn
- * that into a loud compiler_panic. Every condition is gated on ptr_size==4
- * (rv32) and/or float_abi SOFT|SINGLE, so x64/aa64/rv64 (ptr_size 8, or double
- * in FP regs) never trip them. */
+/* Last line of defense against an unlowered split-scalar/soft-float op reaching
+ * the machine backend. The cg-layer gates in src/cg/arith.c route split i64
+ * mul/div/shift and all soft-double arith/convert/compare to runtime calls; if
+ * one escapes, the native backend would silently emit wrong code. */
+static int nd_is_split_wide8_scalar(NativeDirectTarget* d, KitCgTypeId ty) {
+  return abi_cg_scalar_split_lane_size(d->base.c->abi, ty) == 4u &&
+         native_type_size(d->native, ty) == 8u;
+}
+
 static int nd_is_wide64_int(NativeDirectTarget* d, KitCgTypeId ty) {
-  if (d->base.c->target.ptr_size != 4) return 0; /* rv32 only */
+  if (!nd_is_split_wide8_scalar(d, ty)) return 0;
   if (kit_cg_type_int_width((KitCompiler*)d->base.c, ty) == 0) return 0;
-  return cg_type_size(d->base.c, ty) > d->base.c->target.ptr_size;
+  return 1;
 }
 
 static int nd_is_soft_double(NativeDirectTarget* d, KitCgTypeId ty) {
-  u8 abi = d->base.c->target.float_abi;
-  if (abi != KIT_FLOAT_ABI_SOFT && abi != KIT_FLOAT_ABI_SINGLE) return 0;
+  if (!nd_is_split_wide8_scalar(d, ty)) return 0;
   return kit_cg_type_float_width((KitCompiler*)d->base.c, ty) == 64;
 }
 
@@ -1480,13 +1482,14 @@ static void nd_binop(CgTarget* t, BinOp op, Operand dst, Operand a, Operand b) {
   NativeLoc ar;
   NativeLoc br;
   NativeLoc dr;
-  /* No 8-byte value reaches a single GPR op on rv32: the cg layer lowers i64
-   * add/sub/and/or/xor to inline 2-word lane sequences and mul/div/rem/shift to
-   * __*di3 runtime calls (src/cg/arith.c). Anything that slips through here would
-   * silently compute only the low word, so fail loudly instead. */
+  /* No split-lane 8-byte value reaches a single-register op: the cg layer
+   * lowers i64 add/sub/and/or/xor to inline 2-word lane sequences and
+   * mul/div/rem/shift to __*di3 runtime calls (src/cg/arith.c). Anything that
+   * slips through here would silently compute only the low word, so fail
+   * loudly instead. */
   if (nd_is_wide64_int(d, a.type) || nd_is_wide64_int(d, dst.type)) {
     nd_panic(d,
-             "64-bit integer arithmetic reached the rv32 backend un-lowered "
+             "64-bit integer arithmetic reached the backend un-lowered "
              "(cg should emit a 2-word lane sequence or a __*di3 runtime call)");
   }
   if (nd_is_soft_double(d, a.type) || nd_is_soft_double(d, dst.type)) {
@@ -1513,7 +1516,7 @@ static void nd_unop(CgTarget* t, UnOp op, Operand dst, Operand a) {
    * OTHER soft-double unop reaching the backend is an unlowered escape. */
   if (nd_is_wide64_int(d, a.type) || nd_is_wide64_int(d, dst.type)) {
     nd_panic(d,
-             "64-bit integer unary op reached the rv32 backend un-lowered "
+             "64-bit integer unary op reached the backend un-lowered "
              "(cg should emit a 2-word lane sequence)");
   }
   if (op != UO_FNEG &&
@@ -1538,7 +1541,7 @@ static void nd_cmp(CgTarget* t, CmpOp op, Operand dst, Operand a, Operand b) {
    * GPR compare here. */
   if (nd_is_wide64_int(d, a.type) || nd_is_wide64_int(d, b.type)) {
     nd_panic(d,
-             "64-bit integer compare reached the rv32 backend un-lowered "
+             "64-bit integer compare reached the backend un-lowered "
              "(cg should emit a 2-word lane sequence)");
   }
   if (nd_is_soft_double(d, a.type) || nd_is_soft_double(d, b.type)) {
@@ -1565,7 +1568,7 @@ static void nd_convert(CgTarget* t, ConvKind op, Operand dst, Operand src) {
    * calls; none reaches a single-register convert here. */
   if (nd_is_wide64_int(d, src.type) || nd_is_wide64_int(d, dst.type)) {
     nd_panic(d,
-             "64-bit integer conversion reached the rv32 backend un-lowered "
+             "64-bit integer conversion reached the backend un-lowered "
              "(cg should emit a 2-word lane sequence or a runtime call)");
   }
   if (nd_is_soft_double(d, src.type) || nd_is_soft_double(d, dst.type)) {
diff --git a/src/cg/value.c b/src/cg/value.c
@@ -25,18 +25,17 @@ int api_is_wide16_scalar_type(Compiler* c, KitCgTypeId ty) {
   return api_is_f128_type(c, ty) || api_is_i128_type(c, ty);
 }
 
-/* rv32 only: an 8-byte scalar (long long / int64_t, and — under ilp32f/ilp32 —
- * soft double) is twice the 4-byte machine word. Like wide16 (i128/f128) on a
- * 64-bit target it cannot live in a single GPR, so it is forced memory-resident
- * and every operation is legalized to a 2-word lane sequence: add/sub/and/or/
- * xor/neg/not/compare inline (src/cg/wide.c, no compiler-rt helper exists for
- * 64-bit add), mul/div/rem/shift and all soft-double arith/convert to runtime
- * calls. Defined as exactly 8 bytes on a 4-byte-pointer target, so it matches
- * only rv32 i64/double and never fires on rv64/x64/aa64 (ptr_size 8). */
+/* 8-byte scalar split into two 4-byte lanes by the selected ABI. This covers
+ * 32-bit native ABIs whose generic CG path cannot keep the value in a single
+ * scalar register/value. Such values are forced memory-resident so operations
+ * can be legalized as lane sequences or runtime calls. */
 int api_is_wide8_scalar_type(Compiler* c, KitCgTypeId ty) {
-  if (c->target.ptr_size != 4u) return 0;
-  if (!(cg_type_is_int(c, ty) || cg_type_is_float(c, ty))) return 0;
-  return abi_cg_sizeof(c->abi, ty) == 8u;
+  ABITypeInfo ti;
+  if (!c || !c->abi || !ty) return 0;
+  if (abi_cg_scalar_split_lane_size(c->abi, ty) != 4u) return 0;
+  ti = abi_cg_type_info(c->abi, ty);
+  return ti.size == 8u &&
+         (ti.scalar_kind == ABI_SC_INT || ti.scalar_kind == ABI_SC_FLOAT);
 }
 
 Operand api_op_imm(i64 v, KitCgTypeId ty) {
@@ -255,11 +254,10 @@ CGLocal api_alloc_temp_local(KitCg* g, KitCgTypeId ty) {
     d.size = abi_cg_sizeof(g->c->abi, ty);
     d.align = abi_cg_alignof(g->c->abi, ty);
   }
-  /* An rv32 8-byte scalar temp (i64/soft-double arithmetic result, call result,
-   * etc.) must live in memory so its two words are addressable for lane ops and
-   * the multi-part ABI path; the allocator gives an unflagged scalar a single
-   * register, which would truncate it. (wide16 temps are already forced via the
-   * size>word auto-home in cg_ir_lower.) */
+  /* A split-lane 8-byte scalar temp must live in memory so its two words are
+   * addressable for lane ops and the multi-part ABI path; the allocator gives
+   * an unflagged scalar one value slot, which would truncate it. (wide16 temps
+   * are already forced via the size>word auto-home in cg_ir_lower.) */
   if (ty && api_is_wide8_scalar_type(g->c, ty))
     d.flags |= CG_LOCAL_MEMORY_REQUIRED;
   local = g->target->local(g->target, &d);
diff --git a/src/cg/wide.c b/src/cg/wide.c
@@ -130,17 +130,16 @@ ApiSValue api_make_f128_const(KitCg* g, double value, KitCgTypeId ty) {
 }
 
 /* ============================================================
- * wide8 — rv32 8-byte (2-word) scalar lane plumbing
+ * wide8 — 8-byte scalar split into two 4-byte lanes
  *
- * On rv32 a long long / int64_t (and, under ilp32f/ilp32, a soft double) is two
- * machine words. Like the wide16 (i128/f128) scalars above it is memory-resident
- * (api_is_wide8_scalar_type forces CG_LOCAL_MEMORY_REQUIRED), but its arithmetic
- * is done INLINE as 2-word lane sequences (src/cg/arith.c) rather than via a
- * runtime call, because compiler-rt has no 64-bit add/sub/and/or/xor helper. The
- * lane size is the 4-byte word (ptr_size); the low word is at offset 0 on a
- * little-endian target (rv32 is LE; the big-endian offsets are kept for parity
- * with the wide16 helpers). These primitives are the inline analogue of
- * api_store_f128_bytes / api_i128_addr / api_i128_load_lane.
+ * Some 32-bit ABIs represent long long / int64_t, and sometimes soft double,
+ * as two machine words. Like the wide16 (i128/f128) scalars above it is
+ * memory-resident (api_is_wide8_scalar_type forces CG_LOCAL_MEMORY_REQUIRED),
+ * but its arithmetic is done INLINE as 2-word lane sequences (src/cg/arith.c)
+ * rather than via a runtime call, because compiler-rt has no 64-bit
+ * add/sub/and/or/xor helper. The lane size is 4 bytes; the low word is at
+ * offset 0 on a little-endian target. These primitives are the inline analogue
+ * of api_store_f128_bytes / api_i128_addr / api_i128_load_lane.
  * ============================================================ */
 
 /* Allocate an 8-byte memory-resident, address-taken scalar temp. */
@@ -196,11 +195,11 @@ Operand api_wide8_addr(KitCg* g, ApiSValue* v, KitCgTypeId ty) {
   } else {
     lv = *v;
   }
-  /* A delayed value (SV_CMP/SV_ARITH) — e.g. an rv32 i64 produced by `!cmp`
-   * routed here through api_wide64_cmp_inline — is not yet a place. Materialize
-   * it first: api_ensure_local lowers it into a memory-resident wide8 temp
-   * (api_alloc_temp_local forces CG_LOCAL_MEMORY_REQUIRED for an 8-byte scalar),
-   * which is a real addressable home. Materialization, however, clears
+  /* A delayed value (SV_CMP/SV_ARITH) routed here through the wide64 helpers is
+   * not yet a place. Materialize it first: api_ensure_local lowers it into a
+   * memory-resident wide8 temp (api_alloc_temp_local forces
+   * CG_LOCAL_MEMORY_REQUIRED for an 8-byte scalar), which is a real addressable
+   * home. Materialization, however, clears
    * sv.lvalue (fold.c), so we must set the flag AFTER it runs — otherwise the
    * lvalue check in api_lvalue_addr fails ("addr operand is not an lvalue").
    * Doing this before api_lvalue_addr also makes its own api_ensure_local a
diff --git a/src/opt/cg_ir_lower.c b/src/opt/cg_ir_lower.c
@@ -37,10 +37,16 @@ typedef struct CgIrLower {
   u32* inst_block;
   u8* leader;
   CGLocal mat_local[CG_IR_LOWER_MAX_MAT];
+  u8 mat_role[CG_IR_LOWER_MAX_MAT];
   Reg mat_reg[CG_IR_LOWER_MAX_MAT];
   u32 nmat;
 } CgIrLower;
 
+typedef enum CgIrMatRole {
+  CG_IR_MAT_BASE = 0,
+  CG_IR_MAT_INDEX = 1,
+} CgIrMatRole;
+
 static _Noreturn void lower_panic(CgIrLower* l, SrcLoc loc, const char* msg) {
   compiler_panic(l->c, loc, "opt cg-ir lower: %s", msg);
 }
@@ -483,26 +489,54 @@ static OptOperand opt_frame_operand(OptLocalMap* m) {
  * in its frame home, so storage.v.reg is meaningless; load the home into a
  * fresh PReg. prematerialize_indirect_bases emits that load before the using
  * instruction; here we just look the result up (l->mat_*). */
-static Reg resolve_indirect_base_reg(CgIrLower* l, CGLocal local, SrcLoc loc) {
+static Reg resolve_materialized_reg(CgIrLower* l, CGLocal local,
+                                    CgIrMatRole role, SrcLoc loc) {
   OptLocalMap* m = local_map(l, local, loc);
   if (m->storage.kind == CG_LOCAL_STORAGE_REG) return m->storage.v.reg;
   for (u32 i = 0; i < l->nmat; ++i)
-    if (l->mat_local[i] == local) return l->mat_reg[i];
-  lower_panic(l, loc, "indirect base local not materialized");
+    if (l->mat_local[i] == local && l->mat_role[i] == (u8)role)
+      return l->mat_reg[i];
+  lower_panic(l, loc, role == CG_IR_MAT_INDEX
+                          ? "indirect index local not materialized"
+                          : "indirect base local not materialized");
+}
+
+static KitCgTypeId pointer_sized_int_type(CgIrLower* l) {
+  return builtin_id(l->c->target.ptr_size <= 4u ? KIT_CG_BUILTIN_I32
+                                                : KIT_CG_BUILTIN_I64);
+}
+
+static void remember_materialized_reg(CgIrLower* l, CGLocal local,
+                                      CgIrMatRole role, Reg r, SrcLoc loc) {
+  if (l->nmat >= CG_IR_LOWER_MAX_MAT)
+    lower_panic(l, loc, "too many frame indirect operands in one instruction");
+  l->mat_local[l->nmat] = local;
+  l->mat_role[l->nmat] = (u8)role;
+  l->mat_reg[l->nmat] = r;
+  l->nmat++;
+}
+
+static int materialized_reg_exists(CgIrLower* l, CGLocal local,
+                                   CgIrMatRole role) {
+  for (u32 i = 0; i < l->nmat; ++i)
+    if (l->mat_local[i] == local && l->mat_role[i] == (u8)role) return 1;
+  return 0;
 }
 
-/* Emit `r = load <local home>` once per instruction for each FRAME-storage
- * local used as an OPK_INDIRECT base/index, recording r in l->mat_*. Must run
- * before the consuming instruction is emitted so the load dominates its uses.
- */
+static OptOperand opt_frame_operand_as(OptLocalMap* m, KitCgTypeId type) {
+  OptOperand out = opt_frame_operand(m);
+  out.type = type ? type : m->type;
+  return out;
+}
+
+/* Emit the pre-materialization needed for a FRAME-storage local used as an
+ * OPK_INDIRECT base. A pointer-typed local holds the base pointer value and is
+ * loaded. A non-pointer local names storage, so its frame address is the base. */
 static void materialize_frame_base(CgIrLower* l, u32 block, CGLocal local,
                                    SrcLoc loc) {
   OptLocalMap* m = local_map(l, local, loc);
   if (m->storage.kind == CG_LOCAL_STORAGE_REG) return;
-  for (u32 i = 0; i < l->nmat; ++i)
-    if (l->mat_local[i] == local) return;
-  if (l->nmat >= CG_IR_LOWER_MAX_MAT)
-    lower_panic(l, loc, "too many frame indirect bases in one instruction");
+  if (materialized_reg_exists(l, local, CG_IR_MAT_BASE)) return;
   PReg r = ir_alloc_preg(l->f, m->type, RC_INT);
   OptOperand ops[2];
   ops[1] = opt_frame_operand(m);
@@ -537,9 +571,40 @@ static void materialize_frame_base(CgIrLower* l, u32 block, CGLocal local,
     ao->def = (Val)r;
     ao->type = m->type;
   }
-  l->mat_local[l->nmat] = local;
-  l->mat_reg[l->nmat] = (Reg)r;
-  l->nmat++;
+  remember_materialized_reg(l, local, CG_IR_MAT_BASE, (Reg)r, loc);
+}
+
+/* Emit `r = load <local home>` for a FRAME-storage local used as an
+ * OPK_INDIRECT index. Unlike a non-pointer base, an index always needs the
+ * local's value. On rv32, Toy indexes are i64 and therefore memory-backed; the
+ * address calculation only consumes the pointer-width low word. */
+static void materialize_frame_index(CgIrLower* l, u32 block, CGLocal local,
+                                    SrcLoc loc) {
+  OptLocalMap* m = local_map(l, local, loc);
+  if (m->storage.kind == CG_LOCAL_STORAGE_REG) return;
+  if (materialized_reg_exists(l, local, CG_IR_MAT_INDEX)) return;
+  KitCgTypeId idx_ty = pointer_sized_int_type(l);
+  PReg r = ir_alloc_preg(l->f, idx_ty, RC_INT);
+  OptOperand ops[2];
+  Inst* ld = ir_emit(l->f, block, IR_LOAD);
+  ld->loc = loc;
+  memset(&ops[0], 0, sizeof ops[0]);
+  ops[0].kind = OPK_REG;
+  ops[0].cls = RC_INT;
+  ops[0].type = idx_ty;
+  ops[0].v.reg = (Reg)r;
+  ops[1] = opt_frame_operand_as(m, idx_ty);
+  ld->opnds = dup_opt_ops(l, ops, 2);
+  ld->nopnds = 2;
+  ld->def = (Val)r;
+  ld->type = idx_ty;
+  memset(&ld->extra.mem, 0, sizeof ld->extra.mem);
+  ld->extra.mem.type = idx_ty;
+  ld->extra.mem.size = l->c->target.ptr_size;
+  ld->extra.mem.align = m->align && m->align < l->c->target.ptr_size
+                            ? m->align
+                            : l->c->target.ptr_size;
+  remember_materialized_reg(l, local, CG_IR_MAT_INDEX, (Reg)r, loc);
 }
 
 /* Scan the CG instruction's operands for OPK_INDIRECT bases/indices that are
@@ -552,7 +617,7 @@ static void prematerialize_indirect_bases(CgIrLower* l, const CgIrInst* in,
     if (op->kind != OPK_INDIRECT) continue;
     materialize_frame_base(l, block, op->v.ind.base, in->loc);
     if (op->v.ind.index != CG_LOCAL_NONE)
-      materialize_frame_base(l, block, op->v.ind.index, in->loc);
+      materialize_frame_index(l, block, op->v.ind.index, in->loc);
   }
 }
 
@@ -590,10 +655,12 @@ static OptOperand lower_operand_addr(CgIrLower* l, const Operand* in,
     case OPK_INDIRECT: {
       out.kind = OPK_INDIRECT;
       out.cls = RC_INT;
-      out.v.ind.base = resolve_indirect_base_reg(l, in->v.ind.base, loc);
+      out.v.ind.base =
+          resolve_materialized_reg(l, in->v.ind.base, CG_IR_MAT_BASE, loc);
       out.v.ind.index = REG_NONE;
       if (in->v.ind.index != CG_LOCAL_NONE)
-        out.v.ind.index = resolve_indirect_base_reg(l, in->v.ind.index, loc);
+        out.v.ind.index =
+            resolve_materialized_reg(l, in->v.ind.index, CG_IR_MAT_INDEX, loc);
       out.v.ind.log2_scale = in->v.ind.log2_scale;
       out.v.ind.ofs = in->v.ind.ofs;
       return out;
diff --git a/test/parse/cases/6_2_5_04_data_model_widths.c b/test/parse/cases/6_2_5_04_data_model_widths.c
@@ -0,0 +1,23 @@
+#include <stddef.h>
+#include <stdint.h>
+
+_Static_assert(sizeof(long) == __SIZEOF_LONG__, "long width");
+_Static_assert(sizeof(unsigned long) == __SIZEOF_LONG__, "unsigned long width");
+_Static_assert(sizeof(size_t) == __SIZEOF_SIZE_T__, "size_t width");
+_Static_assert(sizeof(ptrdiff_t) == __SIZEOF_PTRDIFF_T__, "ptrdiff_t width");
+_Static_assert(sizeof(intptr_t) == __SIZEOF_POINTER__, "intptr_t width");
+_Static_assert(sizeof(uintptr_t) == __SIZEOF_POINTER__, "uintptr_t width");
+
+int test_main(void) {
+  int a[2];
+  int score = 0;
+  if (sizeof(long) == __SIZEOF_LONG__) score += 1;
+  if (sizeof(unsigned long) == __SIZEOF_LONG__) score += 2;
+  if (sizeof(size_t) == __SIZEOF_SIZE_T__) score += 4;
+  if (sizeof(ptrdiff_t) == __SIZEOF_PTRDIFF_T__) score += 8;
+  if (sizeof(intptr_t) == __SIZEOF_POINTER__) score += 16;
+  if (sizeof(uintptr_t) == __SIZEOF_POINTER__) score += 5;
+  score += _Generic(sizeof(int), size_t: 3, default: 0);
+  score += _Generic((&a[1] - &a[0]), ptrdiff_t: 3, default: 0);
+  return score;
+}
diff --git a/test/parse/cases/6_2_5_04_data_model_widths.expected b/test/parse/cases/6_2_5_04_data_model_widths.expected
@@ -0,0 +1 @@
+42

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	lang/c/abi/c_abi.c	\|	2	++
M	lang/c/type/type.c	\|	4	++--
M	lang/cpp/cpp_support.h	\|	7	+++++++
M	lang/cpp/pp/pp.c	\|	2	+-
M	src/abi/abi.c	\|	5	+++++
M	src/abi/abi.h	\|	1	+
M	src/abi/abi_internal.h	\|	4	++++
M	src/abi/abi_rv64.c	\|	14	++++++++++++++
M	src/cg/arith.c	\|	60	+++++++++++++++++++++++++++++-------------------------------
M	src/cg/call.c	\|	5	++---
M	src/cg/control.c	\|	6	+++---
M	src/cg/local.c	\|	4	++--
M	src/cg/memory.c	\|	14	++++++--------
M	src/cg/native_direct_target.c	\|	45	++++++++++++++++++++++++---------------------
M	src/cg/value.c	\|	30	++++++++++++++----------------
M	src/cg/wide.c	\|	29	++++++++++++++---------------
M	src/opt/cg_ir_lower.c	\|	101	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
A	test/parse/cases/6_2_5_04_data_model_widths.c	\|	23	+++++++++++++++++++++++
A	test/parse/cases/6_2_5_04_data_model_widths.expected	\|	1	+