commit e4a333278d9c614276c265232293826e26aa3b63
parent ea0e478e0cdbf5187b454ea2844b83a9e12e2964
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Wed, 3 Jun 2026 21:22:40 -0700
rv32: use ABI split-lane scalar metadata
Add target data-model width selection for C long/size_t spellings, expose ABI scalar split-lane metadata, and use it to gate wide8 lowering/backend guards. Also fix frame-backed indirect index materialization to load the pointer-width value and add an RV32 parser fixture for data-model widths.
Diffstat:
19 files changed, 238 insertions(+), 119 deletions(-)
diff --git a/lang/c/abi/c_abi.c b/lang/c/abi/c_abi.c
@@ -96,6 +96,7 @@ const ABIFuncInfo* c_abi_func_info(KitCompiler* a, Pool* p,
static const Type* c_size_or_uintptr(KitCompiler* a, Pool* p) {
KitTargetSpec target = kit_compiler_target_spec(a);
+ if (kit_target_uses_lp64(target)) return type_prim(p, TY_ULONG);
return target.ptr_size == 8 ? type_prim(p, TY_ULLONG) : type_prim(p, TY_UINT);
}
@@ -105,6 +106,7 @@ const Type* c_abi_size_type(KitCompiler* a, Pool* p) {
const Type* c_abi_ptrdiff_type(KitCompiler* a, Pool* p) {
KitTargetSpec target = kit_compiler_target_spec(a);
+ if (kit_target_uses_lp64(target)) return type_prim(p, TY_LONG);
return target.ptr_size == 8 ? type_prim(p, TY_LLONG) : type_prim(p, TY_INT);
}
diff --git a/lang/c/type/type.c b/lang/c/type/type.c
@@ -538,8 +538,8 @@ static KitCgTypeId type_cg_builtin(KitCompiler* c, TypeKind kind) {
return b.id[KIT_CG_BUILTIN_I32];
case TY_LONG:
case TY_ULONG:
- if (target.os == KIT_OS_WINDOWS) return b.id[KIT_CG_BUILTIN_I32];
- return b.id[KIT_CG_BUILTIN_I64];
+ return b.id[kit_target_uses_lp64(target) ? KIT_CG_BUILTIN_I64
+ : KIT_CG_BUILTIN_I32];
case TY_LLONG:
case TY_ULLONG:
return b.id[KIT_CG_BUILTIN_I64];
diff --git a/lang/cpp/cpp_support.h b/lang/cpp/cpp_support.h
@@ -35,6 +35,13 @@ typedef struct Pool {
void* type_cache; /* opaque slot owned by the C frontend; unused by cpp */
} Pool;
+/* C data model for frontend-visible scalar spelling. kit currently uses LP64
+ * for 64-bit non-Windows targets, LLP64 for 64-bit Windows targets, and ILP32
+ * for 32-bit targets. */
+static inline int kit_target_uses_lp64(KitTargetSpec t) {
+ return t.ptr_size == 8 && t.os != KIT_OS_WINDOWS;
+}
+
static inline Pool* c_pool_new(Compiler* c) {
Heap* h = kit_compiler_context(c)->heap;
Pool* p = h ? (Pool*)h->alloc(h, sizeof(*p), _Alignof(Pool)) : NULL;
diff --git a/lang/cpp/pp/pp.c b/lang/cpp/pp/pp.c
@@ -350,7 +350,7 @@ static void pp_register_target_predefined(Pp* pp) {
uint32_t i;
int ptr64 = (target.ptr_size == 8);
int win = (target.os == KIT_OS_WINDOWS);
- int lp64 = ptr64 && !win;
+ int lp64 = kit_target_uses_lp64(target);
int wchar16 = win;
for (i = 0; i < narch_defs; ++i) {
diff --git a/src/abi/abi.c b/src/abi/abi.c
@@ -98,6 +98,11 @@ u32 abi_cg_alignof(TargetABI* a, KitCgTypeId id) {
return abi_cg_type_info(a, id).align;
}
+u32 abi_cg_scalar_split_lane_size(TargetABI* a, KitCgTypeId id) {
+ if (!a || !a->vt || !a->vt->scalar_split_lane_size) return 0;
+ return a->vt->scalar_split_lane_size(a, id);
+}
+
/* ---- record layout (struct/union) ----
*
* The CG type constructor computes the shared source-facing record layout.
diff --git a/src/abi/abi.h b/src/abi/abi.h
@@ -149,6 +149,7 @@ Compiler* abi_compiler(TargetABI*);
ABITypeInfo abi_cg_type_info(TargetABI*, KitCgTypeId);
u32 abi_cg_sizeof(TargetABI*, KitCgTypeId);
u32 abi_cg_alignof(TargetABI*, KitCgTypeId);
+u32 abi_cg_scalar_split_lane_size(TargetABI*, KitCgTypeId);
const ABIRecordLayout* abi_cg_record_layout(TargetABI*, KitCgTypeId);
const ABIFuncInfo* abi_cg_func_info(TargetABI*, KitCgTypeId fn_type);
ABITypeInfo abi_va_list_info(TargetABI*);
diff --git a/src/abi/abi_internal.h b/src/abi/abi_internal.h
@@ -13,6 +13,10 @@ typedef struct ABIVtable {
/* Compute the ABIFuncInfo for a function type. The cache wrapper in
* abi.c calls this once per CgTypeId and memoizes the result. */
ABIFuncInfo* (*compute_func_info)(TargetABI*, KitCgTypeId fn);
+ /* Optional. Return the byte width of each lane when a scalar has to be
+ * lowered by generic CG as multiple addressable machine-word lanes, or 0
+ * when the target ABI treats it as one scalar value. */
+ u32 (*scalar_split_lane_size)(TargetABI*, KitCgTypeId);
ABITypeInfo va_list_info;
ABIVaListInfo va_list_layout;
} ABIVtable;
diff --git a/src/abi/abi_rv64.c b/src/abi/abi_rv64.c
@@ -178,6 +178,19 @@ static void classify_scalar(TargetABI* a, KitCgTypeId t, ABIArgInfo* out) {
out->nparts = 1;
}
+static u32 riscv32_scalar_split_lane_size(TargetABI* a, KitCgTypeId t) {
+ RiscvAbiDesc d = riscv_abi_desc(a);
+ ABITypeInfo ti = abi_internal_type_info(a, t);
+ int fp_part;
+ if (d.gpr_bytes != 4u) return 0;
+ fp_part = (ti.scalar_kind == ABI_SC_FLOAT) &&
+ riscv_fp_eligible(d.flen, ti.size);
+ if (ti.size == 2u * d.gpr_bytes && !fp_part &&
+ (ti.scalar_kind == ABI_SC_INT || ti.scalar_kind == ABI_SC_FLOAT))
+ return d.gpr_bytes;
+ return 0;
+}
+
static void classify_void(ABIArgInfo* out) {
memset(out, 0, sizeof *out);
out->kind = ABI_ARG_IGNORE;
@@ -326,6 +339,7 @@ const ABIVtable rv64_vtable = {
const ABIVtable rv32_vtable = {
.compute_func_info = riscv_compute_func_info,
+ .scalar_split_lane_size = riscv32_scalar_split_lane_size,
.va_list_info = {4, 4, ABI_SC_PTR, 0, 0, 0},
/* ILP32* va_list is a plain 4-byte pointer; the variadic register-save
* area is the 8 integer arg registers (a0..a7) spilled contiguously =
diff --git a/src/cg/arith.c b/src/cg/arith.c
@@ -234,8 +234,8 @@ void api_cg_convert_kind(KitCg* g, KitCgTypeId dst_type, ConvKind ck) {
i64 folded;
if (api_try_fold_int_convert(g, ck, sty, dty, v.op.v.imm, &folded)) {
api_release(g, &v);
- /* A folded result of rv32 8-byte type must be memory-resident (two lanes),
- * not a bare i64 immediate the backend would truncate. */
+ /* A folded split-lane 8-byte result must be memory-resident, not a bare
+ * i64 immediate the backend would truncate. */
if (api_is_wide8_scalar_type(g->c, dty))
api_push(g, api_make_wide8_int_const(g, folded, dty));
else
@@ -314,16 +314,16 @@ int api_i128_stack_top(KitCg* g, u32 depth) {
return api_is_i128_type(g->c, api_sv_type(&g->stack[g->sp - 1u - depth]));
}
-/* "Wider than the target machine word but NOT i128" — i.e. a 64-bit integer on
- * a 32-bit target (rv32). The native backend handles add/sub/and/or/xor on such
- * values as register pairs, but mul/div/shift must be lowered to a __*di3
- * runtime call (see api_wideint64_binop). i128 routes through its own ti3 path
- * (api_i128_*), so it is explicitly excluded here. */
+/* 64-bit integer split into two 32-bit lanes by the selected ABI. The native
+ * backend handles add/sub/and/or/xor on such values as register pairs, but
+ * mul/div/shift must be lowered to a __*di3 runtime call (see
+ * api_wideint64_binop). i128 routes through its own ti3 path (api_i128_*), so
+ * it is explicitly excluded here. */
static int api_int_is_wide64(KitCg* g, KitCgTypeId ty) {
if (!g) return 0;
if (api_is_i128_type(g->c, ty)) return 0;
if (kit_cg_type_int_width((KitCompiler*)g->c, ty) == 0) return 0;
- return abi_cg_sizeof(g->c->abi, ty) > g->c->target.ptr_size;
+ return api_is_wide8_scalar_type(g->c, ty);
}
static int api_wide64_stack_top(KitCg* g, u32 depth) {
@@ -425,13 +425,12 @@ static void api_wideint64_binop(KitCg* g, BinOp iop) {
}
/* ============================================================
- * wide8 inline 2-word lane arithmetic (rv32 i64)
+ * wide8 inline 2-word lane arithmetic
*
- * On rv32 a 64-bit integer is a memory-resident 8-byte scalar. add/sub/and/or/
- * xor/neg/not and compares have no compiler-rt helper (they would recurse), so
- * they are emitted INLINE here as 32-bit lane ops, mirroring the i128 lane
- * primitives but operating on register-class i32 lanes loaded from / stored to
- * the value's memory home. mul/div/rem/shift route to __*di3 (api_wideint64_*).
+ * Some 32-bit ABIs represent a 64-bit integer as a memory-resident 8-byte
+ * scalar split into two 32-bit lanes. add/sub/and/or/xor/neg/not and compares
+ * have no compiler-rt helper (they would recurse), so they are emitted INLINE
+ * here as lane ops. mul/div/rem/shift route to __*di3 (api_wideint64_*).
* ============================================================ */
static i32 wide8_lo_off(KitCg* g) { return g->c->target.big_endian ? 4 : 0; }
@@ -620,11 +619,11 @@ static void api_wide64_cmp_inline(KitCg* g, CmpOp cop) {
}
/* ============================================================
- * wide64 __builtin_*_overflow on rv32 (inline, 2-lane)
+ * wide64 __builtin_*_overflow on split-lane 64-bit values
*
* The native backends only model single-register overflow, so a 64-bit
* operand traps there. Here we legalize the 6 overflow intrinsics for a
- * 64-bit (rv32 i64) operand pair into 32-bit lane ops, computing both the
+ * 64-bit operand pair into 32-bit lane ops, computing both the
* 64-bit wrapped value (stored to a fresh 8-byte temp) and the boolean
* overflow flag, then pushing [value, ok] exactly as the native path does.
* add/sub reuse the carry/borrow lane logic; mul builds the full 128-bit
@@ -682,7 +681,7 @@ static Operand wide8_addc(KitCg* g, Operand acc, Operand addend,
return sum;
}
-/* The 6 __builtin_*_overflow intrinsics for a wide64 (rv32 i64) operand pair.
+/* The 6 __builtin_*_overflow intrinsics for a split-lane wide64 operand pair.
* Pops the two 8-byte args, computes the wrapped 64-bit value into a fresh
* 8-byte temp and the bool overflow flag into an i32, then pushes [value, ok]
* matching the contract of the native overflow path. */
@@ -868,7 +867,7 @@ static void api_wide64_overflow_inline(KitCg* g, KitCgIntrinsic intrin) {
}
}
-/* int<->i64 conversions on rv32 (sext/zext/trunc/bitcast across the 4<->8
+/* int<->split-i64 conversions (sext/zext/trunc/bitcast across the 4<->8
* boundary, and i64->bool). Returns 1 if it handled (and consumed) *v. The
* i64<->float conversions are routed to libcalls in kit_cg_*_to_float /
* kit_cg_float_to_* and never reach here. */
@@ -1065,10 +1064,10 @@ void kit_cg_int_binop(KitCg* g, KitCgIntBinOp op, uint32_t flags) {
api_i128_binop(g, iop);
return;
}
- /* 64-bit int on a 32-bit target (rv32): mul/div/rem/shift become __*di3
+ /* 64-bit int split into 32-bit lanes: mul/div/rem/shift become __*di3
* runtime calls; add/sub/and/or/xor are emitted inline as 2-word lane ops
* (no compiler-rt helper exists for them). Both keep the value memory-resident
- * so the allocator never tries to put 8 bytes in one 4-byte register. */
+ * so the allocator never tries to put 8 bytes in one 4-byte value slot. */
if (g && (api_wide64_stack_top(g, 0) || api_wide64_stack_top(g, 1))) {
if (api_wideint64_binop_helper(iop))
api_wideint64_binop(g, iop);
@@ -1085,7 +1084,7 @@ void kit_cg_int_unop(KitCg* g, KitCgIntUnOp op, uint32_t flags) {
api_i128_unop(g, iop);
return;
}
- /* rv32 64-bit int: neg/bnot are inline 2-word lane ops; logical-not (!x) is
+ /* Split 64-bit int: neg/bnot are inline 2-word lane ops; logical-not (!x) is
* the full-value truthiness test (lo|hi)==0. */
if (g && api_wide64_stack_top(g, 0)) {
if (iop == UO_NEG || iop == UO_BNOT) {
@@ -1557,8 +1556,7 @@ void kit_cg_sint_to_float(KitCg* g, KitCgTypeId dst, KitCgRounding rounding) {
api_f128_call_unary(g, name, resolve_type(g->c, dst), pty);
return;
}
- /* signed i64 -> hardware single float: rv32 has no fcvt.s.l, so the i64->f32
- * conversion is a __floatdisf runtime call (mirrors clang under ilp32f). */
+ /* signed split-i64 -> hardware single float: use __floatdisf. */
if (api_wide64_stack_top(g, 0)) {
api_f128_call_unary(g, "__floatdisf", resolve_type(g->c, dst),
builtin_id(KIT_CG_BUILTIN_I64));
@@ -1645,7 +1643,7 @@ void kit_cg_float_to_sint(KitCg* g, KitCgTypeId dst, KitCgRounding rounding) {
if (rty != dty) api_cg_convert_kind(g, dty, CV_TRUNC);
return;
}
- /* hardware single float -> i64: rv32 has no fcvt.l.s, so __fixsfdi. */
+ /* hardware single float -> split-i64: use __fixsfdi. */
if (api_is_wide8_scalar_type(g->c, resolve_type(g->c, dst))) {
api_f128_call_unary(g, "__fixsfdi", resolve_type(g->c, dst),
builtin_id(KIT_CG_BUILTIN_F32));
@@ -1690,7 +1688,7 @@ void kit_cg_float_to_uint(KitCg* g, KitCgTypeId dst, KitCgRounding rounding) {
if (rty != dty) api_cg_convert_kind(g, dty, CV_TRUNC);
return;
}
- /* hardware single float -> u64: __fixunssfdi. */
+ /* hardware single float -> split-u64: use __fixunssfdi. */
if (api_is_wide8_scalar_type(g->c, resolve_type(g->c, dst))) {
api_f128_call_unary(g, "__fixunssfdi", resolve_type(g->c, dst),
builtin_id(KIT_CG_BUILTIN_F32));
@@ -1821,10 +1819,10 @@ void kit_cg_intrinsic(KitCg* g, KitCgIntrinsic intrin, uint32_t nargs,
u32 ndst = 0;
Heap* h;
if (!g) return;
- /* rv32: clz/ctz/popcount/bswap on a 64-bit value cannot be the single-register
- * software sequence the backend emits (it would shift by 32 — an illegal rv32
- * shamt). Route them to the compiler-rt __*di2 helpers, which decompose into
- * 32-bit operations. (32-bit forms still lower inline.) */
+ /* clz/ctz/popcount/bswap on a split 64-bit value cannot use the backend's
+ * single-register software sequence. Route them to the compiler-rt __*di2
+ * helpers, which decompose into 32-bit operations. (32-bit forms still lower
+ * inline.) */
if (nargs == 1 && api_wide64_stack_top(g, 0)) {
const char* name = NULL;
KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
@@ -1846,10 +1844,10 @@ void kit_cg_intrinsic(KitCg* g, KitCgIntrinsic intrin, uint32_t nargs,
return;
}
}
- /* rv32: __builtin_*_overflow on a 64-bit operand pair traps in the native
+ /* __builtin_*_overflow on a split 64-bit operand pair traps in the native
* backend (it only models single-register overflow). Legalize all 6 forms
* inline as 2-lane / 4-lane ops, pushing [value, ok] like the native path.
- * Gated on both operands being wide64 so 32-bit / non-rv32 are unchanged. */
+ * Gated on both operands being wide64 so other targets are unchanged. */
if (nargs == 2 && api_intrinsic_is_overflow(intrin) &&
api_wide64_stack_top(g, 0) && api_wide64_stack_top(g, 1)) {
api_wide64_overflow_inline(g, intrin);
diff --git a/src/cg/call.c b/src/cg/call.c
@@ -48,9 +48,8 @@ static CGLocal api_materialize_call_local(KitCg* g, ApiSValue* arg,
if (api_sv_op_is(arg, OPK_IMM) && api_is_wide16_scalar_type(g->c, ty)) {
*arg = api_make_wide16_int_const(g, arg->op.v.imm, ty);
}
- /* Same for an rv32 8-byte immediate argument: materialize it as a 2-lane
- * memory value so the multi-part ABI path marshals both words into the GPR
- * pair, instead of load_imm'ing only the low word into one register. */
+ /* Same for a split-lane 8-byte immediate argument: materialize it as a
+ * 2-lane memory value so the multi-part ABI path marshals both words. */
if (api_sv_op_is(arg, OPK_IMM) && api_is_wide8_scalar_type(g->c, ty)) {
*arg = api_make_wide8_int_const(g, arg->op.v.imm, ty);
}
diff --git a/src/cg/control.c b/src/cg/control.c
@@ -50,9 +50,9 @@ void api_branch_if(KitCg* g, ApiSValue* v, int branch_when_true, Label label) {
api_branch_if(g, &cmp, branch_when_true, label);
return;
}
- /* rv32 8-byte int (or soft double) truthiness: branch on (lo | hi) != 0. The
- * value is memory-resident, so a single-register CMP_NE-vs-zero would only see
- * the low word; OR the two lanes into an i32 first. */
+ /* Split-lane 8-byte truthiness: branch on (lo | hi) != 0. The value is
+ * memory-resident, so a single-slot CMP_NE-vs-zero would only see the low
+ * word; OR the two lanes into an i32 first. */
if (api_is_wide8_scalar_type(g->c, ty)) {
KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
Operand orl = api_wide8_or_lanes(g, v, ty);
diff --git a/src/cg/local.c b/src/cg/local.c
@@ -5,8 +5,8 @@ int api_local_requires_memory(KitCg* g, KitCgTypeId ty, KitCgLocalAttrs attrs) {
KIT_CG_LOCAL_COMPILER_TEMP;
if (g && g->debug && attrs.name && (attrs.flags & hidden_flags) == 0)
return 1;
- /* Aggregates (records, arrays), wide16 (f128/i128), wide8 (rv32 i64/double),
- * vararg state, and any non-scalar type must live in memory. */
+ /* Aggregates (records, arrays), wide16 (f128/i128), split-lane wide8, vararg
+ * state, and any non-scalar type must live in memory. */
if (api_is_wide16_scalar_type(g->c, ty)) return 1;
if (api_is_wide8_scalar_type(g->c, ty)) return 1;
return !(cg_type_is_int(g->c, ty) || cg_type_is_float(g->c, ty) ||
diff --git a/src/cg/memory.c b/src/cg/memory.c
@@ -12,9 +12,8 @@ void kit_cg_push_int(KitCg* g, uint64_t value, KitCgTypeId type) {
api_push(g, api_make_wide16_int_const(g, (i64)value, ty));
return;
}
- /* rv32 8-byte int: the 64-bit value fits in op.v.imm, but the value is
- * memory-resident, so materialize it as two 32-bit lanes (a register
- * load_imm of an i64 on rv32 would keep only the low word). */
+ /* Split-lane 8-byte int: the 64-bit value fits in op.v.imm, but the value is
+ * memory-resident, so materialize it as two 32-bit lanes. */
if (api_is_wide8_scalar_type(g->c, ty)) {
api_push(g, api_make_wide8_int_const(g, (i64)value, ty));
return;
@@ -40,9 +39,8 @@ void kit_cg_push_float(KitCg* g, double value, KitCgTypeId type) {
api_push(g, api_make_f128_const(g, value, ty));
return;
}
- /* rv32 soft double: the 8-byte value is memory-resident, so materialize the
- * IEEE-754 binary64 pattern as two 32-bit lanes (a register load_const of an
- * 8-byte value on rv32 would keep only the low word). */
+ /* Split-lane double: the 8-byte value is memory-resident, so materialize the
+ * IEEE-754 binary64 pattern as two 32-bit lanes. */
if (api_is_wide8_scalar_type(g->c, ty)) {
union {
double d;
@@ -441,8 +439,8 @@ void kit_cg_store(KitCg* g, KitCgMemAccess access) {
api_is_wide16_scalar_type(g->c, ty)) {
rv = api_make_wide16_int_const(g, rv.op.v.imm, ty);
}
- /* Same for an rv32 8-byte immediate: lower it to a 2-lane memory value so the
- * store moves a full 64-bit value rather than load_imm'ing only the low word. */
+ /* Same for a split-lane 8-byte immediate: lower it to a 2-lane memory value
+ * so the store moves a full 64-bit value rather than only the low word. */
if (!is_bitfield && api_sv_op_is(&rv, OPK_IMM) &&
api_is_wide8_scalar_type(g->c, ty)) {
rv = api_make_wide8_int_const(g, rv.op.v.imm, ty);
diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c
@@ -15,6 +15,7 @@
#include <string.h>
+#include "abi/abi.h"
#include "cg/type.h"
#include "core/arena.h"
#include "core/pool.h"
@@ -1207,7 +1208,7 @@ static void nd_load_imm(CgTarget* t, Operand dst, i64 imm) {
NativeLoc reg;
if (nd_is_wide64_int(d, dst.type))
nd_panic(d,
- "64-bit integer immediate reached the rv32 backend un-lowered "
+ "64-bit integer immediate reached the backend un-lowered "
"(cg should materialize it as two 32-bit lanes)");
reg = nd_dst_reg(d, dst);
ND_REQUIRE_NATIVE(d, load_imm, "target does not emit immediates");
@@ -1220,7 +1221,7 @@ static void nd_load_const(CgTarget* t, Operand dst, ConstBytes cbytes) {
NativeLoc reg;
if (nd_is_wide64_int(d, dst.type) || nd_is_soft_double(d, dst.type))
nd_panic(d,
- "8-byte constant reached the rv32 backend un-lowered (cg should "
+ "8-byte constant reached the backend un-lowered (cg should "
"materialize it as two 32-bit lanes)");
reg = nd_dst_reg(d, dst);
ND_REQUIRE_NATIVE(d, load_const, "target does not emit byte constants");
@@ -1456,22 +1457,23 @@ static void nd_bitfield_store(CgTarget* t, Operand record_addr, Operand src,
nd_addr_temps_release(d, &temps);
}
-/* Last line of defense against an unlowered wide/soft-float op reaching the
- * machine backend. The cg-layer gates in src/cg/arith.c route i64 mul/div/shift
- * (rv32) and all soft-double arith/convert/compare to runtime calls; if one
- * escapes, the native backend would silently emit wrong code. These guards turn
- * that into a loud compiler_panic. Every condition is gated on ptr_size==4
- * (rv32) and/or float_abi SOFT|SINGLE, so x64/aa64/rv64 (ptr_size 8, or double
- * in FP regs) never trip them. */
+/* Last line of defense against an unlowered split-scalar/soft-float op reaching
+ * the machine backend. The cg-layer gates in src/cg/arith.c route split i64
+ * mul/div/shift and all soft-double arith/convert/compare to runtime calls; if
+ * one escapes, the native backend would silently emit wrong code. */
+static int nd_is_split_wide8_scalar(NativeDirectTarget* d, KitCgTypeId ty) {
+ return abi_cg_scalar_split_lane_size(d->base.c->abi, ty) == 4u &&
+ native_type_size(d->native, ty) == 8u;
+}
+
static int nd_is_wide64_int(NativeDirectTarget* d, KitCgTypeId ty) {
- if (d->base.c->target.ptr_size != 4) return 0; /* rv32 only */
+ if (!nd_is_split_wide8_scalar(d, ty)) return 0;
if (kit_cg_type_int_width((KitCompiler*)d->base.c, ty) == 0) return 0;
- return cg_type_size(d->base.c, ty) > d->base.c->target.ptr_size;
+ return 1;
}
static int nd_is_soft_double(NativeDirectTarget* d, KitCgTypeId ty) {
- u8 abi = d->base.c->target.float_abi;
- if (abi != KIT_FLOAT_ABI_SOFT && abi != KIT_FLOAT_ABI_SINGLE) return 0;
+ if (!nd_is_split_wide8_scalar(d, ty)) return 0;
return kit_cg_type_float_width((KitCompiler*)d->base.c, ty) == 64;
}
@@ -1480,13 +1482,14 @@ static void nd_binop(CgTarget* t, BinOp op, Operand dst, Operand a, Operand b) {
NativeLoc ar;
NativeLoc br;
NativeLoc dr;
- /* No 8-byte value reaches a single GPR op on rv32: the cg layer lowers i64
- * add/sub/and/or/xor to inline 2-word lane sequences and mul/div/rem/shift to
- * __*di3 runtime calls (src/cg/arith.c). Anything that slips through here would
- * silently compute only the low word, so fail loudly instead. */
+ /* No split-lane 8-byte value reaches a single-register op: the cg layer
+ * lowers i64 add/sub/and/or/xor to inline 2-word lane sequences and
+ * mul/div/rem/shift to __*di3 runtime calls (src/cg/arith.c). Anything that
+ * slips through here would silently compute only the low word, so fail
+ * loudly instead. */
if (nd_is_wide64_int(d, a.type) || nd_is_wide64_int(d, dst.type)) {
nd_panic(d,
- "64-bit integer arithmetic reached the rv32 backend un-lowered "
+ "64-bit integer arithmetic reached the backend un-lowered "
"(cg should emit a 2-word lane sequence or a __*di3 runtime call)");
}
if (nd_is_soft_double(d, a.type) || nd_is_soft_double(d, dst.type)) {
@@ -1513,7 +1516,7 @@ static void nd_unop(CgTarget* t, UnOp op, Operand dst, Operand a) {
* OTHER soft-double unop reaching the backend is an unlowered escape. */
if (nd_is_wide64_int(d, a.type) || nd_is_wide64_int(d, dst.type)) {
nd_panic(d,
- "64-bit integer unary op reached the rv32 backend un-lowered "
+ "64-bit integer unary op reached the backend un-lowered "
"(cg should emit a 2-word lane sequence)");
}
if (op != UO_FNEG &&
@@ -1538,7 +1541,7 @@ static void nd_cmp(CgTarget* t, CmpOp op, Operand dst, Operand a, Operand b) {
* GPR compare here. */
if (nd_is_wide64_int(d, a.type) || nd_is_wide64_int(d, b.type)) {
nd_panic(d,
- "64-bit integer compare reached the rv32 backend un-lowered "
+ "64-bit integer compare reached the backend un-lowered "
"(cg should emit a 2-word lane sequence)");
}
if (nd_is_soft_double(d, a.type) || nd_is_soft_double(d, b.type)) {
@@ -1565,7 +1568,7 @@ static void nd_convert(CgTarget* t, ConvKind op, Operand dst, Operand src) {
* calls; none reaches a single-register convert here. */
if (nd_is_wide64_int(d, src.type) || nd_is_wide64_int(d, dst.type)) {
nd_panic(d,
- "64-bit integer conversion reached the rv32 backend un-lowered "
+ "64-bit integer conversion reached the backend un-lowered "
"(cg should emit a 2-word lane sequence or a runtime call)");
}
if (nd_is_soft_double(d, src.type) || nd_is_soft_double(d, dst.type)) {
diff --git a/src/cg/value.c b/src/cg/value.c
@@ -25,18 +25,17 @@ int api_is_wide16_scalar_type(Compiler* c, KitCgTypeId ty) {
return api_is_f128_type(c, ty) || api_is_i128_type(c, ty);
}
-/* rv32 only: an 8-byte scalar (long long / int64_t, and — under ilp32f/ilp32 —
- * soft double) is twice the 4-byte machine word. Like wide16 (i128/f128) on a
- * 64-bit target it cannot live in a single GPR, so it is forced memory-resident
- * and every operation is legalized to a 2-word lane sequence: add/sub/and/or/
- * xor/neg/not/compare inline (src/cg/wide.c, no compiler-rt helper exists for
- * 64-bit add), mul/div/rem/shift and all soft-double arith/convert to runtime
- * calls. Defined as exactly 8 bytes on a 4-byte-pointer target, so it matches
- * only rv32 i64/double and never fires on rv64/x64/aa64 (ptr_size 8). */
+/* 8-byte scalar split into two 4-byte lanes by the selected ABI. This covers
+ * 32-bit native ABIs whose generic CG path cannot keep the value in a single
+ * scalar register/value. Such values are forced memory-resident so operations
+ * can be legalized as lane sequences or runtime calls. */
int api_is_wide8_scalar_type(Compiler* c, KitCgTypeId ty) {
- if (c->target.ptr_size != 4u) return 0;
- if (!(cg_type_is_int(c, ty) || cg_type_is_float(c, ty))) return 0;
- return abi_cg_sizeof(c->abi, ty) == 8u;
+ ABITypeInfo ti;
+ if (!c || !c->abi || !ty) return 0;
+ if (abi_cg_scalar_split_lane_size(c->abi, ty) != 4u) return 0;
+ ti = abi_cg_type_info(c->abi, ty);
+ return ti.size == 8u &&
+ (ti.scalar_kind == ABI_SC_INT || ti.scalar_kind == ABI_SC_FLOAT);
}
Operand api_op_imm(i64 v, KitCgTypeId ty) {
@@ -255,11 +254,10 @@ CGLocal api_alloc_temp_local(KitCg* g, KitCgTypeId ty) {
d.size = abi_cg_sizeof(g->c->abi, ty);
d.align = abi_cg_alignof(g->c->abi, ty);
}
- /* An rv32 8-byte scalar temp (i64/soft-double arithmetic result, call result,
- * etc.) must live in memory so its two words are addressable for lane ops and
- * the multi-part ABI path; the allocator gives an unflagged scalar a single
- * register, which would truncate it. (wide16 temps are already forced via the
- * size>word auto-home in cg_ir_lower.) */
+ /* A split-lane 8-byte scalar temp must live in memory so its two words are
+ * addressable for lane ops and the multi-part ABI path; the allocator gives
+ * an unflagged scalar one value slot, which would truncate it. (wide16 temps
+ * are already forced via the size>word auto-home in cg_ir_lower.) */
if (ty && api_is_wide8_scalar_type(g->c, ty))
d.flags |= CG_LOCAL_MEMORY_REQUIRED;
local = g->target->local(g->target, &d);
diff --git a/src/cg/wide.c b/src/cg/wide.c
@@ -130,17 +130,16 @@ ApiSValue api_make_f128_const(KitCg* g, double value, KitCgTypeId ty) {
}
/* ============================================================
- * wide8 — rv32 8-byte (2-word) scalar lane plumbing
+ * wide8 — 8-byte scalar split into two 4-byte lanes
*
- * On rv32 a long long / int64_t (and, under ilp32f/ilp32, a soft double) is two
- * machine words. Like the wide16 (i128/f128) scalars above it is memory-resident
- * (api_is_wide8_scalar_type forces CG_LOCAL_MEMORY_REQUIRED), but its arithmetic
- * is done INLINE as 2-word lane sequences (src/cg/arith.c) rather than via a
- * runtime call, because compiler-rt has no 64-bit add/sub/and/or/xor helper. The
- * lane size is the 4-byte word (ptr_size); the low word is at offset 0 on a
- * little-endian target (rv32 is LE; the big-endian offsets are kept for parity
- * with the wide16 helpers). These primitives are the inline analogue of
- * api_store_f128_bytes / api_i128_addr / api_i128_load_lane.
+ * Some 32-bit ABIs represent long long / int64_t, and sometimes soft double,
+ * as two machine words. Like the wide16 (i128/f128) scalars above it is
+ * memory-resident (api_is_wide8_scalar_type forces CG_LOCAL_MEMORY_REQUIRED),
+ * but its arithmetic is done INLINE as 2-word lane sequences (src/cg/arith.c)
+ * rather than via a runtime call, because compiler-rt has no 64-bit
+ * add/sub/and/or/xor helper. The lane size is 4 bytes; the low word is at
+ * offset 0 on a little-endian target. These primitives are the inline analogue
+ * of api_store_f128_bytes / api_i128_addr / api_i128_load_lane.
* ============================================================ */
/* Allocate an 8-byte memory-resident, address-taken scalar temp. */
@@ -196,11 +195,11 @@ Operand api_wide8_addr(KitCg* g, ApiSValue* v, KitCgTypeId ty) {
} else {
lv = *v;
}
- /* A delayed value (SV_CMP/SV_ARITH) — e.g. an rv32 i64 produced by `!cmp`
- * routed here through api_wide64_cmp_inline — is not yet a place. Materialize
- * it first: api_ensure_local lowers it into a memory-resident wide8 temp
- * (api_alloc_temp_local forces CG_LOCAL_MEMORY_REQUIRED for an 8-byte scalar),
- * which is a real addressable home. Materialization, however, clears
+ /* A delayed value (SV_CMP/SV_ARITH) routed here through the wide64 helpers is
+ * not yet a place. Materialize it first: api_ensure_local lowers it into a
+ * memory-resident wide8 temp (api_alloc_temp_local forces
+ * CG_LOCAL_MEMORY_REQUIRED for an 8-byte scalar), which is a real addressable
+ * home. Materialization, however, clears
* sv.lvalue (fold.c), so we must set the flag AFTER it runs — otherwise the
* lvalue check in api_lvalue_addr fails ("addr operand is not an lvalue").
* Doing this before api_lvalue_addr also makes its own api_ensure_local a
diff --git a/src/opt/cg_ir_lower.c b/src/opt/cg_ir_lower.c
@@ -37,10 +37,16 @@ typedef struct CgIrLower {
u32* inst_block;
u8* leader;
CGLocal mat_local[CG_IR_LOWER_MAX_MAT];
+ u8 mat_role[CG_IR_LOWER_MAX_MAT];
Reg mat_reg[CG_IR_LOWER_MAX_MAT];
u32 nmat;
} CgIrLower;
+typedef enum CgIrMatRole {
+ CG_IR_MAT_BASE = 0,
+ CG_IR_MAT_INDEX = 1,
+} CgIrMatRole;
+
static _Noreturn void lower_panic(CgIrLower* l, SrcLoc loc, const char* msg) {
compiler_panic(l->c, loc, "opt cg-ir lower: %s", msg);
}
@@ -483,26 +489,54 @@ static OptOperand opt_frame_operand(OptLocalMap* m) {
* in its frame home, so storage.v.reg is meaningless; load the home into a
* fresh PReg. prematerialize_indirect_bases emits that load before the using
* instruction; here we just look the result up (l->mat_*). */
-static Reg resolve_indirect_base_reg(CgIrLower* l, CGLocal local, SrcLoc loc) {
+static Reg resolve_materialized_reg(CgIrLower* l, CGLocal local,
+ CgIrMatRole role, SrcLoc loc) {
OptLocalMap* m = local_map(l, local, loc);
if (m->storage.kind == CG_LOCAL_STORAGE_REG) return m->storage.v.reg;
for (u32 i = 0; i < l->nmat; ++i)
- if (l->mat_local[i] == local) return l->mat_reg[i];
- lower_panic(l, loc, "indirect base local not materialized");
+ if (l->mat_local[i] == local && l->mat_role[i] == (u8)role)
+ return l->mat_reg[i];
+ lower_panic(l, loc, role == CG_IR_MAT_INDEX
+ ? "indirect index local not materialized"
+ : "indirect base local not materialized");
+}
+
+static KitCgTypeId pointer_sized_int_type(CgIrLower* l) {
+ return builtin_id(l->c->target.ptr_size <= 4u ? KIT_CG_BUILTIN_I32
+ : KIT_CG_BUILTIN_I64);
+}
+
+static void remember_materialized_reg(CgIrLower* l, CGLocal local,
+ CgIrMatRole role, Reg r, SrcLoc loc) {
+ if (l->nmat >= CG_IR_LOWER_MAX_MAT)
+ lower_panic(l, loc, "too many frame indirect operands in one instruction");
+ l->mat_local[l->nmat] = local;
+ l->mat_role[l->nmat] = (u8)role;
+ l->mat_reg[l->nmat] = r;
+ l->nmat++;
+}
+
+static int materialized_reg_exists(CgIrLower* l, CGLocal local,
+ CgIrMatRole role) {
+ for (u32 i = 0; i < l->nmat; ++i)
+ if (l->mat_local[i] == local && l->mat_role[i] == (u8)role) return 1;
+ return 0;
}
-/* Emit `r = load <local home>` once per instruction for each FRAME-storage
- * local used as an OPK_INDIRECT base/index, recording r in l->mat_*. Must run
- * before the consuming instruction is emitted so the load dominates its uses.
- */
+static OptOperand opt_frame_operand_as(OptLocalMap* m, KitCgTypeId type) {
+ OptOperand out = opt_frame_operand(m);
+ out.type = type ? type : m->type;
+ return out;
+}
+
+/* Emit the pre-materialization needed for a FRAME-storage local used as an
+ * OPK_INDIRECT base. A pointer-typed local holds the base pointer value and is
+ * loaded. A non-pointer local names storage, so its frame address is the base. */
static void materialize_frame_base(CgIrLower* l, u32 block, CGLocal local,
SrcLoc loc) {
OptLocalMap* m = local_map(l, local, loc);
if (m->storage.kind == CG_LOCAL_STORAGE_REG) return;
- for (u32 i = 0; i < l->nmat; ++i)
- if (l->mat_local[i] == local) return;
- if (l->nmat >= CG_IR_LOWER_MAX_MAT)
- lower_panic(l, loc, "too many frame indirect bases in one instruction");
+ if (materialized_reg_exists(l, local, CG_IR_MAT_BASE)) return;
PReg r = ir_alloc_preg(l->f, m->type, RC_INT);
OptOperand ops[2];
ops[1] = opt_frame_operand(m);
@@ -537,9 +571,40 @@ static void materialize_frame_base(CgIrLower* l, u32 block, CGLocal local,
ao->def = (Val)r;
ao->type = m->type;
}
- l->mat_local[l->nmat] = local;
- l->mat_reg[l->nmat] = (Reg)r;
- l->nmat++;
+ remember_materialized_reg(l, local, CG_IR_MAT_BASE, (Reg)r, loc);
+}
+
+/* Emit `r = load <local home>` for a FRAME-storage local used as an
+ * OPK_INDIRECT index. Unlike a non-pointer base, an index always needs the
+ * local's value. On rv32, Toy indexes are i64 and therefore memory-backed; the
+ * address calculation only consumes the pointer-width low word. */
+static void materialize_frame_index(CgIrLower* l, u32 block, CGLocal local,
+ SrcLoc loc) {
+ OptLocalMap* m = local_map(l, local, loc);
+ if (m->storage.kind == CG_LOCAL_STORAGE_REG) return;
+ if (materialized_reg_exists(l, local, CG_IR_MAT_INDEX)) return;
+ KitCgTypeId idx_ty = pointer_sized_int_type(l);
+ PReg r = ir_alloc_preg(l->f, idx_ty, RC_INT);
+ OptOperand ops[2];
+ Inst* ld = ir_emit(l->f, block, IR_LOAD);
+ ld->loc = loc;
+ memset(&ops[0], 0, sizeof ops[0]);
+ ops[0].kind = OPK_REG;
+ ops[0].cls = RC_INT;
+ ops[0].type = idx_ty;
+ ops[0].v.reg = (Reg)r;
+ ops[1] = opt_frame_operand_as(m, idx_ty);
+ ld->opnds = dup_opt_ops(l, ops, 2);
+ ld->nopnds = 2;
+ ld->def = (Val)r;
+ ld->type = idx_ty;
+ memset(&ld->extra.mem, 0, sizeof ld->extra.mem);
+ ld->extra.mem.type = idx_ty;
+ ld->extra.mem.size = l->c->target.ptr_size;
+ ld->extra.mem.align = m->align && m->align < l->c->target.ptr_size
+ ? m->align
+ : l->c->target.ptr_size;
+ remember_materialized_reg(l, local, CG_IR_MAT_INDEX, (Reg)r, loc);
}
/* Scan the CG instruction's operands for OPK_INDIRECT bases/indices that are
@@ -552,7 +617,7 @@ static void prematerialize_indirect_bases(CgIrLower* l, const CgIrInst* in,
if (op->kind != OPK_INDIRECT) continue;
materialize_frame_base(l, block, op->v.ind.base, in->loc);
if (op->v.ind.index != CG_LOCAL_NONE)
- materialize_frame_base(l, block, op->v.ind.index, in->loc);
+ materialize_frame_index(l, block, op->v.ind.index, in->loc);
}
}
@@ -590,10 +655,12 @@ static OptOperand lower_operand_addr(CgIrLower* l, const Operand* in,
case OPK_INDIRECT: {
out.kind = OPK_INDIRECT;
out.cls = RC_INT;
- out.v.ind.base = resolve_indirect_base_reg(l, in->v.ind.base, loc);
+ out.v.ind.base =
+ resolve_materialized_reg(l, in->v.ind.base, CG_IR_MAT_BASE, loc);
out.v.ind.index = REG_NONE;
if (in->v.ind.index != CG_LOCAL_NONE)
- out.v.ind.index = resolve_indirect_base_reg(l, in->v.ind.index, loc);
+ out.v.ind.index =
+ resolve_materialized_reg(l, in->v.ind.index, CG_IR_MAT_INDEX, loc);
out.v.ind.log2_scale = in->v.ind.log2_scale;
out.v.ind.ofs = in->v.ind.ofs;
return out;
diff --git a/test/parse/cases/6_2_5_04_data_model_widths.c b/test/parse/cases/6_2_5_04_data_model_widths.c
@@ -0,0 +1,23 @@
+#include <stddef.h>
+#include <stdint.h>
+
+_Static_assert(sizeof(long) == __SIZEOF_LONG__, "long width");
+_Static_assert(sizeof(unsigned long) == __SIZEOF_LONG__, "unsigned long width");
+_Static_assert(sizeof(size_t) == __SIZEOF_SIZE_T__, "size_t width");
+_Static_assert(sizeof(ptrdiff_t) == __SIZEOF_PTRDIFF_T__, "ptrdiff_t width");
+_Static_assert(sizeof(intptr_t) == __SIZEOF_POINTER__, "intptr_t width");
+_Static_assert(sizeof(uintptr_t) == __SIZEOF_POINTER__, "uintptr_t width");
+
+int test_main(void) {
+ int a[2];
+ int score = 0;
+ if (sizeof(long) == __SIZEOF_LONG__) score += 1;
+ if (sizeof(unsigned long) == __SIZEOF_LONG__) score += 2;
+ if (sizeof(size_t) == __SIZEOF_SIZE_T__) score += 4;
+ if (sizeof(ptrdiff_t) == __SIZEOF_PTRDIFF_T__) score += 8;
+ if (sizeof(intptr_t) == __SIZEOF_POINTER__) score += 16;
+ if (sizeof(uintptr_t) == __SIZEOF_POINTER__) score += 5;
+ score += _Generic(sizeof(int), size_t: 3, default: 0);
+ score += _Generic((&a[1] - &a[0]), ptrdiff_t: 3, default: 0);
+ return score;
+}
diff --git a/test/parse/cases/6_2_5_04_data_model_widths.expected b/test/parse/cases/6_2_5_04_data_model_widths.expected
@@ -0,0 +1 @@
+42