kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit e4a333278d9c614276c265232293826e26aa3b63
parent ea0e478e0cdbf5187b454ea2844b83a9e12e2964
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed,  3 Jun 2026 21:22:40 -0700

rv32: use ABI split-lane scalar metadata

Add target data-model width selection for C long/size_t spellings, expose ABI scalar split-lane metadata, and use it to gate wide8 lowering/backend guards. Also fix frame-backed indirect index materialization to load the pointer-width value and add an RV32 parser fixture for data-model widths.

Diffstat:
Mlang/c/abi/c_abi.c | 2++
Mlang/c/type/type.c | 4++--
Mlang/cpp/cpp_support.h | 7+++++++
Mlang/cpp/pp/pp.c | 2+-
Msrc/abi/abi.c | 5+++++
Msrc/abi/abi.h | 1+
Msrc/abi/abi_internal.h | 4++++
Msrc/abi/abi_rv64.c | 14++++++++++++++
Msrc/cg/arith.c | 60+++++++++++++++++++++++++++++-------------------------------
Msrc/cg/call.c | 5++---
Msrc/cg/control.c | 6+++---
Msrc/cg/local.c | 4++--
Msrc/cg/memory.c | 14++++++--------
Msrc/cg/native_direct_target.c | 45++++++++++++++++++++++++---------------------
Msrc/cg/value.c | 30++++++++++++++----------------
Msrc/cg/wide.c | 29++++++++++++++---------------
Msrc/opt/cg_ir_lower.c | 101+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
Atest/parse/cases/6_2_5_04_data_model_widths.c | 23+++++++++++++++++++++++
Atest/parse/cases/6_2_5_04_data_model_widths.expected | 1+
19 files changed, 238 insertions(+), 119 deletions(-)

diff --git a/lang/c/abi/c_abi.c b/lang/c/abi/c_abi.c @@ -96,6 +96,7 @@ const ABIFuncInfo* c_abi_func_info(KitCompiler* a, Pool* p, static const Type* c_size_or_uintptr(KitCompiler* a, Pool* p) { KitTargetSpec target = kit_compiler_target_spec(a); + if (kit_target_uses_lp64(target)) return type_prim(p, TY_ULONG); return target.ptr_size == 8 ? type_prim(p, TY_ULLONG) : type_prim(p, TY_UINT); } @@ -105,6 +106,7 @@ const Type* c_abi_size_type(KitCompiler* a, Pool* p) { const Type* c_abi_ptrdiff_type(KitCompiler* a, Pool* p) { KitTargetSpec target = kit_compiler_target_spec(a); + if (kit_target_uses_lp64(target)) return type_prim(p, TY_LONG); return target.ptr_size == 8 ? type_prim(p, TY_LLONG) : type_prim(p, TY_INT); } diff --git a/lang/c/type/type.c b/lang/c/type/type.c @@ -538,8 +538,8 @@ static KitCgTypeId type_cg_builtin(KitCompiler* c, TypeKind kind) { return b.id[KIT_CG_BUILTIN_I32]; case TY_LONG: case TY_ULONG: - if (target.os == KIT_OS_WINDOWS) return b.id[KIT_CG_BUILTIN_I32]; - return b.id[KIT_CG_BUILTIN_I64]; + return b.id[kit_target_uses_lp64(target) ? KIT_CG_BUILTIN_I64 + : KIT_CG_BUILTIN_I32]; case TY_LLONG: case TY_ULLONG: return b.id[KIT_CG_BUILTIN_I64]; diff --git a/lang/cpp/cpp_support.h b/lang/cpp/cpp_support.h @@ -35,6 +35,13 @@ typedef struct Pool { void* type_cache; /* opaque slot owned by the C frontend; unused by cpp */ } Pool; +/* C data model for frontend-visible scalar spelling. kit currently uses LP64 + * for 64-bit non-Windows targets, LLP64 for 64-bit Windows targets, and ILP32 + * for 32-bit targets. */ +static inline int kit_target_uses_lp64(KitTargetSpec t) { + return t.ptr_size == 8 && t.os != KIT_OS_WINDOWS; +} + static inline Pool* c_pool_new(Compiler* c) { Heap* h = kit_compiler_context(c)->heap; Pool* p = h ? (Pool*)h->alloc(h, sizeof(*p), _Alignof(Pool)) : NULL; diff --git a/lang/cpp/pp/pp.c b/lang/cpp/pp/pp.c @@ -350,7 +350,7 @@ static void pp_register_target_predefined(Pp* pp) { uint32_t i; int ptr64 = (target.ptr_size == 8); int win = (target.os == KIT_OS_WINDOWS); - int lp64 = ptr64 && !win; + int lp64 = kit_target_uses_lp64(target); int wchar16 = win; for (i = 0; i < narch_defs; ++i) { diff --git a/src/abi/abi.c b/src/abi/abi.c @@ -98,6 +98,11 @@ u32 abi_cg_alignof(TargetABI* a, KitCgTypeId id) { return abi_cg_type_info(a, id).align; } +u32 abi_cg_scalar_split_lane_size(TargetABI* a, KitCgTypeId id) { + if (!a || !a->vt || !a->vt->scalar_split_lane_size) return 0; + return a->vt->scalar_split_lane_size(a, id); +} + /* ---- record layout (struct/union) ---- * * The CG type constructor computes the shared source-facing record layout. diff --git a/src/abi/abi.h b/src/abi/abi.h @@ -149,6 +149,7 @@ Compiler* abi_compiler(TargetABI*); ABITypeInfo abi_cg_type_info(TargetABI*, KitCgTypeId); u32 abi_cg_sizeof(TargetABI*, KitCgTypeId); u32 abi_cg_alignof(TargetABI*, KitCgTypeId); +u32 abi_cg_scalar_split_lane_size(TargetABI*, KitCgTypeId); const ABIRecordLayout* abi_cg_record_layout(TargetABI*, KitCgTypeId); const ABIFuncInfo* abi_cg_func_info(TargetABI*, KitCgTypeId fn_type); ABITypeInfo abi_va_list_info(TargetABI*); diff --git a/src/abi/abi_internal.h b/src/abi/abi_internal.h @@ -13,6 +13,10 @@ typedef struct ABIVtable { /* Compute the ABIFuncInfo for a function type. The cache wrapper in * abi.c calls this once per CgTypeId and memoizes the result. */ ABIFuncInfo* (*compute_func_info)(TargetABI*, KitCgTypeId fn); + /* Optional. Return the byte width of each lane when a scalar has to be + * lowered by generic CG as multiple addressable machine-word lanes, or 0 + * when the target ABI treats it as one scalar value. */ + u32 (*scalar_split_lane_size)(TargetABI*, KitCgTypeId); ABITypeInfo va_list_info; ABIVaListInfo va_list_layout; } ABIVtable; diff --git a/src/abi/abi_rv64.c b/src/abi/abi_rv64.c @@ -178,6 +178,19 @@ static void classify_scalar(TargetABI* a, KitCgTypeId t, ABIArgInfo* out) { out->nparts = 1; } +static u32 riscv32_scalar_split_lane_size(TargetABI* a, KitCgTypeId t) { + RiscvAbiDesc d = riscv_abi_desc(a); + ABITypeInfo ti = abi_internal_type_info(a, t); + int fp_part; + if (d.gpr_bytes != 4u) return 0; + fp_part = (ti.scalar_kind == ABI_SC_FLOAT) && + riscv_fp_eligible(d.flen, ti.size); + if (ti.size == 2u * d.gpr_bytes && !fp_part && + (ti.scalar_kind == ABI_SC_INT || ti.scalar_kind == ABI_SC_FLOAT)) + return d.gpr_bytes; + return 0; +} + static void classify_void(ABIArgInfo* out) { memset(out, 0, sizeof *out); out->kind = ABI_ARG_IGNORE; @@ -326,6 +339,7 @@ const ABIVtable rv64_vtable = { const ABIVtable rv32_vtable = { .compute_func_info = riscv_compute_func_info, + .scalar_split_lane_size = riscv32_scalar_split_lane_size, .va_list_info = {4, 4, ABI_SC_PTR, 0, 0, 0}, /* ILP32* va_list is a plain 4-byte pointer; the variadic register-save * area is the 8 integer arg registers (a0..a7) spilled contiguously = diff --git a/src/cg/arith.c b/src/cg/arith.c @@ -234,8 +234,8 @@ void api_cg_convert_kind(KitCg* g, KitCgTypeId dst_type, ConvKind ck) { i64 folded; if (api_try_fold_int_convert(g, ck, sty, dty, v.op.v.imm, &folded)) { api_release(g, &v); - /* A folded result of rv32 8-byte type must be memory-resident (two lanes), - * not a bare i64 immediate the backend would truncate. */ + /* A folded split-lane 8-byte result must be memory-resident, not a bare + * i64 immediate the backend would truncate. */ if (api_is_wide8_scalar_type(g->c, dty)) api_push(g, api_make_wide8_int_const(g, folded, dty)); else @@ -314,16 +314,16 @@ int api_i128_stack_top(KitCg* g, u32 depth) { return api_is_i128_type(g->c, api_sv_type(&g->stack[g->sp - 1u - depth])); } -/* "Wider than the target machine word but NOT i128" — i.e. a 64-bit integer on - * a 32-bit target (rv32). The native backend handles add/sub/and/or/xor on such - * values as register pairs, but mul/div/shift must be lowered to a __*di3 - * runtime call (see api_wideint64_binop). i128 routes through its own ti3 path - * (api_i128_*), so it is explicitly excluded here. */ +/* 64-bit integer split into two 32-bit lanes by the selected ABI. The native + * backend handles add/sub/and/or/xor on such values as register pairs, but + * mul/div/shift must be lowered to a __*di3 runtime call (see + * api_wideint64_binop). i128 routes through its own ti3 path (api_i128_*), so + * it is explicitly excluded here. */ static int api_int_is_wide64(KitCg* g, KitCgTypeId ty) { if (!g) return 0; if (api_is_i128_type(g->c, ty)) return 0; if (kit_cg_type_int_width((KitCompiler*)g->c, ty) == 0) return 0; - return abi_cg_sizeof(g->c->abi, ty) > g->c->target.ptr_size; + return api_is_wide8_scalar_type(g->c, ty); } static int api_wide64_stack_top(KitCg* g, u32 depth) { @@ -425,13 +425,12 @@ static void api_wideint64_binop(KitCg* g, BinOp iop) { } /* ============================================================ - * wide8 inline 2-word lane arithmetic (rv32 i64) + * wide8 inline 2-word lane arithmetic * - * On rv32 a 64-bit integer is a memory-resident 8-byte scalar. add/sub/and/or/ - * xor/neg/not and compares have no compiler-rt helper (they would recurse), so - * they are emitted INLINE here as 32-bit lane ops, mirroring the i128 lane - * primitives but operating on register-class i32 lanes loaded from / stored to - * the value's memory home. mul/div/rem/shift route to __*di3 (api_wideint64_*). + * Some 32-bit ABIs represent a 64-bit integer as a memory-resident 8-byte + * scalar split into two 32-bit lanes. add/sub/and/or/xor/neg/not and compares + * have no compiler-rt helper (they would recurse), so they are emitted INLINE + * here as lane ops. mul/div/rem/shift route to __*di3 (api_wideint64_*). * ============================================================ */ static i32 wide8_lo_off(KitCg* g) { return g->c->target.big_endian ? 4 : 0; } @@ -620,11 +619,11 @@ static void api_wide64_cmp_inline(KitCg* g, CmpOp cop) { } /* ============================================================ - * wide64 __builtin_*_overflow on rv32 (inline, 2-lane) + * wide64 __builtin_*_overflow on split-lane 64-bit values * * The native backends only model single-register overflow, so a 64-bit * operand traps there. Here we legalize the 6 overflow intrinsics for a - * 64-bit (rv32 i64) operand pair into 32-bit lane ops, computing both the + * 64-bit operand pair into 32-bit lane ops, computing both the * 64-bit wrapped value (stored to a fresh 8-byte temp) and the boolean * overflow flag, then pushing [value, ok] exactly as the native path does. * add/sub reuse the carry/borrow lane logic; mul builds the full 128-bit @@ -682,7 +681,7 @@ static Operand wide8_addc(KitCg* g, Operand acc, Operand addend, return sum; } -/* The 6 __builtin_*_overflow intrinsics for a wide64 (rv32 i64) operand pair. +/* The 6 __builtin_*_overflow intrinsics for a split-lane wide64 operand pair. * Pops the two 8-byte args, computes the wrapped 64-bit value into a fresh * 8-byte temp and the bool overflow flag into an i32, then pushes [value, ok] * matching the contract of the native overflow path. */ @@ -868,7 +867,7 @@ static void api_wide64_overflow_inline(KitCg* g, KitCgIntrinsic intrin) { } } -/* int<->i64 conversions on rv32 (sext/zext/trunc/bitcast across the 4<->8 +/* int<->split-i64 conversions (sext/zext/trunc/bitcast across the 4<->8 * boundary, and i64->bool). Returns 1 if it handled (and consumed) *v. The * i64<->float conversions are routed to libcalls in kit_cg_*_to_float / * kit_cg_float_to_* and never reach here. */ @@ -1065,10 +1064,10 @@ void kit_cg_int_binop(KitCg* g, KitCgIntBinOp op, uint32_t flags) { api_i128_binop(g, iop); return; } - /* 64-bit int on a 32-bit target (rv32): mul/div/rem/shift become __*di3 + /* 64-bit int split into 32-bit lanes: mul/div/rem/shift become __*di3 * runtime calls; add/sub/and/or/xor are emitted inline as 2-word lane ops * (no compiler-rt helper exists for them). Both keep the value memory-resident - * so the allocator never tries to put 8 bytes in one 4-byte register. */ + * so the allocator never tries to put 8 bytes in one 4-byte value slot. */ if (g && (api_wide64_stack_top(g, 0) || api_wide64_stack_top(g, 1))) { if (api_wideint64_binop_helper(iop)) api_wideint64_binop(g, iop); @@ -1085,7 +1084,7 @@ void kit_cg_int_unop(KitCg* g, KitCgIntUnOp op, uint32_t flags) { api_i128_unop(g, iop); return; } - /* rv32 64-bit int: neg/bnot are inline 2-word lane ops; logical-not (!x) is + /* Split 64-bit int: neg/bnot are inline 2-word lane ops; logical-not (!x) is * the full-value truthiness test (lo|hi)==0. */ if (g && api_wide64_stack_top(g, 0)) { if (iop == UO_NEG || iop == UO_BNOT) { @@ -1557,8 +1556,7 @@ void kit_cg_sint_to_float(KitCg* g, KitCgTypeId dst, KitCgRounding rounding) { api_f128_call_unary(g, name, resolve_type(g->c, dst), pty); return; } - /* signed i64 -> hardware single float: rv32 has no fcvt.s.l, so the i64->f32 - * conversion is a __floatdisf runtime call (mirrors clang under ilp32f). */ + /* signed split-i64 -> hardware single float: use __floatdisf. */ if (api_wide64_stack_top(g, 0)) { api_f128_call_unary(g, "__floatdisf", resolve_type(g->c, dst), builtin_id(KIT_CG_BUILTIN_I64)); @@ -1645,7 +1643,7 @@ void kit_cg_float_to_sint(KitCg* g, KitCgTypeId dst, KitCgRounding rounding) { if (rty != dty) api_cg_convert_kind(g, dty, CV_TRUNC); return; } - /* hardware single float -> i64: rv32 has no fcvt.l.s, so __fixsfdi. */ + /* hardware single float -> split-i64: use __fixsfdi. */ if (api_is_wide8_scalar_type(g->c, resolve_type(g->c, dst))) { api_f128_call_unary(g, "__fixsfdi", resolve_type(g->c, dst), builtin_id(KIT_CG_BUILTIN_F32)); @@ -1690,7 +1688,7 @@ void kit_cg_float_to_uint(KitCg* g, KitCgTypeId dst, KitCgRounding rounding) { if (rty != dty) api_cg_convert_kind(g, dty, CV_TRUNC); return; } - /* hardware single float -> u64: __fixunssfdi. */ + /* hardware single float -> split-u64: use __fixunssfdi. */ if (api_is_wide8_scalar_type(g->c, resolve_type(g->c, dst))) { api_f128_call_unary(g, "__fixunssfdi", resolve_type(g->c, dst), builtin_id(KIT_CG_BUILTIN_F32)); @@ -1821,10 +1819,10 @@ void kit_cg_intrinsic(KitCg* g, KitCgIntrinsic intrin, uint32_t nargs, u32 ndst = 0; Heap* h; if (!g) return; - /* rv32: clz/ctz/popcount/bswap on a 64-bit value cannot be the single-register - * software sequence the backend emits (it would shift by 32 — an illegal rv32 - * shamt). Route them to the compiler-rt __*di2 helpers, which decompose into - * 32-bit operations. (32-bit forms still lower inline.) */ + /* clz/ctz/popcount/bswap on a split 64-bit value cannot use the backend's + * single-register software sequence. Route them to the compiler-rt __*di2 + * helpers, which decompose into 32-bit operations. (32-bit forms still lower + * inline.) */ if (nargs == 1 && api_wide64_stack_top(g, 0)) { const char* name = NULL; KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32); @@ -1846,10 +1844,10 @@ void kit_cg_intrinsic(KitCg* g, KitCgIntrinsic intrin, uint32_t nargs, return; } } - /* rv32: __builtin_*_overflow on a 64-bit operand pair traps in the native + /* __builtin_*_overflow on a split 64-bit operand pair traps in the native * backend (it only models single-register overflow). Legalize all 6 forms * inline as 2-lane / 4-lane ops, pushing [value, ok] like the native path. - * Gated on both operands being wide64 so 32-bit / non-rv32 are unchanged. */ + * Gated on both operands being wide64 so other targets are unchanged. */ if (nargs == 2 && api_intrinsic_is_overflow(intrin) && api_wide64_stack_top(g, 0) && api_wide64_stack_top(g, 1)) { api_wide64_overflow_inline(g, intrin); diff --git a/src/cg/call.c b/src/cg/call.c @@ -48,9 +48,8 @@ static CGLocal api_materialize_call_local(KitCg* g, ApiSValue* arg, if (api_sv_op_is(arg, OPK_IMM) && api_is_wide16_scalar_type(g->c, ty)) { *arg = api_make_wide16_int_const(g, arg->op.v.imm, ty); } - /* Same for an rv32 8-byte immediate argument: materialize it as a 2-lane - * memory value so the multi-part ABI path marshals both words into the GPR - * pair, instead of load_imm'ing only the low word into one register. */ + /* Same for a split-lane 8-byte immediate argument: materialize it as a + * 2-lane memory value so the multi-part ABI path marshals both words. */ if (api_sv_op_is(arg, OPK_IMM) && api_is_wide8_scalar_type(g->c, ty)) { *arg = api_make_wide8_int_const(g, arg->op.v.imm, ty); } diff --git a/src/cg/control.c b/src/cg/control.c @@ -50,9 +50,9 @@ void api_branch_if(KitCg* g, ApiSValue* v, int branch_when_true, Label label) { api_branch_if(g, &cmp, branch_when_true, label); return; } - /* rv32 8-byte int (or soft double) truthiness: branch on (lo | hi) != 0. The - * value is memory-resident, so a single-register CMP_NE-vs-zero would only see - * the low word; OR the two lanes into an i32 first. */ + /* Split-lane 8-byte truthiness: branch on (lo | hi) != 0. The value is + * memory-resident, so a single-slot CMP_NE-vs-zero would only see the low + * word; OR the two lanes into an i32 first. */ if (api_is_wide8_scalar_type(g->c, ty)) { KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32); Operand orl = api_wide8_or_lanes(g, v, ty); diff --git a/src/cg/local.c b/src/cg/local.c @@ -5,8 +5,8 @@ int api_local_requires_memory(KitCg* g, KitCgTypeId ty, KitCgLocalAttrs attrs) { KIT_CG_LOCAL_COMPILER_TEMP; if (g && g->debug && attrs.name && (attrs.flags & hidden_flags) == 0) return 1; - /* Aggregates (records, arrays), wide16 (f128/i128), wide8 (rv32 i64/double), - * vararg state, and any non-scalar type must live in memory. */ + /* Aggregates (records, arrays), wide16 (f128/i128), split-lane wide8, vararg + * state, and any non-scalar type must live in memory. */ if (api_is_wide16_scalar_type(g->c, ty)) return 1; if (api_is_wide8_scalar_type(g->c, ty)) return 1; return !(cg_type_is_int(g->c, ty) || cg_type_is_float(g->c, ty) || diff --git a/src/cg/memory.c b/src/cg/memory.c @@ -12,9 +12,8 @@ void kit_cg_push_int(KitCg* g, uint64_t value, KitCgTypeId type) { api_push(g, api_make_wide16_int_const(g, (i64)value, ty)); return; } - /* rv32 8-byte int: the 64-bit value fits in op.v.imm, but the value is - * memory-resident, so materialize it as two 32-bit lanes (a register - * load_imm of an i64 on rv32 would keep only the low word). */ + /* Split-lane 8-byte int: the 64-bit value fits in op.v.imm, but the value is + * memory-resident, so materialize it as two 32-bit lanes. */ if (api_is_wide8_scalar_type(g->c, ty)) { api_push(g, api_make_wide8_int_const(g, (i64)value, ty)); return; @@ -40,9 +39,8 @@ void kit_cg_push_float(KitCg* g, double value, KitCgTypeId type) { api_push(g, api_make_f128_const(g, value, ty)); return; } - /* rv32 soft double: the 8-byte value is memory-resident, so materialize the - * IEEE-754 binary64 pattern as two 32-bit lanes (a register load_const of an - * 8-byte value on rv32 would keep only the low word). */ + /* Split-lane double: the 8-byte value is memory-resident, so materialize the + * IEEE-754 binary64 pattern as two 32-bit lanes. */ if (api_is_wide8_scalar_type(g->c, ty)) { union { double d; @@ -441,8 +439,8 @@ void kit_cg_store(KitCg* g, KitCgMemAccess access) { api_is_wide16_scalar_type(g->c, ty)) { rv = api_make_wide16_int_const(g, rv.op.v.imm, ty); } - /* Same for an rv32 8-byte immediate: lower it to a 2-lane memory value so the - * store moves a full 64-bit value rather than load_imm'ing only the low word. */ + /* Same for a split-lane 8-byte immediate: lower it to a 2-lane memory value + * so the store moves a full 64-bit value rather than only the low word. */ if (!is_bitfield && api_sv_op_is(&rv, OPK_IMM) && api_is_wide8_scalar_type(g->c, ty)) { rv = api_make_wide8_int_const(g, rv.op.v.imm, ty); diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c @@ -15,6 +15,7 @@ #include <string.h> +#include "abi/abi.h" #include "cg/type.h" #include "core/arena.h" #include "core/pool.h" @@ -1207,7 +1208,7 @@ static void nd_load_imm(CgTarget* t, Operand dst, i64 imm) { NativeLoc reg; if (nd_is_wide64_int(d, dst.type)) nd_panic(d, - "64-bit integer immediate reached the rv32 backend un-lowered " + "64-bit integer immediate reached the backend un-lowered " "(cg should materialize it as two 32-bit lanes)"); reg = nd_dst_reg(d, dst); ND_REQUIRE_NATIVE(d, load_imm, "target does not emit immediates"); @@ -1220,7 +1221,7 @@ static void nd_load_const(CgTarget* t, Operand dst, ConstBytes cbytes) { NativeLoc reg; if (nd_is_wide64_int(d, dst.type) || nd_is_soft_double(d, dst.type)) nd_panic(d, - "8-byte constant reached the rv32 backend un-lowered (cg should " + "8-byte constant reached the backend un-lowered (cg should " "materialize it as two 32-bit lanes)"); reg = nd_dst_reg(d, dst); ND_REQUIRE_NATIVE(d, load_const, "target does not emit byte constants"); @@ -1456,22 +1457,23 @@ static void nd_bitfield_store(CgTarget* t, Operand record_addr, Operand src, nd_addr_temps_release(d, &temps); } -/* Last line of defense against an unlowered wide/soft-float op reaching the - * machine backend. The cg-layer gates in src/cg/arith.c route i64 mul/div/shift - * (rv32) and all soft-double arith/convert/compare to runtime calls; if one - * escapes, the native backend would silently emit wrong code. These guards turn - * that into a loud compiler_panic. Every condition is gated on ptr_size==4 - * (rv32) and/or float_abi SOFT|SINGLE, so x64/aa64/rv64 (ptr_size 8, or double - * in FP regs) never trip them. */ +/* Last line of defense against an unlowered split-scalar/soft-float op reaching + * the machine backend. The cg-layer gates in src/cg/arith.c route split i64 + * mul/div/shift and all soft-double arith/convert/compare to runtime calls; if + * one escapes, the native backend would silently emit wrong code. */ +static int nd_is_split_wide8_scalar(NativeDirectTarget* d, KitCgTypeId ty) { + return abi_cg_scalar_split_lane_size(d->base.c->abi, ty) == 4u && + native_type_size(d->native, ty) == 8u; +} + static int nd_is_wide64_int(NativeDirectTarget* d, KitCgTypeId ty) { - if (d->base.c->target.ptr_size != 4) return 0; /* rv32 only */ + if (!nd_is_split_wide8_scalar(d, ty)) return 0; if (kit_cg_type_int_width((KitCompiler*)d->base.c, ty) == 0) return 0; - return cg_type_size(d->base.c, ty) > d->base.c->target.ptr_size; + return 1; } static int nd_is_soft_double(NativeDirectTarget* d, KitCgTypeId ty) { - u8 abi = d->base.c->target.float_abi; - if (abi != KIT_FLOAT_ABI_SOFT && abi != KIT_FLOAT_ABI_SINGLE) return 0; + if (!nd_is_split_wide8_scalar(d, ty)) return 0; return kit_cg_type_float_width((KitCompiler*)d->base.c, ty) == 64; } @@ -1480,13 +1482,14 @@ static void nd_binop(CgTarget* t, BinOp op, Operand dst, Operand a, Operand b) { NativeLoc ar; NativeLoc br; NativeLoc dr; - /* No 8-byte value reaches a single GPR op on rv32: the cg layer lowers i64 - * add/sub/and/or/xor to inline 2-word lane sequences and mul/div/rem/shift to - * __*di3 runtime calls (src/cg/arith.c). Anything that slips through here would - * silently compute only the low word, so fail loudly instead. */ + /* No split-lane 8-byte value reaches a single-register op: the cg layer + * lowers i64 add/sub/and/or/xor to inline 2-word lane sequences and + * mul/div/rem/shift to __*di3 runtime calls (src/cg/arith.c). Anything that + * slips through here would silently compute only the low word, so fail + * loudly instead. */ if (nd_is_wide64_int(d, a.type) || nd_is_wide64_int(d, dst.type)) { nd_panic(d, - "64-bit integer arithmetic reached the rv32 backend un-lowered " + "64-bit integer arithmetic reached the backend un-lowered " "(cg should emit a 2-word lane sequence or a __*di3 runtime call)"); } if (nd_is_soft_double(d, a.type) || nd_is_soft_double(d, dst.type)) { @@ -1513,7 +1516,7 @@ static void nd_unop(CgTarget* t, UnOp op, Operand dst, Operand a) { * OTHER soft-double unop reaching the backend is an unlowered escape. */ if (nd_is_wide64_int(d, a.type) || nd_is_wide64_int(d, dst.type)) { nd_panic(d, - "64-bit integer unary op reached the rv32 backend un-lowered " + "64-bit integer unary op reached the backend un-lowered " "(cg should emit a 2-word lane sequence)"); } if (op != UO_FNEG && @@ -1538,7 +1541,7 @@ static void nd_cmp(CgTarget* t, CmpOp op, Operand dst, Operand a, Operand b) { * GPR compare here. */ if (nd_is_wide64_int(d, a.type) || nd_is_wide64_int(d, b.type)) { nd_panic(d, - "64-bit integer compare reached the rv32 backend un-lowered " + "64-bit integer compare reached the backend un-lowered " "(cg should emit a 2-word lane sequence)"); } if (nd_is_soft_double(d, a.type) || nd_is_soft_double(d, b.type)) { @@ -1565,7 +1568,7 @@ static void nd_convert(CgTarget* t, ConvKind op, Operand dst, Operand src) { * calls; none reaches a single-register convert here. */ if (nd_is_wide64_int(d, src.type) || nd_is_wide64_int(d, dst.type)) { nd_panic(d, - "64-bit integer conversion reached the rv32 backend un-lowered " + "64-bit integer conversion reached the backend un-lowered " "(cg should emit a 2-word lane sequence or a runtime call)"); } if (nd_is_soft_double(d, src.type) || nd_is_soft_double(d, dst.type)) { diff --git a/src/cg/value.c b/src/cg/value.c @@ -25,18 +25,17 @@ int api_is_wide16_scalar_type(Compiler* c, KitCgTypeId ty) { return api_is_f128_type(c, ty) || api_is_i128_type(c, ty); } -/* rv32 only: an 8-byte scalar (long long / int64_t, and — under ilp32f/ilp32 — - * soft double) is twice the 4-byte machine word. Like wide16 (i128/f128) on a - * 64-bit target it cannot live in a single GPR, so it is forced memory-resident - * and every operation is legalized to a 2-word lane sequence: add/sub/and/or/ - * xor/neg/not/compare inline (src/cg/wide.c, no compiler-rt helper exists for - * 64-bit add), mul/div/rem/shift and all soft-double arith/convert to runtime - * calls. Defined as exactly 8 bytes on a 4-byte-pointer target, so it matches - * only rv32 i64/double and never fires on rv64/x64/aa64 (ptr_size 8). */ +/* 8-byte scalar split into two 4-byte lanes by the selected ABI. This covers + * 32-bit native ABIs whose generic CG path cannot keep the value in a single + * scalar register/value. Such values are forced memory-resident so operations + * can be legalized as lane sequences or runtime calls. */ int api_is_wide8_scalar_type(Compiler* c, KitCgTypeId ty) { - if (c->target.ptr_size != 4u) return 0; - if (!(cg_type_is_int(c, ty) || cg_type_is_float(c, ty))) return 0; - return abi_cg_sizeof(c->abi, ty) == 8u; + ABITypeInfo ti; + if (!c || !c->abi || !ty) return 0; + if (abi_cg_scalar_split_lane_size(c->abi, ty) != 4u) return 0; + ti = abi_cg_type_info(c->abi, ty); + return ti.size == 8u && + (ti.scalar_kind == ABI_SC_INT || ti.scalar_kind == ABI_SC_FLOAT); } Operand api_op_imm(i64 v, KitCgTypeId ty) { @@ -255,11 +254,10 @@ CGLocal api_alloc_temp_local(KitCg* g, KitCgTypeId ty) { d.size = abi_cg_sizeof(g->c->abi, ty); d.align = abi_cg_alignof(g->c->abi, ty); } - /* An rv32 8-byte scalar temp (i64/soft-double arithmetic result, call result, - * etc.) must live in memory so its two words are addressable for lane ops and - * the multi-part ABI path; the allocator gives an unflagged scalar a single - * register, which would truncate it. (wide16 temps are already forced via the - * size>word auto-home in cg_ir_lower.) */ + /* A split-lane 8-byte scalar temp must live in memory so its two words are + * addressable for lane ops and the multi-part ABI path; the allocator gives + * an unflagged scalar one value slot, which would truncate it. (wide16 temps + * are already forced via the size>word auto-home in cg_ir_lower.) */ if (ty && api_is_wide8_scalar_type(g->c, ty)) d.flags |= CG_LOCAL_MEMORY_REQUIRED; local = g->target->local(g->target, &d); diff --git a/src/cg/wide.c b/src/cg/wide.c @@ -130,17 +130,16 @@ ApiSValue api_make_f128_const(KitCg* g, double value, KitCgTypeId ty) { } /* ============================================================ - * wide8 — rv32 8-byte (2-word) scalar lane plumbing + * wide8 — 8-byte scalar split into two 4-byte lanes * - * On rv32 a long long / int64_t (and, under ilp32f/ilp32, a soft double) is two - * machine words. Like the wide16 (i128/f128) scalars above it is memory-resident - * (api_is_wide8_scalar_type forces CG_LOCAL_MEMORY_REQUIRED), but its arithmetic - * is done INLINE as 2-word lane sequences (src/cg/arith.c) rather than via a - * runtime call, because compiler-rt has no 64-bit add/sub/and/or/xor helper. The - * lane size is the 4-byte word (ptr_size); the low word is at offset 0 on a - * little-endian target (rv32 is LE; the big-endian offsets are kept for parity - * with the wide16 helpers). These primitives are the inline analogue of - * api_store_f128_bytes / api_i128_addr / api_i128_load_lane. + * Some 32-bit ABIs represent long long / int64_t, and sometimes soft double, + * as two machine words. Like the wide16 (i128/f128) scalars above it is + * memory-resident (api_is_wide8_scalar_type forces CG_LOCAL_MEMORY_REQUIRED), + * but its arithmetic is done INLINE as 2-word lane sequences (src/cg/arith.c) + * rather than via a runtime call, because compiler-rt has no 64-bit + * add/sub/and/or/xor helper. The lane size is 4 bytes; the low word is at + * offset 0 on a little-endian target. These primitives are the inline analogue + * of api_store_f128_bytes / api_i128_addr / api_i128_load_lane. * ============================================================ */ /* Allocate an 8-byte memory-resident, address-taken scalar temp. */ @@ -196,11 +195,11 @@ Operand api_wide8_addr(KitCg* g, ApiSValue* v, KitCgTypeId ty) { } else { lv = *v; } - /* A delayed value (SV_CMP/SV_ARITH) — e.g. an rv32 i64 produced by `!cmp` - * routed here through api_wide64_cmp_inline — is not yet a place. Materialize - * it first: api_ensure_local lowers it into a memory-resident wide8 temp - * (api_alloc_temp_local forces CG_LOCAL_MEMORY_REQUIRED for an 8-byte scalar), - * which is a real addressable home. Materialization, however, clears + /* A delayed value (SV_CMP/SV_ARITH) routed here through the wide64 helpers is + * not yet a place. Materialize it first: api_ensure_local lowers it into a + * memory-resident wide8 temp (api_alloc_temp_local forces + * CG_LOCAL_MEMORY_REQUIRED for an 8-byte scalar), which is a real addressable + * home. Materialization, however, clears * sv.lvalue (fold.c), so we must set the flag AFTER it runs — otherwise the * lvalue check in api_lvalue_addr fails ("addr operand is not an lvalue"). * Doing this before api_lvalue_addr also makes its own api_ensure_local a diff --git a/src/opt/cg_ir_lower.c b/src/opt/cg_ir_lower.c @@ -37,10 +37,16 @@ typedef struct CgIrLower { u32* inst_block; u8* leader; CGLocal mat_local[CG_IR_LOWER_MAX_MAT]; + u8 mat_role[CG_IR_LOWER_MAX_MAT]; Reg mat_reg[CG_IR_LOWER_MAX_MAT]; u32 nmat; } CgIrLower; +typedef enum CgIrMatRole { + CG_IR_MAT_BASE = 0, + CG_IR_MAT_INDEX = 1, +} CgIrMatRole; + static _Noreturn void lower_panic(CgIrLower* l, SrcLoc loc, const char* msg) { compiler_panic(l->c, loc, "opt cg-ir lower: %s", msg); } @@ -483,26 +489,54 @@ static OptOperand opt_frame_operand(OptLocalMap* m) { * in its frame home, so storage.v.reg is meaningless; load the home into a * fresh PReg. prematerialize_indirect_bases emits that load before the using * instruction; here we just look the result up (l->mat_*). */ -static Reg resolve_indirect_base_reg(CgIrLower* l, CGLocal local, SrcLoc loc) { +static Reg resolve_materialized_reg(CgIrLower* l, CGLocal local, + CgIrMatRole role, SrcLoc loc) { OptLocalMap* m = local_map(l, local, loc); if (m->storage.kind == CG_LOCAL_STORAGE_REG) return m->storage.v.reg; for (u32 i = 0; i < l->nmat; ++i) - if (l->mat_local[i] == local) return l->mat_reg[i]; - lower_panic(l, loc, "indirect base local not materialized"); + if (l->mat_local[i] == local && l->mat_role[i] == (u8)role) + return l->mat_reg[i]; + lower_panic(l, loc, role == CG_IR_MAT_INDEX + ? "indirect index local not materialized" + : "indirect base local not materialized"); +} + +static KitCgTypeId pointer_sized_int_type(CgIrLower* l) { + return builtin_id(l->c->target.ptr_size <= 4u ? KIT_CG_BUILTIN_I32 + : KIT_CG_BUILTIN_I64); +} + +static void remember_materialized_reg(CgIrLower* l, CGLocal local, + CgIrMatRole role, Reg r, SrcLoc loc) { + if (l->nmat >= CG_IR_LOWER_MAX_MAT) + lower_panic(l, loc, "too many frame indirect operands in one instruction"); + l->mat_local[l->nmat] = local; + l->mat_role[l->nmat] = (u8)role; + l->mat_reg[l->nmat] = r; + l->nmat++; +} + +static int materialized_reg_exists(CgIrLower* l, CGLocal local, + CgIrMatRole role) { + for (u32 i = 0; i < l->nmat; ++i) + if (l->mat_local[i] == local && l->mat_role[i] == (u8)role) return 1; + return 0; } -/* Emit `r = load <local home>` once per instruction for each FRAME-storage - * local used as an OPK_INDIRECT base/index, recording r in l->mat_*. Must run - * before the consuming instruction is emitted so the load dominates its uses. - */ +static OptOperand opt_frame_operand_as(OptLocalMap* m, KitCgTypeId type) { + OptOperand out = opt_frame_operand(m); + out.type = type ? type : m->type; + return out; +} + +/* Emit the pre-materialization needed for a FRAME-storage local used as an + * OPK_INDIRECT base. A pointer-typed local holds the base pointer value and is + * loaded. A non-pointer local names storage, so its frame address is the base. */ static void materialize_frame_base(CgIrLower* l, u32 block, CGLocal local, SrcLoc loc) { OptLocalMap* m = local_map(l, local, loc); if (m->storage.kind == CG_LOCAL_STORAGE_REG) return; - for (u32 i = 0; i < l->nmat; ++i) - if (l->mat_local[i] == local) return; - if (l->nmat >= CG_IR_LOWER_MAX_MAT) - lower_panic(l, loc, "too many frame indirect bases in one instruction"); + if (materialized_reg_exists(l, local, CG_IR_MAT_BASE)) return; PReg r = ir_alloc_preg(l->f, m->type, RC_INT); OptOperand ops[2]; ops[1] = opt_frame_operand(m); @@ -537,9 +571,40 @@ static void materialize_frame_base(CgIrLower* l, u32 block, CGLocal local, ao->def = (Val)r; ao->type = m->type; } - l->mat_local[l->nmat] = local; - l->mat_reg[l->nmat] = (Reg)r; - l->nmat++; + remember_materialized_reg(l, local, CG_IR_MAT_BASE, (Reg)r, loc); +} + +/* Emit `r = load <local home>` for a FRAME-storage local used as an + * OPK_INDIRECT index. Unlike a non-pointer base, an index always needs the + * local's value. On rv32, Toy indexes are i64 and therefore memory-backed; the + * address calculation only consumes the pointer-width low word. */ +static void materialize_frame_index(CgIrLower* l, u32 block, CGLocal local, + SrcLoc loc) { + OptLocalMap* m = local_map(l, local, loc); + if (m->storage.kind == CG_LOCAL_STORAGE_REG) return; + if (materialized_reg_exists(l, local, CG_IR_MAT_INDEX)) return; + KitCgTypeId idx_ty = pointer_sized_int_type(l); + PReg r = ir_alloc_preg(l->f, idx_ty, RC_INT); + OptOperand ops[2]; + Inst* ld = ir_emit(l->f, block, IR_LOAD); + ld->loc = loc; + memset(&ops[0], 0, sizeof ops[0]); + ops[0].kind = OPK_REG; + ops[0].cls = RC_INT; + ops[0].type = idx_ty; + ops[0].v.reg = (Reg)r; + ops[1] = opt_frame_operand_as(m, idx_ty); + ld->opnds = dup_opt_ops(l, ops, 2); + ld->nopnds = 2; + ld->def = (Val)r; + ld->type = idx_ty; + memset(&ld->extra.mem, 0, sizeof ld->extra.mem); + ld->extra.mem.type = idx_ty; + ld->extra.mem.size = l->c->target.ptr_size; + ld->extra.mem.align = m->align && m->align < l->c->target.ptr_size + ? m->align + : l->c->target.ptr_size; + remember_materialized_reg(l, local, CG_IR_MAT_INDEX, (Reg)r, loc); } /* Scan the CG instruction's operands for OPK_INDIRECT bases/indices that are @@ -552,7 +617,7 @@ static void prematerialize_indirect_bases(CgIrLower* l, const CgIrInst* in, if (op->kind != OPK_INDIRECT) continue; materialize_frame_base(l, block, op->v.ind.base, in->loc); if (op->v.ind.index != CG_LOCAL_NONE) - materialize_frame_base(l, block, op->v.ind.index, in->loc); + materialize_frame_index(l, block, op->v.ind.index, in->loc); } } @@ -590,10 +655,12 @@ static OptOperand lower_operand_addr(CgIrLower* l, const Operand* in, case OPK_INDIRECT: { out.kind = OPK_INDIRECT; out.cls = RC_INT; - out.v.ind.base = resolve_indirect_base_reg(l, in->v.ind.base, loc); + out.v.ind.base = + resolve_materialized_reg(l, in->v.ind.base, CG_IR_MAT_BASE, loc); out.v.ind.index = REG_NONE; if (in->v.ind.index != CG_LOCAL_NONE) - out.v.ind.index = resolve_indirect_base_reg(l, in->v.ind.index, loc); + out.v.ind.index = + resolve_materialized_reg(l, in->v.ind.index, CG_IR_MAT_INDEX, loc); out.v.ind.log2_scale = in->v.ind.log2_scale; out.v.ind.ofs = in->v.ind.ofs; return out; diff --git a/test/parse/cases/6_2_5_04_data_model_widths.c b/test/parse/cases/6_2_5_04_data_model_widths.c @@ -0,0 +1,23 @@ +#include <stddef.h> +#include <stdint.h> + +_Static_assert(sizeof(long) == __SIZEOF_LONG__, "long width"); +_Static_assert(sizeof(unsigned long) == __SIZEOF_LONG__, "unsigned long width"); +_Static_assert(sizeof(size_t) == __SIZEOF_SIZE_T__, "size_t width"); +_Static_assert(sizeof(ptrdiff_t) == __SIZEOF_PTRDIFF_T__, "ptrdiff_t width"); +_Static_assert(sizeof(intptr_t) == __SIZEOF_POINTER__, "intptr_t width"); +_Static_assert(sizeof(uintptr_t) == __SIZEOF_POINTER__, "uintptr_t width"); + +int test_main(void) { + int a[2]; + int score = 0; + if (sizeof(long) == __SIZEOF_LONG__) score += 1; + if (sizeof(unsigned long) == __SIZEOF_LONG__) score += 2; + if (sizeof(size_t) == __SIZEOF_SIZE_T__) score += 4; + if (sizeof(ptrdiff_t) == __SIZEOF_PTRDIFF_T__) score += 8; + if (sizeof(intptr_t) == __SIZEOF_POINTER__) score += 16; + if (sizeof(uintptr_t) == __SIZEOF_POINTER__) score += 5; + score += _Generic(sizeof(int), size_t: 3, default: 0); + score += _Generic((&a[1] - &a[0]), ptrdiff_t: 3, default: 0); + return score; +} diff --git a/test/parse/cases/6_2_5_04_data_model_widths.expected b/test/parse/cases/6_2_5_04_data_model_widths.expected @@ -0,0 +1 @@ +42