kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 1576564d10ca3d46a2d2b708384883adcda9270c
parent 86a819d3f72a1da40cc323e83bc411fefd5fea50
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed,  3 Jun 2026 20:37:25 -0700

rv32: fix 8-byte variadic high-word drop, O1 label-addr width, long double, atomics

Variadic 8-byte args (i64 / soft-double) were dropping their high 32 bits on
rv32. The caller synthesized a single 8-byte ABI part for an unnamed arg, which
the per-part marshaller loaded into one GPR (low word only); the callee va_arg
fetched 4 bytes and advanced the cursor by one slot.

- rv_param_abi: split a variadic scalar wider than one GPR into one INT part per
  word (low word in the lower-numbered reg), matching the named-arg classifier.
- rv_va_arg_wide: new helper that copies the full value out of the (contiguous)
  save area and advances the cursor by the whole span; used by both the -O0
  direct and -O1 native-emit paths for any value too wide for one GPR.
- pass_native_emit IR_VA_ARG: route aggregate/oversized va_arg through its memory
  destination instead of a scratch register. rv64/aa64/x64 unaffected (no test
  passes an 8-byte value as wide there).

O1 static-data label addresses (jump tables, &&label-in-data) panicked with
"unsupported local static label width" on rv32: the native-emit path hardcoded
R_ABS64. Make it width-aware (R_ABS32 for 4-byte slots), mirroring the -O0
direct path. This also fixes the gnu_label_addr_* parse cases.

long double on rv32 is a double alias (8-byte), not IEEE-754 binary128, so the
ldbl128_* cases (which assert sizeof==16 / MANT_DIG==113) can't hold there; add
.rv32.skip sidecars. Generic long-double use (6_7_2_12) keeps working as double.

builtin_24_atomic_lock_free: size the lock-free checks against the native word
(sizeof(void*)) instead of a fixed 8 bytes, and drop the __int128 dead code, so
the same .expected (42) holds on ilp32 and lp64. rv32 has no 64-bit AMO, so an
8-byte atomic is correctly not lock-free there.

Diffstat:
Msrc/arch/riscv/native.c | 99+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
Msrc/opt/pass_native_emit.c | 29+++++++++++++++++++++++++----
Mtest/parse/cases/builtin_24_atomic_lock_free.c | 19+++++++------------
Atest/parse/cases/ldbl128_01_layout_macros.rv32.skip | 1+
Atest/parse/cases/ldbl128_02_literal_to_int.rv32.skip | 1+
Atest/parse/cases/ldbl128_03_arith.rv32.skip | 1+
Atest/parse/cases/ldbl128_04_conversions.rv32.skip | 1+
Atest/parse/cases/ldbl128_05_compare.rv32.skip | 1+
Atest/parse/cases/ldbl128_06_call_return.rv32.skip | 1+
Atest/parse/cases/ldbl128_07_struct_storage.rv32.skip | 1+
Atest/parse/cases/ldbl128_08_literal_bits.rv32.skip | 1+
Atest/parse/cases/ldbl128_09_global_init.rv32.skip | 1+
Atest/parse/cases/ldbl128_10_unary_neg.rv32.skip | 1+
Atest/parse/cases/ldbl128_11_array_copy.rv32.skip | 1+
Atest/parse/cases/ldbl128_12_stack_args.rv32.skip | 1+
Atest/parse/cases/ldbl128_13_mixed_arith.rv32.skip | 1+
Atest/parse/cases/ldbl128_14_struct_return.rv32.skip | 1+
Atest/parse/cases/ldbl128_15_arbitrary_mul.rv32.skip | 1+
18 files changed, 137 insertions(+), 25 deletions(-)

diff --git a/src/arch/riscv/native.c b/src/arch/riscv/native.c @@ -1854,19 +1854,43 @@ static const ABIArgInfo* rv_param_abi(NativeTarget* t, const ABIFuncInfo* abi, * passes variadic FP args in INTEGER registers (as their bit pattern), not * the FP pool — so a variadic float part is ABI_CLASS_INT. */ int variadic = abi && i >= abi->nparams; + u32 gpr = rv_of(t)->variant->ptr_bytes; /* GPR width: 4 ilp32 / 8 lp64 */ + u32 sz, align; + int is_fp; if (abi && i < abi->nparams) return &abi->params[i]; + sz = native_type_size(t, desc->args[i].type); + align = native_type_align(t, desc->args[i].type); + /* A variadic FP arg rides the INTEGER pool as its bit pattern (RISC-V passes + * unnamed FP args in GPRs), so it is INT-class here. */ + is_fp = !variadic && cg_type_is_float(t->c, desc->args[i].type); memset(scratch, 0, sizeof *scratch); scratch->kind = ABI_ARG_DIRECT; + /* A scalar wider than one GPR (an 8-byte i64 / soft-double on ilp32) rides a + * register pair, matching the named-arg classifier (abi_rv64.c). Synthesize + * one INT part per GPR-word so the per-part marshaller fills both registers + * (low word in the lower-numbered reg) instead of dropping the high half into + * a single register. FP-class args (hardware-float, size<=GPR) stay single. */ + if (!is_fp && sz > gpr) { + u32 nparts = (sz + gpr - 1u) / gpr, p; + ABIArgPart* parts = arena_zarray(t->c->tu, ABIArgPart, nparts); + for (p = 0; p < nparts; ++p) { + u32 off = p * gpr; + parts[p].cls = ABI_CLASS_INT; + parts[p].loc = ABI_LOC_REG; + parts[p].size = (sz - off) < gpr ? (sz - off) : gpr; + parts[p].align = gpr; + parts[p].src_offset = off; + } + scratch->nparts = (u16)nparts; + scratch->parts = parts; + return scratch; + } scratch->nparts = 1; scratch->parts = arena_zarray(t->c->tu, ABIArgPart, 1); - ((ABIArgPart*)scratch->parts)[0].cls = - (!variadic && cg_type_is_float(t->c, desc->args[i].type)) ? ABI_CLASS_FP - : ABI_CLASS_INT; + ((ABIArgPart*)scratch->parts)[0].cls = is_fp ? ABI_CLASS_FP : ABI_CLASS_INT; ((ABIArgPart*)scratch->parts)[0].loc = ABI_LOC_REG; - ((ABIArgPart*)scratch->parts)[0].size = - native_type_size(t, desc->args[i].type); - ((ABIArgPart*)scratch->parts)[0].align = - native_type_align(t, desc->args[i].type); + ((ABIArgPart*)scratch->parts)[0].size = sz; + ((ABIArgPart*)scratch->parts)[0].align = align; return scratch; } @@ -2737,6 +2761,41 @@ static void rv_va_start_core(RvNativeTarget* a, NativeAddr ap) { native_mem_for_type(t, i64t, v->ptr_bytes)); } +/* Wide / aggregate va_arg: a value too large for a single GPR (an 8-byte + * i64 / soft-double on ilp32) occupies consecutive GP slots in the save area + * and cannot move through one register. Read the cursor, advance it past the + * whole span, then byte-copy the value from the (saved) cursor into the + * destination memory. RV_TMP2 holds the cursor across the rv_copy_bytes call, + * which itself uses RV_TMP0/RV_TMP1/RV_TMP3. */ +static void rv_va_arg_wide(RvNativeTarget* a, NativeAddr dst, NativeAddr ap, + u32 sz) { + NativeTarget* t = &a->base; + const RiscvVariant* v = a->variant; + MCEmitter* mc = t->mc; + KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); + ABIVaListInfo vai = abi_va_list_layout(t->c->abi); + u32 slot = vai.gp_slot_size ? vai.gp_slot_size : v->gp_slot_bytes; + u32 span = align_up_u32(sz, slot); + NativeLoc cur = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP2); + NativeLoc nxt = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1); + NativeAddr src; + AggregateAccess acc; + /* cur = *ap; *ap = cur + span. */ + rv_emit_mem(a, 1, cur, ap, native_mem_for_type(t, i64t, v->ptr_bytes)); + rv64_emit32(mc, rv_addi(RV_TMP1, RV_TMP2, (i32)span)); + rv_emit_mem(a, 0, nxt, ap, native_mem_for_type(t, i64t, v->ptr_bytes)); + /* Copy sz bytes from [cur] to the destination. */ + memset(&src, 0, sizeof src); + src.base_kind = NATIVE_ADDR_BASE_REG; + src.base.reg = RV_TMP2; + src.base_type = i64t; + memset(&acc, 0, sizeof acc); + acc.type = i64t; + acc.size = sz; + acc.align = slot; + rv_copy_bytes(t, dst, src, acc); +} + static void rv_va_arg_core(RvNativeTarget* a, NativeLoc dst, NativeAddr ap, KitCgTypeId type) { NativeTarget* t = &a->base; @@ -2796,9 +2855,22 @@ static NativeAddr rv_va_addr_from_ptr(NativeLoc ap_ptr) { static void rv_va_start_native(NativeTarget* t, NativeLoc ap_ptr) { rv_va_start_core(rv_of(t), rv_va_addr_from_ptr(ap_ptr)); } +/* A scalar whose value cannot move through one GPR (size > GPR width, e.g. an + * 8-byte i64 / soft-double on ilp32). pass_native_emit hands such a va_arg its + * memory destination directly rather than a scratch register. */ +static int rv_va_arg_is_wide(NativeTarget* t, KitCgTypeId type) { + return native_type_size(t, type) > rv_of(t)->variant->ptr_bytes; +} + static void rv_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr, KitCgTypeId type) { - rv_va_arg_core(rv_of(t), dst, rv_va_addr_from_ptr(ap_ptr), type); + RvNativeTarget* a = rv_of(t); + if (rv_va_arg_is_wide(t, type)) { + rv_va_arg_wide(a, rv_loc_addr(a, dst, 0), rv_va_addr_from_ptr(ap_ptr), + native_type_size(t, type)); + return; + } + rv_va_arg_core(a, dst, rv_va_addr_from_ptr(ap_ptr), type); } static void rv_va_end_native(NativeTarget* t, NativeLoc ap_ptr) { (void)t; @@ -3852,9 +3924,18 @@ static void rv_va_start_(NativeDirectTarget* d, Operand ap_addr) { static void rv_va_arg_(NativeDirectTarget* d, Operand dst, Operand ap_addr, KitCgTypeId type) { RvNativeTarget* a = rv_of(d->native); + NativeAllocClass cls; + /* A value too wide for one GPR (8-byte i64 / soft-double on ilp32) is copied + * straight from the save area into its destination memory. */ + if (rv_va_arg_is_wide(d->native, type)) { + rv_va_arg_wide(a, rv_direct_addr(d, dst), + rv_direct_va_base(d, ap_addr, RV_TMP3), + native_type_size(d->native, type)); + return; + } /* Float-ABI-aware class: a soft (or wider-than-flen) float is INT-class so * the va_arg fetch never lands a double in an FP register on rv32. */ - NativeAllocClass cls = native_class_for_type_fp_le8(d->native, type); + cls = native_class_for_type_fp_le8(d->native, type); NativeLoc res = native_loc_reg(type, cls, cls == NATIVE_REG_FP ? RV_FTMP0 : RV_TMP0); NativeAddr dst_addr; diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c @@ -98,13 +98,23 @@ static void emit_local_static_label_addr(NativeEmitCtx* e, MCLabel target, i64 addend, u32 width, SrcLoc loc) { u8 zero[8]; u32 off; + RelocKind kind; if (!e->local_static_active) emit_panic(e, loc, "local static data inactive"); - if (width != 8u) emit_panic(e, loc, "unsupported local static label width"); + /* A jump-table / label-address slot is one target pointer wide: 8 bytes + * (R_ABS64) on a 64-bit target, 4 bytes (R_ABS32) on rv32/ELFCLASS32. */ + if (width == 8u) + kind = R_ABS64; + else if (width == 4u) + kind = R_ABS32; + else { + emit_panic(e, loc, "unsupported local static label width"); + return; + } memset(zero, 0, sizeof zero); off = e->local_static_base + e->local_static_size; obj_write(e->target->obj, e->local_static_sec, zero, width); e->target->mc->emit_label_data_reloc(e->target->mc, e->local_static_sec, off, - target, R_ABS64, width, addend); + target, kind, width, addend); e->local_static_size += width; } @@ -1186,10 +1196,21 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in, NativeLoc ap = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc), NATIVE_REG_INT, in->opnds[1].type, REG_NONE, REG_NONE, in->loc); + NativeLoc res; + if (type_is_aggregate_or_large(e, ty)) { + /* A value too wide for one register (an 8-byte i64/double on a 32-bit + * target, or an aggregate) can't pass through a scratch register; hand + * the target its memory destination so it can copy the value directly. + */ + e->target->va_arg_(e->target, loc_from_operand(e, &in->opnds[0], + in->loc), + ap, ty); + return; + } /* The result must land in a register distinct from the va_list pointer; * fetch into a scratch register, then write to the real destination. */ - NativeLoc res = scratch_loc(e, ty, class_for_type(e, ty), ap.v.reg, - REG_NONE, in->loc); + res = scratch_loc(e, ty, class_for_type(e, ty), ap.v.reg, REG_NONE, + in->loc); e->target->va_arg_(e->target, res, ap, ty); write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), res, mem_for_type(e->c, ty), in->loc); diff --git a/test/parse/cases/builtin_24_atomic_lock_free.c b/test/parse/cases/builtin_24_atomic_lock_free.c @@ -1,20 +1,15 @@ +/* Lock-free atomics are sized against the native word, not a fixed 64 bits: + * an 8-byte atomic is lock-free on a 64-bit target but not on rv32 (which has + * no 64-bit AMO). Drive the size checks from the pointer width (the native + * atomic word: 4 on ilp32, 8 on lp64) so the same .expected holds on both. A + * width strictly wider than the word is never lock-free anywhere. */ int test_main(void) { int x = 0; - long y = 0; int score = 0; if (__atomic_always_lock_free(1, &x)) score += 1; if (__atomic_always_lock_free(2, &x)) score += 2; if (__atomic_always_lock_free(4, &x)) score += 4; - if (__atomic_is_lock_free(8, &y)) score += 8; - if (!__atomic_always_lock_free(16, &y)) score += 27; - if (__atomic_always_lock_free(32, 0)) { - unsigned __int128* wide = (unsigned __int128*)0; - score += 90 + (int)__atomic_load_n(wide, __ATOMIC_SEQ_CST); - } - if (__atomic_always_lock_free(32, 0) || - (__atomic_always_lock_free(32, 0) && ((unsigned long)&score % 16) == 0)) { - unsigned __int128* wide = (unsigned __int128*)0; - score += 90 + (int)__atomic_load_n(wide, __ATOMIC_SEQ_CST); - } + if (__atomic_is_lock_free(sizeof(void*), &x)) score += 8; + if (!__atomic_always_lock_free(2 * sizeof(void*), &x)) score += 27; return score; } diff --git a/test/parse/cases/ldbl128_01_layout_macros.rv32.skip b/test/parse/cases/ldbl128_01_layout_macros.rv32.skip @@ -0,0 +1 @@ +rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113) diff --git a/test/parse/cases/ldbl128_02_literal_to_int.rv32.skip b/test/parse/cases/ldbl128_02_literal_to_int.rv32.skip @@ -0,0 +1 @@ +rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113) diff --git a/test/parse/cases/ldbl128_03_arith.rv32.skip b/test/parse/cases/ldbl128_03_arith.rv32.skip @@ -0,0 +1 @@ +rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113) diff --git a/test/parse/cases/ldbl128_04_conversions.rv32.skip b/test/parse/cases/ldbl128_04_conversions.rv32.skip @@ -0,0 +1 @@ +rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113) diff --git a/test/parse/cases/ldbl128_05_compare.rv32.skip b/test/parse/cases/ldbl128_05_compare.rv32.skip @@ -0,0 +1 @@ +rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113) diff --git a/test/parse/cases/ldbl128_06_call_return.rv32.skip b/test/parse/cases/ldbl128_06_call_return.rv32.skip @@ -0,0 +1 @@ +rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113) diff --git a/test/parse/cases/ldbl128_07_struct_storage.rv32.skip b/test/parse/cases/ldbl128_07_struct_storage.rv32.skip @@ -0,0 +1 @@ +rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113) diff --git a/test/parse/cases/ldbl128_08_literal_bits.rv32.skip b/test/parse/cases/ldbl128_08_literal_bits.rv32.skip @@ -0,0 +1 @@ +rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113) diff --git a/test/parse/cases/ldbl128_09_global_init.rv32.skip b/test/parse/cases/ldbl128_09_global_init.rv32.skip @@ -0,0 +1 @@ +rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113) diff --git a/test/parse/cases/ldbl128_10_unary_neg.rv32.skip b/test/parse/cases/ldbl128_10_unary_neg.rv32.skip @@ -0,0 +1 @@ +rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113) diff --git a/test/parse/cases/ldbl128_11_array_copy.rv32.skip b/test/parse/cases/ldbl128_11_array_copy.rv32.skip @@ -0,0 +1 @@ +rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113) diff --git a/test/parse/cases/ldbl128_12_stack_args.rv32.skip b/test/parse/cases/ldbl128_12_stack_args.rv32.skip @@ -0,0 +1 @@ +rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113) diff --git a/test/parse/cases/ldbl128_13_mixed_arith.rv32.skip b/test/parse/cases/ldbl128_13_mixed_arith.rv32.skip @@ -0,0 +1 @@ +rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113) diff --git a/test/parse/cases/ldbl128_14_struct_return.rv32.skip b/test/parse/cases/ldbl128_14_struct_return.rv32.skip @@ -0,0 +1 @@ +rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113) diff --git a/test/parse/cases/ldbl128_15_arbitrary_mul.rv32.skip b/test/parse/cases/ldbl128_15_arbitrary_mul.rv32.skip @@ -0,0 +1 @@ +rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113)