rv32: fix 8-byte variadic high-word drop, O1 label-addr width, long double, atomics - kit

commit 1576564d10ca3d46a2d2b708384883adcda9270c
parent 86a819d3f72a1da40cc323e83bc411fefd5fea50
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed,  3 Jun 2026 20:37:25 -0700

rv32: fix 8-byte variadic high-word drop, O1 label-addr width, long double, atomics

Variadic 8-byte args (i64 / soft-double) were dropping their high 32 bits on
rv32. The caller synthesized a single 8-byte ABI part for an unnamed arg, which
the per-part marshaller loaded into one GPR (low word only); the callee va_arg
fetched 4 bytes and advanced the cursor by one slot.

- rv_param_abi: split a variadic scalar wider than one GPR into one INT part per
  word (low word in the lower-numbered reg), matching the named-arg classifier.
- rv_va_arg_wide: new helper that copies the full value out of the (contiguous)
  save area and advances the cursor by the whole span; used by both the -O0
  direct and -O1 native-emit paths for any value too wide for one GPR.
- pass_native_emit IR_VA_ARG: route aggregate/oversized va_arg through its memory
  destination instead of a scratch register. rv64/aa64/x64 unaffected (no test
  passes an 8-byte value as wide there).

O1 static-data label addresses (jump tables, &&label-in-data) panicked with
"unsupported local static label width" on rv32: the native-emit path hardcoded
R_ABS64. Make it width-aware (R_ABS32 for 4-byte slots), mirroring the -O0
direct path. This also fixes the gnu_label_addr_* parse cases.

long double on rv32 is a double alias (8-byte), not IEEE-754 binary128, so the
ldbl128_* cases (which assert sizeof==16 / MANT_DIG==113) can't hold there; add
.rv32.skip sidecars. Generic long-double use (6_7_2_12) keeps working as double.

builtin_24_atomic_lock_free: size the lock-free checks against the native word
(sizeof(void*)) instead of a fixed 8 bytes, and drop the __int128 dead code, so
the same .expected (42) holds on ilp32 and lp64. rv32 has no 64-bit AMO, so an
8-byte atomic is correctly not lock-free there.

Diffstat:
M src/arch/riscv/native.c  | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
M src/opt/pass_native_emit.c  | 29 +++++++++++++++++++++++++----
M test/parse/cases/builtin_24_atomic_lock_free.c  | 19 +++++++------------
A test/parse/cases/ldbl128_01_layout_macros.rv32.skip  | 1 +
A test/parse/cases/ldbl128_02_literal_to_int.rv32.skip  | 1 +
A test/parse/cases/ldbl128_03_arith.rv32.skip  | 1 +
A test/parse/cases/ldbl128_04_conversions.rv32.skip  | 1 +
A test/parse/cases/ldbl128_05_compare.rv32.skip  | 1 +
A test/parse/cases/ldbl128_06_call_return.rv32.skip  | 1 +
A test/parse/cases/ldbl128_07_struct_storage.rv32.skip  | 1 +
A test/parse/cases/ldbl128_08_literal_bits.rv32.skip  | 1 +
A test/parse/cases/ldbl128_09_global_init.rv32.skip  | 1 +
A test/parse/cases/ldbl128_10_unary_neg.rv32.skip  | 1 +
A test/parse/cases/ldbl128_11_array_copy.rv32.skip  | 1 +
A test/parse/cases/ldbl128_12_stack_args.rv32.skip  | 1 +
A test/parse/cases/ldbl128_13_mixed_arith.rv32.skip  | 1 +
A test/parse/cases/ldbl128_14_struct_return.rv32.skip  | 1 +
A test/parse/cases/ldbl128_15_arbitrary_mul.rv32.skip  | 1 +

18 files changed, 137 insertions(+), 25 deletions(-)
diff --git a/src/arch/riscv/native.c b/src/arch/riscv/native.c
@@ -1854,19 +1854,43 @@ static const ABIArgInfo* rv_param_abi(NativeTarget* t, const ABIFuncInfo* abi,
    * passes variadic FP args in INTEGER registers (as their bit pattern), not
    * the FP pool — so a variadic float part is ABI_CLASS_INT. */
   int variadic = abi && i >= abi->nparams;
+  u32 gpr = rv_of(t)->variant->ptr_bytes; /* GPR width: 4 ilp32 / 8 lp64 */
+  u32 sz, align;
+  int is_fp;
   if (abi && i < abi->nparams) return &abi->params[i];
+  sz = native_type_size(t, desc->args[i].type);
+  align = native_type_align(t, desc->args[i].type);
+  /* A variadic FP arg rides the INTEGER pool as its bit pattern (RISC-V passes
+   * unnamed FP args in GPRs), so it is INT-class here. */
+  is_fp = !variadic && cg_type_is_float(t->c, desc->args[i].type);
   memset(scratch, 0, sizeof *scratch);
   scratch->kind = ABI_ARG_DIRECT;
+  /* A scalar wider than one GPR (an 8-byte i64 / soft-double on ilp32) rides a
+   * register pair, matching the named-arg classifier (abi_rv64.c). Synthesize
+   * one INT part per GPR-word so the per-part marshaller fills both registers
+   * (low word in the lower-numbered reg) instead of dropping the high half into
+   * a single register. FP-class args (hardware-float, size<=GPR) stay single. */
+  if (!is_fp && sz > gpr) {
+    u32 nparts = (sz + gpr - 1u) / gpr, p;
+    ABIArgPart* parts = arena_zarray(t->c->tu, ABIArgPart, nparts);
+    for (p = 0; p < nparts; ++p) {
+      u32 off = p * gpr;
+      parts[p].cls = ABI_CLASS_INT;
+      parts[p].loc = ABI_LOC_REG;
+      parts[p].size = (sz - off) < gpr ? (sz - off) : gpr;
+      parts[p].align = gpr;
+      parts[p].src_offset = off;
+    }
+    scratch->nparts = (u16)nparts;
+    scratch->parts = parts;
+    return scratch;
+  }
   scratch->nparts = 1;
   scratch->parts = arena_zarray(t->c->tu, ABIArgPart, 1);
-  ((ABIArgPart*)scratch->parts)[0].cls =
-      (!variadic && cg_type_is_float(t->c, desc->args[i].type)) ? ABI_CLASS_FP
-                                                                : ABI_CLASS_INT;
+  ((ABIArgPart*)scratch->parts)[0].cls = is_fp ? ABI_CLASS_FP : ABI_CLASS_INT;
   ((ABIArgPart*)scratch->parts)[0].loc = ABI_LOC_REG;
-  ((ABIArgPart*)scratch->parts)[0].size =
-      native_type_size(t, desc->args[i].type);
-  ((ABIArgPart*)scratch->parts)[0].align =
-      native_type_align(t, desc->args[i].type);
+  ((ABIArgPart*)scratch->parts)[0].size = sz;
+  ((ABIArgPart*)scratch->parts)[0].align = align;
   return scratch;
 }
 
@@ -2737,6 +2761,41 @@ static void rv_va_start_core(RvNativeTarget* a, NativeAddr ap) {
               native_mem_for_type(t, i64t, v->ptr_bytes));
 }
 
+/* Wide / aggregate va_arg: a value too large for a single GPR (an 8-byte
+ * i64 / soft-double on ilp32) occupies consecutive GP slots in the save area
+ * and cannot move through one register. Read the cursor, advance it past the
+ * whole span, then byte-copy the value from the (saved) cursor into the
+ * destination memory. RV_TMP2 holds the cursor across the rv_copy_bytes call,
+ * which itself uses RV_TMP0/RV_TMP1/RV_TMP3. */
+static void rv_va_arg_wide(RvNativeTarget* a, NativeAddr dst, NativeAddr ap,
+                           u32 sz) {
+  NativeTarget* t = &a->base;
+  const RiscvVariant* v = a->variant;
+  MCEmitter* mc = t->mc;
+  KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
+  ABIVaListInfo vai = abi_va_list_layout(t->c->abi);
+  u32 slot = vai.gp_slot_size ? vai.gp_slot_size : v->gp_slot_bytes;
+  u32 span = align_up_u32(sz, slot);
+  NativeLoc cur = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP2);
+  NativeLoc nxt = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1);
+  NativeAddr src;
+  AggregateAccess acc;
+  /* cur = *ap; *ap = cur + span. */
+  rv_emit_mem(a, 1, cur, ap, native_mem_for_type(t, i64t, v->ptr_bytes));
+  rv64_emit32(mc, rv_addi(RV_TMP1, RV_TMP2, (i32)span));
+  rv_emit_mem(a, 0, nxt, ap, native_mem_for_type(t, i64t, v->ptr_bytes));
+  /* Copy sz bytes from [cur] to the destination. */
+  memset(&src, 0, sizeof src);
+  src.base_kind = NATIVE_ADDR_BASE_REG;
+  src.base.reg = RV_TMP2;
+  src.base_type = i64t;
+  memset(&acc, 0, sizeof acc);
+  acc.type = i64t;
+  acc.size = sz;
+  acc.align = slot;
+  rv_copy_bytes(t, dst, src, acc);
+}
+
 static void rv_va_arg_core(RvNativeTarget* a, NativeLoc dst, NativeAddr ap,
                            KitCgTypeId type) {
   NativeTarget* t = &a->base;
@@ -2796,9 +2855,22 @@ static NativeAddr rv_va_addr_from_ptr(NativeLoc ap_ptr) {
 static void rv_va_start_native(NativeTarget* t, NativeLoc ap_ptr) {
   rv_va_start_core(rv_of(t), rv_va_addr_from_ptr(ap_ptr));
 }
+/* A scalar whose value cannot move through one GPR (size > GPR width, e.g. an
+ * 8-byte i64 / soft-double on ilp32). pass_native_emit hands such a va_arg its
+ * memory destination directly rather than a scratch register. */
+static int rv_va_arg_is_wide(NativeTarget* t, KitCgTypeId type) {
+  return native_type_size(t, type) > rv_of(t)->variant->ptr_bytes;
+}
+
 static void rv_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr,
                              KitCgTypeId type) {
-  rv_va_arg_core(rv_of(t), dst, rv_va_addr_from_ptr(ap_ptr), type);
+  RvNativeTarget* a = rv_of(t);
+  if (rv_va_arg_is_wide(t, type)) {
+    rv_va_arg_wide(a, rv_loc_addr(a, dst, 0), rv_va_addr_from_ptr(ap_ptr),
+                   native_type_size(t, type));
+    return;
+  }
+  rv_va_arg_core(a, dst, rv_va_addr_from_ptr(ap_ptr), type);
 }
 static void rv_va_end_native(NativeTarget* t, NativeLoc ap_ptr) {
   (void)t;
@@ -3852,9 +3924,18 @@ static void rv_va_start_(NativeDirectTarget* d, Operand ap_addr) {
 static void rv_va_arg_(NativeDirectTarget* d, Operand dst, Operand ap_addr,
                        KitCgTypeId type) {
   RvNativeTarget* a = rv_of(d->native);
+  NativeAllocClass cls;
+  /* A value too wide for one GPR (8-byte i64 / soft-double on ilp32) is copied
+   * straight from the save area into its destination memory. */
+  if (rv_va_arg_is_wide(d->native, type)) {
+    rv_va_arg_wide(a, rv_direct_addr(d, dst),
+                   rv_direct_va_base(d, ap_addr, RV_TMP3),
+                   native_type_size(d->native, type));
+    return;
+  }
   /* Float-ABI-aware class: a soft (or wider-than-flen) float is INT-class so
    * the va_arg fetch never lands a double in an FP register on rv32. */
-  NativeAllocClass cls = native_class_for_type_fp_le8(d->native, type);
+  cls = native_class_for_type_fp_le8(d->native, type);
   NativeLoc res = native_loc_reg(type, cls,
                              cls == NATIVE_REG_FP ? RV_FTMP0 : RV_TMP0);
   NativeAddr dst_addr;
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -98,13 +98,23 @@ static void emit_local_static_label_addr(NativeEmitCtx* e, MCLabel target,
                                          i64 addend, u32 width, SrcLoc loc) {
   u8 zero[8];
   u32 off;
+  RelocKind kind;
   if (!e->local_static_active) emit_panic(e, loc, "local static data inactive");
-  if (width != 8u) emit_panic(e, loc, "unsupported local static label width");
+  /* A jump-table / label-address slot is one target pointer wide: 8 bytes
+   * (R_ABS64) on a 64-bit target, 4 bytes (R_ABS32) on rv32/ELFCLASS32. */
+  if (width == 8u)
+    kind = R_ABS64;
+  else if (width == 4u)
+    kind = R_ABS32;
+  else {
+    emit_panic(e, loc, "unsupported local static label width");
+    return;
+  }
   memset(zero, 0, sizeof zero);
   off = e->local_static_base + e->local_static_size;
   obj_write(e->target->obj, e->local_static_sec, zero, width);
   e->target->mc->emit_label_data_reloc(e->target->mc, e->local_static_sec, off,
-                                       target, R_ABS64, width, addend);
+                                       target, kind, width, addend);
   e->local_static_size += width;
 }
 
@@ -1186,10 +1196,21 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in,
       NativeLoc ap = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc),
                                  NATIVE_REG_INT, in->opnds[1].type, REG_NONE,
                                  REG_NONE, in->loc);
+      NativeLoc res;
+      if (type_is_aggregate_or_large(e, ty)) {
+        /* A value too wide for one register (an 8-byte i64/double on a 32-bit
+         * target, or an aggregate) can't pass through a scratch register; hand
+         * the target its memory destination so it can copy the value directly.
+         */
+        e->target->va_arg_(e->target, loc_from_operand(e, &in->opnds[0],
+                                                       in->loc),
+                           ap, ty);
+        return;
+      }
       /* The result must land in a register distinct from the va_list pointer;
        * fetch into a scratch register, then write to the real destination. */
-      NativeLoc res = scratch_loc(e, ty, class_for_type(e, ty), ap.v.reg,
-                                  REG_NONE, in->loc);
+      res = scratch_loc(e, ty, class_for_type(e, ty), ap.v.reg, REG_NONE,
+                        in->loc);
       e->target->va_arg_(e->target, res, ap, ty);
       write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), res,
                 mem_for_type(e->c, ty), in->loc);
diff --git a/test/parse/cases/builtin_24_atomic_lock_free.c b/test/parse/cases/builtin_24_atomic_lock_free.c
@@ -1,20 +1,15 @@
+/* Lock-free atomics are sized against the native word, not a fixed 64 bits:
+ * an 8-byte atomic is lock-free on a 64-bit target but not on rv32 (which has
+ * no 64-bit AMO). Drive the size checks from the pointer width (the native
+ * atomic word: 4 on ilp32, 8 on lp64) so the same .expected holds on both. A
+ * width strictly wider than the word is never lock-free anywhere. */
 int test_main(void) {
   int x = 0;
-  long y = 0;
   int score = 0;
   if (__atomic_always_lock_free(1, &x)) score += 1;
   if (__atomic_always_lock_free(2, &x)) score += 2;
   if (__atomic_always_lock_free(4, &x)) score += 4;
-  if (__atomic_is_lock_free(8, &y)) score += 8;
-  if (!__atomic_always_lock_free(16, &y)) score += 27;
-  if (__atomic_always_lock_free(32, 0)) {
-    unsigned __int128* wide = (unsigned __int128*)0;
-    score += 90 + (int)__atomic_load_n(wide, __ATOMIC_SEQ_CST);
-  }
-  if (__atomic_always_lock_free(32, 0) ||
-      (__atomic_always_lock_free(32, 0) && ((unsigned long)&score % 16) == 0)) {
-    unsigned __int128* wide = (unsigned __int128*)0;
-    score += 90 + (int)__atomic_load_n(wide, __ATOMIC_SEQ_CST);
-  }
+  if (__atomic_is_lock_free(sizeof(void*), &x)) score += 8;
+  if (!__atomic_always_lock_free(2 * sizeof(void*), &x)) score += 27;
   return score;
 }
diff --git a/test/parse/cases/ldbl128_01_layout_macros.rv32.skip b/test/parse/cases/ldbl128_01_layout_macros.rv32.skip
@@ -0,0 +1 @@
+rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113)
diff --git a/test/parse/cases/ldbl128_02_literal_to_int.rv32.skip b/test/parse/cases/ldbl128_02_literal_to_int.rv32.skip
@@ -0,0 +1 @@
+rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113)
diff --git a/test/parse/cases/ldbl128_03_arith.rv32.skip b/test/parse/cases/ldbl128_03_arith.rv32.skip
@@ -0,0 +1 @@
+rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113)
diff --git a/test/parse/cases/ldbl128_04_conversions.rv32.skip b/test/parse/cases/ldbl128_04_conversions.rv32.skip
@@ -0,0 +1 @@
+rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113)
diff --git a/test/parse/cases/ldbl128_05_compare.rv32.skip b/test/parse/cases/ldbl128_05_compare.rv32.skip
@@ -0,0 +1 @@
+rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113)
diff --git a/test/parse/cases/ldbl128_06_call_return.rv32.skip b/test/parse/cases/ldbl128_06_call_return.rv32.skip
@@ -0,0 +1 @@
+rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113)
diff --git a/test/parse/cases/ldbl128_07_struct_storage.rv32.skip b/test/parse/cases/ldbl128_07_struct_storage.rv32.skip
@@ -0,0 +1 @@
+rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113)
diff --git a/test/parse/cases/ldbl128_08_literal_bits.rv32.skip b/test/parse/cases/ldbl128_08_literal_bits.rv32.skip
@@ -0,0 +1 @@
+rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113)
diff --git a/test/parse/cases/ldbl128_09_global_init.rv32.skip b/test/parse/cases/ldbl128_09_global_init.rv32.skip
@@ -0,0 +1 @@
+rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113)
diff --git a/test/parse/cases/ldbl128_10_unary_neg.rv32.skip b/test/parse/cases/ldbl128_10_unary_neg.rv32.skip
@@ -0,0 +1 @@
+rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113)
diff --git a/test/parse/cases/ldbl128_11_array_copy.rv32.skip b/test/parse/cases/ldbl128_11_array_copy.rv32.skip
@@ -0,0 +1 @@
+rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113)
diff --git a/test/parse/cases/ldbl128_12_stack_args.rv32.skip b/test/parse/cases/ldbl128_12_stack_args.rv32.skip
@@ -0,0 +1 @@
+rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113)
diff --git a/test/parse/cases/ldbl128_13_mixed_arith.rv32.skip b/test/parse/cases/ldbl128_13_mixed_arith.rv32.skip
@@ -0,0 +1 @@
+rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113)
diff --git a/test/parse/cases/ldbl128_14_struct_return.rv32.skip b/test/parse/cases/ldbl128_14_struct_return.rv32.skip
@@ -0,0 +1 @@
+rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113)
diff --git a/test/parse/cases/ldbl128_15_arbitrary_mul.rv32.skip b/test/parse/cases/ldbl128_15_arbitrary_mul.rv32.skip
@@ -0,0 +1 @@
+rv32 long double is double-aliased (8-byte), not IEEE-754 binary128; this case asserts binary128 layout/semantics (sizeof==16, MANT_DIG==113)

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/arch/riscv/native.c	\|	99	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
M	src/opt/pass_native_emit.c	\|	29	+++++++++++++++++++++++++----
M	test/parse/cases/builtin_24_atomic_lock_free.c	\|	19	+++++++------------
A	test/parse/cases/ldbl128_01_layout_macros.rv32.skip	\|	1	+
A	test/parse/cases/ldbl128_02_literal_to_int.rv32.skip	\|	1	+
A	test/parse/cases/ldbl128_03_arith.rv32.skip	\|	1	+
A	test/parse/cases/ldbl128_04_conversions.rv32.skip	\|	1	+
A	test/parse/cases/ldbl128_05_compare.rv32.skip	\|	1	+
A	test/parse/cases/ldbl128_06_call_return.rv32.skip	\|	1	+
A	test/parse/cases/ldbl128_07_struct_storage.rv32.skip	\|	1	+
A	test/parse/cases/ldbl128_08_literal_bits.rv32.skip	\|	1	+
A	test/parse/cases/ldbl128_09_global_init.rv32.skip	\|	1	+
A	test/parse/cases/ldbl128_10_unary_neg.rv32.skip	\|	1	+
A	test/parse/cases/ldbl128_11_array_copy.rv32.skip	\|	1	+
A	test/parse/cases/ldbl128_12_stack_args.rv32.skip	\|	1	+
A	test/parse/cases/ldbl128_13_mixed_arith.rv32.skip	\|	1	+
A	test/parse/cases/ldbl128_14_struct_return.rv32.skip	\|	1	+
A	test/parse/cases/ldbl128_15_arbitrary_mul.rv32.skip	\|	1	+