x64: integer va_arg does offset arithmetic in its own dst register - kit

commit 7f49849e76a96d275a93c7bf26b9700cbb8bb7bf
parent 5b6f8ec5fd7128a689d19ba5f2d4bbb5dd04b4a8
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 29 May 2026 11:59:50 -0700

x64: integer va_arg does offset arithmetic in its own dst register

pass_native_emit fetches va_arg into a scratch register and copies it to the
real destination, so for integer results the va_arg core can use that scratch
destination GPR itself for the gp_offset/address arithmetic and touch no other
register. Only FP results (value kept in XMM) still borrow RAX as a GPR scratch.

This removes the RAX clobber that was destroying a returned accumulator living
in RAX across a va_arg loop (the classic 'int sum(int n,...)' reducer). The
machine-clobber hook now reports the RAX clobber only for FP va_arg (new
NativeMachineOp.result_is_fp), so integer va_arg imposes no allocator constraint.

x64 parse -O1: 888 -> 895 (fixes builtin_03_va_list, builtin_05_va_copy,
6_7_6_08_variadic_decl, 6_9_06_variadic_func, variadic_02_many_ints,
variadic_03_long, variadic_07_nested_call). Remaining non-ldbl: variadic_04_pointer,
2 VLA-param, funcptr_field_first_arg, sadd_overflow. No regressions: toy x64/rv64
O0+O1 156/0.

Diffstat:
M lang/c/type/type.c  | 10 ++++------
M lang/cpp/cpp_support.h  | 14 ++++++++++++++
M lang/cpp/pp/pp.c  | 15 ++++++++-------
M rt/Makefile  | 7 +++++--
M src/abi/abi_rv64.c  | 8 ++++++--
M src/arch/native_target.h  | 1 +
M src/arch/x64/native.c  | 52 ++++++++++++++++++++++++++++++----------------------
M src/opt/pass_machinize.c  | 7 ++++++-

8 files changed, 74 insertions(+), 40 deletions(-)
diff --git a/lang/c/type/type.c b/lang/c/type/type.c
@@ -517,13 +517,11 @@ static CfreeCgTypeId type_cg_builtin(CfreeCompiler* c, TypeKind kind) {
     case TY_DOUBLE:
       return b.id[CFREE_CG_BUILTIN_F64];
     case TY_LDOUBLE:
-      /* binary128 long double: aarch64-linux, and wasm32 (matching
-       * clang/LLVM's wasm convention). RV64 long double = double per the
-       * locked decision; everything else uses double too. */
-      if ((target.arch == CFREE_ARCH_ARM_64 && target.os == CFREE_OS_LINUX) ||
-          target.arch == CFREE_ARCH_WASM) {
+      /* `long double` is IEEE-754 binary128 on targets that follow the quad
+       * psABI (RISC-V, aarch64-linux, wasm32); elsewhere it aliases `double`.
+       * See cfree_target_long_double_is_binary128. */
+      if (cfree_target_long_double_is_binary128(target))
         return b.id[CFREE_CG_BUILTIN_F128];
-      }
       return b.id[CFREE_CG_BUILTIN_F64];
     default:
       break;
diff --git a/lang/cpp/cpp_support.h b/lang/cpp/cpp_support.h
@@ -77,4 +77,18 @@ _Noreturn static inline void compiler_panicv(Compiler* c, SrcLoc loc,
   cfree_frontend_vfatal(c, loc, fmt, ap);
 }
 
+/* True when the C `long double` type is IEEE-754 binary128 (quad) on this
+ * target rather than an alias of `double`. RISC-V (LP64/LP64D) and
+ * aarch64-linux follow the quad psABI; wasm32 matches clang/LLVM's wasm
+ * convention. x86 (80-bit x87, not modeled), Apple, and Windows alias long
+ * double to double. Centralized so the preprocessor's __LDBL_* /
+ * __SIZEOF_LONG_DOUBLE__ macros and the C type system's long-double ->
+ * CG-builtin mapping cannot drift apart. */
+static inline int cfree_target_long_double_is_binary128(CfreeTarget t) {
+  if (t.arch == CFREE_ARCH_RV64) return 1;
+  if (t.arch == CFREE_ARCH_ARM_64 && t.os == CFREE_OS_LINUX) return 1;
+  if (t.arch == CFREE_ARCH_WASM) return 1;
+  return 0;
+}
+
 #endif
diff --git a/lang/cpp/pp/pp.c b/lang/cpp/pp/pp.c
@@ -378,7 +378,8 @@ static void pp_register_target_predefined(Pp* pp) {
   pp_define(pp, "__SIZEOF_WINT_T__", "4");
   pp_define(pp, "__SIZEOF_FLOAT__", "4");
   pp_define(pp, "__SIZEOF_DOUBLE__", "8");
-  pp_define(pp, "__SIZEOF_LONG_DOUBLE__", "8");
+  pp_define(pp, "__SIZEOF_LONG_DOUBLE__",
+            cfree_target_long_double_is_binary128(target) ? "16" : "8");
 
   /* Windows / mingw predefined macros. cfree targets the mingw
    * flavor (DWARF debug info, mingwex CRT) rather than MSVC, so we
@@ -669,12 +670,12 @@ static void pp_register_target_predefined(Pp* pp) {
   pp_define(pp, "__DBL_MIN__", "0x1p-1022");
   pp_define(pp, "__DBL_DENORM_MIN__", "0x1p-1074");
 
-  /* RV64 long double = double per the locked decision (matches RV64
-   * musl/glibc default). aarch64-linux and wasm32 (matching clang/LLVM's
-   * wasm convention) get binary128 long double; the wasm backend then
-   * reports it as unsupported when a value is actually materialized. */
-  if ((target.arch == CFREE_ARCH_ARM_64 && target.os == CFREE_OS_LINUX) ||
-      target.arch == CFREE_ARCH_WASM) {
+  /* Targets that follow the IEEE-754 binary128 quad psABI for `long double`
+   * (RISC-V, aarch64-linux, wasm32) get the 113-bit-mantissa characteristics;
+   * everything else aliases `double`. The wasm backend still reports f128 as
+   * unsupported when a value is actually materialized. See
+   * cfree_target_long_double_is_binary128. */
+  if (cfree_target_long_double_is_binary128(target)) {
     pp_define(pp, "__LDBL_HAS_DENORM__", "1");
     pp_define(pp, "__LDBL_MANT_DIG__", "113");
     pp_define(pp, "__LDBL_DECIMAL_DIG__", "36");
diff --git a/rt/Makefile b/rt/Makefile
@@ -106,14 +106,16 @@ RT_riscv64-linux_TARGET     = riscv64-linux-gnu
 RT_riscv64-linux_ABI        = lp64
 RT_riscv64-linux_INT128     = 1
 RT_riscv64-linux_CORO       = riscv64
-# long double = double per the locked rv64 decision; no binary128 runtime.
-RT_riscv64-linux_LDBL128    =
+# RISC-V `long double` is IEEE-754 binary128 per the psABI; ship the quad
+# soft-float / __int128 runtime (fp_tf, fp_ti).
+RT_riscv64-linux_LDBL128    = 1
 RT_riscv64-linux_ARCH_FLAGS = -mabi=lp64d -march=rv64imafd
 
 RT_riscv64-elf_TARGET     = riscv64-unknown-elf
 RT_riscv64-elf_ABI        = lp64
 RT_riscv64-elf_INT128     = 1
 RT_riscv64-elf_CORO       = riscv64
+RT_riscv64-elf_LDBL128    = 1
 RT_riscv64-elf_ARCH_FLAGS = -mabi=lp64 -march=rv64imafd
 
 RT_riscv64-elf-save-restore_TARGET       = riscv64-unknown-elf
@@ -121,6 +123,7 @@ RT_riscv64-elf-save-restore_ABI          = lp64
 RT_riscv64-elf-save-restore_INT128       = 1
 RT_riscv64-elf-save-restore_CORO         = riscv64
 RT_riscv64-elf-save-restore_SAVE_RESTORE = 1
+RT_riscv64-elf-save-restore_LDBL128      = 1
 RT_riscv64-elf-save-restore_ARCH_FLAGS   = -mabi=lp64 -march=rv64imafd
 
 RT_x86_64-pc-windows_TARGET = x86_64-pc-windows-msvc
diff --git a/src/abi/abi_rv64.c b/src/abi/abi_rv64.c
@@ -15,8 +15,12 @@
  *                    * otherwise INT parts up to 16 B (passed in up to 2 GPRs).
  *   large struct  -> INDIRECT (sret for return; byval for args)
  *
- * Long double is locked to `double` for rv64 (see RV64_PARITY_CHECKLIST);
- * binary128 / quad encoding is deferred.
+ * Long double is IEEE-754 binary128 (quad) on rv64. Like __int128 it is a
+ * 16-byte scalar passed/returned in an aligned pair of integer registers
+ * (a0:a1 .. a6:a7; low-order half in the lower-numbered register), split
+ * low-in-register / high-on-stack when only one register remains, and wholly
+ * on the stack otherwise. classify_scalar() handles both via the size==16
+ * INT-pair path below (there are no 128-bit FP registers in RV64GC).
  *
  * Variadic args bypass these rules entirely and always go through the
  * integer register file / stack (handled at the caller / callee sites). */
diff --git a/src/arch/native_target.h b/src/arch/native_target.h
@@ -240,6 +240,7 @@ typedef struct NativeMachineOp {
   u8 binop;         /* BinOp, when kind == NATIVE_MOP_BINOP */
   u8 intrin;        /* IntrinKind, when kind == NATIVE_MOP_INTRINSIC */
   u8 second_is_reg; /* binop's second operand is a register (not an immediate) */
+  u8 result_is_fp;  /* result lands in an FP register (e.g. va_arg of a double) */
 } NativeMachineOp;
 
 typedef struct NativeCallDesc {
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -2927,17 +2927,22 @@ static void x64_va_arg_core(X64NativeTarget* a, NativeLoc dst, NativeAddr ap,
   int is_fp = loc_is_fp(dst);
   u32 dr = loc_reg(dst);
   u32 ap_base = x64_va_base(a, ap, X64_TMP_INT2); /* r11 */
-  /* Uses only the reserved emit scratch: r11 (ap_base) and rax. The va_list
-   * offset/pointer fields are advanced in memory (x64_add_mem_imm) and the
-   * reg-save base is folded into rax (x64_add_reg_mem), so no third register is
-   * needed — leaving r10 (and rdx) free for the allocator. */
+  /* GPR scratch for the offset/address arithmetic. For integer results the
+   * destination is itself a (throwaway) scratch GPR — pass_native_emit fetches
+   * va_arg into a scratch and copies it to the real destination afterward — so
+   * we reuse `dr` and touch no allocable register at all. FP results keep their
+   * value in an XMM register, so they borrow the reserved RAX emit scratch.
+   * Either way only r11 (ap_base) and `gp` are used: the va_list fields are
+   * advanced in memory (x64_add_mem_imm) and the reg-save base is folded in
+   * with x64_add_reg_mem, so no third register is needed. */
+  u32 gp = is_fp ? X64_RAX : dr;
   if (a->abi->shadow_space) {
-    /* Win64: rax = *ap; load dr from [rax]; *ap += 8. */
-    emit_mov_load(mc, 8, 0, X64_RAX, ap_base, 0);
+    /* Win64: gp = *ap; load dr from [gp]; *ap += 8. */
+    emit_mov_load(mc, 8, 0, gp, ap_base, 0);
     if (is_fp)
-      emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_RAX, 0);
+      emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, gp, 0);
     else
-      emit_mov_load(mc, sz, 0, dr, X64_RAX, 0);
+      emit_mov_load(mc, sz, 0, dr, gp, 0);
     x64_add_mem_imm(mc, 1, ap_base, 0, 8);
     return;
   }
@@ -2947,26 +2952,26 @@ static void x64_va_arg_core(X64NativeTarget* a, NativeLoc dst, NativeAddr ap,
     i8 stride = is_fp ? 16 : 8;
     MCLabel L_stack = mc->label_new(mc);
     MCLabel L_done = mc->label_new(mc);
-    /* eax = ap[offs]; cmp eax, max; jae L_stack. */
-    emit_mov_load(mc, 4, 0, X64_RAX, ap_base, (i32)offs_field);
-    emit_alu_imm32(mc, 0, X64_ALU_SUB_CMP, X64_RAX, (i32)max_offs);
+    /* gp32 = ap[offs]; cmp gp32, max; jae L_stack. */
+    emit_mov_load(mc, 4, 0, gp, ap_base, (i32)offs_field);
+    emit_alu_imm32(mc, 0, X64_ALU_SUB_CMP, gp, (i32)max_offs);
     emit_jcc_rel32(mc, X64_CC_AE, L_stack);
-    /* reg path: ap[offs] += stride; rax = reg_save_area(ap[16]) + offset; load.
-     * (The memory increment leaves rax holding the old offset.) */
+    /* reg path: ap[offs] += stride; gp = reg_save_area(ap[16]) + offset; load.
+     * (The memory increment leaves gp holding the old offset.) */
     x64_add_mem_imm(mc, 0, ap_base, (i32)offs_field, stride);
-    x64_add_reg_mem(mc, X64_RAX, ap_base, 16);
+    x64_add_reg_mem(mc, gp, ap_base, 16);
     if (is_fp)
-      emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_RAX, 0);
+      emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, gp, 0);
     else
-      emit_mov_load(mc, sz, 0, dr, X64_RAX, 0);
+      emit_mov_load(mc, sz, 0, dr, gp, 0);
     emit_jmp_rel32(mc, L_done);
-    /* stack path: rax = ap[8] (overflow area); load; ap[8] += 8. */
+    /* stack path: gp = ap[8] (overflow area); load; ap[8] += 8. */
     mc->label_place(mc, L_stack);
-    emit_mov_load(mc, 8, 0, X64_RAX, ap_base, 8);
+    emit_mov_load(mc, 8, 0, gp, ap_base, 8);
     if (is_fp)
-      emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_RAX, 0);
+      emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, gp, 0);
     else
-      emit_mov_load(mc, sz, 0, dr, X64_RAX, 0);
+      emit_mov_load(mc, sz, 0, dr, gp, 0);
     x64_add_mem_imm(mc, 1, ap_base, 8, 8);
     mc->label_place(mc, L_done);
   }
@@ -3660,7 +3665,8 @@ static void x64_finalize(NativeTarget* t) {
  * effect, so the optimizer keeps values live across them out of those registers
  * (the backend is then free to use them). idiv/div write rax (quotient) and rdx
  * (remainder/sign); a variable shift uses cl; cmpxchg/xadd loops use rax/rcx/rdx;
- * va_arg uses rax for the gp/fp offset. */
+ * an FP va_arg borrows rax for the gp/fp offset (an integer va_arg does the
+ * offset arithmetic in its own destination register, so it clobbers nothing). */
 static int x64_machine_op_clobbers(NativeTarget* t, const NativeMachineOp* op,
                                    u32 mask[NATIVE_CALL_PLAN_CLASSES]) {
   (void)t;
@@ -3690,6 +3696,7 @@ static int x64_machine_op_clobbers(NativeTarget* t, const NativeMachineOp* op,
       mask[NATIVE_REG_INT] = (1u << X64_RAX) | (1u << X64_RCX) | (1u << X64_RDX);
       return 1;
     case NATIVE_MOP_VA_ARG:
+      if (!op->result_is_fp) return 0;
       mask[NATIVE_REG_INT] = (1u << X64_RAX);
       return 1;
     default:
@@ -3870,7 +3877,8 @@ static void x64_va_arg_(NativeDirectTarget* d, Operand dst, Operand ap_addr,
   NativeLoc res = x64_reg_loc(type, is_fp ? NATIVE_REG_FP : NATIVE_REG_INT,
                               is_fp ? X64_TMP_FP : (Reg)X64_RDX);
   NativeAddr dst_addr;
-  /* Base in R11 (not RAX/R10, which the va_arg core uses as scratch). */
+  /* Base in R11: the core advances/loads through R11 plus one GPR scratch (the
+   * integer result reg itself, or RAX for FP results), so R11 must not be RAX. */
   x64_va_arg_core(a, res, x64_direct_va_base(d, ap_addr, X64_R11), type);
   dst_addr = x64_direct_addr(d, dst);
   if (dst_addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) {
diff --git a/src/opt/pass_machinize.c b/src/opt/pass_machinize.c
@@ -1,5 +1,6 @@
 #include <string.h>
 
+#include "cg/type.h"
 #include "core/pool.h"
 #include "core/slice.h"
 #include "opt/opt_internal.h"
@@ -165,7 +166,11 @@ static void machinize_inst_clobbers(Func* f, NativeTarget* target) {
           mop.second_is_reg =
               (u8)(in->nopnds > 2u && in->opnds[2].kind == OPK_REG);
           break;
-        case IR_VA_ARG: mop.kind = NATIVE_MOP_VA_ARG; break;
+        case IR_VA_ARG:
+          mop.kind = NATIVE_MOP_VA_ARG;
+          mop.result_is_fp =
+              (u8)(in->nopnds > 0u && cg_type_is_float(f->c, in->opnds[0].type));
+          break;
         case IR_ATOMIC_CAS: mop.kind = NATIVE_MOP_ATOMIC_CAS; break;
         case IR_ATOMIC_RMW: mop.kind = NATIVE_MOP_ATOMIC_RMW; break;
         default: continue;

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	lang/c/type/type.c	\|	10	++++------
M	lang/cpp/cpp_support.h	\|	14	++++++++++++++
M	lang/cpp/pp/pp.c	\|	15	++++++++-------
M	rt/Makefile	\|	7	+++++--
M	src/abi/abi_rv64.c	\|	8	++++++--
M	src/arch/native_target.h	\|	1	+
M	src/arch/x64/native.c	\|	52	++++++++++++++++++++++++++++++----------------------
M	src/opt/pass_machinize.c	\|	7	++++++-