commit 7f49849e76a96d275a93c7bf26b9700cbb8bb7bf
parent 5b6f8ec5fd7128a689d19ba5f2d4bbb5dd04b4a8
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 29 May 2026 11:59:50 -0700
x64: integer va_arg does offset arithmetic in its own dst register
pass_native_emit fetches va_arg into a scratch register and copies it to the
real destination, so for integer results the va_arg core can use that scratch
destination GPR itself for the gp_offset/address arithmetic and touch no other
register. Only FP results (value kept in XMM) still borrow RAX as a GPR scratch.
This removes the RAX clobber that was destroying a returned accumulator living
in RAX across a va_arg loop (the classic 'int sum(int n,...)' reducer). The
machine-clobber hook now reports the RAX clobber only for FP va_arg (new
NativeMachineOp.result_is_fp), so integer va_arg imposes no allocator constraint.
x64 parse -O1: 888 -> 895 (fixes builtin_03_va_list, builtin_05_va_copy,
6_7_6_08_variadic_decl, 6_9_06_variadic_func, variadic_02_many_ints,
variadic_03_long, variadic_07_nested_call). Remaining non-ldbl: variadic_04_pointer,
2 VLA-param, funcptr_field_first_arg, sadd_overflow. No regressions: toy x64/rv64
O0+O1 156/0.
Diffstat:
8 files changed, 74 insertions(+), 40 deletions(-)
diff --git a/lang/c/type/type.c b/lang/c/type/type.c
@@ -517,13 +517,11 @@ static CfreeCgTypeId type_cg_builtin(CfreeCompiler* c, TypeKind kind) {
case TY_DOUBLE:
return b.id[CFREE_CG_BUILTIN_F64];
case TY_LDOUBLE:
- /* binary128 long double: aarch64-linux, and wasm32 (matching
- * clang/LLVM's wasm convention). RV64 long double = double per the
- * locked decision; everything else uses double too. */
- if ((target.arch == CFREE_ARCH_ARM_64 && target.os == CFREE_OS_LINUX) ||
- target.arch == CFREE_ARCH_WASM) {
+ /* `long double` is IEEE-754 binary128 on targets that follow the quad
+ * psABI (RISC-V, aarch64-linux, wasm32); elsewhere it aliases `double`.
+ * See cfree_target_long_double_is_binary128. */
+ if (cfree_target_long_double_is_binary128(target))
return b.id[CFREE_CG_BUILTIN_F128];
- }
return b.id[CFREE_CG_BUILTIN_F64];
default:
break;
diff --git a/lang/cpp/cpp_support.h b/lang/cpp/cpp_support.h
@@ -77,4 +77,18 @@ _Noreturn static inline void compiler_panicv(Compiler* c, SrcLoc loc,
cfree_frontend_vfatal(c, loc, fmt, ap);
}
+/* True when the C `long double` type is IEEE-754 binary128 (quad) on this
+ * target rather than an alias of `double`. RISC-V (LP64/LP64D) and
+ * aarch64-linux follow the quad psABI; wasm32 matches clang/LLVM's wasm
+ * convention. x86 (80-bit x87, not modeled), Apple, and Windows alias long
+ * double to double. Centralized so the preprocessor's __LDBL_* /
+ * __SIZEOF_LONG_DOUBLE__ macros and the C type system's long-double ->
+ * CG-builtin mapping cannot drift apart. */
+static inline int cfree_target_long_double_is_binary128(CfreeTarget t) {
+ if (t.arch == CFREE_ARCH_RV64) return 1;
+ if (t.arch == CFREE_ARCH_ARM_64 && t.os == CFREE_OS_LINUX) return 1;
+ if (t.arch == CFREE_ARCH_WASM) return 1;
+ return 0;
+}
+
#endif
diff --git a/lang/cpp/pp/pp.c b/lang/cpp/pp/pp.c
@@ -378,7 +378,8 @@ static void pp_register_target_predefined(Pp* pp) {
pp_define(pp, "__SIZEOF_WINT_T__", "4");
pp_define(pp, "__SIZEOF_FLOAT__", "4");
pp_define(pp, "__SIZEOF_DOUBLE__", "8");
- pp_define(pp, "__SIZEOF_LONG_DOUBLE__", "8");
+ pp_define(pp, "__SIZEOF_LONG_DOUBLE__",
+ cfree_target_long_double_is_binary128(target) ? "16" : "8");
/* Windows / mingw predefined macros. cfree targets the mingw
* flavor (DWARF debug info, mingwex CRT) rather than MSVC, so we
@@ -669,12 +670,12 @@ static void pp_register_target_predefined(Pp* pp) {
pp_define(pp, "__DBL_MIN__", "0x1p-1022");
pp_define(pp, "__DBL_DENORM_MIN__", "0x1p-1074");
- /* RV64 long double = double per the locked decision (matches RV64
- * musl/glibc default). aarch64-linux and wasm32 (matching clang/LLVM's
- * wasm convention) get binary128 long double; the wasm backend then
- * reports it as unsupported when a value is actually materialized. */
- if ((target.arch == CFREE_ARCH_ARM_64 && target.os == CFREE_OS_LINUX) ||
- target.arch == CFREE_ARCH_WASM) {
+ /* Targets that follow the IEEE-754 binary128 quad psABI for `long double`
+ * (RISC-V, aarch64-linux, wasm32) get the 113-bit-mantissa characteristics;
+ * everything else aliases `double`. The wasm backend still reports f128 as
+ * unsupported when a value is actually materialized. See
+ * cfree_target_long_double_is_binary128. */
+ if (cfree_target_long_double_is_binary128(target)) {
pp_define(pp, "__LDBL_HAS_DENORM__", "1");
pp_define(pp, "__LDBL_MANT_DIG__", "113");
pp_define(pp, "__LDBL_DECIMAL_DIG__", "36");
diff --git a/rt/Makefile b/rt/Makefile
@@ -106,14 +106,16 @@ RT_riscv64-linux_TARGET = riscv64-linux-gnu
RT_riscv64-linux_ABI = lp64
RT_riscv64-linux_INT128 = 1
RT_riscv64-linux_CORO = riscv64
-# long double = double per the locked rv64 decision; no binary128 runtime.
-RT_riscv64-linux_LDBL128 =
+# RISC-V `long double` is IEEE-754 binary128 per the psABI; ship the quad
+# soft-float / __int128 runtime (fp_tf, fp_ti).
+RT_riscv64-linux_LDBL128 = 1
RT_riscv64-linux_ARCH_FLAGS = -mabi=lp64d -march=rv64imafd
RT_riscv64-elf_TARGET = riscv64-unknown-elf
RT_riscv64-elf_ABI = lp64
RT_riscv64-elf_INT128 = 1
RT_riscv64-elf_CORO = riscv64
+RT_riscv64-elf_LDBL128 = 1
RT_riscv64-elf_ARCH_FLAGS = -mabi=lp64 -march=rv64imafd
RT_riscv64-elf-save-restore_TARGET = riscv64-unknown-elf
@@ -121,6 +123,7 @@ RT_riscv64-elf-save-restore_ABI = lp64
RT_riscv64-elf-save-restore_INT128 = 1
RT_riscv64-elf-save-restore_CORO = riscv64
RT_riscv64-elf-save-restore_SAVE_RESTORE = 1
+RT_riscv64-elf-save-restore_LDBL128 = 1
RT_riscv64-elf-save-restore_ARCH_FLAGS = -mabi=lp64 -march=rv64imafd
RT_x86_64-pc-windows_TARGET = x86_64-pc-windows-msvc
diff --git a/src/abi/abi_rv64.c b/src/abi/abi_rv64.c
@@ -15,8 +15,12 @@
* * otherwise INT parts up to 16 B (passed in up to 2 GPRs).
* large struct -> INDIRECT (sret for return; byval for args)
*
- * Long double is locked to `double` for rv64 (see RV64_PARITY_CHECKLIST);
- * binary128 / quad encoding is deferred.
+ * Long double is IEEE-754 binary128 (quad) on rv64. Like __int128 it is a
+ * 16-byte scalar passed/returned in an aligned pair of integer registers
+ * (a0:a1 .. a6:a7; low-order half in the lower-numbered register), split
+ * low-in-register / high-on-stack when only one register remains, and wholly
+ * on the stack otherwise. classify_scalar() handles both via the size==16
+ * INT-pair path below (there are no 128-bit FP registers in RV64GC).
*
* Variadic args bypass these rules entirely and always go through the
* integer register file / stack (handled at the caller / callee sites). */
diff --git a/src/arch/native_target.h b/src/arch/native_target.h
@@ -240,6 +240,7 @@ typedef struct NativeMachineOp {
u8 binop; /* BinOp, when kind == NATIVE_MOP_BINOP */
u8 intrin; /* IntrinKind, when kind == NATIVE_MOP_INTRINSIC */
u8 second_is_reg; /* binop's second operand is a register (not an immediate) */
+ u8 result_is_fp; /* result lands in an FP register (e.g. va_arg of a double) */
} NativeMachineOp;
typedef struct NativeCallDesc {
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -2927,17 +2927,22 @@ static void x64_va_arg_core(X64NativeTarget* a, NativeLoc dst, NativeAddr ap,
int is_fp = loc_is_fp(dst);
u32 dr = loc_reg(dst);
u32 ap_base = x64_va_base(a, ap, X64_TMP_INT2); /* r11 */
- /* Uses only the reserved emit scratch: r11 (ap_base) and rax. The va_list
- * offset/pointer fields are advanced in memory (x64_add_mem_imm) and the
- * reg-save base is folded into rax (x64_add_reg_mem), so no third register is
- * needed — leaving r10 (and rdx) free for the allocator. */
+ /* GPR scratch for the offset/address arithmetic. For integer results the
+ * destination is itself a (throwaway) scratch GPR — pass_native_emit fetches
+ * va_arg into a scratch and copies it to the real destination afterward — so
+ * we reuse `dr` and touch no allocable register at all. FP results keep their
+ * value in an XMM register, so they borrow the reserved RAX emit scratch.
+ * Either way only r11 (ap_base) and `gp` are used: the va_list fields are
+ * advanced in memory (x64_add_mem_imm) and the reg-save base is folded in
+ * with x64_add_reg_mem, so no third register is needed. */
+ u32 gp = is_fp ? X64_RAX : dr;
if (a->abi->shadow_space) {
- /* Win64: rax = *ap; load dr from [rax]; *ap += 8. */
- emit_mov_load(mc, 8, 0, X64_RAX, ap_base, 0);
+ /* Win64: gp = *ap; load dr from [gp]; *ap += 8. */
+ emit_mov_load(mc, 8, 0, gp, ap_base, 0);
if (is_fp)
- emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_RAX, 0);
+ emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, gp, 0);
else
- emit_mov_load(mc, sz, 0, dr, X64_RAX, 0);
+ emit_mov_load(mc, sz, 0, dr, gp, 0);
x64_add_mem_imm(mc, 1, ap_base, 0, 8);
return;
}
@@ -2947,26 +2952,26 @@ static void x64_va_arg_core(X64NativeTarget* a, NativeLoc dst, NativeAddr ap,
i8 stride = is_fp ? 16 : 8;
MCLabel L_stack = mc->label_new(mc);
MCLabel L_done = mc->label_new(mc);
- /* eax = ap[offs]; cmp eax, max; jae L_stack. */
- emit_mov_load(mc, 4, 0, X64_RAX, ap_base, (i32)offs_field);
- emit_alu_imm32(mc, 0, X64_ALU_SUB_CMP, X64_RAX, (i32)max_offs);
+ /* gp32 = ap[offs]; cmp gp32, max; jae L_stack. */
+ emit_mov_load(mc, 4, 0, gp, ap_base, (i32)offs_field);
+ emit_alu_imm32(mc, 0, X64_ALU_SUB_CMP, gp, (i32)max_offs);
emit_jcc_rel32(mc, X64_CC_AE, L_stack);
- /* reg path: ap[offs] += stride; rax = reg_save_area(ap[16]) + offset; load.
- * (The memory increment leaves rax holding the old offset.) */
+ /* reg path: ap[offs] += stride; gp = reg_save_area(ap[16]) + offset; load.
+ * (The memory increment leaves gp holding the old offset.) */
x64_add_mem_imm(mc, 0, ap_base, (i32)offs_field, stride);
- x64_add_reg_mem(mc, X64_RAX, ap_base, 16);
+ x64_add_reg_mem(mc, gp, ap_base, 16);
if (is_fp)
- emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_RAX, 0);
+ emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, gp, 0);
else
- emit_mov_load(mc, sz, 0, dr, X64_RAX, 0);
+ emit_mov_load(mc, sz, 0, dr, gp, 0);
emit_jmp_rel32(mc, L_done);
- /* stack path: rax = ap[8] (overflow area); load; ap[8] += 8. */
+ /* stack path: gp = ap[8] (overflow area); load; ap[8] += 8. */
mc->label_place(mc, L_stack);
- emit_mov_load(mc, 8, 0, X64_RAX, ap_base, 8);
+ emit_mov_load(mc, 8, 0, gp, ap_base, 8);
if (is_fp)
- emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_RAX, 0);
+ emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, gp, 0);
else
- emit_mov_load(mc, sz, 0, dr, X64_RAX, 0);
+ emit_mov_load(mc, sz, 0, dr, gp, 0);
x64_add_mem_imm(mc, 1, ap_base, 8, 8);
mc->label_place(mc, L_done);
}
@@ -3660,7 +3665,8 @@ static void x64_finalize(NativeTarget* t) {
* effect, so the optimizer keeps values live across them out of those registers
* (the backend is then free to use them). idiv/div write rax (quotient) and rdx
* (remainder/sign); a variable shift uses cl; cmpxchg/xadd loops use rax/rcx/rdx;
- * va_arg uses rax for the gp/fp offset. */
+ * an FP va_arg borrows rax for the gp/fp offset (an integer va_arg does the
+ * offset arithmetic in its own destination register, so it clobbers nothing). */
static int x64_machine_op_clobbers(NativeTarget* t, const NativeMachineOp* op,
u32 mask[NATIVE_CALL_PLAN_CLASSES]) {
(void)t;
@@ -3690,6 +3696,7 @@ static int x64_machine_op_clobbers(NativeTarget* t, const NativeMachineOp* op,
mask[NATIVE_REG_INT] = (1u << X64_RAX) | (1u << X64_RCX) | (1u << X64_RDX);
return 1;
case NATIVE_MOP_VA_ARG:
+ if (!op->result_is_fp) return 0;
mask[NATIVE_REG_INT] = (1u << X64_RAX);
return 1;
default:
@@ -3870,7 +3877,8 @@ static void x64_va_arg_(NativeDirectTarget* d, Operand dst, Operand ap_addr,
NativeLoc res = x64_reg_loc(type, is_fp ? NATIVE_REG_FP : NATIVE_REG_INT,
is_fp ? X64_TMP_FP : (Reg)X64_RDX);
NativeAddr dst_addr;
- /* Base in R11 (not RAX/R10, which the va_arg core uses as scratch). */
+ /* Base in R11: the core advances/loads through R11 plus one GPR scratch (the
+ * integer result reg itself, or RAX for FP results), so R11 must not be RAX. */
x64_va_arg_core(a, res, x64_direct_va_base(d, ap_addr, X64_R11), type);
dst_addr = x64_direct_addr(d, dst);
if (dst_addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) {
diff --git a/src/opt/pass_machinize.c b/src/opt/pass_machinize.c
@@ -1,5 +1,6 @@
#include <string.h>
+#include "cg/type.h"
#include "core/pool.h"
#include "core/slice.h"
#include "opt/opt_internal.h"
@@ -165,7 +166,11 @@ static void machinize_inst_clobbers(Func* f, NativeTarget* target) {
mop.second_is_reg =
(u8)(in->nopnds > 2u && in->opnds[2].kind == OPK_REG);
break;
- case IR_VA_ARG: mop.kind = NATIVE_MOP_VA_ARG; break;
+ case IR_VA_ARG:
+ mop.kind = NATIVE_MOP_VA_ARG;
+ mop.result_is_fp =
+ (u8)(in->nopnds > 0u && cg_type_is_float(f->c, in->opnds[0].type));
+ break;
case IR_ATOMIC_CAS: mop.kind = NATIVE_MOP_ATOMIC_CAS; break;
case IR_ATOMIC_RMW: mop.kind = NATIVE_MOP_ATOMIC_RMW; break;
default: continue;