kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 7f49849e76a96d275a93c7bf26b9700cbb8bb7bf
parent 5b6f8ec5fd7128a689d19ba5f2d4bbb5dd04b4a8
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 29 May 2026 11:59:50 -0700

x64: integer va_arg does offset arithmetic in its own dst register

pass_native_emit fetches va_arg into a scratch register and copies it to the
real destination, so for integer results the va_arg core can use that scratch
destination GPR itself for the gp_offset/address arithmetic and touch no other
register. Only FP results (value kept in XMM) still borrow RAX as a GPR scratch.

This removes the RAX clobber that was destroying a returned accumulator living
in RAX across a va_arg loop (the classic 'int sum(int n,...)' reducer). The
machine-clobber hook now reports the RAX clobber only for FP va_arg (new
NativeMachineOp.result_is_fp), so integer va_arg imposes no allocator constraint.

x64 parse -O1: 888 -> 895 (fixes builtin_03_va_list, builtin_05_va_copy,
6_7_6_08_variadic_decl, 6_9_06_variadic_func, variadic_02_many_ints,
variadic_03_long, variadic_07_nested_call). Remaining non-ldbl: variadic_04_pointer,
2 VLA-param, funcptr_field_first_arg, sadd_overflow. No regressions: toy x64/rv64
O0+O1 156/0.

Diffstat:
Mlang/c/type/type.c | 10++++------
Mlang/cpp/cpp_support.h | 14++++++++++++++
Mlang/cpp/pp/pp.c | 15++++++++-------
Mrt/Makefile | 7+++++--
Msrc/abi/abi_rv64.c | 8++++++--
Msrc/arch/native_target.h | 1+
Msrc/arch/x64/native.c | 52++++++++++++++++++++++++++++++----------------------
Msrc/opt/pass_machinize.c | 7++++++-
8 files changed, 74 insertions(+), 40 deletions(-)

diff --git a/lang/c/type/type.c b/lang/c/type/type.c @@ -517,13 +517,11 @@ static CfreeCgTypeId type_cg_builtin(CfreeCompiler* c, TypeKind kind) { case TY_DOUBLE: return b.id[CFREE_CG_BUILTIN_F64]; case TY_LDOUBLE: - /* binary128 long double: aarch64-linux, and wasm32 (matching - * clang/LLVM's wasm convention). RV64 long double = double per the - * locked decision; everything else uses double too. */ - if ((target.arch == CFREE_ARCH_ARM_64 && target.os == CFREE_OS_LINUX) || - target.arch == CFREE_ARCH_WASM) { + /* `long double` is IEEE-754 binary128 on targets that follow the quad + * psABI (RISC-V, aarch64-linux, wasm32); elsewhere it aliases `double`. + * See cfree_target_long_double_is_binary128. */ + if (cfree_target_long_double_is_binary128(target)) return b.id[CFREE_CG_BUILTIN_F128]; - } return b.id[CFREE_CG_BUILTIN_F64]; default: break; diff --git a/lang/cpp/cpp_support.h b/lang/cpp/cpp_support.h @@ -77,4 +77,18 @@ _Noreturn static inline void compiler_panicv(Compiler* c, SrcLoc loc, cfree_frontend_vfatal(c, loc, fmt, ap); } +/* True when the C `long double` type is IEEE-754 binary128 (quad) on this + * target rather than an alias of `double`. RISC-V (LP64/LP64D) and + * aarch64-linux follow the quad psABI; wasm32 matches clang/LLVM's wasm + * convention. x86 (80-bit x87, not modeled), Apple, and Windows alias long + * double to double. Centralized so the preprocessor's __LDBL_* / + * __SIZEOF_LONG_DOUBLE__ macros and the C type system's long-double -> + * CG-builtin mapping cannot drift apart. */ +static inline int cfree_target_long_double_is_binary128(CfreeTarget t) { + if (t.arch == CFREE_ARCH_RV64) return 1; + if (t.arch == CFREE_ARCH_ARM_64 && t.os == CFREE_OS_LINUX) return 1; + if (t.arch == CFREE_ARCH_WASM) return 1; + return 0; +} + #endif diff --git a/lang/cpp/pp/pp.c b/lang/cpp/pp/pp.c @@ -378,7 +378,8 @@ static void pp_register_target_predefined(Pp* pp) { pp_define(pp, "__SIZEOF_WINT_T__", "4"); pp_define(pp, "__SIZEOF_FLOAT__", "4"); pp_define(pp, "__SIZEOF_DOUBLE__", "8"); - pp_define(pp, "__SIZEOF_LONG_DOUBLE__", "8"); + pp_define(pp, "__SIZEOF_LONG_DOUBLE__", + cfree_target_long_double_is_binary128(target) ? "16" : "8"); /* Windows / mingw predefined macros. cfree targets the mingw * flavor (DWARF debug info, mingwex CRT) rather than MSVC, so we @@ -669,12 +670,12 @@ static void pp_register_target_predefined(Pp* pp) { pp_define(pp, "__DBL_MIN__", "0x1p-1022"); pp_define(pp, "__DBL_DENORM_MIN__", "0x1p-1074"); - /* RV64 long double = double per the locked decision (matches RV64 - * musl/glibc default). aarch64-linux and wasm32 (matching clang/LLVM's - * wasm convention) get binary128 long double; the wasm backend then - * reports it as unsupported when a value is actually materialized. */ - if ((target.arch == CFREE_ARCH_ARM_64 && target.os == CFREE_OS_LINUX) || - target.arch == CFREE_ARCH_WASM) { + /* Targets that follow the IEEE-754 binary128 quad psABI for `long double` + * (RISC-V, aarch64-linux, wasm32) get the 113-bit-mantissa characteristics; + * everything else aliases `double`. The wasm backend still reports f128 as + * unsupported when a value is actually materialized. See + * cfree_target_long_double_is_binary128. */ + if (cfree_target_long_double_is_binary128(target)) { pp_define(pp, "__LDBL_HAS_DENORM__", "1"); pp_define(pp, "__LDBL_MANT_DIG__", "113"); pp_define(pp, "__LDBL_DECIMAL_DIG__", "36"); diff --git a/rt/Makefile b/rt/Makefile @@ -106,14 +106,16 @@ RT_riscv64-linux_TARGET = riscv64-linux-gnu RT_riscv64-linux_ABI = lp64 RT_riscv64-linux_INT128 = 1 RT_riscv64-linux_CORO = riscv64 -# long double = double per the locked rv64 decision; no binary128 runtime. -RT_riscv64-linux_LDBL128 = +# RISC-V `long double` is IEEE-754 binary128 per the psABI; ship the quad +# soft-float / __int128 runtime (fp_tf, fp_ti). +RT_riscv64-linux_LDBL128 = 1 RT_riscv64-linux_ARCH_FLAGS = -mabi=lp64d -march=rv64imafd RT_riscv64-elf_TARGET = riscv64-unknown-elf RT_riscv64-elf_ABI = lp64 RT_riscv64-elf_INT128 = 1 RT_riscv64-elf_CORO = riscv64 +RT_riscv64-elf_LDBL128 = 1 RT_riscv64-elf_ARCH_FLAGS = -mabi=lp64 -march=rv64imafd RT_riscv64-elf-save-restore_TARGET = riscv64-unknown-elf @@ -121,6 +123,7 @@ RT_riscv64-elf-save-restore_ABI = lp64 RT_riscv64-elf-save-restore_INT128 = 1 RT_riscv64-elf-save-restore_CORO = riscv64 RT_riscv64-elf-save-restore_SAVE_RESTORE = 1 +RT_riscv64-elf-save-restore_LDBL128 = 1 RT_riscv64-elf-save-restore_ARCH_FLAGS = -mabi=lp64 -march=rv64imafd RT_x86_64-pc-windows_TARGET = x86_64-pc-windows-msvc diff --git a/src/abi/abi_rv64.c b/src/abi/abi_rv64.c @@ -15,8 +15,12 @@ * * otherwise INT parts up to 16 B (passed in up to 2 GPRs). * large struct -> INDIRECT (sret for return; byval for args) * - * Long double is locked to `double` for rv64 (see RV64_PARITY_CHECKLIST); - * binary128 / quad encoding is deferred. + * Long double is IEEE-754 binary128 (quad) on rv64. Like __int128 it is a + * 16-byte scalar passed/returned in an aligned pair of integer registers + * (a0:a1 .. a6:a7; low-order half in the lower-numbered register), split + * low-in-register / high-on-stack when only one register remains, and wholly + * on the stack otherwise. classify_scalar() handles both via the size==16 + * INT-pair path below (there are no 128-bit FP registers in RV64GC). * * Variadic args bypass these rules entirely and always go through the * integer register file / stack (handled at the caller / callee sites). */ diff --git a/src/arch/native_target.h b/src/arch/native_target.h @@ -240,6 +240,7 @@ typedef struct NativeMachineOp { u8 binop; /* BinOp, when kind == NATIVE_MOP_BINOP */ u8 intrin; /* IntrinKind, when kind == NATIVE_MOP_INTRINSIC */ u8 second_is_reg; /* binop's second operand is a register (not an immediate) */ + u8 result_is_fp; /* result lands in an FP register (e.g. va_arg of a double) */ } NativeMachineOp; typedef struct NativeCallDesc { diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c @@ -2927,17 +2927,22 @@ static void x64_va_arg_core(X64NativeTarget* a, NativeLoc dst, NativeAddr ap, int is_fp = loc_is_fp(dst); u32 dr = loc_reg(dst); u32 ap_base = x64_va_base(a, ap, X64_TMP_INT2); /* r11 */ - /* Uses only the reserved emit scratch: r11 (ap_base) and rax. The va_list - * offset/pointer fields are advanced in memory (x64_add_mem_imm) and the - * reg-save base is folded into rax (x64_add_reg_mem), so no third register is - * needed — leaving r10 (and rdx) free for the allocator. */ + /* GPR scratch for the offset/address arithmetic. For integer results the + * destination is itself a (throwaway) scratch GPR — pass_native_emit fetches + * va_arg into a scratch and copies it to the real destination afterward — so + * we reuse `dr` and touch no allocable register at all. FP results keep their + * value in an XMM register, so they borrow the reserved RAX emit scratch. + * Either way only r11 (ap_base) and `gp` are used: the va_list fields are + * advanced in memory (x64_add_mem_imm) and the reg-save base is folded in + * with x64_add_reg_mem, so no third register is needed. */ + u32 gp = is_fp ? X64_RAX : dr; if (a->abi->shadow_space) { - /* Win64: rax = *ap; load dr from [rax]; *ap += 8. */ - emit_mov_load(mc, 8, 0, X64_RAX, ap_base, 0); + /* Win64: gp = *ap; load dr from [gp]; *ap += 8. */ + emit_mov_load(mc, 8, 0, gp, ap_base, 0); if (is_fp) - emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_RAX, 0); + emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, gp, 0); else - emit_mov_load(mc, sz, 0, dr, X64_RAX, 0); + emit_mov_load(mc, sz, 0, dr, gp, 0); x64_add_mem_imm(mc, 1, ap_base, 0, 8); return; } @@ -2947,26 +2952,26 @@ static void x64_va_arg_core(X64NativeTarget* a, NativeLoc dst, NativeAddr ap, i8 stride = is_fp ? 16 : 8; MCLabel L_stack = mc->label_new(mc); MCLabel L_done = mc->label_new(mc); - /* eax = ap[offs]; cmp eax, max; jae L_stack. */ - emit_mov_load(mc, 4, 0, X64_RAX, ap_base, (i32)offs_field); - emit_alu_imm32(mc, 0, X64_ALU_SUB_CMP, X64_RAX, (i32)max_offs); + /* gp32 = ap[offs]; cmp gp32, max; jae L_stack. */ + emit_mov_load(mc, 4, 0, gp, ap_base, (i32)offs_field); + emit_alu_imm32(mc, 0, X64_ALU_SUB_CMP, gp, (i32)max_offs); emit_jcc_rel32(mc, X64_CC_AE, L_stack); - /* reg path: ap[offs] += stride; rax = reg_save_area(ap[16]) + offset; load. - * (The memory increment leaves rax holding the old offset.) */ + /* reg path: ap[offs] += stride; gp = reg_save_area(ap[16]) + offset; load. + * (The memory increment leaves gp holding the old offset.) */ x64_add_mem_imm(mc, 0, ap_base, (i32)offs_field, stride); - x64_add_reg_mem(mc, X64_RAX, ap_base, 16); + x64_add_reg_mem(mc, gp, ap_base, 16); if (is_fp) - emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_RAX, 0); + emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, gp, 0); else - emit_mov_load(mc, sz, 0, dr, X64_RAX, 0); + emit_mov_load(mc, sz, 0, dr, gp, 0); emit_jmp_rel32(mc, L_done); - /* stack path: rax = ap[8] (overflow area); load; ap[8] += 8. */ + /* stack path: gp = ap[8] (overflow area); load; ap[8] += 8. */ mc->label_place(mc, L_stack); - emit_mov_load(mc, 8, 0, X64_RAX, ap_base, 8); + emit_mov_load(mc, 8, 0, gp, ap_base, 8); if (is_fp) - emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, X64_RAX, 0); + emit_sse_load(mc, sse_scalar_prefix(sz), 0x10, dr, gp, 0); else - emit_mov_load(mc, sz, 0, dr, X64_RAX, 0); + emit_mov_load(mc, sz, 0, dr, gp, 0); x64_add_mem_imm(mc, 1, ap_base, 8, 8); mc->label_place(mc, L_done); } @@ -3660,7 +3665,8 @@ static void x64_finalize(NativeTarget* t) { * effect, so the optimizer keeps values live across them out of those registers * (the backend is then free to use them). idiv/div write rax (quotient) and rdx * (remainder/sign); a variable shift uses cl; cmpxchg/xadd loops use rax/rcx/rdx; - * va_arg uses rax for the gp/fp offset. */ + * an FP va_arg borrows rax for the gp/fp offset (an integer va_arg does the + * offset arithmetic in its own destination register, so it clobbers nothing). */ static int x64_machine_op_clobbers(NativeTarget* t, const NativeMachineOp* op, u32 mask[NATIVE_CALL_PLAN_CLASSES]) { (void)t; @@ -3690,6 +3696,7 @@ static int x64_machine_op_clobbers(NativeTarget* t, const NativeMachineOp* op, mask[NATIVE_REG_INT] = (1u << X64_RAX) | (1u << X64_RCX) | (1u << X64_RDX); return 1; case NATIVE_MOP_VA_ARG: + if (!op->result_is_fp) return 0; mask[NATIVE_REG_INT] = (1u << X64_RAX); return 1; default: @@ -3870,7 +3877,8 @@ static void x64_va_arg_(NativeDirectTarget* d, Operand dst, Operand ap_addr, NativeLoc res = x64_reg_loc(type, is_fp ? NATIVE_REG_FP : NATIVE_REG_INT, is_fp ? X64_TMP_FP : (Reg)X64_RDX); NativeAddr dst_addr; - /* Base in R11 (not RAX/R10, which the va_arg core uses as scratch). */ + /* Base in R11: the core advances/loads through R11 plus one GPR scratch (the + * integer result reg itself, or RAX for FP results), so R11 must not be RAX. */ x64_va_arg_core(a, res, x64_direct_va_base(d, ap_addr, X64_R11), type); dst_addr = x64_direct_addr(d, dst); if (dst_addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) { diff --git a/src/opt/pass_machinize.c b/src/opt/pass_machinize.c @@ -1,5 +1,6 @@ #include <string.h> +#include "cg/type.h" #include "core/pool.h" #include "core/slice.h" #include "opt/opt_internal.h" @@ -165,7 +166,11 @@ static void machinize_inst_clobbers(Func* f, NativeTarget* target) { mop.second_is_reg = (u8)(in->nopnds > 2u && in->opnds[2].kind == OPK_REG); break; - case IR_VA_ARG: mop.kind = NATIVE_MOP_VA_ARG; break; + case IR_VA_ARG: + mop.kind = NATIVE_MOP_VA_ARG; + mop.result_is_fp = + (u8)(in->nopnds > 0u && cg_type_is_float(f->c, in->opnds[0].type)); + break; case IR_ATOMIC_CAS: mop.kind = NATIVE_MOP_ATOMIC_CAS; break; case IR_ATOMIC_RMW: mop.kind = NATIVE_MOP_ATOMIC_RMW; break; default: continue;