kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 97b868ff24f7d52083af2a924890ed48b14a5f6d
parent 0a722716448183a97354a873bcda572031919e19
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 28 May 2026 10:38:55 -0700

opt/aa64: resolve call-argument moves as a parallel copy

The aarch64 call lowering emitted argument setup as a single forward pass:
fixed/register args loaded left-to-right, then variadic stack args stored.
Both orderings could clobber a live argument source. When the register
allocator left a value in an ABI argument register (e.g. a prior call's
result still in x0, consumed as a later argument), loading an earlier
argument into that register destroyed the value before the later argument
read it -- miscompiling any call that passes a non-inlined call's result
alongside another argument. binary-trees -O1 hit this: ItemCheck(tree)
passed directly to printf printed the format-string pointer instead.

Emit stack stores first (their sources are read before any argument
register is written) and collect the register-argument moves, then emit
them with parallel-copy semantics: a register is read by every move that
sources it before any move overwrites it (topological order, with a
scratch to break true cycles). Fixes both the Apple (variadic-on-stack)
and AAPCS64 (variadic-in-register) variants.

Adds toy case 139 covering a non-inlined call result passed as a variadic
argument in several positions; green on R/L/C/W and the aa64 cross path.

Diffstat:
Msrc/arch/aa64/native.c | 142+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
Atest/toy/cases/139_variadic_stack_arg_call_result.expected | 1+
Atest/toy/cases/139_variadic_stack_arg_call_result.toy | 36++++++++++++++++++++++++++++++++++++
3 files changed, 166 insertions(+), 13 deletions(-)

diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c @@ -2270,6 +2270,104 @@ static u32 aa_signature_stack_bytes(NativeTarget* t, CfreeCgTypeId fn_type, return aa_call_stack_size(t, &d); } +/* One register-passed call argument: write `src` (or its address) into the + * argument register `dst`. Collected during planning and emitted as a batch so + * the backend can order them as a parallel copy (see aa_emit_reg_arg_moves). */ +typedef struct { + NativeLoc dst; + NativeLoc src; + u32 src_offset; + u32 size; + int is_addr; /* 1: write addr-of(src) into dst; 0: load the value part */ +} AAArgMove; + +/* AAPCS64/Apple permit at most 8 GP + 8 FP register-passed argument slots. */ +#define AA_MAX_REG_ARG_MOVES 16u + +/* The register a move reads, or REG_NONE if it reads none (immediate, frame + * slot, or an address derived from fp/sp/a global). Only a register source can + * be invalidated by another move writing its argument register, so only these + * constrain the emission order. */ +static Reg aa_arg_move_src_reg(const AAArgMove* m, NativeAllocClass* cls_out) { + if (!m->is_addr && m->src.kind == NATIVE_LOC_REG) { + *cls_out = (NativeAllocClass)m->src.cls; + return m->src.v.reg; + } + return REG_NONE; +} + +static void aa_emit_one_arg_move(NativeTarget* t, const AAArgMove* m) { + if (m->is_addr) + aa_addr_of_loc(t, m->dst, m->src); + else + aa_load_part(t, m->dst, m->src, m->src_offset, m->size); +} + +/* Emit register-argument moves with parallel-copy semantics: every register + * must be read by all moves that source it before any move overwrites it. The + * allocator usually arranges the sources so the moves are already conflict-free + * (it copies a value out of an about-to-be-clobbered argument register), but it + * does not always — notably for variadic arguments, where it can leave a prior + * call's result sitting in x0 even though x0 is this call's first arg register. + * So the backend must not assume a safe order. Emit any move whose destination + * is no longer needed as a source (topological order); when only a true cycle + * remains, save one member's register into a scratch and redirect its readers, + * then drain the now-acyclic remainder. */ +static void aa_emit_reg_arg_moves(NativeTarget* t, AAArgMove* moves, u32 n) { + u8 done[AA_MAX_REG_ARG_MOVES]; + u32 emitted = 0; + if (n > AA_MAX_REG_ARG_MOVES) + aa_panic(aa_of(t), "too many register arguments"); + memset(done, 0, sizeof done); + while (emitted < n) { + int progress = 0; + for (u32 i = 0; i < n; ++i) { + int blocked = 0; + if (done[i]) continue; + for (u32 j = 0; j < n && !blocked; ++j) { + NativeAllocClass sc; + Reg sr; + if (done[j] || j == i) continue; + sr = aa_arg_move_src_reg(&moves[j], &sc); + if (sr != REG_NONE && sr == moves[i].dst.v.reg && + sc == (NativeAllocClass)moves[i].dst.cls) + blocked = 1; + } + if (!blocked) { + aa_emit_one_arg_move(t, &moves[i]); + done[i] = 1; + emitted++; + progress = 1; + } + } + if (!progress) { + /* Every remaining move is part of a cycle; a cycle's ring is reg->reg, so + * at least one remaining move is register-sourced. Break on such a move: + * its destination register still holds a pending source value, so copy it + * to a scratch and point that value's readers at the scratch. Scratch + * avoids x16 (which may hold a stashed indirect callee) and the arg regs; + * cycle moves are reg->reg, so the scratch needs no address arithmetic. */ + u32 k = 0; + NativeAllocClass bc, sc; + Reg scratch_reg; + NativeLoc scratchloc; + while (k < n && (done[k] || aa_arg_move_src_reg(&moves[k], &sc) == REG_NONE)) + ++k; + bc = (NativeAllocClass)moves[k].dst.cls; + scratch_reg = bc == NATIVE_REG_FP ? 16u : AA_TMP1; + scratchloc = aa_reg_loc(moves[k].dst.type, bc, scratch_reg); + aa_move(t, scratchloc, moves[k].dst); + for (u32 j = 0; j < n; ++j) { + Reg sr = aa_arg_move_src_reg(&moves[j], &sc); + if (!done[j] && sr != REG_NONE && sr == moves[k].dst.v.reg && sc == bc) { + moves[j].src = scratchloc; + moves[j].src_offset = 0; + } + } + } + } +} + static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc, NativeCallPlan* plan) { NativeCallPlanRet* rets; @@ -2297,8 +2395,18 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc, plan->callee = scratch; } { - u32 next_int = 0, next_fp = 0, stack = 0; + u32 next_int = 0, next_fp = 0, stack = 0, nmoves = 0; int tail_call = (desc->flags & CG_CALL_TAIL) != 0; + AAArgMove moves[AA_MAX_REG_ARG_MOVES]; + /* Stack-passed arguments are stored inline as we walk, *before* any + * argument register is written, so a stack-arg source that the allocator + * left in an arg register (e.g. a prior call's result still in x0, consumed + * as a variadic stack arg) is read while it is still live. Stack stores + * only touch memory and the AA_TMP0/v16 scratch, never an arg-register + * source, so emitting them first cannot clobber a register-arg source. + * Register-passed arguments are collected and emitted afterward as a + * parallel copy (aa_emit_reg_arg_moves) so they likewise never overwrite a + * register another argument still needs to read. */ for (u32 i = 0; i < desc->nargs; ++i) { ABIArgInfo tmp; const ABIArgInfo* ai = aa_param_abi(t, abi, desc, i, &tmp); @@ -2319,14 +2427,18 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc, continue; } if (ai->kind == ABI_ARG_INDIRECT) { - NativeLoc ptr; if (next_int < 8u) { - ptr = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT, - next_int++); - aa_addr_of_loc(t, ptr, desc->args[i]); + AAArgMove* m = &moves[nmoves++]; + m->dst = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT, + next_int++); + m->src = desc->args[i]; + m->src_offset = 0; + m->size = 8; + m->is_addr = 1; } else { - ptr = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT, - AA_TMP0); + NativeLoc ptr = + aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT, + AA_TMP0); aa_addr_of_loc(t, ptr, desc->args[i]); aa_store_outgoing_part(t, tail_call, stack, ptr, 8); stack += 8u; @@ -2337,12 +2449,15 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc, const ABIArgPart* part = &ai->parts[p]; NativeAllocClass cls = part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; - if (cls == NATIVE_REG_FP && next_fp < 8u) { - NativeLoc dst = aa_reg_loc(desc->args[i].type, cls, next_fp++); - aa_load_part(t, dst, desc->args[i], part->src_offset, part->size); - } else if (cls == NATIVE_REG_INT && next_int < 8u) { - NativeLoc dst = aa_reg_loc(desc->args[i].type, cls, next_int++); - aa_load_part(t, dst, desc->args[i], part->src_offset, part->size); + if ((cls == NATIVE_REG_FP && next_fp < 8u) || + (cls == NATIVE_REG_INT && next_int < 8u)) { + AAArgMove* m = &moves[nmoves++]; + m->dst = aa_reg_loc(desc->args[i].type, cls, + cls == NATIVE_REG_FP ? next_fp++ : next_int++); + m->src = desc->args[i]; + m->src_offset = part->src_offset; + m->size = part->size; + m->is_addr = 0; } else { NativeLoc tmpreg = aa_reg_loc(desc->args[i].type, cls, cls == NATIVE_REG_FP ? 16u : AA_TMP0); @@ -2353,6 +2468,7 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc, } } } + aa_emit_reg_arg_moves(t, moves, nmoves); /* Set the indirect-result register (x8) *after* the argument loads: an * argument source may have been allocated to x8, and the sret pointer load * would otherwise clobber it before it is moved into its argument diff --git a/test/toy/cases/139_variadic_stack_arg_call_result.expected b/test/toy/cases/139_variadic_stack_arg_call_result.expected @@ -0,0 +1 @@ +42 diff --git a/test/toy/cases/139_variadic_stack_arg_call_result.toy b/test/toy/cases/139_variadic_stack_arg_call_result.toy @@ -0,0 +1,36 @@ +// Regression: a variadic argument is passed on the stack (Apple arm64 ABI), +// and its source value is the result of a *non-inlined* call, which the +// register allocator leaves in the ABI return/arg register x0. The fixed +// parameter `n` also targets x0. A single forward pass over the call's +// arguments would load `n` into x0 first, clobbering the still-pending call +// result before the stack store reads it, printing garbage. The backend must +// read every stack-arg source before overwriting any argument register. +// +// `make` is marked noinline so its result genuinely comes back in x0 via a +// real `bl`; if it inlined, the value would be computed in a scratch reg and +// the hazard would never arise. + +fn @[.noinline] make(x: i64): i64 { + return x + x; +} + +fn vfn(n: i64, ...): i64 { + var ap: va_list; + @va_start(ap); + let a: i64 = @va_arg<i64>(ap); + let b: i64 = @va_arg<i64>(ap); + @va_end(ap); + return n * (100 as i64) + a * (10 as i64) + b; +} + +fn __user_main(): i64 { + // Call result is the last (second) variadic stack arg. + if vfn(7, 3, make(64)) != (858 as i64) { return 1; } + // Call result is the first variadic stack arg. + if vfn(7, make(64), 3) != (1983 as i64) { return 2; } + // Both variadic stack args are call results. + if vfn(1, make(10), make(20)) != (340 as i64) { return 3; } + return 42; +} + +fn main(): i32 { return __user_main() as i32; }