commit 97b868ff24f7d52083af2a924890ed48b14a5f6d
parent 0a722716448183a97354a873bcda572031919e19
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 28 May 2026 10:38:55 -0700
opt/aa64: resolve call-argument moves as a parallel copy
The aarch64 call lowering emitted argument setup as a single forward pass:
fixed/register args loaded left-to-right, then variadic stack args stored.
Both orderings could clobber a live argument source. When the register
allocator left a value in an ABI argument register (e.g. a prior call's
result still in x0, consumed as a later argument), loading an earlier
argument into that register destroyed the value before the later argument
read it -- miscompiling any call that passes a non-inlined call's result
alongside another argument. binary-trees -O1 hit this: ItemCheck(tree)
passed directly to printf printed the format-string pointer instead.
Emit stack stores first (their sources are read before any argument
register is written) and collect the register-argument moves, then emit
them with parallel-copy semantics: a register is read by every move that
sources it before any move overwrites it (topological order, with a
scratch to break true cycles). Fixes both the Apple (variadic-on-stack)
and AAPCS64 (variadic-in-register) variants.
Adds toy case 139 covering a non-inlined call result passed as a variadic
argument in several positions; green on R/L/C/W and the aa64 cross path.
Diffstat:
3 files changed, 166 insertions(+), 13 deletions(-)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -2270,6 +2270,104 @@ static u32 aa_signature_stack_bytes(NativeTarget* t, CfreeCgTypeId fn_type,
return aa_call_stack_size(t, &d);
}
+/* One register-passed call argument: write `src` (or its address) into the
+ * argument register `dst`. Collected during planning and emitted as a batch so
+ * the backend can order them as a parallel copy (see aa_emit_reg_arg_moves). */
+typedef struct {
+ NativeLoc dst;
+ NativeLoc src;
+ u32 src_offset;
+ u32 size;
+ int is_addr; /* 1: write addr-of(src) into dst; 0: load the value part */
+} AAArgMove;
+
+/* AAPCS64/Apple permit at most 8 GP + 8 FP register-passed argument slots. */
+#define AA_MAX_REG_ARG_MOVES 16u
+
+/* The register a move reads, or REG_NONE if it reads none (immediate, frame
+ * slot, or an address derived from fp/sp/a global). Only a register source can
+ * be invalidated by another move writing its argument register, so only these
+ * constrain the emission order. */
+static Reg aa_arg_move_src_reg(const AAArgMove* m, NativeAllocClass* cls_out) {
+ if (!m->is_addr && m->src.kind == NATIVE_LOC_REG) {
+ *cls_out = (NativeAllocClass)m->src.cls;
+ return m->src.v.reg;
+ }
+ return REG_NONE;
+}
+
+static void aa_emit_one_arg_move(NativeTarget* t, const AAArgMove* m) {
+ if (m->is_addr)
+ aa_addr_of_loc(t, m->dst, m->src);
+ else
+ aa_load_part(t, m->dst, m->src, m->src_offset, m->size);
+}
+
+/* Emit register-argument moves with parallel-copy semantics: every register
+ * must be read by all moves that source it before any move overwrites it. The
+ * allocator usually arranges the sources so the moves are already conflict-free
+ * (it copies a value out of an about-to-be-clobbered argument register), but it
+ * does not always — notably for variadic arguments, where it can leave a prior
+ * call's result sitting in x0 even though x0 is this call's first arg register.
+ * So the backend must not assume a safe order. Emit any move whose destination
+ * is no longer needed as a source (topological order); when only a true cycle
+ * remains, save one member's register into a scratch and redirect its readers,
+ * then drain the now-acyclic remainder. */
+static void aa_emit_reg_arg_moves(NativeTarget* t, AAArgMove* moves, u32 n) {
+ u8 done[AA_MAX_REG_ARG_MOVES];
+ u32 emitted = 0;
+ if (n > AA_MAX_REG_ARG_MOVES)
+ aa_panic(aa_of(t), "too many register arguments");
+ memset(done, 0, sizeof done);
+ while (emitted < n) {
+ int progress = 0;
+ for (u32 i = 0; i < n; ++i) {
+ int blocked = 0;
+ if (done[i]) continue;
+ for (u32 j = 0; j < n && !blocked; ++j) {
+ NativeAllocClass sc;
+ Reg sr;
+ if (done[j] || j == i) continue;
+ sr = aa_arg_move_src_reg(&moves[j], &sc);
+ if (sr != REG_NONE && sr == moves[i].dst.v.reg &&
+ sc == (NativeAllocClass)moves[i].dst.cls)
+ blocked = 1;
+ }
+ if (!blocked) {
+ aa_emit_one_arg_move(t, &moves[i]);
+ done[i] = 1;
+ emitted++;
+ progress = 1;
+ }
+ }
+ if (!progress) {
+ /* Every remaining move is part of a cycle; a cycle's ring is reg->reg, so
+ * at least one remaining move is register-sourced. Break on such a move:
+ * its destination register still holds a pending source value, so copy it
+ * to a scratch and point that value's readers at the scratch. Scratch
+ * avoids x16 (which may hold a stashed indirect callee) and the arg regs;
+ * cycle moves are reg->reg, so the scratch needs no address arithmetic. */
+ u32 k = 0;
+ NativeAllocClass bc, sc;
+ Reg scratch_reg;
+ NativeLoc scratchloc;
+ while (k < n && (done[k] || aa_arg_move_src_reg(&moves[k], &sc) == REG_NONE))
+ ++k;
+ bc = (NativeAllocClass)moves[k].dst.cls;
+ scratch_reg = bc == NATIVE_REG_FP ? 16u : AA_TMP1;
+ scratchloc = aa_reg_loc(moves[k].dst.type, bc, scratch_reg);
+ aa_move(t, scratchloc, moves[k].dst);
+ for (u32 j = 0; j < n; ++j) {
+ Reg sr = aa_arg_move_src_reg(&moves[j], &sc);
+ if (!done[j] && sr != REG_NONE && sr == moves[k].dst.v.reg && sc == bc) {
+ moves[j].src = scratchloc;
+ moves[j].src_offset = 0;
+ }
+ }
+ }
+ }
+}
+
static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc,
NativeCallPlan* plan) {
NativeCallPlanRet* rets;
@@ -2297,8 +2395,18 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc,
plan->callee = scratch;
}
{
- u32 next_int = 0, next_fp = 0, stack = 0;
+ u32 next_int = 0, next_fp = 0, stack = 0, nmoves = 0;
int tail_call = (desc->flags & CG_CALL_TAIL) != 0;
+ AAArgMove moves[AA_MAX_REG_ARG_MOVES];
+ /* Stack-passed arguments are stored inline as we walk, *before* any
+ * argument register is written, so a stack-arg source that the allocator
+ * left in an arg register (e.g. a prior call's result still in x0, consumed
+ * as a variadic stack arg) is read while it is still live. Stack stores
+ * only touch memory and the AA_TMP0/v16 scratch, never an arg-register
+ * source, so emitting them first cannot clobber a register-arg source.
+ * Register-passed arguments are collected and emitted afterward as a
+ * parallel copy (aa_emit_reg_arg_moves) so they likewise never overwrite a
+ * register another argument still needs to read. */
for (u32 i = 0; i < desc->nargs; ++i) {
ABIArgInfo tmp;
const ABIArgInfo* ai = aa_param_abi(t, abi, desc, i, &tmp);
@@ -2319,14 +2427,18 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc,
continue;
}
if (ai->kind == ABI_ARG_INDIRECT) {
- NativeLoc ptr;
if (next_int < 8u) {
- ptr = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT,
- next_int++);
- aa_addr_of_loc(t, ptr, desc->args[i]);
+ AAArgMove* m = &moves[nmoves++];
+ m->dst = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT,
+ next_int++);
+ m->src = desc->args[i];
+ m->src_offset = 0;
+ m->size = 8;
+ m->is_addr = 1;
} else {
- ptr = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT,
- AA_TMP0);
+ NativeLoc ptr =
+ aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT,
+ AA_TMP0);
aa_addr_of_loc(t, ptr, desc->args[i]);
aa_store_outgoing_part(t, tail_call, stack, ptr, 8);
stack += 8u;
@@ -2337,12 +2449,15 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc,
const ABIArgPart* part = &ai->parts[p];
NativeAllocClass cls =
part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
- if (cls == NATIVE_REG_FP && next_fp < 8u) {
- NativeLoc dst = aa_reg_loc(desc->args[i].type, cls, next_fp++);
- aa_load_part(t, dst, desc->args[i], part->src_offset, part->size);
- } else if (cls == NATIVE_REG_INT && next_int < 8u) {
- NativeLoc dst = aa_reg_loc(desc->args[i].type, cls, next_int++);
- aa_load_part(t, dst, desc->args[i], part->src_offset, part->size);
+ if ((cls == NATIVE_REG_FP && next_fp < 8u) ||
+ (cls == NATIVE_REG_INT && next_int < 8u)) {
+ AAArgMove* m = &moves[nmoves++];
+ m->dst = aa_reg_loc(desc->args[i].type, cls,
+ cls == NATIVE_REG_FP ? next_fp++ : next_int++);
+ m->src = desc->args[i];
+ m->src_offset = part->src_offset;
+ m->size = part->size;
+ m->is_addr = 0;
} else {
NativeLoc tmpreg = aa_reg_loc(desc->args[i].type, cls,
cls == NATIVE_REG_FP ? 16u : AA_TMP0);
@@ -2353,6 +2468,7 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc,
}
}
}
+ aa_emit_reg_arg_moves(t, moves, nmoves);
/* Set the indirect-result register (x8) *after* the argument loads: an
* argument source may have been allocated to x8, and the sret pointer load
* would otherwise clobber it before it is moved into its argument
diff --git a/test/toy/cases/139_variadic_stack_arg_call_result.expected b/test/toy/cases/139_variadic_stack_arg_call_result.expected
@@ -0,0 +1 @@
+42
diff --git a/test/toy/cases/139_variadic_stack_arg_call_result.toy b/test/toy/cases/139_variadic_stack_arg_call_result.toy
@@ -0,0 +1,36 @@
+// Regression: a variadic argument is passed on the stack (Apple arm64 ABI),
+// and its source value is the result of a *non-inlined* call, which the
+// register allocator leaves in the ABI return/arg register x0. The fixed
+// parameter `n` also targets x0. A single forward pass over the call's
+// arguments would load `n` into x0 first, clobbering the still-pending call
+// result before the stack store reads it, printing garbage. The backend must
+// read every stack-arg source before overwriting any argument register.
+//
+// `make` is marked noinline so its result genuinely comes back in x0 via a
+// real `bl`; if it inlined, the value would be computed in a scratch reg and
+// the hazard would never arise.
+
+fn @[.noinline] make(x: i64): i64 {
+ return x + x;
+}
+
+fn vfn(n: i64, ...): i64 {
+ var ap: va_list;
+ @va_start(ap);
+ let a: i64 = @va_arg<i64>(ap);
+ let b: i64 = @va_arg<i64>(ap);
+ @va_end(ap);
+ return n * (100 as i64) + a * (10 as i64) + b;
+}
+
+fn __user_main(): i64 {
+ // Call result is the last (second) variadic stack arg.
+ if vfn(7, 3, make(64)) != (858 as i64) { return 1; }
+ // Call result is the first variadic stack arg.
+ if vfn(7, make(64), 3) != (1983 as i64) { return 2; }
+ // Both variadic stack args are call results.
+ if vfn(1, make(10), make(20)) != (340 as i64) { return 3; }
+ return 42;
+}
+
+fn main(): i32 { return __user_main() as i32; }