opt/aa64: resolve call-argument moves as a parallel copy - kit

commit 97b868ff24f7d52083af2a924890ed48b14a5f6d
parent 0a722716448183a97354a873bcda572031919e19
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 28 May 2026 10:38:55 -0700

opt/aa64: resolve call-argument moves as a parallel copy

The aarch64 call lowering emitted argument setup as a single forward pass:
fixed/register args loaded left-to-right, then variadic stack args stored.
Both orderings could clobber a live argument source. When the register
allocator left a value in an ABI argument register (e.g. a prior call's
result still in x0, consumed as a later argument), loading an earlier
argument into that register destroyed the value before the later argument
read it -- miscompiling any call that passes a non-inlined call's result
alongside another argument. binary-trees -O1 hit this: ItemCheck(tree)
passed directly to printf printed the format-string pointer instead.

Emit stack stores first (their sources are read before any argument
register is written) and collect the register-argument moves, then emit
them with parallel-copy semantics: a register is read by every move that
sources it before any move overwrites it (topological order, with a
scratch to break true cycles). Fixes both the Apple (variadic-on-stack)
and AAPCS64 (variadic-in-register) variants.

Adds toy case 139 covering a non-inlined call result passed as a variadic
argument in several positions; green on R/L/C/W and the aa64 cross path.

Diffstat:
M src/arch/aa64/native.c  | 142 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
A test/toy/cases/139_variadic_stack_arg_call_result.expected  | 1 +
A test/toy/cases/139_variadic_stack_arg_call_result.toy  | 36 ++++++++++++++++++++++++++++++++++++

3 files changed, 166 insertions(+), 13 deletions(-)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -2270,6 +2270,104 @@ static u32 aa_signature_stack_bytes(NativeTarget* t, CfreeCgTypeId fn_type,
   return aa_call_stack_size(t, &d);
 }
 
+/* One register-passed call argument: write `src` (or its address) into the
+ * argument register `dst`. Collected during planning and emitted as a batch so
+ * the backend can order them as a parallel copy (see aa_emit_reg_arg_moves). */
+typedef struct {
+  NativeLoc dst;
+  NativeLoc src;
+  u32 src_offset;
+  u32 size;
+  int is_addr; /* 1: write addr-of(src) into dst; 0: load the value part */
+} AAArgMove;
+
+/* AAPCS64/Apple permit at most 8 GP + 8 FP register-passed argument slots. */
+#define AA_MAX_REG_ARG_MOVES 16u
+
+/* The register a move reads, or REG_NONE if it reads none (immediate, frame
+ * slot, or an address derived from fp/sp/a global). Only a register source can
+ * be invalidated by another move writing its argument register, so only these
+ * constrain the emission order. */
+static Reg aa_arg_move_src_reg(const AAArgMove* m, NativeAllocClass* cls_out) {
+  if (!m->is_addr && m->src.kind == NATIVE_LOC_REG) {
+    *cls_out = (NativeAllocClass)m->src.cls;
+    return m->src.v.reg;
+  }
+  return REG_NONE;
+}
+
+static void aa_emit_one_arg_move(NativeTarget* t, const AAArgMove* m) {
+  if (m->is_addr)
+    aa_addr_of_loc(t, m->dst, m->src);
+  else
+    aa_load_part(t, m->dst, m->src, m->src_offset, m->size);
+}
+
+/* Emit register-argument moves with parallel-copy semantics: every register
+ * must be read by all moves that source it before any move overwrites it. The
+ * allocator usually arranges the sources so the moves are already conflict-free
+ * (it copies a value out of an about-to-be-clobbered argument register), but it
+ * does not always — notably for variadic arguments, where it can leave a prior
+ * call's result sitting in x0 even though x0 is this call's first arg register.
+ * So the backend must not assume a safe order. Emit any move whose destination
+ * is no longer needed as a source (topological order); when only a true cycle
+ * remains, save one member's register into a scratch and redirect its readers,
+ * then drain the now-acyclic remainder. */
+static void aa_emit_reg_arg_moves(NativeTarget* t, AAArgMove* moves, u32 n) {
+  u8 done[AA_MAX_REG_ARG_MOVES];
+  u32 emitted = 0;
+  if (n > AA_MAX_REG_ARG_MOVES)
+    aa_panic(aa_of(t), "too many register arguments");
+  memset(done, 0, sizeof done);
+  while (emitted < n) {
+    int progress = 0;
+    for (u32 i = 0; i < n; ++i) {
+      int blocked = 0;
+      if (done[i]) continue;
+      for (u32 j = 0; j < n && !blocked; ++j) {
+        NativeAllocClass sc;
+        Reg sr;
+        if (done[j] || j == i) continue;
+        sr = aa_arg_move_src_reg(&moves[j], &sc);
+        if (sr != REG_NONE && sr == moves[i].dst.v.reg &&
+            sc == (NativeAllocClass)moves[i].dst.cls)
+          blocked = 1;
+      }
+      if (!blocked) {
+        aa_emit_one_arg_move(t, &moves[i]);
+        done[i] = 1;
+        emitted++;
+        progress = 1;
+      }
+    }
+    if (!progress) {
+      /* Every remaining move is part of a cycle; a cycle's ring is reg->reg, so
+       * at least one remaining move is register-sourced. Break on such a move:
+       * its destination register still holds a pending source value, so copy it
+       * to a scratch and point that value's readers at the scratch. Scratch
+       * avoids x16 (which may hold a stashed indirect callee) and the arg regs;
+       * cycle moves are reg->reg, so the scratch needs no address arithmetic. */
+      u32 k = 0;
+      NativeAllocClass bc, sc;
+      Reg scratch_reg;
+      NativeLoc scratchloc;
+      while (k < n && (done[k] || aa_arg_move_src_reg(&moves[k], &sc) == REG_NONE))
+        ++k;
+      bc = (NativeAllocClass)moves[k].dst.cls;
+      scratch_reg = bc == NATIVE_REG_FP ? 16u : AA_TMP1;
+      scratchloc = aa_reg_loc(moves[k].dst.type, bc, scratch_reg);
+      aa_move(t, scratchloc, moves[k].dst);
+      for (u32 j = 0; j < n; ++j) {
+        Reg sr = aa_arg_move_src_reg(&moves[j], &sc);
+        if (!done[j] && sr != REG_NONE && sr == moves[k].dst.v.reg && sc == bc) {
+          moves[j].src = scratchloc;
+          moves[j].src_offset = 0;
+        }
+      }
+    }
+  }
+}
+
 static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc,
                          NativeCallPlan* plan) {
   NativeCallPlanRet* rets;
@@ -2297,8 +2395,18 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc,
     plan->callee = scratch;
   }
   {
-    u32 next_int = 0, next_fp = 0, stack = 0;
+    u32 next_int = 0, next_fp = 0, stack = 0, nmoves = 0;
     int tail_call = (desc->flags & CG_CALL_TAIL) != 0;
+    AAArgMove moves[AA_MAX_REG_ARG_MOVES];
+    /* Stack-passed arguments are stored inline as we walk, *before* any
+     * argument register is written, so a stack-arg source that the allocator
+     * left in an arg register (e.g. a prior call's result still in x0, consumed
+     * as a variadic stack arg) is read while it is still live. Stack stores
+     * only touch memory and the AA_TMP0/v16 scratch, never an arg-register
+     * source, so emitting them first cannot clobber a register-arg source.
+     * Register-passed arguments are collected and emitted afterward as a
+     * parallel copy (aa_emit_reg_arg_moves) so they likewise never overwrite a
+     * register another argument still needs to read. */
     for (u32 i = 0; i < desc->nargs; ++i) {
       ABIArgInfo tmp;
       const ABIArgInfo* ai = aa_param_abi(t, abi, desc, i, &tmp);
@@ -2319,14 +2427,18 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc,
         continue;
       }
       if (ai->kind == ABI_ARG_INDIRECT) {
-        NativeLoc ptr;
         if (next_int < 8u) {
-          ptr = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT,
-                           next_int++);
-          aa_addr_of_loc(t, ptr, desc->args[i]);
+          AAArgMove* m = &moves[nmoves++];
+          m->dst = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT,
+                              next_int++);
+          m->src = desc->args[i];
+          m->src_offset = 0;
+          m->size = 8;
+          m->is_addr = 1;
         } else {
-          ptr = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT,
-                           AA_TMP0);
+          NativeLoc ptr =
+              aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT,
+                         AA_TMP0);
           aa_addr_of_loc(t, ptr, desc->args[i]);
           aa_store_outgoing_part(t, tail_call, stack, ptr, 8);
           stack += 8u;
@@ -2337,12 +2449,15 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc,
         const ABIArgPart* part = &ai->parts[p];
         NativeAllocClass cls =
             part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
-        if (cls == NATIVE_REG_FP && next_fp < 8u) {
-          NativeLoc dst = aa_reg_loc(desc->args[i].type, cls, next_fp++);
-          aa_load_part(t, dst, desc->args[i], part->src_offset, part->size);
-        } else if (cls == NATIVE_REG_INT && next_int < 8u) {
-          NativeLoc dst = aa_reg_loc(desc->args[i].type, cls, next_int++);
-          aa_load_part(t, dst, desc->args[i], part->src_offset, part->size);
+        if ((cls == NATIVE_REG_FP && next_fp < 8u) ||
+            (cls == NATIVE_REG_INT && next_int < 8u)) {
+          AAArgMove* m = &moves[nmoves++];
+          m->dst = aa_reg_loc(desc->args[i].type, cls,
+                              cls == NATIVE_REG_FP ? next_fp++ : next_int++);
+          m->src = desc->args[i];
+          m->src_offset = part->src_offset;
+          m->size = part->size;
+          m->is_addr = 0;
         } else {
           NativeLoc tmpreg = aa_reg_loc(desc->args[i].type, cls,
                                         cls == NATIVE_REG_FP ? 16u : AA_TMP0);
@@ -2353,6 +2468,7 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc,
         }
       }
     }
+    aa_emit_reg_arg_moves(t, moves, nmoves);
     /* Set the indirect-result register (x8) *after* the argument loads: an
      * argument source may have been allocated to x8, and the sret pointer load
      * would otherwise clobber it before it is moved into its argument
diff --git a/test/toy/cases/139_variadic_stack_arg_call_result.expected b/test/toy/cases/139_variadic_stack_arg_call_result.expected
@@ -0,0 +1 @@
+42
diff --git a/test/toy/cases/139_variadic_stack_arg_call_result.toy b/test/toy/cases/139_variadic_stack_arg_call_result.toy
@@ -0,0 +1,36 @@
+// Regression: a variadic argument is passed on the stack (Apple arm64 ABI),
+// and its source value is the result of a *non-inlined* call, which the
+// register allocator leaves in the ABI return/arg register x0. The fixed
+// parameter `n` also targets x0. A single forward pass over the call's
+// arguments would load `n` into x0 first, clobbering the still-pending call
+// result before the stack store reads it, printing garbage. The backend must
+// read every stack-arg source before overwriting any argument register.
+//
+// `make` is marked noinline so its result genuinely comes back in x0 via a
+// real `bl`; if it inlined, the value would be computed in a scratch reg and
+// the hazard would never arise.
+
+fn @[.noinline] make(x: i64): i64 {
+  return x + x;
+}
+
+fn vfn(n: i64, ...): i64 {
+  var ap: va_list;
+  @va_start(ap);
+  let a: i64 = @va_arg<i64>(ap);
+  let b: i64 = @va_arg<i64>(ap);
+  @va_end(ap);
+  return n * (100 as i64) + a * (10 as i64) + b;
+}
+
+fn __user_main(): i64 {
+  // Call result is the last (second) variadic stack arg.
+  if vfn(7, 3, make(64)) != (858 as i64) { return 1; }
+  // Call result is the first variadic stack arg.
+  if vfn(7, make(64), 3) != (1983 as i64) { return 2; }
+  // Both variadic stack args are call results.
+  if vfn(1, make(10), make(20)) != (340 as i64) { return 3; }
+  return 42;
+}
+
+fn main(): i32 { return __user_main() as i32; }

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/arch/aa64/native.c	\|	142	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
A	test/toy/cases/139_variadic_stack_arg_call_result.expected	\|	1	+
A	test/toy/cases/139_variadic_stack_arg_call_result.toy	\|	36	++++++++++++++++++++++++++++++++++++