Share the parallel-copy arg-move scheduler; x64 NativeFrame + partial O1 - kit

commit dc46a3135f40231846840a807f477f5d082525c3
parent 1bfb41a5fc5c5da9f233bcf2d6846192694639e3
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 29 May 2026 09:40:53 -0700

Share the parallel-copy arg-move scheduler; x64 NativeFrame + partial O1

New src/cg/native_argmove.{h,c}: the parallel-copy register-move scheduler
(topological emission + cycle-break through a scratch) was duplicated nearly
verbatim in all three backends (aa_/rv_/x64_emit_reg_arg_moves). The scheduling
is arch-independent; only the leaf ops differ (emit one move, the scratch reg).
Extracted to one native_arg_shuffle() taking a tiny ops struct; aa64/rv64/x64
each now build their move list and call it. Behavior-identical: aa64 toy 1034/0
(O0+O1), rv64 toy O0 156/0 + O1 156/0, x64 toy O0 156/0.

x64 also adopts the shared NativeFrame (mirroring aa64/rv64) and implements its
-O1 known-frame path (func_begin_known_frame derives the callee-saved set from
the optimizer masks; reserve_callee_saves non-NULL; tail-site restores
callee-saves before leave, indirect callee already staged in r11). x64
compute_frame_size already counted callee-save bytes, so no frame-size fix
needed. x64 -O1 is 137 pass / 19 fail: the 9 tail failures are root-caused to
naive per-parameter entry binds (bind_param) clobbering when the allocator
rotates a tail function's params across the incoming arg regs — the same cycle
the shuffle breaks, but the entry-bind path doesn't use it yet. Fix (route entry
binds through the shared scheduler) and the atomic/varargs failures are next.

Diffstat:
M src/arch/aa64/native.c  | 101 ++++++++++++++++---------------------------------------------------------------
M src/arch/rv64/native.c  | 82 ++++++++++++++-----------------------------------------------------------------
M src/arch/x64/native.c  | 262 +++++++++++++++++++++++++++++++++++--------------------------------------------
A src/cg/native_argmove.c  | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/cg/native_argmove.h  | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

5 files changed, 290 insertions(+), 294 deletions(-)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -36,6 +36,7 @@
 #include "arch/aa64/regs.h"
 #include "asm/asm.h"
 #include "asm/asm_lex.h"
+#include "cg/native_argmove.h"
 #include "cg/native_direct_target.h"
 #include "cg/native_frame.h"
 #include "cg/type.h"
@@ -2446,99 +2447,37 @@ static u32 aa_call_stack_bytes(NativeTarget* t, const NativeCallDesc* desc) {
 /* One register-passed call argument: write `src` (or its address) into the
  * argument register `dst`. Collected during planning and emitted as a batch so
  * the backend can order them as a parallel copy (see aa_emit_reg_arg_moves). */
-typedef struct {
-  NativeLoc dst;
-  NativeLoc src;
-  u32 src_offset;
-  u32 size;
-  int is_addr; /* 1: write addr-of(src) into dst; 0: load the value part */
-} AAArgMove;
+typedef NativeArgMove AAArgMove;
 
 /* AAPCS64/Apple permit at most 8 GP + 8 FP register-passed argument slots. */
 #define AA_MAX_REG_ARG_MOVES 16u
 
-/* The register a move reads, or REG_NONE if it reads none (immediate, frame
- * slot, or an address derived from fp/sp/a global). Only a register source can
- * be invalidated by another move writing its argument register, so only these
- * constrain the emission order. */
-static Reg aa_arg_move_src_reg(const AAArgMove* m, NativeAllocClass* cls_out) {
-  if (!m->is_addr && m->src.kind == NATIVE_LOC_REG) {
-    *cls_out = (NativeAllocClass)m->src.cls;
-    return m->src.v.reg;
-  }
-  return REG_NONE;
-}
-
-static void aa_emit_one_arg_move(NativeTarget* t, const AAArgMove* m) {
+static void aa_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) {
   if (m->is_addr)
     aa_addr_of_loc(t, m->dst, m->src);
   else
     aa_load_part(t, m->dst, m->src, m->src_offset, m->size);
 }
 
-/* Emit register-argument moves with parallel-copy semantics: every register
- * must be read by all moves that source it before any move overwrites it. The
- * allocator usually arranges the sources so the moves are already conflict-free
- * (it copies a value out of an about-to-be-clobbered argument register), but it
- * does not always — notably for variadic arguments, where it can leave a prior
- * call's result sitting in x0 even though x0 is this call's first arg register.
- * So the backend must not assume a safe order. Emit any move whose destination
- * is no longer needed as a source (topological order); when only a true cycle
- * remains, save one member's register into a scratch and redirect its readers,
- * then drain the now-acyclic remainder. */
-static void aa_emit_reg_arg_moves(NativeTarget* t, AAArgMove* moves, u32 n) {
-  u8 done[AA_MAX_REG_ARG_MOVES];
-  u32 emitted = 0;
+/* Emit register-argument moves as a parallel copy via the shared scheduler:
+ * every register is read by all moves that source it before any move overwrites
+ * it; a true cycle is broken through a scratch. The allocator usually arranges
+ * a conflict-free order, but not always (notably variadic args, where it can
+ * leave a prior call's result in x0 even though x0 is this call's first arg
+ * register), so the backend must not assume a safe order. Cycle scratch is
+ * AA_TMP1 (x17) for int and v16 for fp — distinct from x16 (AA_TMP0), which may
+ * hold a stashed indirect callee. */
+static void aa_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, u32 n) {
+  NativeArgShuffle s;
   if (n > AA_MAX_REG_ARG_MOVES)
     aa_panic(aa_of(t), "too many register arguments");
-  memset(done, 0, sizeof done);
-  while (emitted < n) {
-    int progress = 0;
-    for (u32 i = 0; i < n; ++i) {
-      int blocked = 0;
-      if (done[i]) continue;
-      for (u32 j = 0; j < n && !blocked; ++j) {
-        NativeAllocClass sc;
-        Reg sr;
-        if (done[j] || j == i) continue;
-        sr = aa_arg_move_src_reg(&moves[j], &sc);
-        if (sr != REG_NONE && sr == moves[i].dst.v.reg &&
-            sc == (NativeAllocClass)moves[i].dst.cls)
-          blocked = 1;
-      }
-      if (!blocked) {
-        aa_emit_one_arg_move(t, &moves[i]);
-        done[i] = 1;
-        emitted++;
-        progress = 1;
-      }
-    }
-    if (!progress) {
-      /* Every remaining move is part of a cycle; a cycle's ring is reg->reg, so
-       * at least one remaining move is register-sourced. Break on such a move:
-       * its destination register still holds a pending source value, so copy it
-       * to a scratch and point that value's readers at the scratch. Scratch
-       * avoids x16 (which may hold a stashed indirect callee) and the arg regs;
-       * cycle moves are reg->reg, so the scratch needs no address arithmetic. */
-      u32 k = 0;
-      NativeAllocClass bc, sc;
-      Reg scratch_reg;
-      NativeLoc scratchloc;
-      while (k < n && (done[k] || aa_arg_move_src_reg(&moves[k], &sc) == REG_NONE))
-        ++k;
-      bc = (NativeAllocClass)moves[k].dst.cls;
-      scratch_reg = bc == NATIVE_REG_FP ? 16u : AA_TMP1;
-      scratchloc = aa_reg_loc(moves[k].dst.type, bc, scratch_reg);
-      aa_move(t, scratchloc, moves[k].dst);
-      for (u32 j = 0; j < n; ++j) {
-        Reg sr = aa_arg_move_src_reg(&moves[j], &sc);
-        if (!done[j] && sr != REG_NONE && sr == moves[k].dst.v.reg && sc == bc) {
-          moves[j].src = scratchloc;
-          moves[j].src_offset = 0;
-        }
-      }
-    }
-  }
+  memset(&s, 0, sizeof s);
+  s.t = t;
+  s.emit_one = aa_emit_one_arg_move;
+  s.reg_move = aa_move;
+  s.scratch[NATIVE_REG_INT] = AA_TMP1;
+  s.scratch[NATIVE_REG_FP] = 16u;
+  native_arg_shuffle(&s, moves, n);
 }
 
 static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc,
diff --git a/src/arch/rv64/native.c b/src/arch/rv64/native.c
@@ -23,6 +23,7 @@
 #include "asm/asm_lex.h"
 #include "arch/rv64/regs.h"
 #include "arch/rv64/rv64.h"
+#include "cg/native_argmove.h"
 #include "cg/native_direct_target.h"
 #include "cg/native_frame.h"
 #include "cg/type.h"
@@ -1794,82 +1795,27 @@ static void rv_bind_native_param(NativeTarget* t, const CGParamDesc* p,
 
 /* ============================ calls / returns ============================ */
 
-typedef struct {
-  NativeLoc dst;
-  NativeLoc src;
-  u32 src_offset;
-  u32 size;
-  int is_addr;
-} RvArgMove;
+typedef NativeArgMove RvArgMove;
 
-static Reg rv_arg_move_src_reg(const RvArgMove* m, NativeAllocClass* cls_out) {
-  if (!m->is_addr && m->src.kind == NATIVE_LOC_REG) {
-    *cls_out = (NativeAllocClass)m->src.cls;
-    return m->src.v.reg;
-  }
-  return REG_NONE;
-}
-
-static void rv_emit_one_arg_move(NativeTarget* t, const RvArgMove* m) {
+static void rv_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) {
   if (m->is_addr)
     rv_addr_of_loc(t, m->dst, m->src);
   else
     rv_load_part(t, m->dst, m->src, m->src_offset, m->size);
 }
 
-/* Parallel-copy register arg moves with cycle breaking. */
-static void rv_emit_reg_arg_moves(NativeTarget* t, RvArgMove* moves, u32 n) {
-  u8 done[RV_MAX_REG_ARG_MOVES];
-  u32 emitted = 0;
+/* Parallel-copy register arg moves via the shared scheduler; cycles break
+ * through the int/fp emit scratch (t1 / ft1). */
+static void rv_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, u32 n) {
+  NativeArgShuffle s;
   if (n > RV_MAX_REG_ARG_MOVES) rv_panic(rv_of(t), "too many register args");
-  memset(done, 0, sizeof done);
-  while (emitted < n) {
-    int progress = 0;
-    u32 i, j;
-    for (i = 0; i < n; ++i) {
-      int blocked = 0;
-      if (done[i]) continue;
-      for (j = 0; j < n && !blocked; ++j) {
-        NativeAllocClass sc;
-        Reg sr;
-        if (done[j] || j == i) continue;
-        sr = rv_arg_move_src_reg(&moves[j], &sc);
-        if (sr != REG_NONE && sr == moves[i].dst.v.reg &&
-            sc == (NativeAllocClass)moves[i].dst.cls)
-          blocked = 1;
-      }
-      if (!blocked) {
-        rv_emit_one_arg_move(t, &moves[i]);
-        done[i] = 1;
-        emitted++;
-        progress = 1;
-      }
-    }
-    if (!progress) {
-      u32 k = 0;
-      NativeAllocClass bc, sc;
-      Reg scratch_reg;
-      NativeLoc scratchloc;
-      while (k < n &&
-             (done[k] || rv_arg_move_src_reg(&moves[k], &sc) == REG_NONE))
-        ++k;
-      bc = (NativeAllocClass)moves[k].dst.cls;
-      scratch_reg = bc == NATIVE_REG_FP ? RV_FTMP1 : RV_TMP1;
-      scratchloc = rv_reg_loc(moves[k].dst.type, bc, scratch_reg);
-      rv_move(t, scratchloc, moves[k].dst);
-      {
-        u32 jj;
-        for (jj = 0; jj < n; ++jj) {
-          Reg sr = rv_arg_move_src_reg(&moves[jj], &sc);
-          if (!done[jj] && sr != REG_NONE && sr == moves[k].dst.v.reg &&
-              sc == bc) {
-            moves[jj].src = scratchloc;
-            moves[jj].src_offset = 0;
-          }
-        }
-      }
-    }
-  }
+  memset(&s, 0, sizeof s);
+  s.t = t;
+  s.emit_one = rv_emit_one_arg_move;
+  s.reg_move = rv_move;
+  s.scratch[NATIVE_REG_INT] = RV_TMP1;
+  s.scratch[NATIVE_REG_FP] = RV_FTMP1;
+  native_arg_shuffle(&s, moves, n);
 }
 
 static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc,
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -34,7 +34,9 @@
 #include "arch/x64/x64.h"
 #include "asm/asm.h"
 #include "asm/asm_lex.h"
+#include "cg/native_argmove.h"
 #include "cg/native_direct_target.h"
+#include "cg/native_frame.h"
 #include "cg/type.h"
 #include "core/arena.h"
 #include "core/bytes.h"
@@ -53,18 +55,12 @@ enum {
 
 /* ============================ target state ============================ */
 
-typedef struct X64NativeSlot {
-  u32 off; /* bytes below rbp (positive); address = rbp - off */
-  u32 size;
-  u32 align;
-  u8 kind; /* NativeFrameSlotKind */
-  u8 pad[3];
-} X64NativeSlot;
-
-typedef struct X64CalleeSave {
-  Reg reg;
-  u8 cls; /* NativeAllocClass */
-} X64CalleeSave;
+/* Frame slots and callee-save records live in the shared NativeFrame
+ * bookkeeping (cg/native_frame.h); these aliases keep the x64-local spellings.
+ * x64 reads only .reg/.cls of a callee-save (it computes save offsets below the
+ * locals rather than homing them in frame slots, so .slot/.type stay unused). */
+typedef NativeFrameSlotEntry X64NativeSlot;
+typedef NativeFrameCalleeSave X64CalleeSave;
 
 typedef enum X64PatchKind { X64_PATCH_ALLOCA } X64PatchKind;
 
@@ -78,11 +74,9 @@ typedef struct X64NativeTarget {
   SrcLoc loc;
   const CGFuncDesc* func;
 
-  X64NativeSlot* slots;
-  u32 nslots;
-  u32 slots_cap;
-  u32 cum_off;      /* sum of frame-slot reservations below rbp */
-  u32 max_outgoing; /* max outgoing-arg bytes across all calls */
+  /* Shared frame bookkeeping: slot table, cum_off, max_outgoing, callee-save
+   * set, and the known_frame / has_alloca / frame_final flags. */
+  NativeFrame frame;
   u32 frame_size_final;
 
   u32 incoming_stack_size; /* fixed-param stack bytes (tail-call check) */
@@ -104,13 +98,6 @@ typedef struct X64NativeTarget {
   u32 prologue_nbytes;
   MCLabel epilogue_label;
 
-  X64CalleeSave callee_saves[X64_MAX_CS_INT_REGS + X64_MAX_CS_FP_REGS];
-  u32 ncallee_saves;
-
-  u8 known_frame;
-  u8 has_alloca;
-  u8 frame_final;
-
   const X64ABIRegs* abi;
 } X64NativeTarget;
 
@@ -121,9 +108,7 @@ static _Noreturn void x64_panic(X64NativeTarget* a, const char* msg) {
 }
 
 static X64NativeSlot* x64_slot_get(X64NativeTarget* a, NativeFrameSlot fs) {
-  if (fs == NATIVE_FRAME_SLOT_NONE || fs > a->nslots)
-    x64_panic(a, "bad frame slot");
-  return &a->slots[fs - 1u];
+  return native_frame_slot_at(&a->frame, fs);
 }
 
 static u32 align_up_u32(u32 v, u32 align) {
@@ -1343,37 +1328,19 @@ static void x64_load_label_addr(NativeTarget* t, NativeLoc dst, MCLabel l) {
 
 static NativeFrameSlot x64_frame_slot(NativeTarget* t,
                                       const NativeFrameSlotDesc* d) {
-  X64NativeTarget* a = x64_of(t);
-  X64NativeSlot* s;
-  u32 size = d->size ? d->size : 8u;
-  u32 align = d->align ? d->align : 1u;
-  if (a->frame_final) x64_panic(a, "frame slot requested after prologue");
-  if (a->nslots == a->slots_cap) {
-    u32 cap = a->slots_cap ? a->slots_cap * 2u : 16u;
-    X64NativeSlot* nb = arena_zarray(t->c->tu, X64NativeSlot, cap);
-    if (a->slots) memcpy(nb, a->slots, sizeof(*nb) * a->nslots);
-    a->slots = nb;
-    a->slots_cap = cap;
-  }
-  a->cum_off = align_up_u32(a->cum_off + size, align);
-  s = &a->slots[a->nslots++];
-  s->off = a->cum_off;
-  s->size = size;
-  s->align = align;
-  s->kind = d->kind;
-  return (NativeFrameSlot)a->nslots;
+  return native_frame_slot_alloc(&x64_of(t)->frame, d);
 }
 
 /* xmm save area base (rbp-relative). XMM saves are 16-aligned. */
 static u32 x64_xmm_base(const X64NativeTarget* a, u32 cs_fp) {
-  if (cs_fp == 0) return a->cum_off;
-  return align_up_u32(a->cum_off, 16u);
+  if (cs_fp == 0) return a->frame.cum_off;
+  return align_up_u32(a->frame.cum_off, 16u);
 }
 
 static u32 x64_compute_frame_size(const X64NativeTarget* a, u32 cs_int,
                                   u32 cs_fp) {
   u32 xmm_base = x64_xmm_base(a, cs_fp);
-  u32 raw = a->max_outgoing + cs_int * 8u + cs_fp * 16u + xmm_base;
+  u32 raw = a->frame.max_outgoing + cs_int * 8u + cs_fp * 16u + xmm_base;
   u32 fs = align_up_u32(raw, 16u);
   return fs ? fs : 16u;
 }
@@ -1381,16 +1348,16 @@ static u32 x64_compute_frame_size(const X64NativeTarget* a, u32 cs_int,
 /* Collect the callee-saves the body actually used. */
 static u32 x64_collect_int_saves(X64NativeTarget* a, Reg* regs) {
   u32 n = 0, i;
-  for (i = 0; i < a->ncallee_saves; ++i)
-    if (a->callee_saves[i].cls == NATIVE_REG_INT)
-      regs[n++] = a->callee_saves[i].reg;
+  for (i = 0; i < a->frame.ncallee_saves; ++i)
+    if (a->frame.callee_saves[i].cls == NATIVE_REG_INT)
+      regs[n++] = a->frame.callee_saves[i].reg;
   return n;
 }
 static u32 x64_collect_fp_saves(X64NativeTarget* a, Reg* regs) {
   u32 n = 0, i;
-  for (i = 0; i < a->ncallee_saves; ++i)
-    if (a->callee_saves[i].cls == NATIVE_REG_FP)
-      regs[n++] = a->callee_saves[i].reg;
+  for (i = 0; i < a->frame.ncallee_saves; ++i)
+    if (a->frame.callee_saves[i].cls == NATIVE_REG_FP)
+      regs[n++] = a->frame.callee_saves[i].reg;
   return n;
 }
 
@@ -1486,9 +1453,9 @@ static void x64_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) {
   a->func = fd;
   a->loc = fd->loc;
   a->abi = x64_abi_for_os(t->c->target.os);
-  a->nslots = 0;
-  a->cum_off = 0;
-  a->max_outgoing = 0;
+  /* Shared frame bookkeeping: clears the slot table, cum_off, max_outgoing,
+   * callee-save set, and known_frame/has_alloca/frame_final. */
+  native_frame_reset(&a->frame);
   a->incoming_stack_size = 0;
   a->next_param_int = 0;
   a->next_param_fp = 0;
@@ -1499,10 +1466,6 @@ static void x64_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) {
   a->reg_save_slot = NATIVE_FRAME_SLOT_NONE;
   a->npatches = 0;
   a->nalloca = 0;
-  a->ncallee_saves = 0;
-  a->known_frame = 0;
-  a->has_alloca = 0;
-  a->frame_final = 0;
   a->prologue_nbytes =
       a->abi->shadow_space ? X64_PROLOGUE_BYTES_WIN64 : X64_PROLOGUE_BYTES;
 
@@ -1573,13 +1536,58 @@ static void x64_func_begin(NativeTarget* t, const CGFuncDesc* fd) {
   x64_emit_variadic_reg_saves(a);
 }
 
+/* x64 homes callee-saves below the locals (offsets computed in
+ * x64_compute_frame_size / x64_build_prologue), not in frame slots, so
+ * alloc_slots=0: native_frame just records the {reg,cls} set from the masks. */
+static void x64_reserve_callee_saves(NativeTarget* t, const u32* used,
+                                     u32 nclasses) {
+  native_frame_set_callee_saves(&x64_of(t)->frame, used, nclasses, NULL, 0, 0);
+}
+
+/* Optimizer entry point: the full frame is supplied up front, so the prologue
+ * is emitted final the moment it is built — no NOP region, no func_end patch
+ * (x64_func_end skips patching when known_frame). x64_build_prologue emits the
+ * push rbp / sub rsp / sret spill / callee-save spills; the variadic
+ * register-save stores are emitted separately, as on the single-pass path. */
 static void x64_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
                                        const NativeKnownFrameDesc* frame,
                                        NativeFrameSlot* out_slots) {
-  (void)fd;
-  (void)frame;
-  (void)out_slots;
-  x64_panic(x64_of(t), "known-frame path not implemented yet");
+  X64NativeTarget* a = x64_of(t);
+  MCEmitter* mc = t->mc;
+  Reg cs_int[X64_MAX_CS_INT_REGS], cs_fp[X64_MAX_CS_FP_REGS];
+  u32 n_int, n_fp, frame_size, nbytes, chkstk_disp_pos, i;
+  u8 buf[X64_PROLOGUE_BYTES_WIN64];
+  x64_func_begin_common(t, fd);
+  a->frame.known_frame = 1;
+  if (frame) {
+    a->frame.has_alloca = frame->has_alloca;
+    if (frame->callee_saved_used && frame->ncallee_classes)
+      x64_reserve_callee_saves(t, frame->callee_saved_used,
+                               frame->ncallee_classes);
+    for (i = 0; i < frame->nslots; ++i) {
+      NativeFrameSlot slot = x64_frame_slot(t, &frame->slots[i]);
+      if (out_slots) out_slots[i] = slot;
+    }
+    x64_reserve_entry_saves(a);
+    native_frame_note_outgoing(&a->frame, frame->max_outgoing);
+  }
+  /* Frame is final: size and offsets are settled, so emit the exact prologue. */
+  n_int = x64_collect_int_saves(a, cs_int);
+  n_fp = x64_collect_fp_saves(a, cs_fp);
+  frame_size = x64_compute_frame_size(a, n_int, n_fp);
+  a->frame_size_final = frame_size;
+  a->prologue_pos = mc->pos(mc);
+  nbytes = x64_build_prologue(a, buf, sizeof buf, frame_size, cs_int, n_int,
+                              cs_fp, n_fp, &chkstk_disp_pos);
+  mc->emit_bytes(mc, buf, nbytes);
+  if (chkstk_disp_pos != (u32)-1) {
+    ObjSymId chk = x64_chkstk_sym(t);
+    mc->emit_reloc_at(mc, mc->section_id, a->prologue_pos + chkstk_disp_pos,
+                      R_X64_PLT32, chk, -4, 1, 0);
+  }
+  a->prologue_nbytes = nbytes; /* exact length: used for the CFI post offset */
+  x64_emit_variadic_reg_saves(a);
+  native_frame_set_final(&a->frame);
 }
 
 static void x64_func_end(NativeTarget* t) {
@@ -1610,7 +1618,7 @@ static void x64_func_end(NativeTarget* t) {
   emit_ret(mc);
 
   /* Patch the single-pass prologue placeholder. */
-  if (!a->known_frame) {
+  if (!a->frame.known_frame) {
     u8 buf[X64_PROLOGUE_BYTES_WIN64];
     u32 chkstk_disp_pos;
     u32 nbytes;
@@ -1629,7 +1637,7 @@ static void x64_func_end(NativeTarget* t) {
 
   /* Patch alloca disp32s: lea dst, [rsp + max_outgoing]. */
   {
-    u32 mo = align_up_u32(a->max_outgoing, 16u);
+    u32 mo = align_up_u32(a->frame.max_outgoing, 16u);
     u32 k;
     for (k = 0; k < a->npatches; ++k) {
       u8 dbuf[4];
@@ -1640,8 +1648,10 @@ static void x64_func_end(NativeTarget* t) {
 
   /* CFI: after the prologue, CFA = rbp + 16; rbp at cfa-16, ra at cfa-8. */
   if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) {
-    u32 post =
-        a->prologue_pos + (a->known_frame ? 0u : a->prologue_nbytes);
+    /* Body starts past the prologue. prologue_nbytes is the reserved NOP-region
+     * size on the single-pass path and the exact prologue length on the
+     * known-frame path (set in x64_func_begin_known_frame). */
+    u32 post = a->prologue_pos + a->prologue_nbytes;
     u32 k;
     mc->cfi_set_next_pc_offset(mc, post - a->func_start);
     mc->cfi_def_cfa(mc, X64_RBP, 16);
@@ -1991,25 +2001,9 @@ static void x64_bind_native_param(NativeTarget* t, const CGParamDesc* p,
 
 /* ============================ calls / returns ============================ */
 
-typedef struct {
-  NativeLoc dst;
-  NativeLoc src;
-  u32 src_offset;
-  u32 size;
-  int is_addr;
-  int dup_to_gpr;  /* Win64: also place into the matching GPR */
-  Reg dup_gpr;
-} X64ArgMove;
-
-static Reg x64_arg_move_src_reg(const X64ArgMove* m, NativeAllocClass* cls_out) {
-  if (!m->is_addr && m->src.kind == NATIVE_LOC_REG) {
-    *cls_out = (NativeAllocClass)m->src.cls;
-    return m->src.v.reg;
-  }
-  return REG_NONE;
-}
+typedef NativeArgMove X64ArgMove;
 
-static void x64_emit_one_arg_move(NativeTarget* t, const X64ArgMove* m) {
+static void x64_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) {
   if (m->is_addr) {
     x64_addr_of_loc(t, m->dst, m->src);
   } else {
@@ -2021,57 +2015,19 @@ static void x64_emit_one_arg_move(NativeTarget* t, const X64ArgMove* m) {
   }
 }
 
-/* Parallel-copy register arg moves with cycle breaking through scratch. */
-static void x64_emit_reg_arg_moves(NativeTarget* t, X64ArgMove* moves, u32 n) {
-  u8 done[X64_MAX_REG_ARG_MOVES];
-  u32 emitted = 0;
+/* Parallel-copy register arg moves via the shared scheduler; cycles break
+ * through the int/fp emit scratch (r11 / xmm14). */
+static void x64_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves,
+                                   u32 n) {
+  NativeArgShuffle s;
   if (n > X64_MAX_REG_ARG_MOVES) x64_panic(x64_of(t), "too many register args");
-  memset(done, 0, sizeof done);
-  while (emitted < n) {
-    int progress = 0;
-    u32 i, j;
-    for (i = 0; i < n; ++i) {
-      int blocked = 0;
-      if (done[i]) continue;
-      for (j = 0; j < n && !blocked; ++j) {
-        NativeAllocClass sc;
-        Reg sr;
-        if (done[j] || j == i) continue;
-        sr = x64_arg_move_src_reg(&moves[j], &sc);
-        if (sr != REG_NONE && sr == moves[i].dst.v.reg &&
-            sc == (NativeAllocClass)moves[i].dst.cls)
-          blocked = 1;
-      }
-      if (!blocked) {
-        x64_emit_one_arg_move(t, &moves[i]);
-        done[i] = 1;
-        emitted++;
-        progress = 1;
-      }
-    }
-    if (!progress) {
-      u32 k = 0;
-      NativeAllocClass bc, sc;
-      Reg scratch_reg;
-      NativeLoc scratchloc;
-      u32 jj;
-      while (k < n &&
-             (done[k] || x64_arg_move_src_reg(&moves[k], &sc) == REG_NONE))
-        ++k;
-      bc = (NativeAllocClass)moves[k].dst.cls;
-      scratch_reg = bc == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT2;
-      scratchloc = x64_reg_loc(moves[k].dst.type, bc, scratch_reg);
-      x64_move(t, scratchloc, moves[k].dst);
-      for (jj = 0; jj < n; ++jj) {
-        Reg sr = x64_arg_move_src_reg(&moves[jj], &sc);
-        if (!done[jj] && sr != REG_NONE && sr == moves[k].dst.v.reg &&
-            sc == bc) {
-          moves[jj].src = scratchloc;
-          moves[jj].src_offset = 0;
-        }
-      }
-    }
-  }
+  memset(&s, 0, sizeof s);
+  s.t = t;
+  s.emit_one = x64_emit_one_arg_move;
+  s.reg_move = x64_move;
+  s.scratch[NATIVE_REG_INT] = X64_TMP_INT2;
+  s.scratch[NATIVE_REG_FP] = X64_TMP_FP;
+  native_arg_shuffle(&s, moves, n);
 }
 
 /* Clobber masks: per-call all caller-saved regs are clobbered. */
@@ -2122,8 +2078,8 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc,
   plan->has_sret = abi && abi->has_sret;
   plan->is_variadic = abi && abi->variadic;
   plan->stack_arg_size = x64_call_stack_size(t, desc);
-  if (plan->stack_arg_size > a->max_outgoing)
-    a->max_outgoing = plan->stack_arg_size;
+  if (plan->stack_arg_size > a->frame.max_outgoing)
+    a->frame.max_outgoing = plan->stack_arg_size;
   for (c = 0; c < NATIVE_CALL_PLAN_CLASSES; ++c) {
     plan->clobber_mask[c] = x64_clobber_mask(aregs, (NativeAllocClass)c);
     plan->return_mask[c] = x64_return_mask(abi, (NativeAllocClass)c);
@@ -2275,8 +2231,21 @@ static void x64_emit_tail_site(NativeTarget* t, NativeLoc callee) {
   X64NativeTarget* a = x64_of(t);
   MCEmitter* mc = t->mc;
   ObjSecId sec = mc->section_id;
-  if (a->ncallee_saves)
-    x64_panic(a, "tail call with callee-saves (O1 path) not implemented");
+  /* Restore callee-saves before the frame teardown (O1 path; none at -O0).
+   * Their rbp-relative offsets are frame-size-independent, and the indirect
+   * callee was staged in r11 by plan_call — a caller-saved scratch — so these
+   * restores never clobber it. Mirrors the x64_func_end epilogue. */
+  Reg cs_int[X64_MAX_CS_INT_REGS], cs_fp[X64_MAX_CS_FP_REGS];
+  u32 n_int = x64_collect_int_saves(a, cs_int);
+  u32 n_fp = x64_collect_fp_saves(a, cs_fp);
+  u32 xmm_base = x64_xmm_base(a, n_fp);
+  i32 i;
+  for (i = (i32)n_fp - 1; i >= 0; --i)
+    emit_sse_load(mc, 0, 0x28, cs_fp[i], X64_RBP,
+                  -(i32)xmm_base - (i32)(i + 1) * 16); /* movaps */
+  for (i = (i32)n_int - 1; i >= 0; --i)
+    emit_mov_load(mc, 8, 0, cs_int[i], X64_RBP,
+                  -(i32)xmm_base - (i32)n_fp * 16 - (i32)(i + 1) * 8);
   emit_leave(mc);
   if (callee.kind == NATIVE_LOC_GLOBAL) {
     u8 op = X64_OPC_JMP_REL32;
@@ -2437,7 +2406,7 @@ static void x64_alloca(NativeTarget* t, NativeLoc dst, NativeLoc size,
     }
     emit_alu_rr(mc, 1, X64_OPC_ALU_SUB, X64_RSP, X64_RAX);
   }
-  a->has_alloca = 1;
+  a->frame.has_alloca = 1;
   /* lea dst, [rsp + max_outgoing] — disp32 patched in func_end. */
   if (a->npatches == a->patches_cap) {
     u32 cap = a->patches_cap ? a->patches_cap * 2u : 8u;
@@ -3545,6 +3514,7 @@ NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj,
   t->c = c;
   t->obj = obj;
   t->mc = mc;
+  native_frame_init(&a->frame, c);
   t->regs = &x64_reg_info;
   t->class_for_type = x64_class_for_type;
   t->imm_legal = x64_imm_legal;
@@ -3552,7 +3522,9 @@ NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj,
   t->func_begin = x64_func_begin;
   t->func_begin_known_frame = x64_func_begin_known_frame;
   t->note_frame_state = NULL;
-  t->reserve_callee_saves = NULL;
+  /* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved
+   * set; x64_func_begin_known_frame derives the records from the masks. */
+  t->reserve_callee_saves = x64_reserve_callee_saves;
   t->signature_stack_bytes = x64_signature_stack_bytes;
   t->call_stack_bytes = x64_call_stack_bytes;
   t->has_store_zero_reg = 0;
@@ -3627,7 +3599,7 @@ static const char* x64_no_tail(NativeDirectTarget* d, const CGCallDesc* call) {
   NativeLoc* args = NULL;
   NativeLoc* results = NULL;
   u32 i, stack;
-  if (a->ncallee_saves)
+  if (a->frame.ncallee_saves)
     return "x64 tail call: callee-saved registers in use";
   memset(&nd, 0, sizeof nd);
   if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs);
diff --git a/src/cg/native_argmove.c b/src/cg/native_argmove.c
@@ -0,0 +1,79 @@
+#include "cg/native_argmove.h"
+
+/* Upper bound on moves in one shuffle. Callers (per-call register args, entry
+ * param binds) are bounded by the ABI register pools (<= 16 per class); 32
+ * covers both classes with headroom. */
+#define NATIVE_ARG_SHUFFLE_MAX 32u
+
+/* The register a move reads, or REG_NONE if it reads none (immediate, frame
+ * slot, or an address derived from fp/sp/a global). Only a register source can
+ * be invalidated by another move writing its destination register, so only
+ * these constrain the emission order. */
+static Reg nam_src_reg(const NativeArgMove* m, NativeAllocClass* cls_out) {
+  if (!m->is_addr && m->src.kind == NATIVE_LOC_REG) {
+    *cls_out = (NativeAllocClass)m->src.cls;
+    return m->src.v.reg;
+  }
+  return REG_NONE;
+}
+
+void native_arg_shuffle(const NativeArgShuffle* s, NativeArgMove* moves, u32 n) {
+  u8 done[NATIVE_ARG_SHUFFLE_MAX];
+  u32 emitted = 0;
+  if (n > NATIVE_ARG_SHUFFLE_MAX)
+    compiler_panic(s->t->c, (SrcLoc){0, 0, 0},
+                   "native arg shuffle: too many moves");
+  memset(done, 0, sizeof done);
+  /* Mark-and-sweep over the move set. Each pass emits every move whose
+   * destination is no longer needed as a source (topological order); when only
+   * a true cycle remains, break one member and continue. */
+  while (emitted < n) {
+    int progress = 0;
+    u32 i, j;
+    for (i = 0; i < n; ++i) {
+      int blocked = 0;
+      if (done[i]) continue;
+      for (j = 0; j < n && !blocked; ++j) {
+        NativeAllocClass sc;
+        Reg sr;
+        if (j == i || done[j]) continue;
+        sr = nam_src_reg(&moves[j], &sc);
+        if (sr != REG_NONE && sr == moves[i].dst.v.reg &&
+            sc == (NativeAllocClass)moves[i].dst.cls)
+          blocked = 1;
+      }
+      if (!blocked) {
+        s->emit_one(s->t, &moves[i]);
+        done[i] = 1;
+        emitted++;
+        progress = 1;
+      }
+    }
+    if (!progress) {
+      /* Every remaining move is part of a cycle; a cycle's ring is reg->reg, so
+       * at least one remaining move is register-sourced. Break on such a move:
+       * its destination register still holds a pending source value, so copy it
+       * to the class scratch and point that value's readers at the scratch. */
+      u32 k = 0;
+      NativeAllocClass bc, sc;
+      NativeLoc scratchloc;
+      while (k < n &&
+             (done[k] || nam_src_reg(&moves[k], &sc) == REG_NONE))
+        ++k;
+      bc = (NativeAllocClass)moves[k].dst.cls;
+      memset(&scratchloc, 0, sizeof scratchloc);
+      scratchloc.kind = NATIVE_LOC_REG;
+      scratchloc.cls = (u8)bc;
+      scratchloc.type = moves[k].dst.type;
+      scratchloc.v.reg = s->scratch[bc];
+      s->reg_move(s->t, scratchloc, moves[k].dst);
+      for (j = 0; j < n; ++j) {
+        Reg sr = nam_src_reg(&moves[j], &sc);
+        if (!done[j] && sr != REG_NONE && sr == moves[k].dst.v.reg && sc == bc) {
+          moves[j].src = scratchloc;
+          moves[j].src_offset = 0;
+        }
+      }
+    }
+  }
+}
diff --git a/src/cg/native_argmove.h b/src/cg/native_argmove.h
@@ -0,0 +1,60 @@
+#ifndef CFREE_CG_NATIVE_ARGMOVE_H
+#define CFREE_CG_NATIVE_ARGMOVE_H
+
+/* Shared parallel-copy register-move scheduler for the native backends.
+ *
+ * Marshalling call arguments (and, on the optimizer path, binding incoming
+ * parameters) means realizing a set of register `dst <- src` moves as a
+ * *parallel* copy: every register must be read by all moves that source it
+ * before any move overwrites it. The allocator usually arranges a conflict-free
+ * order, but not always (notably variadic args, and tail-call / entry
+ * permutations the allocator is free to rotate). When a true cycle remains —
+ * a rotation like rdi<-rdx, rsi<-rdi, rdx<-rsi — it is broken by stashing one
+ * member's register into a scratch and redirecting that value's readers.
+ *
+ * The scheduling (topological emission + cycle detection + the scratch break)
+ * is identical across aa64/rv64/x64; only the leaf operations differ — how one
+ * move is emitted and which register is the scratch. Those come in via the ops
+ * struct, so all three backends (and the entry-bind path) share one scheduler. */
+
+#include <string.h>
+
+#include "arch/native_target.h"
+#include "core/core.h"
+
+/* A single register-destination move. `src` may be a register (the common,
+ * order-constraining case), a frame/stack slot, an immediate, or — when
+ * `is_addr` — an address to materialize into `dst`. Identical to the per-arch
+ * arg-move structs it replaces. */
+typedef struct NativeArgMove {
+  NativeLoc dst;
+  NativeLoc src;
+  u32 src_offset;
+  u32 size;
+  int is_addr;
+  /* Optional emit hints, opaque to the scheduler (it only reads dst/src/is_addr
+   * for ordering). x64 uses these for the Win64 "variadic FP arg also duplicated
+   * into the matching GPR" rule; other backends leave them zero. */
+  int dup_to_gpr;
+  Reg dup_gpr;
+} NativeArgMove;
+
+typedef struct NativeArgShuffle {
+  NativeTarget* t;
+  /* Emit one move as-is: an address-of for is_addr, else a load/copy of `size`
+   * bytes at `src_offset` from `src` into `dst`. */
+  void (*emit_one)(NativeTarget* t, const NativeArgMove* m);
+  /* A plain register-to-register copy, used to stash a cycle victim into the
+   * scratch register before redirecting its readers. */
+  void (*reg_move)(NativeTarget* t, NativeLoc dst, NativeLoc src);
+  /* Scratch register per NativeAllocClass (index by class id) for breaking
+   * cycles. A cycle's ring is register-to-register, so the scratch needs no
+   * address arithmetic — a bare register suffices. */
+  Reg scratch[NATIVE_REG_VEC + 1];
+} NativeArgShuffle;
+
+/* Emit `moves[0..n)` as a parallel register copy via `s`. Mutates `moves`
+ * (redirects a broken cycle's readers to the scratch register). */
+void native_arg_shuffle(const NativeArgShuffle* s, NativeArgMove* moves, u32 n);
+
+#endif

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/arch/aa64/native.c	\|	101	++++++++++++++++---------------------------------------------------------------
M	src/arch/rv64/native.c	\|	82	++++++++++++++-----------------------------------------------------------------
M	src/arch/x64/native.c	\|	262	+++++++++++++++++++++++++++++++++++--------------------------------------------
A	src/cg/native_argmove.c	\|	79	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/cg/native_argmove.h	\|	60	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++