x64: make -O1 entry param-binds cycle-safe via the shared shuffle - kit

commit 6cedc8f3b379c3369f243742bc1e3e23a5474e84
parent dc46a3135f40231846840a807f477f5d082525c3
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 29 May 2026 09:50:15 -0700

x64: make -O1 entry param-binds cycle-safe via the shared shuffle

At -O1 the allocator may place a tail function's params in a rotated register
set (pass_lower skips arg-reg hinting for tail functions). x64_bind_native_param
emitted the incoming->home moves one param at a time, so a rotation clobbered a
register a later bind still needed — e.g. caller(x,y,z) tail-calling
target(z,x,y) collapsed all three args to one value.

Defer register-destination binds into a pending list (incoming arg reg, or a
NATIVE_LOC_ADDR for an incoming stack slot) and resolve them together as a
parallel copy in a new bind_params_end hook, via the shared native_arg_shuffle.
Frame-destination and indirect binds stay eager: they only read incoming
registers, so they correctly precede the shuffle. New optional
NativeTarget.bind_params_end is called by the optimizer emit path after the last
bind; aa64/rv64 leave it NULL (their allocator doesn't rotate, so they were
already correct).

x64 toy X-O1: 137 -> 148 pass (all 9 tail cases + many-params fixed); 8 remain
(atomics, varargs, a couple others). No regressions: x64 O0 156/0, rv64 O1
156/0, aa64 1034/0.

x64: keep atomic cmpxchg/rmw address out of the fixed operand registers

At -O1 the optimizer can materialize the atomic operand address into rax/rcx
(cmpxchg) or rax/rcx/rdx (rmw) — exactly the registers x64_atomic_cas/_rmw
hardcode for expected/desired/prior/new/val. x64_addr_to_base_reg returns the
existing base register as-is when there is no index/displacement, so the address
landed in one of those and the operand setup then clobbered it: e.g.
`leaq -8(%rbp),%rcx; movq %r12,%rcx; lock cmpxchgq %rcx,(%rcx)` dereferenced 42.

Stage the address into r11 (the reserved int emit scratch, never an allocated
operand) whenever it occupies a fixed operand register, before any operand move.
Also place expected->rax / desired->rcx as a parallel copy (xchg on a full swap;
desired-in-rax ordering) rather than two naive moves. x64 toy X-O1: 148 -> 151
(all 3 atomic cmpxchg/rmw cases). No -O0 change (the address is r11 there).

x64 -O1: fix va-list base + indirect-callee scratch register clobbers

Two -O1 register-aliasing bugs, both because the optimizer may place a value in
a register the codegen hardcodes:

- va_start/va_arg/va_copy use rax/r10/rdx for field values, but x64_addr_to_base_reg
  returns the va_list pointer's own register when it's a simple base — which at
  -O1 can be rax/rdx, so the field stores clobbered the pointer and wrote to
  garbage. New x64_va_base() forces the va_list base into a reserved scratch
  (r11, and rax for va_copy's source) that the field values never alias.

- For an indirect call the callee is staged in r11, but x64_emit_reg_arg_moves
  used r11 as its integer cycle-break scratch, clobbering the callee with an arg
  value (SIGSEGV on call *r11). The shuffle's int scratch is now a parameter:
  rax when the callee occupies r11 (rax is never a SysV int arg reg and the
  variadic AL count is written after the moves), r11 otherwise.

x64 toy X-O1: 151 -> 154 (fixes 112 indirect-many-args, 19 variadic+asm, 123
spec_demo). Remaining: 132 (bswap roundtrip), 133 (varargs mixed types). No
regressions: x64 O0 156/0, rv64 O0/O1 156/156.

Diffstat:
M src/arch/native_target.h  | 7 +++++++
M src/arch/x64/native.c  | 205 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
M src/cg/native_frame.h  | 1 +
M src/opt/pass_native_emit.c  | 3 +++

4 files changed, 170 insertions(+), 46 deletions(-)
diff --git a/src/arch/native_target.h b/src/arch/native_target.h
@@ -342,6 +342,13 @@ struct NativeTarget {
    * registers are never allocable, so reg destinations never alias an incoming
    * arg register and ordering across params is unconstrained. */
   void (*bind_param)(NativeTarget*, const CGParamDesc*, NativeLoc dst);
+  /* Optional. Called once by the optimizer emit path after the last bind_param,
+   * before the body. Lets a backend that defers register-destination param
+   * binds (to resolve them as a parallel copy, since the allocator may rotate
+   * params across the incoming arg registers — a permutation the naive
+   * per-param move order cannot realize) flush them now. Backends that bind
+   * eagerly leave this NULL. */
+  void (*bind_params_end)(NativeTarget*);
 
   MCLabel (*label_new)(NativeTarget*);
   void (*label_place)(NativeTarget*, MCLabel);
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -50,6 +50,9 @@ enum {
   X64_TMP_FP = X64_XMM0 + 14, /* emit-internal fp scratch (reserved) */
   X64_TMP_FP2 = X64_XMM15,    /* emit-internal fp scratch (reserved) */
   X64_MAX_REG_ARG_MOVES = 16u,
+  /* Deferred entry register-binds (-O1): bounded by simultaneously-live
+   * register-homed param parts, i.e. the allocable register count. */
+  X64_MAX_BIND_MOVES = 32u,
   X64_MAX_CS_FP_REGS = 10u, /* Win64 xmm6..xmm15 */
 };
 
@@ -98,6 +101,13 @@ typedef struct X64NativeTarget {
   u32 prologue_nbytes;
   MCLabel epilogue_label;
 
+  /* Optimizer (-O1) entry binds: register-destination param binds are deferred
+   * here and resolved as a parallel copy in x64_bind_params_end, since the
+   * allocator may rotate params across the incoming arg registers — a
+   * permutation the naive per-param move order would clobber. */
+  NativeArgMove bind_moves[X64_MAX_BIND_MOVES];
+  u32 nbind_moves;
+
   const X64ABIRegs* abi;
 } X64NativeTarget;
 
@@ -1466,6 +1476,7 @@ static void x64_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) {
   a->reg_save_slot = NATIVE_FRAME_SLOT_NONE;
   a->npatches = 0;
   a->nalloca = 0;
+  a->nbind_moves = 0;
   a->prologue_nbytes =
       a->abi->shadow_space ? X64_PROLOGUE_BYTES_WIN64 : X64_PROLOGUE_BYTES;
 
@@ -1883,6 +1894,39 @@ static void x64_store_outgoing_part(NativeTarget* t, int tail_call,
 }
 
 /* NativeTarget bind_param: route incoming param (ABI loc) into dst. */
+static void x64_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, u32 n,
+                                   Reg int_scratch);
+
+/* Defer a register-destination param bind for the parallel-copy flush in
+ * x64_bind_params_end. `src` is the incoming location (an arg register, or a
+ * NATIVE_LOC_ADDR for an incoming stack slot). */
+static void x64_defer_reg_bind(X64NativeTarget* a, NativeLoc dst, NativeLoc src,
+                               u32 size) {
+  NativeArgMove* m;
+  if (a->nbind_moves >= X64_MAX_BIND_MOVES)
+    x64_panic(a, "too many register parameter binds");
+  m = &a->bind_moves[a->nbind_moves++];
+  memset(m, 0, sizeof *m);
+  m->dst = dst;
+  m->src = src;
+  m->size = size;
+}
+
+/* Incoming stack-arg source as a NATIVE_LOC_ADDR ([rbp + bias + stack_off]). */
+static NativeLoc x64_incoming_stack_loc(CfreeCgTypeId type, NativeAllocClass cls,
+                                        i32 off) {
+  NativeLoc l;
+  memset(&l, 0, sizeof l);
+  l.kind = NATIVE_LOC_ADDR;
+  l.cls = (u8)cls;
+  l.type = type;
+  l.v.addr.base_kind = NATIVE_ADDR_BASE_REG;
+  l.v.addr.base.reg = X64_RBP;
+  l.v.addr.base_type = type;
+  l.v.addr.offset = off;
+  return l;
+}
+
 static void x64_bind_native_param(NativeTarget* t, const CGParamDesc* p,
                                   NativeLoc dst) {
   X64NativeTarget* a = x64_of(t);
@@ -1934,26 +1978,26 @@ static void x64_bind_native_param(NativeTarget* t, const CGParamDesc* p,
       const ABIArgPart* part = &ai->parts[i];
       NativeAllocClass cls =
           part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
-      Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT;
-      NativeLoc src = x64_reg_loc(p->type, cls, tmp);
-      NativeAddr sa;
-      memset(&sa, 0, sizeof sa);
-      sa.base_kind = NATIVE_ADDR_BASE_REG;
-      sa.base.reg = X64_RBP;
-      sa.base_type = p->type;
-      sa.offset = incoming_bias + (i32)a->next_param_stack;
+      NativeLoc isrc = x64_incoming_stack_loc(
+          p->type, cls, incoming_bias + (i32)a->next_param_stack);
       a->next_param_stack += 8u;
-      x64_emit_mem(a, 1, src, sa, x64_mem_for_type(t, p->type, part->size));
       if (dst.kind == NATIVE_LOC_NONE) {
         /* unused */
       } else if (to_reg) {
-        x64_move(t, x64_reg_loc(dst.type ? dst.type : p->type,
-                                (NativeAllocClass)dst.cls, (Reg)dst.v.reg),
-                 src);
+        /* Defer: a register dst may be another param's incoming reg. */
+        x64_defer_reg_bind(a,
+                           x64_reg_loc(dst.type ? dst.type : p->type,
+                                       (NativeAllocClass)dst.cls, (Reg)dst.v.reg),
+                           isrc, part->size);
       } else {
+        /* Frame dst: load to scratch then store (memory dst is never a cycle
+         * source, so emit eagerly — it only reads the incoming slot). */
+        Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT;
+        NativeLoc tloc = x64_reg_loc(p->type, cls, tmp);
+        x64_load_part(t, tloc, isrc, 0, part->size);
         x64_store_part(t,
                        x64_stack_loc(p->type, dst.v.frame, (i32)part->src_offset),
-                       src, 0, part->size);
+                       tloc, 0, part->size);
       }
     }
     return;
@@ -1963,42 +2007,58 @@ static void x64_bind_native_param(NativeTarget* t, const CGParamDesc* p,
     const ABIArgPart* part = &ai->parts[i];
     NativeAllocClass cls =
         part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
-    NativeLoc src;
+    NativeLoc src; /* incoming: arg register, or NATIVE_LOC_ADDR for a stack arg */
     if (cls == NATIVE_REG_FP && a->next_param_fp < a->abi->n_fp_args) {
       src = x64_reg_loc(p->type, cls, (Reg)(X64_XMM0 + a->next_param_fp++));
     } else if (cls == NATIVE_REG_INT &&
                a->next_param_int < a->abi->n_int_args) {
       src = x64_reg_loc(p->type, cls, a->abi->int_args[a->next_param_int++]);
     } else {
-      Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT;
-      NativeAddr sa;
-      src = x64_reg_loc(p->type, cls, tmp);
-      memset(&sa, 0, sizeof sa);
-      sa.base_kind = NATIVE_ADDR_BASE_REG;
-      sa.base.reg = X64_RBP;
-      sa.base_type = p->type;
-      sa.offset = incoming_bias + (i32)a->next_param_stack;
-      x64_emit_mem(a, 1, src, sa, x64_mem_for_type(t, p->type, part->size));
+      src = x64_incoming_stack_loc(p->type, cls,
+                                   incoming_bias + (i32)a->next_param_stack);
       a->next_param_stack += 8u;
     }
     x64_sync_slot(a->abi, &a->next_param_int, &a->next_param_fp);
     if (dst.kind == NATIVE_LOC_NONE) {
       /* unused parameter; cursors advanced */
     } else if (to_reg) {
-      NativeLoc d = x64_reg_loc(dst.type ? dst.type : p->type,
-                                (NativeAllocClass)dst.cls, (Reg)dst.v.reg);
-      if (!(src.kind == NATIVE_LOC_REG && loc_reg(src) == loc_reg(d) &&
-            (NativeAllocClass)src.cls == (NativeAllocClass)d.cls))
-        x64_move(t, d, src);
-    } else {
+      /* Defer the register bind: the allocator may rotate params across the
+       * incoming arg registers, so a per-param move could clobber a register
+       * another bind still needs. x64_bind_params_end resolves them together as
+       * a parallel copy. */
+      x64_defer_reg_bind(a,
+                         x64_reg_loc(dst.type ? dst.type : p->type,
+                                     (NativeAllocClass)dst.cls, (Reg)dst.v.reg),
+                         src, part->size);
+    } else if (src.kind == NATIVE_LOC_REG) {
       x64_store_part(t,
                      x64_stack_loc(p->type, dst.v.frame, (i32)part->src_offset),
                      src, 0, part->size);
+    } else {
+      /* Stack source -> frame dst: load to scratch, then store. */
+      Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT;
+      NativeLoc tloc = x64_reg_loc(p->type, cls, tmp);
+      x64_load_part(t, tloc, src, 0, part->size);
+      x64_store_part(t,
+                     x64_stack_loc(p->type, dst.v.frame, (i32)part->src_offset),
+                     tloc, 0, part->size);
     }
   }
   a->incoming_stack_size = align_up_u32(a->next_param_stack, 16u);
 }
 
+/* Flush the deferred register-destination param binds as a parallel copy (the
+ * shared scheduler breaks any cycle the allocator's rotation created through
+ * the int/fp emit scratch). Frame-dst and indirect binds were emitted eagerly
+ * in bind_param — they only read incoming registers, so they precede this. */
+static void x64_bind_params_end(NativeTarget* t) {
+  X64NativeTarget* a = x64_of(t);
+  /* No callee is staged during entry binds, so r11 is free as the cycle scratch. */
+  if (a->nbind_moves)
+    x64_emit_reg_arg_moves(t, a->bind_moves, a->nbind_moves, X64_TMP_INT2);
+  a->nbind_moves = 0;
+}
+
 /* ============================ calls / returns ============================ */
 
 typedef NativeArgMove X64ArgMove;
@@ -2015,17 +2075,19 @@ static void x64_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) {
   }
 }
 
-/* Parallel-copy register arg moves via the shared scheduler; cycles break
- * through the int/fp emit scratch (r11 / xmm14). */
-static void x64_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves,
-                                   u32 n) {
+/* Parallel-copy register arg moves via the shared scheduler. `int_scratch` is
+ * the register used to break an integer cycle: normally r11, but rax when an
+ * indirect callee is staged in r11 (rax is never a SysV int arg register and
+ * the variadic AL count is written only after the moves). */
+static void x64_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, u32 n,
+                                   Reg int_scratch) {
   NativeArgShuffle s;
   if (n > X64_MAX_REG_ARG_MOVES) x64_panic(x64_of(t), "too many register args");
   memset(&s, 0, sizeof s);
   s.t = t;
   s.emit_one = x64_emit_one_arg_move;
   s.reg_move = x64_move;
-  s.scratch[NATIVE_REG_INT] = X64_TMP_INT2;
+  s.scratch[NATIVE_REG_INT] = int_scratch;
   s.scratch[NATIVE_REG_FP] = X64_TMP_FP;
   native_arg_shuffle(&s, moves, n);
 }
@@ -2173,7 +2235,13 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc,
         }
       }
     }
-    x64_emit_reg_arg_moves(t, moves, nmoves);
+    /* If an indirect callee was staged in r11 above, the cycle scratch must
+     * avoid it; rax is free here (not an int arg reg; AL count comes later). */
+    x64_emit_reg_arg_moves(
+        t, moves, nmoves,
+        (plan->callee.kind == NATIVE_LOC_REG && plan->callee.v.reg == X64_R11)
+            ? X64_TMP_INT
+            : X64_TMP_INT2);
     if (abi && abi->has_sret && desc->nresults) {
       /* sret pointer in the first int-arg reg. A tail call forwards the
        * caller's own incoming sret pointer (spilled at entry); otherwise pass
@@ -2634,11 +2702,18 @@ static void x64_atomic_rmw(NativeTarget* t, AtomicOp op, NativeLoc dst,
   MCEmitter* mc = t->mc;
   u32 sz = mem.size ? mem.size : x64_type_size(t, dst.type);
   int w = sz == 8u ? 1 : 0;
-  u32 base = x64_atomic_base(a, addr); /* r11 */
+  u32 base = x64_atomic_base(a, addr);
   u32 dr = loc_reg(dst);
   u32 vr = loc_reg(val);
   (void)mo; /* LOCK ops are full barriers. */
-  /* val staged in rdx (base owns r11; rax/rcx used by the cmpxchg loop). */
+  /* The rmw uses fixed rax (prior), rcx (new), rdx (val); the optimizer may have
+   * materialized the address into one of them, so keep it out (r11 is the int
+   * emit scratch, never an allocated operand). Stage before rdx is loaded. */
+  if (base == X64_RAX || base == X64_RCX || base == X64_RDX) {
+    emit_mov_rr(mc, 1, X64_TMP_INT2, base);
+    base = X64_TMP_INT2;
+  }
+  /* val staged in rdx (rax/rcx used by the cmpxchg loop). */
   emit_mov_rr(mc, w, X64_RDX, vr);
   if (op == AO_ADD || op == AO_SUB) {
     if (op == AO_SUB) emit_f7_rm(mc, w, X64_F7_SUB_NEG, X64_RDX);
@@ -2698,16 +2773,38 @@ static void x64_atomic_cas(NativeTarget* t, NativeLoc prior, NativeLoc ok,
   MCEmitter* mc = t->mc;
   u32 sz = mem.size ? mem.size : x64_type_size(t, prior.type);
   int w = sz == 8u ? 1 : 0;
-  u32 base = x64_atomic_base(a, addr); /* r11 */
+  u32 base = x64_atomic_base(a, addr);
   u32 rprior = loc_reg(prior);
   u32 rok = loc_reg(ok);
   u32 rexp = loc_reg(expected);
   u32 rdes = loc_reg(desired);
   (void)success;
   (void)failure;
-  /* rax = expected; rcx = desired. */
-  if (rexp != X64_RAX) emit_mov_rr(mc, w, X64_RAX, rexp);
-  if (rdes != X64_RCX) emit_mov_rr(mc, w, X64_RCX, rdes);
+  /* cmpxchg uses fixed rax (expected) and rcx (desired). The optimizer may have
+   * materialized the address into either; keep it out of both (r11 is the int
+   * emit scratch, never an allocated operand). */
+  if (base == X64_RAX || base == X64_RCX) {
+    emit_mov_rr(mc, 1, X64_TMP_INT2, base);
+    base = X64_TMP_INT2;
+  }
+  /* Place expected -> rax and desired -> rcx as a parallel copy: the allocator
+   * may have them in each other's target register (full swap) or desired in rax
+   * (expected's target), either of which a naive two-move order would clobber. */
+  if (rexp == X64_RCX && rdes == X64_RAX) {
+    /* Swap rax <-> rcx (xchg needs no temp; base is not rax/rcx here). */
+    emit_rex(mc, w, X64_RCX, 0, X64_RAX);
+    {
+      u8 xchg[2] = {0x87, modrm(3u, X64_RCX, X64_RAX)};
+      mc->emit_bytes(mc, xchg, 2);
+    }
+  } else if (rdes == X64_RAX) {
+    /* desired sits in rax; move it to rcx before rax is overwritten. */
+    if (rdes != X64_RCX) emit_mov_rr(mc, w, X64_RCX, rdes);
+    if (rexp != X64_RAX) emit_mov_rr(mc, w, X64_RAX, rexp);
+  } else {
+    if (rexp != X64_RAX) emit_mov_rr(mc, w, X64_RAX, rexp);
+    if (rdes != X64_RCX) emit_mov_rr(mc, w, X64_RCX, rdes);
+  }
   emit_lock_prefix(mc);
   emit_rex(mc, w, X64_RCX, 0, base);
   {
@@ -2731,12 +2828,25 @@ static void x64_fence(NativeTarget* t, MemOrder mo) {
  * into the matching GPR slot at the call site. `ap` addresses the va_list
  * object. */
 
+/* Resolve a va_list address into `scratch`, materializing it there if it is not
+ * already, so the va field-value scratch registers (rax / r10 / rdx) never alias
+ * it. At -O1 the optimizer may place the va_list pointer in any register —
+ * including those — and the va code would then clobber the pointer mid-sequence. */
+static u32 x64_va_base(X64NativeTarget* a, NativeAddr ap, u32 scratch) {
+  u32 base = x64_addr_to_base_reg(a, ap, scratch);
+  if (base != scratch) {
+    emit_mov_rr(a->base.mc, 1, scratch, base);
+    base = scratch;
+  }
+  return base;
+}
+
 static void x64_va_start_core(X64NativeTarget* a, NativeAddr ap) {
   NativeTarget* t = &a->base;
   MCEmitter* mc = t->mc;
   u32 ap_base;
   if (!a->is_variadic) x64_panic(a, "va_start: function not variadic");
-  ap_base = x64_addr_to_base_reg(a, ap, X64_TMP_INT2);
+  ap_base = x64_va_base(a, ap, X64_TMP_INT2);
   if (a->abi->shadow_space) {
     /* Win64: *ap = rbp + 16 + named_int*8 + named_stack. */
     u32 first = 16u + a->next_param_int * 8u + a->next_param_stack;
@@ -2768,7 +2878,7 @@ static void x64_va_arg_core(X64NativeTarget* a, NativeLoc dst, NativeAddr ap,
   u32 sz = x64_type_size(t, type);
   int is_fp = loc_is_fp(dst);
   u32 dr = loc_reg(dst);
-  u32 ap_base = x64_addr_to_base_reg(a, ap, X64_TMP_INT2);
+  u32 ap_base = x64_va_base(a, ap, X64_TMP_INT2);
   if (a->abi->shadow_space) {
     /* Win64: r10 = *ap; load; *ap += 8. (r10 is caller-saved scratch here.) */
     emit_mov_load(mc, 8, 0, X64_R10, ap_base, 0);
@@ -2826,9 +2936,11 @@ static void x64_va_copy_core(X64NativeTarget* a, NativeAddr dst_ap,
                              NativeAddr src_ap) {
   NativeTarget* t = &a->base;
   MCEmitter* mc = t->mc;
-  /* Resolve dst into r11 first, src into rax (disjoint). */
-  u32 dst_base = x64_addr_to_base_reg(a, dst_ap, X64_TMP_INT2);
-  u32 src_base = x64_addr_to_base_reg(a, src_ap, X64_TMP_INT);
+  /* Resolve dst into r11, src into rax (disjoint from each other and from the
+   * rdx copy scratch); force both so the optimizer's register choice for a
+   * va_list pointer can't alias the copy scratch. */
+  u32 dst_base = x64_va_base(a, dst_ap, X64_TMP_INT2);
+  u32 src_base = x64_va_base(a, src_ap, X64_TMP_INT);
   u32 n = a->abi->shadow_space ? 8u : 24u, i;
   for (i = 0; i < n; i += 8u) {
     emit_mov_load(mc, 8, 0, X64_RDX, src_base, (i32)i);
@@ -3521,6 +3633,7 @@ NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj,
   t->addr_legal = x64_addr_legal;
   t->func_begin = x64_func_begin;
   t->func_begin_known_frame = x64_func_begin_known_frame;
+  t->bind_params_end = x64_bind_params_end;
   t->note_frame_state = NULL;
   /* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved
    * set; x64_func_begin_known_frame derives the records from the masks. */
diff --git a/src/cg/native_frame.h b/src/cg/native_frame.h
@@ -1,3 +1,4 @@
+/* see also: cg/native_argmove.h (shared parallel-copy register shuffle) */
 #ifndef CFREE_CG_NATIVE_FRAME_H
 #define CFREE_CG_NATIVE_FRAME_H
 
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -596,6 +596,9 @@ static void bind_params(NativeEmitCtx* e) {
                       map_slot(e, p->storage.v.frame_slot, p->loc));
     if (e->target->bind_param) e->target->bind_param(e->target, &sd, dst);
   }
+  /* Let a backend that defers register-destination binds resolve them now (as a
+   * parallel copy), once every param's incoming location has been read. */
+  if (e->target->bind_params_end) e->target->bind_params_end(e->target);
 }
 
 /* The parameter value is placed into its allocated location by bind_param at

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/arch/native_target.h	\|	7	+++++++
M	src/arch/x64/native.c	\|	205	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
M	src/cg/native_frame.h	\|	1	+
M	src/opt/pass_native_emit.c	\|	3	+++