opt+aa64: unify on AAPCS64 frame layout; hint params to incoming regs - kit

commit 0a722716448183a97354a873bcda572031919e19
parent 678aa2c2c1d76c07ae26fecbd5b4c2128564012d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 28 May 2026 10:12:05 -0700

opt+aa64: unify on AAPCS64 frame layout; hint params to incoming regs

aa64 prologue/epilogue had three layouts (slim_prologue, slim_small_frame,
fat) that each anchored fp differently. Bind_param assumed fp == caller's
original sp, which only held for two of them; the slim layout left fp 16
bytes low, so incoming stack args read from the saved-pair slots. Activated
the moment any optimization let a function skip its callee-save spills.

Centralize the layout in a single AAFrameLayout struct with typed accessors
(aa_fp_off_in_arg, aa_fp_off_slot, aa_fp_off_saved_fp/lr, aa_sp_off_saved_pair,
aa_fp_off_tail_out_arg, aa_sp_off_out_arg) documented by one ASCII diagram.
Every site that addresses the frame now goes through a helper — no bare
arithmetic on AA_FP/AA_SP for frame addressing. All three prologue variants
anchor fp at the saved-pair address (true AAPCS64); CFI unified.

Param-hint: set_preg_pref_for_params hints each scalar param PReg toward
its own incoming ABI reg, gated by func_has_tail_call (tail shuffles can't
break cycles imposed by pinned param positions). Drives add(a,b) to
'add x0, x0, x1; ret'. Requires p->abi to be populated, so lower_params
resolves it via abi_cg_func_info (scoped to the param side; doesn't activate
the dormant f->desc.abi path).

Fix latent indirect-call bug exposed by the hint: when the callee lives in
x0..x7 (e.g. a param hinted to x0), the arg-load loop would clobber it
before blr. aa_plan_call now stashes the callee into AA_TMP0 first.

Delete apply_param_incoming_register_hazards and the verifier's
"left-in-incoming-reg" pre-check. Both were dormant via the f->desc.abi
gate; both had the tail-call bug. The hint mechanism + opt_ranges_overlap_kind
unit-overlap check is the real safety net.

Diffstat:
M src/arch/aa64/native.c  | 301 +++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
M src/opt/cg_ir_lower.c  | 11 +++++++++++
M src/opt/pass_lower.c  | 222 ++++++++++++++++++++++++++++++++-----------------------------------------------

3 files changed, 294 insertions(+), 240 deletions(-)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -61,6 +61,72 @@ enum {
   AA_TAIL_WORDS = 32u,
 };
 
+/* ============================================================================
+ * AAPCS64 frame layout
+ *
+ * fp anchors at the caller's saved-pair address; sp anchors at the bottom of
+ * the outgoing-arg area. Every fp- or sp-relative offset in this file is
+ * computed via one of the helpers below — no site should do bare arithmetic
+ * on AA_FP / AA_SP for addressing the frame.
+ *
+ *   high addr   caller's stack frame
+ *               +------------------------------+
+ *               | incoming stack args          |  aa_fp_off_in_arg(i)
+ *               +------------------------------+
+ *      fp  -->  | saved x29 (prev fp)          |  aa_fp_off_saved_fp()
+ *               | saved x30 (prev lr)          |  aa_fp_off_saved_lr()
+ *               +------------------------------+
+ *               | frame slots                  |  aa_fp_off_slot(s->off)
+ *               |   (callee-saves + locals     |
+ *               |    + spills + sret/variadic) |
+ *               +------------------------------+
+ *               | outgoing args                |  aa_sp_off_out_arg(i)
+ *      sp  -->  +------------------------------+
+ *   low addr
+ *
+ * frame_size = align16(AA_FRAME_SAVE_SIZE + slot_bytes + out_stack).
+ * Tail calls write outgoing args into the caller's incoming-args window —
+ * physically the same address, expressed via aa_fp_off_tail_out_arg.
+ * ========================================================================== */
+
+static u32 align_up_u32(u32 v, u32 align);
+
+typedef struct AAFrameLayout {
+  u32 slot_bytes; /* sum of aa_frame_slot reservations (callee-saves + locals
+                   * + spills + sret/variadic) */
+  u32 out_stack;  /* max outgoing-arg bytes across all calls in this function */
+  u32 frame_size; /* align16(AA_FRAME_SAVE_SIZE + slot_bytes + out_stack) */
+} AAFrameLayout;
+
+static inline AAFrameLayout aa_build_layout(u32 slot_bytes, u32 out_stack) {
+  AAFrameLayout L;
+  L.slot_bytes = slot_bytes;
+  L.out_stack = out_stack;
+  L.frame_size =
+      align_up_u32(AA_FRAME_SAVE_SIZE + slot_bytes + out_stack, 16u);
+  return L;
+}
+
+/* FP-relative byte offsets. */
+static inline i32 aa_fp_off_saved_fp(void) { return 0; }
+static inline i32 aa_fp_off_saved_lr(void) { return 8; }
+static inline i32 aa_fp_off_in_arg(u32 byte_off) {
+  return (i32)(AA_FRAME_SAVE_SIZE + byte_off);
+}
+static inline i32 aa_fp_off_slot(u32 slot_off) { return -(i32)slot_off; }
+/* Outgoing stack args on a tail call land in the caller's incoming-arg
+ * window — same physical address the tail-callee will read via
+ * aa_fp_off_in_arg. Same helper, distinct name for site-side intent. */
+static inline i32 aa_fp_off_tail_out_arg(u32 byte_off) {
+  return aa_fp_off_in_arg(byte_off);
+}
+
+/* SP-relative byte offsets. */
+static inline i32 aa_sp_off_out_arg(u32 byte_off) { return (i32)byte_off; }
+static inline u32 aa_sp_off_saved_pair(const AAFrameLayout* L) {
+  return L->frame_size - AA_FRAME_SAVE_SIZE;
+}
+
 typedef struct AANativeSlot {
   u32 off;
   u32 size;
@@ -545,7 +611,7 @@ static void aa_addr_base(AANativeTarget* a, NativeAddr addr, u32* base_out,
     case NATIVE_ADDR_BASE_FRAME: {
       AANativeSlot* s = aa_slot(a, addr.base.frame);
       *base_out = AA_FP;
-      *off_out = -(i32)s->off + addr.offset;
+      *off_out = aa_fp_off_slot(s->off) + addr.offset;
       return;
     }
     case NATIVE_ADDR_BASE_GLOBAL: {
@@ -819,7 +885,7 @@ static u32 aa_ldst_q_simm9(int load, u32 rt, u32 rn, i32 byte_off) {
 static void aa_emit_q_frame(AANativeTarget* a, int load, u32 qreg,
                             NativeFrameSlot slot, u32 offset) {
   AANativeSlot* s = aa_slot(a, slot);
-  i32 off = -(i32)s->off + (i32)offset;
+  i32 off = aa_fp_off_slot(s->off) + (i32)offset;
   MCEmitter* mc = a->base.mc;
   if (off >= 0 && ((u32)off & 15u) == 0 && ((u32)off >> 4) <= 0xfffu) {
     aa_emit32(mc, aa_ldst_q_uimm(load, qreg, AA_FP, (u32)off));
@@ -873,11 +939,18 @@ static void aa_func_begin(NativeTarget* t, const CGFuncDesc* fd) {
   MCEmitter* mc = t->mc;
   a->func = fd;
   a->nslots = 0;
-  a->cum_off = AA_FRAME_SAVE_SIZE;
+  /* cum_off counts frame-slot bytes below fp (see AAFrameLayout above).
+   * The saved fp/lr pair (16 bytes at [fp, fp+8]) is *not* part of cum_off;
+   * the frame-size computation in aa_func_end adds it via aa_build_layout. */
+  a->cum_off = 0;
   a->max_outgoing = 0;
   a->incoming_stack_size = 0;
   a->next_param_int = 0;
   a->next_param_fp = 0;
+  /* 0-based byte cursor for incoming stack args (also reported as the
+   * caller's incoming_stack_size for tail-call realizability). bind_param
+   * forms its fp-relative address via aa_fp_off_in_arg(next_param_stack),
+   * which adds the saved-pair offset. */
   a->next_param_stack = 0;
   a->sret_ptr_slot = NATIVE_FRAME_SLOT_NONE;
   a->saved_tmp_slot = NATIVE_FRAME_SLOT_NONE;
@@ -1015,23 +1088,30 @@ static void aa_words_sub_sp_frame(AANativeTarget* a, u32* words, u32 cap,
   words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP1, 0, 0);
 }
 
+/* Anchor fp at the AAPCS64 saved-pair address (= sp + saved-pair offset).
+ * The slim_prologue path achieves the same anchor in a single insn via
+ * `add x29, sp, #0` after the pre-decrement stp moves sp to the saved-pair. */
 static void aa_words_frame_ptr_from_sp(AANativeTarget* a, u32* words, u32 cap,
-                                       u32* n, u32 frame_size) {
+                                       u32* n, const AAFrameLayout* L) {
   u32 imm12, sh;
-  if (aa64_addsub_imm_fits(frame_size, &imm12, &sh)) {
+  u32 anchor = aa_sp_off_saved_pair(L);
+  if (aa64_addsub_imm_fits(anchor, &imm12, &sh)) {
     if (*n >= cap) aa_panic(a, "instruction patch too small");
     words[(*n)++] = aa64_add_imm(1, AA_FP, AA_SP, imm12, sh);
     return;
   }
-  aa_words_load_imm(a, words, cap, n, AA_TMP0, frame_size);
+  aa_words_load_imm(a, words, cap, n, AA_TMP0, anchor);
   if (*n + 2u > cap) aa_panic(a, "instruction patch too small");
   words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0);
   words[(*n)++] = aa64_add(1, AA_FP, AA_TMP1, AA_TMP0);
 }
 
+/* x17 = address of the saved-pair slot (= sp + saved-pair offset). Used by
+ * the fat prologue to materialize the stp destination when the offset
+ * doesn't fit stp's signed-7-bit-scaled immediate. */
 static void aa_words_saved_pair_addr(AANativeTarget* a, u32* words, u32 cap,
-                                     u32* n, u32 frame_size) {
-  u32 save_off = frame_size - AA_FRAME_SAVE_SIZE;
+                                     u32* n, const AAFrameLayout* L) {
+  u32 save_off = aa_sp_off_saved_pair(L);
   u32 imm12, sh;
   if (aa64_addsub_imm_fits(save_off, &imm12, &sh)) {
     if (*n >= cap) aa_panic(a, "instruction patch too small");
@@ -1045,33 +1125,34 @@ static void aa_words_saved_pair_addr(AANativeTarget* a, u32* words, u32 cap,
 }
 
 static void aa_words_restore_frame(AANativeTarget* a, u32* words, u32 cap,
-                                   u32* n, u32 frame_size) {
-  if (!frame_size) return;
+                                   u32* n, const AAFrameLayout* L) {
+  if (!L->frame_size) return;
   if (a->slim_prologue) {
     if (*n + 1u > cap) aa_panic(a, "instruction patch too small");
-    /* `ldp x29, x30, [sp], #16` — pops the saved pair and restores sp. */
+    /* `ldp x29, x30, [sp], #16` — pop saved pair, restore sp. */
     words[(*n)++] = aa64_ldp64_post(AA_FP, AA_LR, AA_SP, 2);
     return;
   }
   if (a->slim_small_frame) {
-    /* `ldp x29,x30,[sp,#N-16] ; add sp,sp,#N` — skip the `add x10,fp,#0`
-     * scratch the fat path uses. Restoring fp,lr through sp+offset avoids
-     * the scratch entirely; the subsequent `add sp` then unwinds the frame
-     * without depending on the (now-clobbered) old fp. */
-    u32 save_off = frame_size - AA_FRAME_SAVE_SIZE;
+    /* `ldp x29,x30,[sp,#saved_pair] ; add sp,sp,#frame_size` — load through
+     * sp avoids the fat path's `add x10, fp, #0` scratch, and the subsequent
+     * `add sp` unwinds without depending on the (now-clobbered) old fp. */
+    u32 save_off = aa_sp_off_saved_pair(L);
     u32 imm12, sh;
     if (*n + 2u > cap) aa_panic(a, "instruction patch too small");
     words[(*n)++] =
         aa64_ldp64_soff(AA_FP, AA_LR, AA_SP, (i32)(save_off / 8u));
-    if (!aa64_addsub_imm_fits(frame_size, &imm12, &sh))
+    if (!aa64_addsub_imm_fits(L->frame_size, &imm12, &sh))
       aa_panic(a, "slim_small_frame: frame_size out of addsub imm range");
     words[(*n)++] = aa64_add_imm(1, AA_SP, AA_SP, imm12, sh);
     return;
   }
   if (*n + 3u > cap) aa_panic(a, "instruction patch too small");
+  /* AAPCS64: fp is the saved-pair address. Reload pair from [fp], then
+   * restore sp to fp + AA_FRAME_SAVE_SIZE (= caller's original sp = CFA). */
   words[(*n)++] = aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0);
-  words[(*n)++] = aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, -2); /* fp,lr @ -16 */
-  words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP0, 0, 0);
+  words[(*n)++] = aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, 0);
+  words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP0, AA_FRAME_SAVE_SIZE, 0);
 }
 
 /* Emit callee-save store (save=1) or restore (save=0) words into `words`,
@@ -1084,13 +1165,14 @@ static void aa_words_callee_saves(AANativeTarget* a, int save, u32* words,
                                   u32 cap, u32* n) {
   for (u32 i = 0; i < a->ncallee_saves;) {
     const AACalleeSave* cs = &a->callee_saves[i];
-    i32 off = -(i32)aa_slot(a, cs->slot)->off;
+    i32 off = aa_fp_off_slot(aa_slot(a, cs->slot)->off);
     if (off < -256 || off > 255)
       aa_panic(a, "callee-save offset out of prologue range");
     if (i + 1u < a->ncallee_saves && cs->cls == (u8)NATIVE_REG_INT &&
         a->callee_saves[i + 1u].cls == (u8)NATIVE_REG_INT) {
       const AACalleeSave* cs2 = &a->callee_saves[i + 1u];
-      i32 off2 = -(i32)aa_slot(a, cs2->slot)->off; /* off - 8, lower address */
+      i32 off2 = aa_fp_off_slot(aa_slot(a, cs2->slot)->off);
+      /* off2 = off - 8 (lower address; reserve allocates downward). */
       if (*n >= cap) aa_panic(a, "prologue too large");
       words[(*n)++] = save
                           ? aa64_stp64_soff(cs2->reg, cs->reg, AA_FP, off2 / 8)
@@ -1106,57 +1188,63 @@ static void aa_words_callee_saves(AANativeTarget* a, int save, u32* words,
   }
 }
 
-/* Build the prologue instruction words for `frame_size` into `words` (capacity
- * `cap`), returning the count. Shared by the NativeDirectTarget patch path
- * (reserves a fixed worst-case region, then patches it here) and the optimizer
- * path (emits an exact-size region up front; see aa_emit_prologue). */
-static u32 aa_build_prologue_words(AANativeTarget* a, u32 frame_size, u32* words,
-                                   u32 cap) {
+/* Build the prologue instruction words for `L` into `words` (capacity `cap`),
+ * returning the count. Shared by the NativeDirectTarget patch path (reserves
+ * a fixed worst-case region, then patches it here) and the optimizer path
+ * (emits an exact-size region up front; see aa_emit_prologue).
+ *
+ * All three variants establish the same post-prologue state defined by L:
+ *   sp = caller's sp - L->frame_size
+ *   fp = sp + aa_sp_off_saved_pair(L)  (saved-pair address)
+ *   saved x29/x30 at [fp], [fp+8]
+ *   callee-saves at [fp - s->off] for each. */
+static u32 aa_build_prologue_words(AANativeTarget* a, const AAFrameLayout* L,
+                                   u32* words, u32 cap) {
   u32 n = 0;
-  if (!frame_size) return 0;
+  if (!L->frame_size) return 0;
   if (a->slim_prologue) {
     if (cap < 2u) aa_panic(a, "prologue too large");
-    /* `stp x29, x30, [sp, #-16]!` — push the saved pair and adjust sp in
-     * one instruction. `mov x29, sp` keeps the AAPCS64 backtrace chain
-     * intact (some unwinders walk x29 directly rather than via DWARF). */
+    /* `stp x29, x30, [sp, #-16]!; add x29, sp, #0` — the pre-decrement stp
+     * moves sp down to the saved-pair address, so a no-op add anchors fp
+     * there directly. AAPCS64 frame record. */
     words[n++] = aa64_stp64_pre(AA_FP, AA_LR, AA_SP, -2);
     words[n++] = aa64_add_imm(1, AA_FP, AA_SP, 0, 0);
     return n;
   }
-  aa_words_sub_sp_frame(a, words, cap, &n, frame_size);
+  aa_words_sub_sp_frame(a, words, cap, &n, L->frame_size);
   if (a->slim_small_frame) {
-    /* `stp x29, x30, [sp, #(frame_size-16)]` — skip the `add x17, sp, #N-16`
-     * scratch step the fat path emits. Valid when (frame_size - 16) fits the
-     * stp signed-7-bit scaled immediate (i.e. frame_size <= 520). */
-    u32 save_off = frame_size - AA_FRAME_SAVE_SIZE;
+    /* `stp x29, x30, [sp, #saved_pair_off]` — skip the `add x17, sp, #...`
+     * scratch the fat path needs. Valid when the offset fits stp's
+     * signed-7-bit scaled immediate (saved_pair_off <= 504). */
+    u32 save_off = aa_sp_off_saved_pair(L);
     if (n >= cap) aa_panic(a, "prologue too large");
     words[n++] =
         aa64_stp64_soff(AA_FP, AA_LR, AA_SP, (i32)(save_off / 8u));
   } else {
-    aa_words_saved_pair_addr(a, words, cap, &n, frame_size);
+    aa_words_saved_pair_addr(a, words, cap, &n, L);
     if (n >= cap) aa_panic(a, "prologue too large");
     words[n++] = aa64_stp64_soff(AA_FP, AA_LR, AA_TMP1, 0); /* fp,lr @ [x17] */
   }
-  aa_words_frame_ptr_from_sp(a, words, cap, &n, frame_size);
-  /* Save callee-saved registers the allocator used, FP-relative. Their slots
-   * were reserved first (aa_reserve_callee_saves), so offsets fit stur's
-   * signed-9-bit immediate. */
+  aa_words_frame_ptr_from_sp(a, words, cap, &n, L);
+  /* Save callee-saved registers the allocator used (fp-relative; their slots
+   * were reserved first by aa_reserve_callee_saves so offsets fit stur). */
   aa_words_callee_saves(a, 1, words, cap, &n);
   return n;
 }
 
 /* Patch the reserved prologue region (`region` words at prologue_pos) with the
- * real prologue for `frame_size`. Used by the NativeDirectTarget single-pass
- * path, which reserves AA_PROLOGUE_WORDS up front before the frame is known.
- * The optimizer path reserves exactly the words it needs, so `region` equals
+ * real prologue for `L`. Used by the NativeDirectTarget single-pass path,
+ * which reserves AA_PROLOGUE_WORDS up front before the frame is known. The
+ * optimizer path reserves exactly the words it needs, so `region` equals
  * the real prologue length and no tail remains. */
-static void aa_patch_prologue(AANativeTarget* a, u32 frame_size, u32 region) {
+static void aa_patch_prologue(AANativeTarget* a, const AAFrameLayout* L,
+                              u32 region) {
   u32 words[AA_PROLOGUE_WORDS];
   u32 n;
   ObjSecId sec = a->func->text_section_id;
   if (region > AA_PROLOGUE_WORDS) aa_panic(a, "prologue region too large");
   memset(words, 0, sizeof words);
-  n = aa_build_prologue_words(a, frame_size, words, region);
+  n = aa_build_prologue_words(a, L, words, region);
   /* If the real prologue is shorter than the reserved region (the worst-case
    * NDT reservation), branch straight to the body rather than leaving the
    * trailing slots as NOPs that fall through and execute on every call. */
@@ -1171,41 +1259,27 @@ static void aa_patch_prologue(AANativeTarget* a, u32 frame_size, u32 region) {
 /* Optimizer path: emit an exact-size prologue in place (no reserved NOP
  * region). The callee-save set and the static frame slots are final by now, so
  * the prologue's instruction count is fixed; only the frame-size immediates
- * (sub sp / save-area address / fp = sp+frame) still depend on body-emitted
- * temporaries and are patched in func_end. We size the region with a frame that
- * fits add/sub's imm12 (the real frame must too, or func_end's rebuild — capped
- * at this length — panics). The sret/variadic entry saves follow, as on the
- * single-pass path. */
+ * (sub sp / save-area address / fp = sp+saved_pair) still depend on body-
+ * emitted temporaries and are patched in func_end. We size the region with
+ * a frame that fits add/sub's imm12 (the real frame must too, or func_end's
+ * rebuild — capped at this length — panics). */
 static void aa_emit_prologue(NativeTarget* t) {
   AANativeTarget* a = aa_of(t);
   u32 words[AA_PROLOGUE_WORDS];
-  u32 est_frame = align_up_u32(a->cum_off + a->max_outgoing, 16u);
-  u32 n = aa_build_prologue_words(a, est_frame, words, AA_PROLOGUE_WORDS);
+  AAFrameLayout L = aa_build_layout(a->cum_off, a->max_outgoing);
+  u32 n = aa_build_prologue_words(a, &L, words, AA_PROLOGUE_WORDS);
   for (u32 i = 0; i < n; ++i) aa_emit32(t->mc, words[i]);
   a->minimal_prologue_words = n;
   aa_emit_entry_saves(a);
 }
 
-static void aa_emit_restore_frame(AANativeTarget* a, u32 frame_size) {
+static void aa_emit_restore_frame(AANativeTarget* a, const AAFrameLayout* L) {
   MCEmitter* mc = a->base.mc;
-  if (!frame_size) return;
-  if (a->slim_prologue) {
-    aa_emit32(mc, aa64_ldp64_post(AA_FP, AA_LR, AA_SP, 2));
-    return;
-  }
-  if (a->slim_small_frame) {
-    u32 save_off = frame_size - AA_FRAME_SAVE_SIZE;
-    u32 imm12, sh;
-    aa_emit32(mc,
-              aa64_ldp64_soff(AA_FP, AA_LR, AA_SP, (i32)(save_off / 8u)));
-    if (!aa64_addsub_imm_fits(frame_size, &imm12, &sh))
-      aa_panic(a, "slim_small_frame: frame_size out of addsub imm range");
-    aa_emit32(mc, aa64_add_imm(1, AA_SP, AA_SP, imm12, sh));
-    return;
-  }
-  aa_emit32(mc, aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0));
-  aa_emit32(mc, aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, -2)); /* fp,lr @ -16 */
-  aa_emit32(mc, aa64_add_imm(1, AA_SP, AA_TMP0, 0, 0));
+  u32 words[AA_PROLOGUE_WORDS];
+  u32 n = 0;
+  if (!L->frame_size) return;
+  aa_words_restore_frame(a, words, AA_PROLOGUE_WORDS, &n, L);
+  for (u32 i = 0; i < n; ++i) aa_emit32(mc, words[i]);
 }
 
 static void aa_patch_allocas(AANativeTarget* a) {
@@ -1228,7 +1302,7 @@ static void aa_words_callee_restores(AANativeTarget* a, u32* words, u32 cap,
   aa_words_callee_saves(a, 0, words, cap, n);
 }
 
-static void aa_patch_tail_sites(AANativeTarget* a, u32 frame_size) {
+static void aa_patch_tail_sites(AANativeTarget* a, const AAFrameLayout* L) {
   ObjSecId sec = a->func->text_section_id;
   for (u32 i = 0; i < a->ntail_sites; ++i) {
     AATailSite* site = &a->tail_sites[i];
@@ -1236,7 +1310,7 @@ static void aa_patch_tail_sites(AANativeTarget* a, u32 frame_size) {
     u32 n = 0;
     memset(words, 0, sizeof words);
     aa_words_callee_restores(a, words, AA_TAIL_WORDS, &n);
-    aa_words_restore_frame(a, words, AA_TAIL_WORDS, &n, frame_size);
+    aa_words_restore_frame(a, words, AA_TAIL_WORDS, &n, L);
     if (n >= AA_TAIL_WORDS) aa_panic(a, "tail patch too small");
     if (site->callee.kind == NATIVE_LOC_REG) {
       words[n++] = aa64_br(loc_reg(site->callee));
@@ -1255,7 +1329,7 @@ static void aa_patch_tail_sites(AANativeTarget* a, u32 frame_size) {
 static void aa_func_end(NativeTarget* t) {
   AANativeTarget* a = aa_of(t);
   MCEmitter* mc = t->mc;
-  u32 frame_size = align_up_u32(a->cum_off + a->max_outgoing, 16u);
+  AAFrameLayout L = aa_build_layout(a->cum_off, a->max_outgoing);
   /* Optimizer path emitted an exact-size prologue (minimal_prologue_words);
    * the single-pass path reserved a fixed worst-case region. Either way the
    * frame-size immediates are only final now, so patch the region in place. */
@@ -1264,45 +1338,36 @@ static void aa_func_end(NativeTarget* t) {
   /* Slim Tier A eligibility (set before emitting the epilogue / patching the
    * prologue so the *_restore_frame / *_build_prologue_words helpers pick the
    * slim form). Conditions: no callee-saves needed, no alloca, no body
-   * locals/spills (cum_off untouched past the reserved fp/lr save area), no
-   * outgoing stack args, and only on the optimizer path (the NDT reserves a
-   * much larger prologue region and isn't on the bench path). sret/variadic
-   * disqualify naturally because their entry-save slots advance cum_off. */
+   * slots (locals/spills/sret/variadic — all counted in slot_bytes), no
+   * outgoing stack args, optimizer path only (the NDT reserves a much
+   * larger prologue region). */
   a->slim_prologue =
       t->emit_minimal_prologue && a->ncallee_saves == 0 &&
-      a->nalloca_patches == 0 && a->cum_off == AA_FRAME_SAVE_SIZE &&
-      a->max_outgoing == 0;
+      a->nalloca_patches == 0 && L.slot_bytes == 0 && L.out_stack == 0;
   /* Universal small-frame fast path: skip the x17/x10 scratch when the
-   * saved-pair offset (frame_size - 16) fits stp's signed 7-bit scaled
-   * immediate. Mutually exclusive with the Tier A slim form (Tier A is
-   * strictly tighter — 2-insn prologue, 1-insn restore). Disqualify alloca:
-   * alloca dynamically moves sp during the body, and the fat epilogue's
-   * `add sp, fp, #0` (via x10) is what restores sp from fp. The slim
-   * epilogue's `add sp, sp, #N` only undoes the static frame, leaving sp
-   * pointing into the alloca area. */
+   * saved-pair offset fits stp's signed 7-bit scaled immediate. Mutually
+   * exclusive with the Tier A slim form (Tier A is strictly tighter).
+   * Disqualify alloca: alloca dynamically moves sp during the body, and the
+   * fat epilogue (sp = fp + 16 via x10) is what restores sp from fp; the
+   * slim_small_frame epilogue's `add sp, sp, #N` only undoes the static
+   * frame, leaving sp pointing into the alloca area. */
   a->slim_small_frame =
       !a->slim_prologue && a->nalloca_patches == 0 &&
-      frame_size >= AA_FRAME_SAVE_SIZE &&
-      (frame_size - AA_FRAME_SAVE_SIZE) <= 504u;
+      aa_sp_off_saved_pair(&L) <= 504u;
   mc->label_place(mc, a->epilogue_label);
   aa_emit_callee_restores(a);
-  aa_emit_restore_frame(a, frame_size);
+  aa_emit_restore_frame(a, &L);
   aa_emit32(mc, aa64_ret(AA_LR));
-  aa_patch_prologue(a, frame_size, prologue_region);
+  aa_patch_prologue(a, &L, prologue_region);
   aa_patch_allocas(a);
-  aa_patch_tail_sites(a, frame_size);
+  aa_patch_tail_sites(a, &L);
   if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) {
     mc->cfi_set_next_pc_offset(mc, prologue_region * 4u);
-    if (a->slim_prologue) {
-      /* After `stp x29,x30,[sp,#-16]!; mov x29,sp`: CFA = sp + 16, fp at
-       * CFA-16, lr at CFA-8. SP-anchored stays correct for the entire body
-       * since slim Tier A has no further sp moves. */
-      mc->cfi_def_cfa(mc, AA_SP, 16);
-    } else {
-      mc->cfi_def_cfa(mc, AA_FP, 0);
-    }
-    mc->cfi_offset(mc, AA_FP, -16);
-    mc->cfi_offset(mc, AA_LR, -8);
+    /* CFA = caller's sp = fp + AA_FRAME_SAVE_SIZE. saved fp/lr at fp/fp+8
+     * (= CFA-16, CFA-8). Unified across all three prologue layouts. */
+    mc->cfi_def_cfa(mc, AA_FP, AA_FRAME_SAVE_SIZE);
+    mc->cfi_offset(mc, AA_FP, aa_fp_off_saved_fp() - (i32)AA_FRAME_SAVE_SIZE);
+    mc->cfi_offset(mc, AA_LR, aa_fp_off_saved_lr() - (i32)AA_FRAME_SAVE_SIZE);
   }
   obj_symbol_define(t->obj, a->func->sym, a->func->text_section_id,
                     a->func_start, mc->pos(mc) - a->func_start);
@@ -1467,7 +1532,7 @@ static void aa_load_addr(NativeTarget* t, NativeLoc dst, NativeAddr addr) {
   switch ((NativeAddrBaseKind)addr.base_kind) {
     case NATIVE_ADDR_BASE_FRAME: {
       AANativeSlot* s = aa_slot(a, addr.base.frame);
-      aa_emit_add_imm(a, rd, AA_FP, -(i32)s->off + addr.offset);
+      aa_emit_add_imm(a, rd, AA_FP, aa_fp_off_slot(s->off) + addr.offset);
       aa_apply_index(a, rd, &addr);
       return;
     }
@@ -2073,7 +2138,12 @@ static void aa_store_outgoing_part(NativeTarget* t, int tail_call,
   addr.base_kind = NATIVE_ADDR_BASE_REG;
   addr.base.reg = tail_call ? AA_FP : AA_SP;
   addr.base_type = src.type;
-  addr.offset = (i32)stack_off;
+  /* Tail calls write outgoing args into the caller's incoming-args window
+   * (= [fp + 16 + off], same address the tail-callee will read via
+   * aa_fp_off_in_arg). Non-tail calls write to the sp-anchored outgoing
+   * area at the bottom of the caller's frame. */
+  addr.offset = tail_call ? aa_fp_off_tail_out_arg(stack_off)
+                          : aa_sp_off_out_arg(stack_off);
   aa_emit_mem(aa_of(t), 0, src, addr, mem);
 }
 
@@ -2214,6 +2284,18 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc,
   plan->stack_arg_size = aa_call_stack_size(t, desc);
   if (plan->stack_arg_size > aa_of(t)->max_outgoing)
     aa_of(t)->max_outgoing = plan->stack_arg_size;
+  /* Indirect call whose callee lives in x0..x7: the upcoming arg-load loop
+   * writes those same registers and would clobber the function pointer
+   * before blr reads it. Stash callee into AA_TMP0 (x16) up front and
+   * retarget the call. (AA_TMP0 is a backend scratch, never an arg reg.) */
+  if (plan->callee.kind == NATIVE_LOC_REG &&
+      (NativeAllocClass)plan->callee.cls == NATIVE_REG_INT &&
+      plan->callee.v.reg < 8u) {
+    NativeLoc scratch =
+        aa_reg_loc(plan->callee.type, NATIVE_REG_INT, AA_TMP0);
+    aa_move(t, scratch, plan->callee);
+    plan->callee = scratch;
+  }
   {
     u32 next_int = 0, next_fp = 0, stack = 0;
     int tail_call = (desc->flags & CG_CALL_TAIL) != 0;
@@ -3172,7 +3254,7 @@ static void aa_bind_native_param(NativeTarget* t, const CGParamDesc* p,
       memset(&saddr, 0, sizeof saddr);
       saddr.base_kind = NATIVE_ADDR_BASE_REG;
       saddr.base.reg = AA_FP;
-      saddr.offset = (i32)a->next_param_stack;
+      saddr.offset = aa_fp_off_in_arg(a->next_param_stack);
       aa_emit_mem(a, 1, src, saddr, aa_mem_for_type(t, p->type, 8));
       a->next_param_stack += 8u;
     }
@@ -3215,7 +3297,7 @@ static void aa_bind_native_param(NativeTarget* t, const CGParamDesc* p,
       saddr.base_kind = NATIVE_ADDR_BASE_REG;
       saddr.base.reg = AA_FP;
       saddr.base_type = p->type;
-      saddr.offset = (i32)a->next_param_stack;
+      saddr.offset = aa_fp_off_in_arg(a->next_param_stack);
       aa_emit_mem(a, 1, src, saddr, aa_mem_for_type(t, p->type, part->size));
       a->next_param_stack += aa_part_stack_size(part);
     }
@@ -3376,7 +3458,10 @@ static void aa_va_start_core(AANativeTarget* a, NativeAddr ap) {
   NativeLoc ptr =
       aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT, AA_TMP0);
   if (vai.kind == ABI_VA_LIST_POINTER) {
-    aa_emit_add_imm(a, AA_TMP0, AA_FP, (i32)a->next_param_stack);
+    /* `va_list = &<first vararg>`. Variadic stack args follow the fixed
+     * incoming params in the same caller window, so the offset is the
+     * current next_param_stack cursor. */
+    aa_emit_add_imm(a, AA_TMP0, AA_FP, aa_fp_off_in_arg(a->next_param_stack));
     aa_emit_mem(a, 0, ptr, ap, aa_mem_for_type(t, ptr.type, 8));
     return;
   }
diff --git a/src/opt/cg_ir_lower.c b/src/opt/cg_ir_lower.c
@@ -192,6 +192,16 @@ static const CgIrParam* find_param(const CgIrFunc* f, CGLocal local) {
 }
 
 static void lower_params(CgIrLower* l) {
+  /* Resolve the function-level ABI info once so we can attach per-param
+   * ABIArgInfo to each IRParam. Consumers (set_preg_pref_for_params, the
+   * native bind_param emit path) read p->abi without going through
+   * f->desc.abi, so this stays scoped to the param plumbing and does not
+   * activate the dormant f->desc.abi-gated passes (e.g.
+   * apply_param_incoming_register_hazards, opt_verify_alloc's incoming
+   * check), which have known issues with tail-call shuffles. */
+  const ABIFuncInfo* fi = NULL;
+  if (l->c && l->c->abi && l->f->desc.fn_type)
+    fi = abi_cg_func_info(l->c->abi, l->f->desc.fn_type);
   for (u32 i = 0; i < l->src->nlocals; ++i) {
     const CgIrLocal* loc = &l->src->locals[i];
     if (!loc->is_param) continue;
@@ -217,6 +227,7 @@ static void lower_params(CgIrLower* l) {
       d.loc = loc->desc.loc;
     }
     d.storage = m->storage;
+    if (fi && d.index < fi->nparams) d.abi = &fi->params[d.index];
     ir_param_add(l->f, &d);
   }
 }
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -218,104 +218,8 @@ static int phys_arg_reg_for_index(Func* f, u8 cls, u32 abi_index, Reg* out) {
   return 0;
 }
 
-typedef struct ParamIncomingRegs {
-  Reg regs[64];
-  u8 cls[64];
-  u8 has[64];
-  u32 nparams;
-  Reg all_regs[128];
-  u8 all_cls[128];
-  u32 nall;
-} ParamIncomingRegs;
-
-static void param_incoming_add(ParamIncomingRegs* out, u8 cls, Reg r) {
-  if (r >= 32) return;
-  for (u32 i = 0; i < out->nall; ++i)
-    if (out->all_cls[i] == cls && out->all_regs[i] == r) return;
-  if (out->nall < 128u) {
-    out->all_cls[out->nall] = cls;
-    out->all_regs[out->nall] = r;
-    ++out->nall;
-  }
-}
-
-static void collect_param_incoming_regs(Func* f, ParamIncomingRegs* out) {
-  memset(out, 0, sizeof *out);
-  if (!f || !f->desc.abi || !f->nparams) return;
-
-  u32 next_int = 0;
-  u32 next_fp = 0;
-  if (f->desc.abi->has_sret && f->opt_target.arch != CFREE_ARCH_ARM_64)
-    next_int = 1;
-
-  out->nparams = f->nparams < 64u ? f->nparams : 64u;
-  for (u32 i = 0; i < out->nparams; ++i) {
-    IRParam* p = &f->params[i];
-    const ABIArgInfo* ai = p->abi;
-    if (!ai || ai->kind == ABI_ARG_IGNORE) continue;
-    if (ai->kind == ABI_ARG_INDIRECT) {
-      Reg r = REG_NONE;
-      if (phys_arg_reg_for_index(f, RC_INT, next_int, &r)) {
-        out->regs[i] = r;
-        out->cls[i] = RC_INT;
-        out->has[i] = 1;
-        param_incoming_add(out, RC_INT, r);
-      }
-      ++next_int;
-      continue;
-    }
-    if (ai->kind != ABI_ARG_DIRECT) continue;
-    for (u16 j = 0; j < ai->nparts; ++j) {
-      const ABIArgPart* part = &ai->parts[j];
-      if (part->cls == ABI_CLASS_FP) {
-        Reg r = REG_NONE;
-        if (phys_arg_reg_for_index(f, RC_FP, next_fp, &r)) {
-          param_incoming_add(out, RC_FP, r);
-          if (ai->nparts == 1) {
-            out->regs[i] = r;
-            out->cls[i] = RC_FP;
-            out->has[i] = 1;
-          }
-        }
-        ++next_fp;
-      } else if (part->cls == ABI_CLASS_INT) {
-        Reg r = REG_NONE;
-        if (phys_arg_reg_for_index(f, RC_INT, next_int, &r)) {
-          param_incoming_add(out, RC_INT, r);
-          if (ai->nparts == 1) {
-            out->regs[i] = r;
-            out->cls[i] = RC_INT;
-            out->has[i] = 1;
-          }
-        }
-        ++next_int;
-      }
-    }
-  }
-}
-
 static int hard_available(Func* f, u8 cls, Reg r);
 
-static void apply_param_incoming_register_hazards(Func* f) {
-  if (!f || !f->preg_info || !f->desc.abi || !f->nparams) return;
-  ParamIncomingRegs incoming;
-  collect_param_incoming_regs(f, &incoming);
-
-  /* O1 replays parameter materialization before the body, but values left in
-   * their ABI incoming registers are not represented as live from function
-   * entry to first use. Keep those incoming registers out of virtual
-   * allocation so the backend emits explicit entry moves/stores before body
-   * code can reuse them. Fixed asm constraints still use tied_hard_reg. */
-  for (PReg v = 1; v < opt_reg_count(f); ++v) {
-    u8 cls = f->preg_info[v].cls;
-    for (u32 j = 0; j < incoming.nall; ++j) {
-      if (incoming.all_cls[j] != cls) continue;
-      if (f->preg_info[v].tied_hard_reg == (i32)incoming.all_regs[j]) continue;
-      forbid_preg_reg(f, v, cls, incoming.all_regs[j]);
-    }
-  }
-}
-
 static int is_caller_saved(Func* f, u8 cls, Reg r) {
   if (cls >= OPT_REG_CLASSES || r >= 32) return 0;
   return (f->opt_caller_saved[cls] & (1u << r)) != 0;
@@ -339,20 +243,11 @@ static void set_preg_pref_to_ret_reg(Func* f, const Operand* op) {
   if (hint == REG_NONE || hint >= 32) return;
   /* Don't override a real pin. */
   if (f->preg_info[v].tied_hard_reg >= 0) return;
-  /* apply_param_incoming_register_hazards conservatively forbids incoming
-   * param regs (e.g. x0) for every body PReg, because liveness doesn't
-   * model the implicit entry-move from x0 -> the param's home reg. That
-   * forbid is overly broad for call-result and ret-value PRegs:
-   *   - A call-result PReg's def is mid-function (the call writes x0); its
-   *     live range starts after every entry move, so it can't alias the
-   *     entry-window use of x0.
-   *   - A ret-value PReg is consumed at IR_RET (function exit); its live
-   *     range can extend through the body but the entry-move into the
-   *     param's home reg has already completed by the time any body inst
-   *     could define this PReg.
-   * Clear the forbid for the hinted reg so the allocator can actually
-   * pick it. The general conflict check (alloc_group_conflicts_bit) still
-   * excludes intervening clobbers like other calls. */
+  /* The hint reg may not be in opt_hard_regs (e.g. x0 on aa64 is reserved
+   * as the ABI ret reg, outside aa_int_allocable); the allocator's
+   * preferred-reg branch will still consider it via the unit-overlap
+   * precision check. Clear any leftover forbid bit so the hint isn't
+   * silently blocked. */
   f->preg_info[v].forbidden_hard_regs &= ~(1u << hint);
   f->preg_info[v].preferred_hard_reg = (i8)hint;
 }
@@ -366,9 +261,7 @@ static void set_preg_pref_for_abivalue(Func* f, const CGABIValue* v) {
 
 /* Soft hint: prefer a specific ABI register for `op`'s PReg. Symmetric to
  * set_preg_pref_to_ret_reg but takes an arbitrary hint reg (the matching
- * arg reg for the i-th call argument). Same rationale for clearing the
- * apply_param_incoming_register_hazards forbid: the source operand of a
- * call arg is defined in the body, after every entry-bind has run. */
+ * arg reg for the i-th call argument). */
 static void set_preg_pref_to_arg_reg(Func* f, const Operand* op, Reg hint) {
   if (!op || op->kind != OPK_REG) return;
   if (hint == REG_NONE || hint >= 32) return;
@@ -382,9 +275,86 @@ static void set_preg_pref_to_arg_reg(Func* f, const Operand* op, Reg hint) {
   f->preg_info[v].preferred_hard_reg = (i8)hint;
 }
 
+/* Hint each single-PReg-stored param toward its own incoming ABI reg. When
+ * the allocator picks the incoming reg, bind_param sees src==dst and emits
+ * no entry move (aa_bind_native_param checks at native.c:3227). Live-range
+ * conflicts at body use sites still go through the normal allocator check,
+ * so cross-call params that need a callee-save get one. */
+/* True iff `f` contains any IR_CALL flagged as a tail call. Tail-call arg
+ * routing goes through the backend shuffle which can permute the caller's
+ * incoming arg regs into different positions for the callee — pinning each
+ * param PReg to its own incoming reg turns those permutations into multi-reg
+ * cycles the shuffle can't break. Symmetric to the per-call tail skip in
+ * set_preg_pref_for_call_args. */
+static int func_has_tail_call(const Func* f) {
+  if (!f) return 0;
+  for (u32 b = 0; b < f->nblocks; ++b) {
+    const Block* bl = &f->blocks[b];
+    for (u32 i = 0; i < bl->ninsts; ++i) {
+      const Inst* in = &bl->insts[i];
+      if ((IROp)in->op != IR_CALL) continue;
+      const IRCallAux* aux = (const IRCallAux*)in->extra.aux;
+      if (aux && (aux->desc.flags & CG_CALL_TAIL)) return 1;
+    }
+  }
+  return 0;
+}
+
+static void set_preg_pref_for_params(Func* f) {
+  if (!f || !f->preg_info || !f->nparams) return;
+  if (func_has_tail_call(f)) return;
+  /* Per-class ABI arg cursors. Drives from per-param ABI info rather than
+   * f->desc.abi so this fires on paths where only f->params[i].abi is set. */
+  u32 next_int = 0;
+  u32 next_fp = 0;
+  /* sret on non-aa64 targets consumes the first int arg slot. Only consult
+   * f->desc.abi for this when it's available; aa64 (the only arch where this
+   * hint targets x0..x7 today) doesn't have the sret-takes-arg0 quirk. */
+  if (f->desc.abi && f->desc.abi->has_sret &&
+      f->opt_target.arch != CFREE_ARCH_ARM_64)
+    next_int = 1;
+  for (u32 i = 0; i < f->nparams; ++i) {
+    IRParam* p = &f->params[i];
+    const ABIArgInfo* ai = p->abi;
+    if (!ai || ai->kind == ABI_ARG_IGNORE) continue;
+    if (ai->kind == ABI_ARG_INDIRECT) {
+      ++next_int;
+      continue;
+    }
+    if (ai->kind != ABI_ARG_DIRECT) continue;
+    /* Only hint single-part DIRECT params whose home is a single PReg.
+     * Aggregate / split params take the bind_param frame-store path. */
+    int single_part_to_preg = (ai->nparts == 1) &&
+                              (p->storage.kind == CG_LOCAL_STORAGE_REG);
+    if (single_part_to_preg) {
+      const ABIArgPart* part = &ai->parts[0];
+      u8 cls = (part->cls == ABI_CLASS_FP) ? RC_FP : RC_INT;
+      u32* counter = (cls == RC_FP) ? &next_fp : &next_int;
+      Reg hint = REG_NONE;
+      if (*counter < 8u && phys_arg_reg_for_index(f, cls, *counter, &hint)) {
+        PReg v = (PReg)p->storage.v.reg;
+        if (v != PREG_NONE && v != 0 && v < opt_reg_count(f) &&
+            f->preg_info[v].cls == cls &&
+            f->preg_info[v].tied_hard_reg < 0 &&
+            f->preg_info[v].preferred_hard_reg < 0 &&
+            hint != REG_NONE && hint < 32) {
+          f->preg_info[v].forbidden_hard_regs &= ~(1u << hint);
+          f->preg_info[v].preferred_hard_reg = (i8)hint;
+        }
+      }
+    }
+    /* Advance the ABI cursors for every part of this param's home, regardless
+     * of whether we hinted, so subsequent params see the right slot. */
+    for (u16 j = 0; j < ai->nparts; ++j) {
+      u32* c = (ai->parts[j].cls == ABI_CLASS_FP) ? &next_fp : &next_int;
+      *c += 1u;
+    }
+  }
+}
+
 /* For each IR_CALL arg whose source storage is a single OPK_REG, hint that
  * PReg to the matching ABI arg register. Sequential int/fp counters mirror
- * the per-class arg slot assignment in collect_param_incoming_regs. Skips
+ * the per-class arg slot assignment used by set_preg_pref_for_params. Skips
  * variadic, has_sret, and indirect/aggregate args: they need per-target
  * counter logic that hasn't been factored out of plan_call. */
 static void set_preg_pref_for_call_args(Func* f, const CGCallDesc* desc) {
@@ -473,6 +443,7 @@ static void propagate_hint_through_copies(Func* f) {
  * a result PReg live across another call cannot pick x0). */
 static void apply_abi_aliasing_hints(Func* f) {
   if (!f || !f->preg_info) return;
+  set_preg_pref_for_params(f);
   for (u32 b = 0; b < f->nblocks; ++b) {
     Block* bl = &f->blocks[b];
     for (u32 i = 0; i < bl->ninsts; ++i) {
@@ -1946,23 +1917,11 @@ static void opt_verify_alloc(Func* f, const OptLiveInfo* live) {
   u32 nregs = opt_reg_count(f);
   u8* cur;
   if (nregs <= 1u || !live) return;
-  ParamIncomingRegs incoming;
-  collect_param_incoming_regs(f, &incoming);
-  for (PReg v = 1; v < nregs; ++v) {
-    OptPRegInfo* vi = &f->preg_info[v];
-    if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_HARD || vi->use_freq == 0)
-      continue;
-    u8 cls = opt_preg_loc_cls(f, v);
-    Reg hard = opt_preg_hard_reg(f, v);
-    for (u32 i = 0; i < incoming.nall; ++i) {
-      if (cls == incoming.all_cls[i] && hard == incoming.all_regs[i]) {
-        SrcLoc loc = {0, 0, 0};
-        compiler_panic(f->c, loc,
-                       "opt regalloc: O1 preg %u left in incoming cls%u reg%u",
-                       (unsigned)v, (unsigned)cls, (unsigned)hard);
-      }
-    }
-  }
+  /* No "left in incoming reg" pre-check: the hint path's
+   * opt_ranges_overlap_kind precision check already permits the unit-overlap
+   * between a param PReg and its own incoming reg (= "no entry move"), and
+   * the standard allocator's bitmap rejects every other overlap. The
+   * per-instruction interference scan below is the residual safety net. */
   cur = arena_array(f->arena, u8, nregs);
   for (u32 b = 0; b < f->nblocks; ++b) {
     Block* bl = &f->blocks[b];
@@ -2025,7 +1984,6 @@ static void opt_regalloc_place(Func* f, int allow_live_range_split,
   opt_live_ranges_build(f, &live, &ranges);
   opt_init_preg_info_from_ranges(f, &ranges);
   opt_apply_asm_constraints_from_live(f, &live);
-  apply_param_incoming_register_hazards(f);
   apply_abi_aliasing_hints(f);
   /* MIR coalesces only at -O2 (mir-gen.c:9431); match that here. At O1 the
    * point-bitmap allocator emits copies through the natural conflict-free

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/arch/aa64/native.c	\|	301	+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
M	src/opt/cg_ir_lower.c	\|	11	+++++++++++
M	src/opt/pass_lower.c	\|	222	++++++++++++++++++++++++++++++++-----------------------------------------------