commit 0a722716448183a97354a873bcda572031919e19
parent 678aa2c2c1d76c07ae26fecbd5b4c2128564012d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 28 May 2026 10:12:05 -0700
opt+aa64: unify on AAPCS64 frame layout; hint params to incoming regs
aa64 prologue/epilogue had three layouts (slim_prologue, slim_small_frame,
fat) that each anchored fp differently. Bind_param assumed fp == caller's
original sp, which only held for two of them; the slim layout left fp 16
bytes low, so incoming stack args read from the saved-pair slots. Activated
the moment any optimization let a function skip its callee-save spills.
Centralize the layout in a single AAFrameLayout struct with typed accessors
(aa_fp_off_in_arg, aa_fp_off_slot, aa_fp_off_saved_fp/lr, aa_sp_off_saved_pair,
aa_fp_off_tail_out_arg, aa_sp_off_out_arg) documented by one ASCII diagram.
Every site that addresses the frame now goes through a helper — no bare
arithmetic on AA_FP/AA_SP for frame addressing. All three prologue variants
anchor fp at the saved-pair address (true AAPCS64); CFI unified.
Param-hint: set_preg_pref_for_params hints each scalar param PReg toward
its own incoming ABI reg, gated by func_has_tail_call (tail shuffles can't
break cycles imposed by pinned param positions). Drives add(a,b) to
'add x0, x0, x1; ret'. Requires p->abi to be populated, so lower_params
resolves it via abi_cg_func_info (scoped to the param side; doesn't activate
the dormant f->desc.abi path).
Fix latent indirect-call bug exposed by the hint: when the callee lives in
x0..x7 (e.g. a param hinted to x0), the arg-load loop would clobber it
before blr. aa_plan_call now stashes the callee into AA_TMP0 first.
Delete apply_param_incoming_register_hazards and the verifier's
"left-in-incoming-reg" pre-check. Both were dormant via the f->desc.abi
gate; both had the tail-call bug. The hint mechanism + opt_ranges_overlap_kind
unit-overlap check is the real safety net.
Diffstat:
3 files changed, 294 insertions(+), 240 deletions(-)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -61,6 +61,72 @@ enum {
AA_TAIL_WORDS = 32u,
};
+/* ============================================================================
+ * AAPCS64 frame layout
+ *
+ * fp anchors at the caller's saved-pair address; sp anchors at the bottom of
+ * the outgoing-arg area. Every fp- or sp-relative offset in this file is
+ * computed via one of the helpers below — no site should do bare arithmetic
+ * on AA_FP / AA_SP for addressing the frame.
+ *
+ * high addr caller's stack frame
+ * +------------------------------+
+ * | incoming stack args | aa_fp_off_in_arg(i)
+ * +------------------------------+
+ * fp --> | saved x29 (prev fp) | aa_fp_off_saved_fp()
+ * | saved x30 (prev lr) | aa_fp_off_saved_lr()
+ * +------------------------------+
+ * | frame slots | aa_fp_off_slot(s->off)
+ * | (callee-saves + locals |
+ * | + spills + sret/variadic) |
+ * +------------------------------+
+ * | outgoing args | aa_sp_off_out_arg(i)
+ * sp --> +------------------------------+
+ * low addr
+ *
+ * frame_size = align16(AA_FRAME_SAVE_SIZE + slot_bytes + out_stack).
+ * Tail calls write outgoing args into the caller's incoming-args window —
+ * physically the same address, expressed via aa_fp_off_tail_out_arg.
+ * ========================================================================== */
+
+static u32 align_up_u32(u32 v, u32 align);
+
+typedef struct AAFrameLayout {
+ u32 slot_bytes; /* sum of aa_frame_slot reservations (callee-saves + locals
+ * + spills + sret/variadic) */
+ u32 out_stack; /* max outgoing-arg bytes across all calls in this function */
+ u32 frame_size; /* align16(AA_FRAME_SAVE_SIZE + slot_bytes + out_stack) */
+} AAFrameLayout;
+
+static inline AAFrameLayout aa_build_layout(u32 slot_bytes, u32 out_stack) {
+ AAFrameLayout L;
+ L.slot_bytes = slot_bytes;
+ L.out_stack = out_stack;
+ L.frame_size =
+ align_up_u32(AA_FRAME_SAVE_SIZE + slot_bytes + out_stack, 16u);
+ return L;
+}
+
+/* FP-relative byte offsets. */
+static inline i32 aa_fp_off_saved_fp(void) { return 0; }
+static inline i32 aa_fp_off_saved_lr(void) { return 8; }
+static inline i32 aa_fp_off_in_arg(u32 byte_off) {
+ return (i32)(AA_FRAME_SAVE_SIZE + byte_off);
+}
+static inline i32 aa_fp_off_slot(u32 slot_off) { return -(i32)slot_off; }
+/* Outgoing stack args on a tail call land in the caller's incoming-arg
+ * window — same physical address the tail-callee will read via
+ * aa_fp_off_in_arg. Same helper, distinct name for site-side intent. */
+static inline i32 aa_fp_off_tail_out_arg(u32 byte_off) {
+ return aa_fp_off_in_arg(byte_off);
+}
+
+/* SP-relative byte offsets. */
+static inline i32 aa_sp_off_out_arg(u32 byte_off) { return (i32)byte_off; }
+static inline u32 aa_sp_off_saved_pair(const AAFrameLayout* L) {
+ return L->frame_size - AA_FRAME_SAVE_SIZE;
+}
+
typedef struct AANativeSlot {
u32 off;
u32 size;
@@ -545,7 +611,7 @@ static void aa_addr_base(AANativeTarget* a, NativeAddr addr, u32* base_out,
case NATIVE_ADDR_BASE_FRAME: {
AANativeSlot* s = aa_slot(a, addr.base.frame);
*base_out = AA_FP;
- *off_out = -(i32)s->off + addr.offset;
+ *off_out = aa_fp_off_slot(s->off) + addr.offset;
return;
}
case NATIVE_ADDR_BASE_GLOBAL: {
@@ -819,7 +885,7 @@ static u32 aa_ldst_q_simm9(int load, u32 rt, u32 rn, i32 byte_off) {
static void aa_emit_q_frame(AANativeTarget* a, int load, u32 qreg,
NativeFrameSlot slot, u32 offset) {
AANativeSlot* s = aa_slot(a, slot);
- i32 off = -(i32)s->off + (i32)offset;
+ i32 off = aa_fp_off_slot(s->off) + (i32)offset;
MCEmitter* mc = a->base.mc;
if (off >= 0 && ((u32)off & 15u) == 0 && ((u32)off >> 4) <= 0xfffu) {
aa_emit32(mc, aa_ldst_q_uimm(load, qreg, AA_FP, (u32)off));
@@ -873,11 +939,18 @@ static void aa_func_begin(NativeTarget* t, const CGFuncDesc* fd) {
MCEmitter* mc = t->mc;
a->func = fd;
a->nslots = 0;
- a->cum_off = AA_FRAME_SAVE_SIZE;
+ /* cum_off counts frame-slot bytes below fp (see AAFrameLayout above).
+ * The saved fp/lr pair (16 bytes at [fp, fp+8]) is *not* part of cum_off;
+ * the frame-size computation in aa_func_end adds it via aa_build_layout. */
+ a->cum_off = 0;
a->max_outgoing = 0;
a->incoming_stack_size = 0;
a->next_param_int = 0;
a->next_param_fp = 0;
+ /* 0-based byte cursor for incoming stack args (also reported as the
+ * caller's incoming_stack_size for tail-call realizability). bind_param
+ * forms its fp-relative address via aa_fp_off_in_arg(next_param_stack),
+ * which adds the saved-pair offset. */
a->next_param_stack = 0;
a->sret_ptr_slot = NATIVE_FRAME_SLOT_NONE;
a->saved_tmp_slot = NATIVE_FRAME_SLOT_NONE;
@@ -1015,23 +1088,30 @@ static void aa_words_sub_sp_frame(AANativeTarget* a, u32* words, u32 cap,
words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP1, 0, 0);
}
+/* Anchor fp at the AAPCS64 saved-pair address (= sp + saved-pair offset).
+ * The slim_prologue path achieves the same anchor in a single insn via
+ * `add x29, sp, #0` after the pre-decrement stp moves sp to the saved-pair. */
static void aa_words_frame_ptr_from_sp(AANativeTarget* a, u32* words, u32 cap,
- u32* n, u32 frame_size) {
+ u32* n, const AAFrameLayout* L) {
u32 imm12, sh;
- if (aa64_addsub_imm_fits(frame_size, &imm12, &sh)) {
+ u32 anchor = aa_sp_off_saved_pair(L);
+ if (aa64_addsub_imm_fits(anchor, &imm12, &sh)) {
if (*n >= cap) aa_panic(a, "instruction patch too small");
words[(*n)++] = aa64_add_imm(1, AA_FP, AA_SP, imm12, sh);
return;
}
- aa_words_load_imm(a, words, cap, n, AA_TMP0, frame_size);
+ aa_words_load_imm(a, words, cap, n, AA_TMP0, anchor);
if (*n + 2u > cap) aa_panic(a, "instruction patch too small");
words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0);
words[(*n)++] = aa64_add(1, AA_FP, AA_TMP1, AA_TMP0);
}
+/* x17 = address of the saved-pair slot (= sp + saved-pair offset). Used by
+ * the fat prologue to materialize the stp destination when the offset
+ * doesn't fit stp's signed-7-bit-scaled immediate. */
static void aa_words_saved_pair_addr(AANativeTarget* a, u32* words, u32 cap,
- u32* n, u32 frame_size) {
- u32 save_off = frame_size - AA_FRAME_SAVE_SIZE;
+ u32* n, const AAFrameLayout* L) {
+ u32 save_off = aa_sp_off_saved_pair(L);
u32 imm12, sh;
if (aa64_addsub_imm_fits(save_off, &imm12, &sh)) {
if (*n >= cap) aa_panic(a, "instruction patch too small");
@@ -1045,33 +1125,34 @@ static void aa_words_saved_pair_addr(AANativeTarget* a, u32* words, u32 cap,
}
static void aa_words_restore_frame(AANativeTarget* a, u32* words, u32 cap,
- u32* n, u32 frame_size) {
- if (!frame_size) return;
+ u32* n, const AAFrameLayout* L) {
+ if (!L->frame_size) return;
if (a->slim_prologue) {
if (*n + 1u > cap) aa_panic(a, "instruction patch too small");
- /* `ldp x29, x30, [sp], #16` — pops the saved pair and restores sp. */
+ /* `ldp x29, x30, [sp], #16` — pop saved pair, restore sp. */
words[(*n)++] = aa64_ldp64_post(AA_FP, AA_LR, AA_SP, 2);
return;
}
if (a->slim_small_frame) {
- /* `ldp x29,x30,[sp,#N-16] ; add sp,sp,#N` — skip the `add x10,fp,#0`
- * scratch the fat path uses. Restoring fp,lr through sp+offset avoids
- * the scratch entirely; the subsequent `add sp` then unwinds the frame
- * without depending on the (now-clobbered) old fp. */
- u32 save_off = frame_size - AA_FRAME_SAVE_SIZE;
+ /* `ldp x29,x30,[sp,#saved_pair] ; add sp,sp,#frame_size` — load through
+ * sp avoids the fat path's `add x10, fp, #0` scratch, and the subsequent
+ * `add sp` unwinds without depending on the (now-clobbered) old fp. */
+ u32 save_off = aa_sp_off_saved_pair(L);
u32 imm12, sh;
if (*n + 2u > cap) aa_panic(a, "instruction patch too small");
words[(*n)++] =
aa64_ldp64_soff(AA_FP, AA_LR, AA_SP, (i32)(save_off / 8u));
- if (!aa64_addsub_imm_fits(frame_size, &imm12, &sh))
+ if (!aa64_addsub_imm_fits(L->frame_size, &imm12, &sh))
aa_panic(a, "slim_small_frame: frame_size out of addsub imm range");
words[(*n)++] = aa64_add_imm(1, AA_SP, AA_SP, imm12, sh);
return;
}
if (*n + 3u > cap) aa_panic(a, "instruction patch too small");
+ /* AAPCS64: fp is the saved-pair address. Reload pair from [fp], then
+ * restore sp to fp + AA_FRAME_SAVE_SIZE (= caller's original sp = CFA). */
words[(*n)++] = aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0);
- words[(*n)++] = aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, -2); /* fp,lr @ -16 */
- words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP0, 0, 0);
+ words[(*n)++] = aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, 0);
+ words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP0, AA_FRAME_SAVE_SIZE, 0);
}
/* Emit callee-save store (save=1) or restore (save=0) words into `words`,
@@ -1084,13 +1165,14 @@ static void aa_words_callee_saves(AANativeTarget* a, int save, u32* words,
u32 cap, u32* n) {
for (u32 i = 0; i < a->ncallee_saves;) {
const AACalleeSave* cs = &a->callee_saves[i];
- i32 off = -(i32)aa_slot(a, cs->slot)->off;
+ i32 off = aa_fp_off_slot(aa_slot(a, cs->slot)->off);
if (off < -256 || off > 255)
aa_panic(a, "callee-save offset out of prologue range");
if (i + 1u < a->ncallee_saves && cs->cls == (u8)NATIVE_REG_INT &&
a->callee_saves[i + 1u].cls == (u8)NATIVE_REG_INT) {
const AACalleeSave* cs2 = &a->callee_saves[i + 1u];
- i32 off2 = -(i32)aa_slot(a, cs2->slot)->off; /* off - 8, lower address */
+ i32 off2 = aa_fp_off_slot(aa_slot(a, cs2->slot)->off);
+ /* off2 = off - 8 (lower address; reserve allocates downward). */
if (*n >= cap) aa_panic(a, "prologue too large");
words[(*n)++] = save
? aa64_stp64_soff(cs2->reg, cs->reg, AA_FP, off2 / 8)
@@ -1106,57 +1188,63 @@ static void aa_words_callee_saves(AANativeTarget* a, int save, u32* words,
}
}
-/* Build the prologue instruction words for `frame_size` into `words` (capacity
- * `cap`), returning the count. Shared by the NativeDirectTarget patch path
- * (reserves a fixed worst-case region, then patches it here) and the optimizer
- * path (emits an exact-size region up front; see aa_emit_prologue). */
-static u32 aa_build_prologue_words(AANativeTarget* a, u32 frame_size, u32* words,
- u32 cap) {
+/* Build the prologue instruction words for `L` into `words` (capacity `cap`),
+ * returning the count. Shared by the NativeDirectTarget patch path (reserves
+ * a fixed worst-case region, then patches it here) and the optimizer path
+ * (emits an exact-size region up front; see aa_emit_prologue).
+ *
+ * All three variants establish the same post-prologue state defined by L:
+ * sp = caller's sp - L->frame_size
+ * fp = sp + aa_sp_off_saved_pair(L) (saved-pair address)
+ * saved x29/x30 at [fp], [fp+8]
+ * callee-saves at [fp - s->off] for each. */
+static u32 aa_build_prologue_words(AANativeTarget* a, const AAFrameLayout* L,
+ u32* words, u32 cap) {
u32 n = 0;
- if (!frame_size) return 0;
+ if (!L->frame_size) return 0;
if (a->slim_prologue) {
if (cap < 2u) aa_panic(a, "prologue too large");
- /* `stp x29, x30, [sp, #-16]!` — push the saved pair and adjust sp in
- * one instruction. `mov x29, sp` keeps the AAPCS64 backtrace chain
- * intact (some unwinders walk x29 directly rather than via DWARF). */
+ /* `stp x29, x30, [sp, #-16]!; add x29, sp, #0` — the pre-decrement stp
+ * moves sp down to the saved-pair address, so a no-op add anchors fp
+ * there directly. AAPCS64 frame record. */
words[n++] = aa64_stp64_pre(AA_FP, AA_LR, AA_SP, -2);
words[n++] = aa64_add_imm(1, AA_FP, AA_SP, 0, 0);
return n;
}
- aa_words_sub_sp_frame(a, words, cap, &n, frame_size);
+ aa_words_sub_sp_frame(a, words, cap, &n, L->frame_size);
if (a->slim_small_frame) {
- /* `stp x29, x30, [sp, #(frame_size-16)]` — skip the `add x17, sp, #N-16`
- * scratch step the fat path emits. Valid when (frame_size - 16) fits the
- * stp signed-7-bit scaled immediate (i.e. frame_size <= 520). */
- u32 save_off = frame_size - AA_FRAME_SAVE_SIZE;
+ /* `stp x29, x30, [sp, #saved_pair_off]` — skip the `add x17, sp, #...`
+ * scratch the fat path needs. Valid when the offset fits stp's
+ * signed-7-bit scaled immediate (saved_pair_off <= 504). */
+ u32 save_off = aa_sp_off_saved_pair(L);
if (n >= cap) aa_panic(a, "prologue too large");
words[n++] =
aa64_stp64_soff(AA_FP, AA_LR, AA_SP, (i32)(save_off / 8u));
} else {
- aa_words_saved_pair_addr(a, words, cap, &n, frame_size);
+ aa_words_saved_pair_addr(a, words, cap, &n, L);
if (n >= cap) aa_panic(a, "prologue too large");
words[n++] = aa64_stp64_soff(AA_FP, AA_LR, AA_TMP1, 0); /* fp,lr @ [x17] */
}
- aa_words_frame_ptr_from_sp(a, words, cap, &n, frame_size);
- /* Save callee-saved registers the allocator used, FP-relative. Their slots
- * were reserved first (aa_reserve_callee_saves), so offsets fit stur's
- * signed-9-bit immediate. */
+ aa_words_frame_ptr_from_sp(a, words, cap, &n, L);
+ /* Save callee-saved registers the allocator used (fp-relative; their slots
+ * were reserved first by aa_reserve_callee_saves so offsets fit stur). */
aa_words_callee_saves(a, 1, words, cap, &n);
return n;
}
/* Patch the reserved prologue region (`region` words at prologue_pos) with the
- * real prologue for `frame_size`. Used by the NativeDirectTarget single-pass
- * path, which reserves AA_PROLOGUE_WORDS up front before the frame is known.
- * The optimizer path reserves exactly the words it needs, so `region` equals
+ * real prologue for `L`. Used by the NativeDirectTarget single-pass path,
+ * which reserves AA_PROLOGUE_WORDS up front before the frame is known. The
+ * optimizer path reserves exactly the words it needs, so `region` equals
* the real prologue length and no tail remains. */
-static void aa_patch_prologue(AANativeTarget* a, u32 frame_size, u32 region) {
+static void aa_patch_prologue(AANativeTarget* a, const AAFrameLayout* L,
+ u32 region) {
u32 words[AA_PROLOGUE_WORDS];
u32 n;
ObjSecId sec = a->func->text_section_id;
if (region > AA_PROLOGUE_WORDS) aa_panic(a, "prologue region too large");
memset(words, 0, sizeof words);
- n = aa_build_prologue_words(a, frame_size, words, region);
+ n = aa_build_prologue_words(a, L, words, region);
/* If the real prologue is shorter than the reserved region (the worst-case
* NDT reservation), branch straight to the body rather than leaving the
* trailing slots as NOPs that fall through and execute on every call. */
@@ -1171,41 +1259,27 @@ static void aa_patch_prologue(AANativeTarget* a, u32 frame_size, u32 region) {
/* Optimizer path: emit an exact-size prologue in place (no reserved NOP
* region). The callee-save set and the static frame slots are final by now, so
* the prologue's instruction count is fixed; only the frame-size immediates
- * (sub sp / save-area address / fp = sp+frame) still depend on body-emitted
- * temporaries and are patched in func_end. We size the region with a frame that
- * fits add/sub's imm12 (the real frame must too, or func_end's rebuild — capped
- * at this length — panics). The sret/variadic entry saves follow, as on the
- * single-pass path. */
+ * (sub sp / save-area address / fp = sp+saved_pair) still depend on body-
+ * emitted temporaries and are patched in func_end. We size the region with
+ * a frame that fits add/sub's imm12 (the real frame must too, or func_end's
+ * rebuild — capped at this length — panics). */
static void aa_emit_prologue(NativeTarget* t) {
AANativeTarget* a = aa_of(t);
u32 words[AA_PROLOGUE_WORDS];
- u32 est_frame = align_up_u32(a->cum_off + a->max_outgoing, 16u);
- u32 n = aa_build_prologue_words(a, est_frame, words, AA_PROLOGUE_WORDS);
+ AAFrameLayout L = aa_build_layout(a->cum_off, a->max_outgoing);
+ u32 n = aa_build_prologue_words(a, &L, words, AA_PROLOGUE_WORDS);
for (u32 i = 0; i < n; ++i) aa_emit32(t->mc, words[i]);
a->minimal_prologue_words = n;
aa_emit_entry_saves(a);
}
-static void aa_emit_restore_frame(AANativeTarget* a, u32 frame_size) {
+static void aa_emit_restore_frame(AANativeTarget* a, const AAFrameLayout* L) {
MCEmitter* mc = a->base.mc;
- if (!frame_size) return;
- if (a->slim_prologue) {
- aa_emit32(mc, aa64_ldp64_post(AA_FP, AA_LR, AA_SP, 2));
- return;
- }
- if (a->slim_small_frame) {
- u32 save_off = frame_size - AA_FRAME_SAVE_SIZE;
- u32 imm12, sh;
- aa_emit32(mc,
- aa64_ldp64_soff(AA_FP, AA_LR, AA_SP, (i32)(save_off / 8u)));
- if (!aa64_addsub_imm_fits(frame_size, &imm12, &sh))
- aa_panic(a, "slim_small_frame: frame_size out of addsub imm range");
- aa_emit32(mc, aa64_add_imm(1, AA_SP, AA_SP, imm12, sh));
- return;
- }
- aa_emit32(mc, aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0));
- aa_emit32(mc, aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, -2)); /* fp,lr @ -16 */
- aa_emit32(mc, aa64_add_imm(1, AA_SP, AA_TMP0, 0, 0));
+ u32 words[AA_PROLOGUE_WORDS];
+ u32 n = 0;
+ if (!L->frame_size) return;
+ aa_words_restore_frame(a, words, AA_PROLOGUE_WORDS, &n, L);
+ for (u32 i = 0; i < n; ++i) aa_emit32(mc, words[i]);
}
static void aa_patch_allocas(AANativeTarget* a) {
@@ -1228,7 +1302,7 @@ static void aa_words_callee_restores(AANativeTarget* a, u32* words, u32 cap,
aa_words_callee_saves(a, 0, words, cap, n);
}
-static void aa_patch_tail_sites(AANativeTarget* a, u32 frame_size) {
+static void aa_patch_tail_sites(AANativeTarget* a, const AAFrameLayout* L) {
ObjSecId sec = a->func->text_section_id;
for (u32 i = 0; i < a->ntail_sites; ++i) {
AATailSite* site = &a->tail_sites[i];
@@ -1236,7 +1310,7 @@ static void aa_patch_tail_sites(AANativeTarget* a, u32 frame_size) {
u32 n = 0;
memset(words, 0, sizeof words);
aa_words_callee_restores(a, words, AA_TAIL_WORDS, &n);
- aa_words_restore_frame(a, words, AA_TAIL_WORDS, &n, frame_size);
+ aa_words_restore_frame(a, words, AA_TAIL_WORDS, &n, L);
if (n >= AA_TAIL_WORDS) aa_panic(a, "tail patch too small");
if (site->callee.kind == NATIVE_LOC_REG) {
words[n++] = aa64_br(loc_reg(site->callee));
@@ -1255,7 +1329,7 @@ static void aa_patch_tail_sites(AANativeTarget* a, u32 frame_size) {
static void aa_func_end(NativeTarget* t) {
AANativeTarget* a = aa_of(t);
MCEmitter* mc = t->mc;
- u32 frame_size = align_up_u32(a->cum_off + a->max_outgoing, 16u);
+ AAFrameLayout L = aa_build_layout(a->cum_off, a->max_outgoing);
/* Optimizer path emitted an exact-size prologue (minimal_prologue_words);
* the single-pass path reserved a fixed worst-case region. Either way the
* frame-size immediates are only final now, so patch the region in place. */
@@ -1264,45 +1338,36 @@ static void aa_func_end(NativeTarget* t) {
/* Slim Tier A eligibility (set before emitting the epilogue / patching the
* prologue so the *_restore_frame / *_build_prologue_words helpers pick the
* slim form). Conditions: no callee-saves needed, no alloca, no body
- * locals/spills (cum_off untouched past the reserved fp/lr save area), no
- * outgoing stack args, and only on the optimizer path (the NDT reserves a
- * much larger prologue region and isn't on the bench path). sret/variadic
- * disqualify naturally because their entry-save slots advance cum_off. */
+ * slots (locals/spills/sret/variadic — all counted in slot_bytes), no
+ * outgoing stack args, optimizer path only (the NDT reserves a much
+ * larger prologue region). */
a->slim_prologue =
t->emit_minimal_prologue && a->ncallee_saves == 0 &&
- a->nalloca_patches == 0 && a->cum_off == AA_FRAME_SAVE_SIZE &&
- a->max_outgoing == 0;
+ a->nalloca_patches == 0 && L.slot_bytes == 0 && L.out_stack == 0;
/* Universal small-frame fast path: skip the x17/x10 scratch when the
- * saved-pair offset (frame_size - 16) fits stp's signed 7-bit scaled
- * immediate. Mutually exclusive with the Tier A slim form (Tier A is
- * strictly tighter — 2-insn prologue, 1-insn restore). Disqualify alloca:
- * alloca dynamically moves sp during the body, and the fat epilogue's
- * `add sp, fp, #0` (via x10) is what restores sp from fp. The slim
- * epilogue's `add sp, sp, #N` only undoes the static frame, leaving sp
- * pointing into the alloca area. */
+ * saved-pair offset fits stp's signed 7-bit scaled immediate. Mutually
+ * exclusive with the Tier A slim form (Tier A is strictly tighter).
+ * Disqualify alloca: alloca dynamically moves sp during the body, and the
+ * fat epilogue (sp = fp + 16 via x10) is what restores sp from fp; the
+ * slim_small_frame epilogue's `add sp, sp, #N` only undoes the static
+ * frame, leaving sp pointing into the alloca area. */
a->slim_small_frame =
!a->slim_prologue && a->nalloca_patches == 0 &&
- frame_size >= AA_FRAME_SAVE_SIZE &&
- (frame_size - AA_FRAME_SAVE_SIZE) <= 504u;
+ aa_sp_off_saved_pair(&L) <= 504u;
mc->label_place(mc, a->epilogue_label);
aa_emit_callee_restores(a);
- aa_emit_restore_frame(a, frame_size);
+ aa_emit_restore_frame(a, &L);
aa_emit32(mc, aa64_ret(AA_LR));
- aa_patch_prologue(a, frame_size, prologue_region);
+ aa_patch_prologue(a, &L, prologue_region);
aa_patch_allocas(a);
- aa_patch_tail_sites(a, frame_size);
+ aa_patch_tail_sites(a, &L);
if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) {
mc->cfi_set_next_pc_offset(mc, prologue_region * 4u);
- if (a->slim_prologue) {
- /* After `stp x29,x30,[sp,#-16]!; mov x29,sp`: CFA = sp + 16, fp at
- * CFA-16, lr at CFA-8. SP-anchored stays correct for the entire body
- * since slim Tier A has no further sp moves. */
- mc->cfi_def_cfa(mc, AA_SP, 16);
- } else {
- mc->cfi_def_cfa(mc, AA_FP, 0);
- }
- mc->cfi_offset(mc, AA_FP, -16);
- mc->cfi_offset(mc, AA_LR, -8);
+ /* CFA = caller's sp = fp + AA_FRAME_SAVE_SIZE. saved fp/lr at fp/fp+8
+ * (= CFA-16, CFA-8). Unified across all three prologue layouts. */
+ mc->cfi_def_cfa(mc, AA_FP, AA_FRAME_SAVE_SIZE);
+ mc->cfi_offset(mc, AA_FP, aa_fp_off_saved_fp() - (i32)AA_FRAME_SAVE_SIZE);
+ mc->cfi_offset(mc, AA_LR, aa_fp_off_saved_lr() - (i32)AA_FRAME_SAVE_SIZE);
}
obj_symbol_define(t->obj, a->func->sym, a->func->text_section_id,
a->func_start, mc->pos(mc) - a->func_start);
@@ -1467,7 +1532,7 @@ static void aa_load_addr(NativeTarget* t, NativeLoc dst, NativeAddr addr) {
switch ((NativeAddrBaseKind)addr.base_kind) {
case NATIVE_ADDR_BASE_FRAME: {
AANativeSlot* s = aa_slot(a, addr.base.frame);
- aa_emit_add_imm(a, rd, AA_FP, -(i32)s->off + addr.offset);
+ aa_emit_add_imm(a, rd, AA_FP, aa_fp_off_slot(s->off) + addr.offset);
aa_apply_index(a, rd, &addr);
return;
}
@@ -2073,7 +2138,12 @@ static void aa_store_outgoing_part(NativeTarget* t, int tail_call,
addr.base_kind = NATIVE_ADDR_BASE_REG;
addr.base.reg = tail_call ? AA_FP : AA_SP;
addr.base_type = src.type;
- addr.offset = (i32)stack_off;
+ /* Tail calls write outgoing args into the caller's incoming-args window
+ * (= [fp + 16 + off], same address the tail-callee will read via
+ * aa_fp_off_in_arg). Non-tail calls write to the sp-anchored outgoing
+ * area at the bottom of the caller's frame. */
+ addr.offset = tail_call ? aa_fp_off_tail_out_arg(stack_off)
+ : aa_sp_off_out_arg(stack_off);
aa_emit_mem(aa_of(t), 0, src, addr, mem);
}
@@ -2214,6 +2284,18 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc,
plan->stack_arg_size = aa_call_stack_size(t, desc);
if (plan->stack_arg_size > aa_of(t)->max_outgoing)
aa_of(t)->max_outgoing = plan->stack_arg_size;
+ /* Indirect call whose callee lives in x0..x7: the upcoming arg-load loop
+ * writes those same registers and would clobber the function pointer
+ * before blr reads it. Stash callee into AA_TMP0 (x16) up front and
+ * retarget the call. (AA_TMP0 is a backend scratch, never an arg reg.) */
+ if (plan->callee.kind == NATIVE_LOC_REG &&
+ (NativeAllocClass)plan->callee.cls == NATIVE_REG_INT &&
+ plan->callee.v.reg < 8u) {
+ NativeLoc scratch =
+ aa_reg_loc(plan->callee.type, NATIVE_REG_INT, AA_TMP0);
+ aa_move(t, scratch, plan->callee);
+ plan->callee = scratch;
+ }
{
u32 next_int = 0, next_fp = 0, stack = 0;
int tail_call = (desc->flags & CG_CALL_TAIL) != 0;
@@ -3172,7 +3254,7 @@ static void aa_bind_native_param(NativeTarget* t, const CGParamDesc* p,
memset(&saddr, 0, sizeof saddr);
saddr.base_kind = NATIVE_ADDR_BASE_REG;
saddr.base.reg = AA_FP;
- saddr.offset = (i32)a->next_param_stack;
+ saddr.offset = aa_fp_off_in_arg(a->next_param_stack);
aa_emit_mem(a, 1, src, saddr, aa_mem_for_type(t, p->type, 8));
a->next_param_stack += 8u;
}
@@ -3215,7 +3297,7 @@ static void aa_bind_native_param(NativeTarget* t, const CGParamDesc* p,
saddr.base_kind = NATIVE_ADDR_BASE_REG;
saddr.base.reg = AA_FP;
saddr.base_type = p->type;
- saddr.offset = (i32)a->next_param_stack;
+ saddr.offset = aa_fp_off_in_arg(a->next_param_stack);
aa_emit_mem(a, 1, src, saddr, aa_mem_for_type(t, p->type, part->size));
a->next_param_stack += aa_part_stack_size(part);
}
@@ -3376,7 +3458,10 @@ static void aa_va_start_core(AANativeTarget* a, NativeAddr ap) {
NativeLoc ptr =
aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT, AA_TMP0);
if (vai.kind == ABI_VA_LIST_POINTER) {
- aa_emit_add_imm(a, AA_TMP0, AA_FP, (i32)a->next_param_stack);
+ /* `va_list = &<first vararg>`. Variadic stack args follow the fixed
+ * incoming params in the same caller window, so the offset is the
+ * current next_param_stack cursor. */
+ aa_emit_add_imm(a, AA_TMP0, AA_FP, aa_fp_off_in_arg(a->next_param_stack));
aa_emit_mem(a, 0, ptr, ap, aa_mem_for_type(t, ptr.type, 8));
return;
}
diff --git a/src/opt/cg_ir_lower.c b/src/opt/cg_ir_lower.c
@@ -192,6 +192,16 @@ static const CgIrParam* find_param(const CgIrFunc* f, CGLocal local) {
}
static void lower_params(CgIrLower* l) {
+ /* Resolve the function-level ABI info once so we can attach per-param
+ * ABIArgInfo to each IRParam. Consumers (set_preg_pref_for_params, the
+ * native bind_param emit path) read p->abi without going through
+ * f->desc.abi, so this stays scoped to the param plumbing and does not
+ * activate the dormant f->desc.abi-gated passes (e.g.
+ * apply_param_incoming_register_hazards, opt_verify_alloc's incoming
+ * check), which have known issues with tail-call shuffles. */
+ const ABIFuncInfo* fi = NULL;
+ if (l->c && l->c->abi && l->f->desc.fn_type)
+ fi = abi_cg_func_info(l->c->abi, l->f->desc.fn_type);
for (u32 i = 0; i < l->src->nlocals; ++i) {
const CgIrLocal* loc = &l->src->locals[i];
if (!loc->is_param) continue;
@@ -217,6 +227,7 @@ static void lower_params(CgIrLower* l) {
d.loc = loc->desc.loc;
}
d.storage = m->storage;
+ if (fi && d.index < fi->nparams) d.abi = &fi->params[d.index];
ir_param_add(l->f, &d);
}
}
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -218,104 +218,8 @@ static int phys_arg_reg_for_index(Func* f, u8 cls, u32 abi_index, Reg* out) {
return 0;
}
-typedef struct ParamIncomingRegs {
- Reg regs[64];
- u8 cls[64];
- u8 has[64];
- u32 nparams;
- Reg all_regs[128];
- u8 all_cls[128];
- u32 nall;
-} ParamIncomingRegs;
-
-static void param_incoming_add(ParamIncomingRegs* out, u8 cls, Reg r) {
- if (r >= 32) return;
- for (u32 i = 0; i < out->nall; ++i)
- if (out->all_cls[i] == cls && out->all_regs[i] == r) return;
- if (out->nall < 128u) {
- out->all_cls[out->nall] = cls;
- out->all_regs[out->nall] = r;
- ++out->nall;
- }
-}
-
-static void collect_param_incoming_regs(Func* f, ParamIncomingRegs* out) {
- memset(out, 0, sizeof *out);
- if (!f || !f->desc.abi || !f->nparams) return;
-
- u32 next_int = 0;
- u32 next_fp = 0;
- if (f->desc.abi->has_sret && f->opt_target.arch != CFREE_ARCH_ARM_64)
- next_int = 1;
-
- out->nparams = f->nparams < 64u ? f->nparams : 64u;
- for (u32 i = 0; i < out->nparams; ++i) {
- IRParam* p = &f->params[i];
- const ABIArgInfo* ai = p->abi;
- if (!ai || ai->kind == ABI_ARG_IGNORE) continue;
- if (ai->kind == ABI_ARG_INDIRECT) {
- Reg r = REG_NONE;
- if (phys_arg_reg_for_index(f, RC_INT, next_int, &r)) {
- out->regs[i] = r;
- out->cls[i] = RC_INT;
- out->has[i] = 1;
- param_incoming_add(out, RC_INT, r);
- }
- ++next_int;
- continue;
- }
- if (ai->kind != ABI_ARG_DIRECT) continue;
- for (u16 j = 0; j < ai->nparts; ++j) {
- const ABIArgPart* part = &ai->parts[j];
- if (part->cls == ABI_CLASS_FP) {
- Reg r = REG_NONE;
- if (phys_arg_reg_for_index(f, RC_FP, next_fp, &r)) {
- param_incoming_add(out, RC_FP, r);
- if (ai->nparts == 1) {
- out->regs[i] = r;
- out->cls[i] = RC_FP;
- out->has[i] = 1;
- }
- }
- ++next_fp;
- } else if (part->cls == ABI_CLASS_INT) {
- Reg r = REG_NONE;
- if (phys_arg_reg_for_index(f, RC_INT, next_int, &r)) {
- param_incoming_add(out, RC_INT, r);
- if (ai->nparts == 1) {
- out->regs[i] = r;
- out->cls[i] = RC_INT;
- out->has[i] = 1;
- }
- }
- ++next_int;
- }
- }
- }
-}
-
static int hard_available(Func* f, u8 cls, Reg r);
-static void apply_param_incoming_register_hazards(Func* f) {
- if (!f || !f->preg_info || !f->desc.abi || !f->nparams) return;
- ParamIncomingRegs incoming;
- collect_param_incoming_regs(f, &incoming);
-
- /* O1 replays parameter materialization before the body, but values left in
- * their ABI incoming registers are not represented as live from function
- * entry to first use. Keep those incoming registers out of virtual
- * allocation so the backend emits explicit entry moves/stores before body
- * code can reuse them. Fixed asm constraints still use tied_hard_reg. */
- for (PReg v = 1; v < opt_reg_count(f); ++v) {
- u8 cls = f->preg_info[v].cls;
- for (u32 j = 0; j < incoming.nall; ++j) {
- if (incoming.all_cls[j] != cls) continue;
- if (f->preg_info[v].tied_hard_reg == (i32)incoming.all_regs[j]) continue;
- forbid_preg_reg(f, v, cls, incoming.all_regs[j]);
- }
- }
-}
-
static int is_caller_saved(Func* f, u8 cls, Reg r) {
if (cls >= OPT_REG_CLASSES || r >= 32) return 0;
return (f->opt_caller_saved[cls] & (1u << r)) != 0;
@@ -339,20 +243,11 @@ static void set_preg_pref_to_ret_reg(Func* f, const Operand* op) {
if (hint == REG_NONE || hint >= 32) return;
/* Don't override a real pin. */
if (f->preg_info[v].tied_hard_reg >= 0) return;
- /* apply_param_incoming_register_hazards conservatively forbids incoming
- * param regs (e.g. x0) for every body PReg, because liveness doesn't
- * model the implicit entry-move from x0 -> the param's home reg. That
- * forbid is overly broad for call-result and ret-value PRegs:
- * - A call-result PReg's def is mid-function (the call writes x0); its
- * live range starts after every entry move, so it can't alias the
- * entry-window use of x0.
- * - A ret-value PReg is consumed at IR_RET (function exit); its live
- * range can extend through the body but the entry-move into the
- * param's home reg has already completed by the time any body inst
- * could define this PReg.
- * Clear the forbid for the hinted reg so the allocator can actually
- * pick it. The general conflict check (alloc_group_conflicts_bit) still
- * excludes intervening clobbers like other calls. */
+ /* The hint reg may not be in opt_hard_regs (e.g. x0 on aa64 is reserved
+ * as the ABI ret reg, outside aa_int_allocable); the allocator's
+ * preferred-reg branch will still consider it via the unit-overlap
+ * precision check. Clear any leftover forbid bit so the hint isn't
+ * silently blocked. */
f->preg_info[v].forbidden_hard_regs &= ~(1u << hint);
f->preg_info[v].preferred_hard_reg = (i8)hint;
}
@@ -366,9 +261,7 @@ static void set_preg_pref_for_abivalue(Func* f, const CGABIValue* v) {
/* Soft hint: prefer a specific ABI register for `op`'s PReg. Symmetric to
* set_preg_pref_to_ret_reg but takes an arbitrary hint reg (the matching
- * arg reg for the i-th call argument). Same rationale for clearing the
- * apply_param_incoming_register_hazards forbid: the source operand of a
- * call arg is defined in the body, after every entry-bind has run. */
+ * arg reg for the i-th call argument). */
static void set_preg_pref_to_arg_reg(Func* f, const Operand* op, Reg hint) {
if (!op || op->kind != OPK_REG) return;
if (hint == REG_NONE || hint >= 32) return;
@@ -382,9 +275,86 @@ static void set_preg_pref_to_arg_reg(Func* f, const Operand* op, Reg hint) {
f->preg_info[v].preferred_hard_reg = (i8)hint;
}
+/* Hint each single-PReg-stored param toward its own incoming ABI reg. When
+ * the allocator picks the incoming reg, bind_param sees src==dst and emits
+ * no entry move (aa_bind_native_param checks at native.c:3227). Live-range
+ * conflicts at body use sites still go through the normal allocator check,
+ * so cross-call params that need a callee-save get one. */
+/* True iff `f` contains any IR_CALL flagged as a tail call. Tail-call arg
+ * routing goes through the backend shuffle which can permute the caller's
+ * incoming arg regs into different positions for the callee — pinning each
+ * param PReg to its own incoming reg turns those permutations into multi-reg
+ * cycles the shuffle can't break. Symmetric to the per-call tail skip in
+ * set_preg_pref_for_call_args. */
+static int func_has_tail_call(const Func* f) {
+ if (!f) return 0;
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ const Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ const Inst* in = &bl->insts[i];
+ if ((IROp)in->op != IR_CALL) continue;
+ const IRCallAux* aux = (const IRCallAux*)in->extra.aux;
+ if (aux && (aux->desc.flags & CG_CALL_TAIL)) return 1;
+ }
+ }
+ return 0;
+}
+
+static void set_preg_pref_for_params(Func* f) {
+ if (!f || !f->preg_info || !f->nparams) return;
+ if (func_has_tail_call(f)) return;
+ /* Per-class ABI arg cursors. Drives from per-param ABI info rather than
+ * f->desc.abi so this fires on paths where only f->params[i].abi is set. */
+ u32 next_int = 0;
+ u32 next_fp = 0;
+ /* sret on non-aa64 targets consumes the first int arg slot. Only consult
+ * f->desc.abi for this when it's available; aa64 (the only arch where this
+ * hint targets x0..x7 today) doesn't have the sret-takes-arg0 quirk. */
+ if (f->desc.abi && f->desc.abi->has_sret &&
+ f->opt_target.arch != CFREE_ARCH_ARM_64)
+ next_int = 1;
+ for (u32 i = 0; i < f->nparams; ++i) {
+ IRParam* p = &f->params[i];
+ const ABIArgInfo* ai = p->abi;
+ if (!ai || ai->kind == ABI_ARG_IGNORE) continue;
+ if (ai->kind == ABI_ARG_INDIRECT) {
+ ++next_int;
+ continue;
+ }
+ if (ai->kind != ABI_ARG_DIRECT) continue;
+ /* Only hint single-part DIRECT params whose home is a single PReg.
+ * Aggregate / split params take the bind_param frame-store path. */
+ int single_part_to_preg = (ai->nparts == 1) &&
+ (p->storage.kind == CG_LOCAL_STORAGE_REG);
+ if (single_part_to_preg) {
+ const ABIArgPart* part = &ai->parts[0];
+ u8 cls = (part->cls == ABI_CLASS_FP) ? RC_FP : RC_INT;
+ u32* counter = (cls == RC_FP) ? &next_fp : &next_int;
+ Reg hint = REG_NONE;
+ if (*counter < 8u && phys_arg_reg_for_index(f, cls, *counter, &hint)) {
+ PReg v = (PReg)p->storage.v.reg;
+ if (v != PREG_NONE && v != 0 && v < opt_reg_count(f) &&
+ f->preg_info[v].cls == cls &&
+ f->preg_info[v].tied_hard_reg < 0 &&
+ f->preg_info[v].preferred_hard_reg < 0 &&
+ hint != REG_NONE && hint < 32) {
+ f->preg_info[v].forbidden_hard_regs &= ~(1u << hint);
+ f->preg_info[v].preferred_hard_reg = (i8)hint;
+ }
+ }
+ }
+ /* Advance the ABI cursors for every part of this param's home, regardless
+ * of whether we hinted, so subsequent params see the right slot. */
+ for (u16 j = 0; j < ai->nparts; ++j) {
+ u32* c = (ai->parts[j].cls == ABI_CLASS_FP) ? &next_fp : &next_int;
+ *c += 1u;
+ }
+ }
+}
+
/* For each IR_CALL arg whose source storage is a single OPK_REG, hint that
* PReg to the matching ABI arg register. Sequential int/fp counters mirror
- * the per-class arg slot assignment in collect_param_incoming_regs. Skips
+ * the per-class arg slot assignment used by set_preg_pref_for_params. Skips
* variadic, has_sret, and indirect/aggregate args: they need per-target
* counter logic that hasn't been factored out of plan_call. */
static void set_preg_pref_for_call_args(Func* f, const CGCallDesc* desc) {
@@ -473,6 +443,7 @@ static void propagate_hint_through_copies(Func* f) {
* a result PReg live across another call cannot pick x0). */
static void apply_abi_aliasing_hints(Func* f) {
if (!f || !f->preg_info) return;
+ set_preg_pref_for_params(f);
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
for (u32 i = 0; i < bl->ninsts; ++i) {
@@ -1946,23 +1917,11 @@ static void opt_verify_alloc(Func* f, const OptLiveInfo* live) {
u32 nregs = opt_reg_count(f);
u8* cur;
if (nregs <= 1u || !live) return;
- ParamIncomingRegs incoming;
- collect_param_incoming_regs(f, &incoming);
- for (PReg v = 1; v < nregs; ++v) {
- OptPRegInfo* vi = &f->preg_info[v];
- if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_HARD || vi->use_freq == 0)
- continue;
- u8 cls = opt_preg_loc_cls(f, v);
- Reg hard = opt_preg_hard_reg(f, v);
- for (u32 i = 0; i < incoming.nall; ++i) {
- if (cls == incoming.all_cls[i] && hard == incoming.all_regs[i]) {
- SrcLoc loc = {0, 0, 0};
- compiler_panic(f->c, loc,
- "opt regalloc: O1 preg %u left in incoming cls%u reg%u",
- (unsigned)v, (unsigned)cls, (unsigned)hard);
- }
- }
- }
+ /* No "left in incoming reg" pre-check: the hint path's
+ * opt_ranges_overlap_kind precision check already permits the unit-overlap
+ * between a param PReg and its own incoming reg (= "no entry move"), and
+ * the standard allocator's bitmap rejects every other overlap. The
+ * per-instruction interference scan below is the residual safety net. */
cur = arena_array(f->arena, u8, nregs);
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
@@ -2025,7 +1984,6 @@ static void opt_regalloc_place(Func* f, int allow_live_range_split,
opt_live_ranges_build(f, &live, &ranges);
opt_init_preg_info_from_ranges(f, &ranges);
opt_apply_asm_constraints_from_live(f, &live);
- apply_param_incoming_register_hazards(f);
apply_abi_aliasing_hints(f);
/* MIR coalesces only at -O2 (mir-gen.c:9431); match that here. At O1 the
* point-bitmap allocator emits copies through the natural conflict-free