kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 0a722716448183a97354a873bcda572031919e19
parent 678aa2c2c1d76c07ae26fecbd5b4c2128564012d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 28 May 2026 10:12:05 -0700

opt+aa64: unify on AAPCS64 frame layout; hint params to incoming regs

aa64 prologue/epilogue had three layouts (slim_prologue, slim_small_frame,
fat) that each anchored fp differently. Bind_param assumed fp == caller's
original sp, which only held for two of them; the slim layout left fp 16
bytes low, so incoming stack args read from the saved-pair slots. Activated
the moment any optimization let a function skip its callee-save spills.

Centralize the layout in a single AAFrameLayout struct with typed accessors
(aa_fp_off_in_arg, aa_fp_off_slot, aa_fp_off_saved_fp/lr, aa_sp_off_saved_pair,
aa_fp_off_tail_out_arg, aa_sp_off_out_arg) documented by one ASCII diagram.
Every site that addresses the frame now goes through a helper — no bare
arithmetic on AA_FP/AA_SP for frame addressing. All three prologue variants
anchor fp at the saved-pair address (true AAPCS64); CFI unified.

Param-hint: set_preg_pref_for_params hints each scalar param PReg toward
its own incoming ABI reg, gated by func_has_tail_call (tail shuffles can't
break cycles imposed by pinned param positions). Drives add(a,b) to
'add x0, x0, x1; ret'. Requires p->abi to be populated, so lower_params
resolves it via abi_cg_func_info (scoped to the param side; doesn't activate
the dormant f->desc.abi path).

Fix latent indirect-call bug exposed by the hint: when the callee lives in
x0..x7 (e.g. a param hinted to x0), the arg-load loop would clobber it
before blr. aa_plan_call now stashes the callee into AA_TMP0 first.

Delete apply_param_incoming_register_hazards and the verifier's
"left-in-incoming-reg" pre-check. Both were dormant via the f->desc.abi
gate; both had the tail-call bug. The hint mechanism + opt_ranges_overlap_kind
unit-overlap check is the real safety net.

Diffstat:
Msrc/arch/aa64/native.c | 301+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
Msrc/opt/cg_ir_lower.c | 11+++++++++++
Msrc/opt/pass_lower.c | 222++++++++++++++++++++++++++++++++-----------------------------------------------
3 files changed, 294 insertions(+), 240 deletions(-)

diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c @@ -61,6 +61,72 @@ enum { AA_TAIL_WORDS = 32u, }; +/* ============================================================================ + * AAPCS64 frame layout + * + * fp anchors at the caller's saved-pair address; sp anchors at the bottom of + * the outgoing-arg area. Every fp- or sp-relative offset in this file is + * computed via one of the helpers below — no site should do bare arithmetic + * on AA_FP / AA_SP for addressing the frame. + * + * high addr caller's stack frame + * +------------------------------+ + * | incoming stack args | aa_fp_off_in_arg(i) + * +------------------------------+ + * fp --> | saved x29 (prev fp) | aa_fp_off_saved_fp() + * | saved x30 (prev lr) | aa_fp_off_saved_lr() + * +------------------------------+ + * | frame slots | aa_fp_off_slot(s->off) + * | (callee-saves + locals | + * | + spills + sret/variadic) | + * +------------------------------+ + * | outgoing args | aa_sp_off_out_arg(i) + * sp --> +------------------------------+ + * low addr + * + * frame_size = align16(AA_FRAME_SAVE_SIZE + slot_bytes + out_stack). + * Tail calls write outgoing args into the caller's incoming-args window — + * physically the same address, expressed via aa_fp_off_tail_out_arg. + * ========================================================================== */ + +static u32 align_up_u32(u32 v, u32 align); + +typedef struct AAFrameLayout { + u32 slot_bytes; /* sum of aa_frame_slot reservations (callee-saves + locals + * + spills + sret/variadic) */ + u32 out_stack; /* max outgoing-arg bytes across all calls in this function */ + u32 frame_size; /* align16(AA_FRAME_SAVE_SIZE + slot_bytes + out_stack) */ +} AAFrameLayout; + +static inline AAFrameLayout aa_build_layout(u32 slot_bytes, u32 out_stack) { + AAFrameLayout L; + L.slot_bytes = slot_bytes; + L.out_stack = out_stack; + L.frame_size = + align_up_u32(AA_FRAME_SAVE_SIZE + slot_bytes + out_stack, 16u); + return L; +} + +/* FP-relative byte offsets. */ +static inline i32 aa_fp_off_saved_fp(void) { return 0; } +static inline i32 aa_fp_off_saved_lr(void) { return 8; } +static inline i32 aa_fp_off_in_arg(u32 byte_off) { + return (i32)(AA_FRAME_SAVE_SIZE + byte_off); +} +static inline i32 aa_fp_off_slot(u32 slot_off) { return -(i32)slot_off; } +/* Outgoing stack args on a tail call land in the caller's incoming-arg + * window — same physical address the tail-callee will read via + * aa_fp_off_in_arg. Same helper, distinct name for site-side intent. */ +static inline i32 aa_fp_off_tail_out_arg(u32 byte_off) { + return aa_fp_off_in_arg(byte_off); +} + +/* SP-relative byte offsets. */ +static inline i32 aa_sp_off_out_arg(u32 byte_off) { return (i32)byte_off; } +static inline u32 aa_sp_off_saved_pair(const AAFrameLayout* L) { + return L->frame_size - AA_FRAME_SAVE_SIZE; +} + typedef struct AANativeSlot { u32 off; u32 size; @@ -545,7 +611,7 @@ static void aa_addr_base(AANativeTarget* a, NativeAddr addr, u32* base_out, case NATIVE_ADDR_BASE_FRAME: { AANativeSlot* s = aa_slot(a, addr.base.frame); *base_out = AA_FP; - *off_out = -(i32)s->off + addr.offset; + *off_out = aa_fp_off_slot(s->off) + addr.offset; return; } case NATIVE_ADDR_BASE_GLOBAL: { @@ -819,7 +885,7 @@ static u32 aa_ldst_q_simm9(int load, u32 rt, u32 rn, i32 byte_off) { static void aa_emit_q_frame(AANativeTarget* a, int load, u32 qreg, NativeFrameSlot slot, u32 offset) { AANativeSlot* s = aa_slot(a, slot); - i32 off = -(i32)s->off + (i32)offset; + i32 off = aa_fp_off_slot(s->off) + (i32)offset; MCEmitter* mc = a->base.mc; if (off >= 0 && ((u32)off & 15u) == 0 && ((u32)off >> 4) <= 0xfffu) { aa_emit32(mc, aa_ldst_q_uimm(load, qreg, AA_FP, (u32)off)); @@ -873,11 +939,18 @@ static void aa_func_begin(NativeTarget* t, const CGFuncDesc* fd) { MCEmitter* mc = t->mc; a->func = fd; a->nslots = 0; - a->cum_off = AA_FRAME_SAVE_SIZE; + /* cum_off counts frame-slot bytes below fp (see AAFrameLayout above). + * The saved fp/lr pair (16 bytes at [fp, fp+8]) is *not* part of cum_off; + * the frame-size computation in aa_func_end adds it via aa_build_layout. */ + a->cum_off = 0; a->max_outgoing = 0; a->incoming_stack_size = 0; a->next_param_int = 0; a->next_param_fp = 0; + /* 0-based byte cursor for incoming stack args (also reported as the + * caller's incoming_stack_size for tail-call realizability). bind_param + * forms its fp-relative address via aa_fp_off_in_arg(next_param_stack), + * which adds the saved-pair offset. */ a->next_param_stack = 0; a->sret_ptr_slot = NATIVE_FRAME_SLOT_NONE; a->saved_tmp_slot = NATIVE_FRAME_SLOT_NONE; @@ -1015,23 +1088,30 @@ static void aa_words_sub_sp_frame(AANativeTarget* a, u32* words, u32 cap, words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP1, 0, 0); } +/* Anchor fp at the AAPCS64 saved-pair address (= sp + saved-pair offset). + * The slim_prologue path achieves the same anchor in a single insn via + * `add x29, sp, #0` after the pre-decrement stp moves sp to the saved-pair. */ static void aa_words_frame_ptr_from_sp(AANativeTarget* a, u32* words, u32 cap, - u32* n, u32 frame_size) { + u32* n, const AAFrameLayout* L) { u32 imm12, sh; - if (aa64_addsub_imm_fits(frame_size, &imm12, &sh)) { + u32 anchor = aa_sp_off_saved_pair(L); + if (aa64_addsub_imm_fits(anchor, &imm12, &sh)) { if (*n >= cap) aa_panic(a, "instruction patch too small"); words[(*n)++] = aa64_add_imm(1, AA_FP, AA_SP, imm12, sh); return; } - aa_words_load_imm(a, words, cap, n, AA_TMP0, frame_size); + aa_words_load_imm(a, words, cap, n, AA_TMP0, anchor); if (*n + 2u > cap) aa_panic(a, "instruction patch too small"); words[(*n)++] = aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0); words[(*n)++] = aa64_add(1, AA_FP, AA_TMP1, AA_TMP0); } +/* x17 = address of the saved-pair slot (= sp + saved-pair offset). Used by + * the fat prologue to materialize the stp destination when the offset + * doesn't fit stp's signed-7-bit-scaled immediate. */ static void aa_words_saved_pair_addr(AANativeTarget* a, u32* words, u32 cap, - u32* n, u32 frame_size) { - u32 save_off = frame_size - AA_FRAME_SAVE_SIZE; + u32* n, const AAFrameLayout* L) { + u32 save_off = aa_sp_off_saved_pair(L); u32 imm12, sh; if (aa64_addsub_imm_fits(save_off, &imm12, &sh)) { if (*n >= cap) aa_panic(a, "instruction patch too small"); @@ -1045,33 +1125,34 @@ static void aa_words_saved_pair_addr(AANativeTarget* a, u32* words, u32 cap, } static void aa_words_restore_frame(AANativeTarget* a, u32* words, u32 cap, - u32* n, u32 frame_size) { - if (!frame_size) return; + u32* n, const AAFrameLayout* L) { + if (!L->frame_size) return; if (a->slim_prologue) { if (*n + 1u > cap) aa_panic(a, "instruction patch too small"); - /* `ldp x29, x30, [sp], #16` — pops the saved pair and restores sp. */ + /* `ldp x29, x30, [sp], #16` — pop saved pair, restore sp. */ words[(*n)++] = aa64_ldp64_post(AA_FP, AA_LR, AA_SP, 2); return; } if (a->slim_small_frame) { - /* `ldp x29,x30,[sp,#N-16] ; add sp,sp,#N` — skip the `add x10,fp,#0` - * scratch the fat path uses. Restoring fp,lr through sp+offset avoids - * the scratch entirely; the subsequent `add sp` then unwinds the frame - * without depending on the (now-clobbered) old fp. */ - u32 save_off = frame_size - AA_FRAME_SAVE_SIZE; + /* `ldp x29,x30,[sp,#saved_pair] ; add sp,sp,#frame_size` — load through + * sp avoids the fat path's `add x10, fp, #0` scratch, and the subsequent + * `add sp` unwinds without depending on the (now-clobbered) old fp. */ + u32 save_off = aa_sp_off_saved_pair(L); u32 imm12, sh; if (*n + 2u > cap) aa_panic(a, "instruction patch too small"); words[(*n)++] = aa64_ldp64_soff(AA_FP, AA_LR, AA_SP, (i32)(save_off / 8u)); - if (!aa64_addsub_imm_fits(frame_size, &imm12, &sh)) + if (!aa64_addsub_imm_fits(L->frame_size, &imm12, &sh)) aa_panic(a, "slim_small_frame: frame_size out of addsub imm range"); words[(*n)++] = aa64_add_imm(1, AA_SP, AA_SP, imm12, sh); return; } if (*n + 3u > cap) aa_panic(a, "instruction patch too small"); + /* AAPCS64: fp is the saved-pair address. Reload pair from [fp], then + * restore sp to fp + AA_FRAME_SAVE_SIZE (= caller's original sp = CFA). */ words[(*n)++] = aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0); - words[(*n)++] = aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, -2); /* fp,lr @ -16 */ - words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP0, 0, 0); + words[(*n)++] = aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, 0); + words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP0, AA_FRAME_SAVE_SIZE, 0); } /* Emit callee-save store (save=1) or restore (save=0) words into `words`, @@ -1084,13 +1165,14 @@ static void aa_words_callee_saves(AANativeTarget* a, int save, u32* words, u32 cap, u32* n) { for (u32 i = 0; i < a->ncallee_saves;) { const AACalleeSave* cs = &a->callee_saves[i]; - i32 off = -(i32)aa_slot(a, cs->slot)->off; + i32 off = aa_fp_off_slot(aa_slot(a, cs->slot)->off); if (off < -256 || off > 255) aa_panic(a, "callee-save offset out of prologue range"); if (i + 1u < a->ncallee_saves && cs->cls == (u8)NATIVE_REG_INT && a->callee_saves[i + 1u].cls == (u8)NATIVE_REG_INT) { const AACalleeSave* cs2 = &a->callee_saves[i + 1u]; - i32 off2 = -(i32)aa_slot(a, cs2->slot)->off; /* off - 8, lower address */ + i32 off2 = aa_fp_off_slot(aa_slot(a, cs2->slot)->off); + /* off2 = off - 8 (lower address; reserve allocates downward). */ if (*n >= cap) aa_panic(a, "prologue too large"); words[(*n)++] = save ? aa64_stp64_soff(cs2->reg, cs->reg, AA_FP, off2 / 8) @@ -1106,57 +1188,63 @@ static void aa_words_callee_saves(AANativeTarget* a, int save, u32* words, } } -/* Build the prologue instruction words for `frame_size` into `words` (capacity - * `cap`), returning the count. Shared by the NativeDirectTarget patch path - * (reserves a fixed worst-case region, then patches it here) and the optimizer - * path (emits an exact-size region up front; see aa_emit_prologue). */ -static u32 aa_build_prologue_words(AANativeTarget* a, u32 frame_size, u32* words, - u32 cap) { +/* Build the prologue instruction words for `L` into `words` (capacity `cap`), + * returning the count. Shared by the NativeDirectTarget patch path (reserves + * a fixed worst-case region, then patches it here) and the optimizer path + * (emits an exact-size region up front; see aa_emit_prologue). + * + * All three variants establish the same post-prologue state defined by L: + * sp = caller's sp - L->frame_size + * fp = sp + aa_sp_off_saved_pair(L) (saved-pair address) + * saved x29/x30 at [fp], [fp+8] + * callee-saves at [fp - s->off] for each. */ +static u32 aa_build_prologue_words(AANativeTarget* a, const AAFrameLayout* L, + u32* words, u32 cap) { u32 n = 0; - if (!frame_size) return 0; + if (!L->frame_size) return 0; if (a->slim_prologue) { if (cap < 2u) aa_panic(a, "prologue too large"); - /* `stp x29, x30, [sp, #-16]!` — push the saved pair and adjust sp in - * one instruction. `mov x29, sp` keeps the AAPCS64 backtrace chain - * intact (some unwinders walk x29 directly rather than via DWARF). */ + /* `stp x29, x30, [sp, #-16]!; add x29, sp, #0` — the pre-decrement stp + * moves sp down to the saved-pair address, so a no-op add anchors fp + * there directly. AAPCS64 frame record. */ words[n++] = aa64_stp64_pre(AA_FP, AA_LR, AA_SP, -2); words[n++] = aa64_add_imm(1, AA_FP, AA_SP, 0, 0); return n; } - aa_words_sub_sp_frame(a, words, cap, &n, frame_size); + aa_words_sub_sp_frame(a, words, cap, &n, L->frame_size); if (a->slim_small_frame) { - /* `stp x29, x30, [sp, #(frame_size-16)]` — skip the `add x17, sp, #N-16` - * scratch step the fat path emits. Valid when (frame_size - 16) fits the - * stp signed-7-bit scaled immediate (i.e. frame_size <= 520). */ - u32 save_off = frame_size - AA_FRAME_SAVE_SIZE; + /* `stp x29, x30, [sp, #saved_pair_off]` — skip the `add x17, sp, #...` + * scratch the fat path needs. Valid when the offset fits stp's + * signed-7-bit scaled immediate (saved_pair_off <= 504). */ + u32 save_off = aa_sp_off_saved_pair(L); if (n >= cap) aa_panic(a, "prologue too large"); words[n++] = aa64_stp64_soff(AA_FP, AA_LR, AA_SP, (i32)(save_off / 8u)); } else { - aa_words_saved_pair_addr(a, words, cap, &n, frame_size); + aa_words_saved_pair_addr(a, words, cap, &n, L); if (n >= cap) aa_panic(a, "prologue too large"); words[n++] = aa64_stp64_soff(AA_FP, AA_LR, AA_TMP1, 0); /* fp,lr @ [x17] */ } - aa_words_frame_ptr_from_sp(a, words, cap, &n, frame_size); - /* Save callee-saved registers the allocator used, FP-relative. Their slots - * were reserved first (aa_reserve_callee_saves), so offsets fit stur's - * signed-9-bit immediate. */ + aa_words_frame_ptr_from_sp(a, words, cap, &n, L); + /* Save callee-saved registers the allocator used (fp-relative; their slots + * were reserved first by aa_reserve_callee_saves so offsets fit stur). */ aa_words_callee_saves(a, 1, words, cap, &n); return n; } /* Patch the reserved prologue region (`region` words at prologue_pos) with the - * real prologue for `frame_size`. Used by the NativeDirectTarget single-pass - * path, which reserves AA_PROLOGUE_WORDS up front before the frame is known. - * The optimizer path reserves exactly the words it needs, so `region` equals + * real prologue for `L`. Used by the NativeDirectTarget single-pass path, + * which reserves AA_PROLOGUE_WORDS up front before the frame is known. The + * optimizer path reserves exactly the words it needs, so `region` equals * the real prologue length and no tail remains. */ -static void aa_patch_prologue(AANativeTarget* a, u32 frame_size, u32 region) { +static void aa_patch_prologue(AANativeTarget* a, const AAFrameLayout* L, + u32 region) { u32 words[AA_PROLOGUE_WORDS]; u32 n; ObjSecId sec = a->func->text_section_id; if (region > AA_PROLOGUE_WORDS) aa_panic(a, "prologue region too large"); memset(words, 0, sizeof words); - n = aa_build_prologue_words(a, frame_size, words, region); + n = aa_build_prologue_words(a, L, words, region); /* If the real prologue is shorter than the reserved region (the worst-case * NDT reservation), branch straight to the body rather than leaving the * trailing slots as NOPs that fall through and execute on every call. */ @@ -1171,41 +1259,27 @@ static void aa_patch_prologue(AANativeTarget* a, u32 frame_size, u32 region) { /* Optimizer path: emit an exact-size prologue in place (no reserved NOP * region). The callee-save set and the static frame slots are final by now, so * the prologue's instruction count is fixed; only the frame-size immediates - * (sub sp / save-area address / fp = sp+frame) still depend on body-emitted - * temporaries and are patched in func_end. We size the region with a frame that - * fits add/sub's imm12 (the real frame must too, or func_end's rebuild — capped - * at this length — panics). The sret/variadic entry saves follow, as on the - * single-pass path. */ + * (sub sp / save-area address / fp = sp+saved_pair) still depend on body- + * emitted temporaries and are patched in func_end. We size the region with + * a frame that fits add/sub's imm12 (the real frame must too, or func_end's + * rebuild — capped at this length — panics). */ static void aa_emit_prologue(NativeTarget* t) { AANativeTarget* a = aa_of(t); u32 words[AA_PROLOGUE_WORDS]; - u32 est_frame = align_up_u32(a->cum_off + a->max_outgoing, 16u); - u32 n = aa_build_prologue_words(a, est_frame, words, AA_PROLOGUE_WORDS); + AAFrameLayout L = aa_build_layout(a->cum_off, a->max_outgoing); + u32 n = aa_build_prologue_words(a, &L, words, AA_PROLOGUE_WORDS); for (u32 i = 0; i < n; ++i) aa_emit32(t->mc, words[i]); a->minimal_prologue_words = n; aa_emit_entry_saves(a); } -static void aa_emit_restore_frame(AANativeTarget* a, u32 frame_size) { +static void aa_emit_restore_frame(AANativeTarget* a, const AAFrameLayout* L) { MCEmitter* mc = a->base.mc; - if (!frame_size) return; - if (a->slim_prologue) { - aa_emit32(mc, aa64_ldp64_post(AA_FP, AA_LR, AA_SP, 2)); - return; - } - if (a->slim_small_frame) { - u32 save_off = frame_size - AA_FRAME_SAVE_SIZE; - u32 imm12, sh; - aa_emit32(mc, - aa64_ldp64_soff(AA_FP, AA_LR, AA_SP, (i32)(save_off / 8u))); - if (!aa64_addsub_imm_fits(frame_size, &imm12, &sh)) - aa_panic(a, "slim_small_frame: frame_size out of addsub imm range"); - aa_emit32(mc, aa64_add_imm(1, AA_SP, AA_SP, imm12, sh)); - return; - } - aa_emit32(mc, aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0)); - aa_emit32(mc, aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, -2)); /* fp,lr @ -16 */ - aa_emit32(mc, aa64_add_imm(1, AA_SP, AA_TMP0, 0, 0)); + u32 words[AA_PROLOGUE_WORDS]; + u32 n = 0; + if (!L->frame_size) return; + aa_words_restore_frame(a, words, AA_PROLOGUE_WORDS, &n, L); + for (u32 i = 0; i < n; ++i) aa_emit32(mc, words[i]); } static void aa_patch_allocas(AANativeTarget* a) { @@ -1228,7 +1302,7 @@ static void aa_words_callee_restores(AANativeTarget* a, u32* words, u32 cap, aa_words_callee_saves(a, 0, words, cap, n); } -static void aa_patch_tail_sites(AANativeTarget* a, u32 frame_size) { +static void aa_patch_tail_sites(AANativeTarget* a, const AAFrameLayout* L) { ObjSecId sec = a->func->text_section_id; for (u32 i = 0; i < a->ntail_sites; ++i) { AATailSite* site = &a->tail_sites[i]; @@ -1236,7 +1310,7 @@ static void aa_patch_tail_sites(AANativeTarget* a, u32 frame_size) { u32 n = 0; memset(words, 0, sizeof words); aa_words_callee_restores(a, words, AA_TAIL_WORDS, &n); - aa_words_restore_frame(a, words, AA_TAIL_WORDS, &n, frame_size); + aa_words_restore_frame(a, words, AA_TAIL_WORDS, &n, L); if (n >= AA_TAIL_WORDS) aa_panic(a, "tail patch too small"); if (site->callee.kind == NATIVE_LOC_REG) { words[n++] = aa64_br(loc_reg(site->callee)); @@ -1255,7 +1329,7 @@ static void aa_patch_tail_sites(AANativeTarget* a, u32 frame_size) { static void aa_func_end(NativeTarget* t) { AANativeTarget* a = aa_of(t); MCEmitter* mc = t->mc; - u32 frame_size = align_up_u32(a->cum_off + a->max_outgoing, 16u); + AAFrameLayout L = aa_build_layout(a->cum_off, a->max_outgoing); /* Optimizer path emitted an exact-size prologue (minimal_prologue_words); * the single-pass path reserved a fixed worst-case region. Either way the * frame-size immediates are only final now, so patch the region in place. */ @@ -1264,45 +1338,36 @@ static void aa_func_end(NativeTarget* t) { /* Slim Tier A eligibility (set before emitting the epilogue / patching the * prologue so the *_restore_frame / *_build_prologue_words helpers pick the * slim form). Conditions: no callee-saves needed, no alloca, no body - * locals/spills (cum_off untouched past the reserved fp/lr save area), no - * outgoing stack args, and only on the optimizer path (the NDT reserves a - * much larger prologue region and isn't on the bench path). sret/variadic - * disqualify naturally because their entry-save slots advance cum_off. */ + * slots (locals/spills/sret/variadic — all counted in slot_bytes), no + * outgoing stack args, optimizer path only (the NDT reserves a much + * larger prologue region). */ a->slim_prologue = t->emit_minimal_prologue && a->ncallee_saves == 0 && - a->nalloca_patches == 0 && a->cum_off == AA_FRAME_SAVE_SIZE && - a->max_outgoing == 0; + a->nalloca_patches == 0 && L.slot_bytes == 0 && L.out_stack == 0; /* Universal small-frame fast path: skip the x17/x10 scratch when the - * saved-pair offset (frame_size - 16) fits stp's signed 7-bit scaled - * immediate. Mutually exclusive with the Tier A slim form (Tier A is - * strictly tighter — 2-insn prologue, 1-insn restore). Disqualify alloca: - * alloca dynamically moves sp during the body, and the fat epilogue's - * `add sp, fp, #0` (via x10) is what restores sp from fp. The slim - * epilogue's `add sp, sp, #N` only undoes the static frame, leaving sp - * pointing into the alloca area. */ + * saved-pair offset fits stp's signed 7-bit scaled immediate. Mutually + * exclusive with the Tier A slim form (Tier A is strictly tighter). + * Disqualify alloca: alloca dynamically moves sp during the body, and the + * fat epilogue (sp = fp + 16 via x10) is what restores sp from fp; the + * slim_small_frame epilogue's `add sp, sp, #N` only undoes the static + * frame, leaving sp pointing into the alloca area. */ a->slim_small_frame = !a->slim_prologue && a->nalloca_patches == 0 && - frame_size >= AA_FRAME_SAVE_SIZE && - (frame_size - AA_FRAME_SAVE_SIZE) <= 504u; + aa_sp_off_saved_pair(&L) <= 504u; mc->label_place(mc, a->epilogue_label); aa_emit_callee_restores(a); - aa_emit_restore_frame(a, frame_size); + aa_emit_restore_frame(a, &L); aa_emit32(mc, aa64_ret(AA_LR)); - aa_patch_prologue(a, frame_size, prologue_region); + aa_patch_prologue(a, &L, prologue_region); aa_patch_allocas(a); - aa_patch_tail_sites(a, frame_size); + aa_patch_tail_sites(a, &L); if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) { mc->cfi_set_next_pc_offset(mc, prologue_region * 4u); - if (a->slim_prologue) { - /* After `stp x29,x30,[sp,#-16]!; mov x29,sp`: CFA = sp + 16, fp at - * CFA-16, lr at CFA-8. SP-anchored stays correct for the entire body - * since slim Tier A has no further sp moves. */ - mc->cfi_def_cfa(mc, AA_SP, 16); - } else { - mc->cfi_def_cfa(mc, AA_FP, 0); - } - mc->cfi_offset(mc, AA_FP, -16); - mc->cfi_offset(mc, AA_LR, -8); + /* CFA = caller's sp = fp + AA_FRAME_SAVE_SIZE. saved fp/lr at fp/fp+8 + * (= CFA-16, CFA-8). Unified across all three prologue layouts. */ + mc->cfi_def_cfa(mc, AA_FP, AA_FRAME_SAVE_SIZE); + mc->cfi_offset(mc, AA_FP, aa_fp_off_saved_fp() - (i32)AA_FRAME_SAVE_SIZE); + mc->cfi_offset(mc, AA_LR, aa_fp_off_saved_lr() - (i32)AA_FRAME_SAVE_SIZE); } obj_symbol_define(t->obj, a->func->sym, a->func->text_section_id, a->func_start, mc->pos(mc) - a->func_start); @@ -1467,7 +1532,7 @@ static void aa_load_addr(NativeTarget* t, NativeLoc dst, NativeAddr addr) { switch ((NativeAddrBaseKind)addr.base_kind) { case NATIVE_ADDR_BASE_FRAME: { AANativeSlot* s = aa_slot(a, addr.base.frame); - aa_emit_add_imm(a, rd, AA_FP, -(i32)s->off + addr.offset); + aa_emit_add_imm(a, rd, AA_FP, aa_fp_off_slot(s->off) + addr.offset); aa_apply_index(a, rd, &addr); return; } @@ -2073,7 +2138,12 @@ static void aa_store_outgoing_part(NativeTarget* t, int tail_call, addr.base_kind = NATIVE_ADDR_BASE_REG; addr.base.reg = tail_call ? AA_FP : AA_SP; addr.base_type = src.type; - addr.offset = (i32)stack_off; + /* Tail calls write outgoing args into the caller's incoming-args window + * (= [fp + 16 + off], same address the tail-callee will read via + * aa_fp_off_in_arg). Non-tail calls write to the sp-anchored outgoing + * area at the bottom of the caller's frame. */ + addr.offset = tail_call ? aa_fp_off_tail_out_arg(stack_off) + : aa_sp_off_out_arg(stack_off); aa_emit_mem(aa_of(t), 0, src, addr, mem); } @@ -2214,6 +2284,18 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc, plan->stack_arg_size = aa_call_stack_size(t, desc); if (plan->stack_arg_size > aa_of(t)->max_outgoing) aa_of(t)->max_outgoing = plan->stack_arg_size; + /* Indirect call whose callee lives in x0..x7: the upcoming arg-load loop + * writes those same registers and would clobber the function pointer + * before blr reads it. Stash callee into AA_TMP0 (x16) up front and + * retarget the call. (AA_TMP0 is a backend scratch, never an arg reg.) */ + if (plan->callee.kind == NATIVE_LOC_REG && + (NativeAllocClass)plan->callee.cls == NATIVE_REG_INT && + plan->callee.v.reg < 8u) { + NativeLoc scratch = + aa_reg_loc(plan->callee.type, NATIVE_REG_INT, AA_TMP0); + aa_move(t, scratch, plan->callee); + plan->callee = scratch; + } { u32 next_int = 0, next_fp = 0, stack = 0; int tail_call = (desc->flags & CG_CALL_TAIL) != 0; @@ -3172,7 +3254,7 @@ static void aa_bind_native_param(NativeTarget* t, const CGParamDesc* p, memset(&saddr, 0, sizeof saddr); saddr.base_kind = NATIVE_ADDR_BASE_REG; saddr.base.reg = AA_FP; - saddr.offset = (i32)a->next_param_stack; + saddr.offset = aa_fp_off_in_arg(a->next_param_stack); aa_emit_mem(a, 1, src, saddr, aa_mem_for_type(t, p->type, 8)); a->next_param_stack += 8u; } @@ -3215,7 +3297,7 @@ static void aa_bind_native_param(NativeTarget* t, const CGParamDesc* p, saddr.base_kind = NATIVE_ADDR_BASE_REG; saddr.base.reg = AA_FP; saddr.base_type = p->type; - saddr.offset = (i32)a->next_param_stack; + saddr.offset = aa_fp_off_in_arg(a->next_param_stack); aa_emit_mem(a, 1, src, saddr, aa_mem_for_type(t, p->type, part->size)); a->next_param_stack += aa_part_stack_size(part); } @@ -3376,7 +3458,10 @@ static void aa_va_start_core(AANativeTarget* a, NativeAddr ap) { NativeLoc ptr = aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT, AA_TMP0); if (vai.kind == ABI_VA_LIST_POINTER) { - aa_emit_add_imm(a, AA_TMP0, AA_FP, (i32)a->next_param_stack); + /* `va_list = &<first vararg>`. Variadic stack args follow the fixed + * incoming params in the same caller window, so the offset is the + * current next_param_stack cursor. */ + aa_emit_add_imm(a, AA_TMP0, AA_FP, aa_fp_off_in_arg(a->next_param_stack)); aa_emit_mem(a, 0, ptr, ap, aa_mem_for_type(t, ptr.type, 8)); return; } diff --git a/src/opt/cg_ir_lower.c b/src/opt/cg_ir_lower.c @@ -192,6 +192,16 @@ static const CgIrParam* find_param(const CgIrFunc* f, CGLocal local) { } static void lower_params(CgIrLower* l) { + /* Resolve the function-level ABI info once so we can attach per-param + * ABIArgInfo to each IRParam. Consumers (set_preg_pref_for_params, the + * native bind_param emit path) read p->abi without going through + * f->desc.abi, so this stays scoped to the param plumbing and does not + * activate the dormant f->desc.abi-gated passes (e.g. + * apply_param_incoming_register_hazards, opt_verify_alloc's incoming + * check), which have known issues with tail-call shuffles. */ + const ABIFuncInfo* fi = NULL; + if (l->c && l->c->abi && l->f->desc.fn_type) + fi = abi_cg_func_info(l->c->abi, l->f->desc.fn_type); for (u32 i = 0; i < l->src->nlocals; ++i) { const CgIrLocal* loc = &l->src->locals[i]; if (!loc->is_param) continue; @@ -217,6 +227,7 @@ static void lower_params(CgIrLower* l) { d.loc = loc->desc.loc; } d.storage = m->storage; + if (fi && d.index < fi->nparams) d.abi = &fi->params[d.index]; ir_param_add(l->f, &d); } } diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c @@ -218,104 +218,8 @@ static int phys_arg_reg_for_index(Func* f, u8 cls, u32 abi_index, Reg* out) { return 0; } -typedef struct ParamIncomingRegs { - Reg regs[64]; - u8 cls[64]; - u8 has[64]; - u32 nparams; - Reg all_regs[128]; - u8 all_cls[128]; - u32 nall; -} ParamIncomingRegs; - -static void param_incoming_add(ParamIncomingRegs* out, u8 cls, Reg r) { - if (r >= 32) return; - for (u32 i = 0; i < out->nall; ++i) - if (out->all_cls[i] == cls && out->all_regs[i] == r) return; - if (out->nall < 128u) { - out->all_cls[out->nall] = cls; - out->all_regs[out->nall] = r; - ++out->nall; - } -} - -static void collect_param_incoming_regs(Func* f, ParamIncomingRegs* out) { - memset(out, 0, sizeof *out); - if (!f || !f->desc.abi || !f->nparams) return; - - u32 next_int = 0; - u32 next_fp = 0; - if (f->desc.abi->has_sret && f->opt_target.arch != CFREE_ARCH_ARM_64) - next_int = 1; - - out->nparams = f->nparams < 64u ? f->nparams : 64u; - for (u32 i = 0; i < out->nparams; ++i) { - IRParam* p = &f->params[i]; - const ABIArgInfo* ai = p->abi; - if (!ai || ai->kind == ABI_ARG_IGNORE) continue; - if (ai->kind == ABI_ARG_INDIRECT) { - Reg r = REG_NONE; - if (phys_arg_reg_for_index(f, RC_INT, next_int, &r)) { - out->regs[i] = r; - out->cls[i] = RC_INT; - out->has[i] = 1; - param_incoming_add(out, RC_INT, r); - } - ++next_int; - continue; - } - if (ai->kind != ABI_ARG_DIRECT) continue; - for (u16 j = 0; j < ai->nparts; ++j) { - const ABIArgPart* part = &ai->parts[j]; - if (part->cls == ABI_CLASS_FP) { - Reg r = REG_NONE; - if (phys_arg_reg_for_index(f, RC_FP, next_fp, &r)) { - param_incoming_add(out, RC_FP, r); - if (ai->nparts == 1) { - out->regs[i] = r; - out->cls[i] = RC_FP; - out->has[i] = 1; - } - } - ++next_fp; - } else if (part->cls == ABI_CLASS_INT) { - Reg r = REG_NONE; - if (phys_arg_reg_for_index(f, RC_INT, next_int, &r)) { - param_incoming_add(out, RC_INT, r); - if (ai->nparts == 1) { - out->regs[i] = r; - out->cls[i] = RC_INT; - out->has[i] = 1; - } - } - ++next_int; - } - } - } -} - static int hard_available(Func* f, u8 cls, Reg r); -static void apply_param_incoming_register_hazards(Func* f) { - if (!f || !f->preg_info || !f->desc.abi || !f->nparams) return; - ParamIncomingRegs incoming; - collect_param_incoming_regs(f, &incoming); - - /* O1 replays parameter materialization before the body, but values left in - * their ABI incoming registers are not represented as live from function - * entry to first use. Keep those incoming registers out of virtual - * allocation so the backend emits explicit entry moves/stores before body - * code can reuse them. Fixed asm constraints still use tied_hard_reg. */ - for (PReg v = 1; v < opt_reg_count(f); ++v) { - u8 cls = f->preg_info[v].cls; - for (u32 j = 0; j < incoming.nall; ++j) { - if (incoming.all_cls[j] != cls) continue; - if (f->preg_info[v].tied_hard_reg == (i32)incoming.all_regs[j]) continue; - forbid_preg_reg(f, v, cls, incoming.all_regs[j]); - } - } -} - static int is_caller_saved(Func* f, u8 cls, Reg r) { if (cls >= OPT_REG_CLASSES || r >= 32) return 0; return (f->opt_caller_saved[cls] & (1u << r)) != 0; @@ -339,20 +243,11 @@ static void set_preg_pref_to_ret_reg(Func* f, const Operand* op) { if (hint == REG_NONE || hint >= 32) return; /* Don't override a real pin. */ if (f->preg_info[v].tied_hard_reg >= 0) return; - /* apply_param_incoming_register_hazards conservatively forbids incoming - * param regs (e.g. x0) for every body PReg, because liveness doesn't - * model the implicit entry-move from x0 -> the param's home reg. That - * forbid is overly broad for call-result and ret-value PRegs: - * - A call-result PReg's def is mid-function (the call writes x0); its - * live range starts after every entry move, so it can't alias the - * entry-window use of x0. - * - A ret-value PReg is consumed at IR_RET (function exit); its live - * range can extend through the body but the entry-move into the - * param's home reg has already completed by the time any body inst - * could define this PReg. - * Clear the forbid for the hinted reg so the allocator can actually - * pick it. The general conflict check (alloc_group_conflicts_bit) still - * excludes intervening clobbers like other calls. */ + /* The hint reg may not be in opt_hard_regs (e.g. x0 on aa64 is reserved + * as the ABI ret reg, outside aa_int_allocable); the allocator's + * preferred-reg branch will still consider it via the unit-overlap + * precision check. Clear any leftover forbid bit so the hint isn't + * silently blocked. */ f->preg_info[v].forbidden_hard_regs &= ~(1u << hint); f->preg_info[v].preferred_hard_reg = (i8)hint; } @@ -366,9 +261,7 @@ static void set_preg_pref_for_abivalue(Func* f, const CGABIValue* v) { /* Soft hint: prefer a specific ABI register for `op`'s PReg. Symmetric to * set_preg_pref_to_ret_reg but takes an arbitrary hint reg (the matching - * arg reg for the i-th call argument). Same rationale for clearing the - * apply_param_incoming_register_hazards forbid: the source operand of a - * call arg is defined in the body, after every entry-bind has run. */ + * arg reg for the i-th call argument). */ static void set_preg_pref_to_arg_reg(Func* f, const Operand* op, Reg hint) { if (!op || op->kind != OPK_REG) return; if (hint == REG_NONE || hint >= 32) return; @@ -382,9 +275,86 @@ static void set_preg_pref_to_arg_reg(Func* f, const Operand* op, Reg hint) { f->preg_info[v].preferred_hard_reg = (i8)hint; } +/* Hint each single-PReg-stored param toward its own incoming ABI reg. When + * the allocator picks the incoming reg, bind_param sees src==dst and emits + * no entry move (aa_bind_native_param checks at native.c:3227). Live-range + * conflicts at body use sites still go through the normal allocator check, + * so cross-call params that need a callee-save get one. */ +/* True iff `f` contains any IR_CALL flagged as a tail call. Tail-call arg + * routing goes through the backend shuffle which can permute the caller's + * incoming arg regs into different positions for the callee — pinning each + * param PReg to its own incoming reg turns those permutations into multi-reg + * cycles the shuffle can't break. Symmetric to the per-call tail skip in + * set_preg_pref_for_call_args. */ +static int func_has_tail_call(const Func* f) { + if (!f) return 0; + for (u32 b = 0; b < f->nblocks; ++b) { + const Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + const Inst* in = &bl->insts[i]; + if ((IROp)in->op != IR_CALL) continue; + const IRCallAux* aux = (const IRCallAux*)in->extra.aux; + if (aux && (aux->desc.flags & CG_CALL_TAIL)) return 1; + } + } + return 0; +} + +static void set_preg_pref_for_params(Func* f) { + if (!f || !f->preg_info || !f->nparams) return; + if (func_has_tail_call(f)) return; + /* Per-class ABI arg cursors. Drives from per-param ABI info rather than + * f->desc.abi so this fires on paths where only f->params[i].abi is set. */ + u32 next_int = 0; + u32 next_fp = 0; + /* sret on non-aa64 targets consumes the first int arg slot. Only consult + * f->desc.abi for this when it's available; aa64 (the only arch where this + * hint targets x0..x7 today) doesn't have the sret-takes-arg0 quirk. */ + if (f->desc.abi && f->desc.abi->has_sret && + f->opt_target.arch != CFREE_ARCH_ARM_64) + next_int = 1; + for (u32 i = 0; i < f->nparams; ++i) { + IRParam* p = &f->params[i]; + const ABIArgInfo* ai = p->abi; + if (!ai || ai->kind == ABI_ARG_IGNORE) continue; + if (ai->kind == ABI_ARG_INDIRECT) { + ++next_int; + continue; + } + if (ai->kind != ABI_ARG_DIRECT) continue; + /* Only hint single-part DIRECT params whose home is a single PReg. + * Aggregate / split params take the bind_param frame-store path. */ + int single_part_to_preg = (ai->nparts == 1) && + (p->storage.kind == CG_LOCAL_STORAGE_REG); + if (single_part_to_preg) { + const ABIArgPart* part = &ai->parts[0]; + u8 cls = (part->cls == ABI_CLASS_FP) ? RC_FP : RC_INT; + u32* counter = (cls == RC_FP) ? &next_fp : &next_int; + Reg hint = REG_NONE; + if (*counter < 8u && phys_arg_reg_for_index(f, cls, *counter, &hint)) { + PReg v = (PReg)p->storage.v.reg; + if (v != PREG_NONE && v != 0 && v < opt_reg_count(f) && + f->preg_info[v].cls == cls && + f->preg_info[v].tied_hard_reg < 0 && + f->preg_info[v].preferred_hard_reg < 0 && + hint != REG_NONE && hint < 32) { + f->preg_info[v].forbidden_hard_regs &= ~(1u << hint); + f->preg_info[v].preferred_hard_reg = (i8)hint; + } + } + } + /* Advance the ABI cursors for every part of this param's home, regardless + * of whether we hinted, so subsequent params see the right slot. */ + for (u16 j = 0; j < ai->nparts; ++j) { + u32* c = (ai->parts[j].cls == ABI_CLASS_FP) ? &next_fp : &next_int; + *c += 1u; + } + } +} + /* For each IR_CALL arg whose source storage is a single OPK_REG, hint that * PReg to the matching ABI arg register. Sequential int/fp counters mirror - * the per-class arg slot assignment in collect_param_incoming_regs. Skips + * the per-class arg slot assignment used by set_preg_pref_for_params. Skips * variadic, has_sret, and indirect/aggregate args: they need per-target * counter logic that hasn't been factored out of plan_call. */ static void set_preg_pref_for_call_args(Func* f, const CGCallDesc* desc) { @@ -473,6 +443,7 @@ static void propagate_hint_through_copies(Func* f) { * a result PReg live across another call cannot pick x0). */ static void apply_abi_aliasing_hints(Func* f) { if (!f || !f->preg_info) return; + set_preg_pref_for_params(f); for (u32 b = 0; b < f->nblocks; ++b) { Block* bl = &f->blocks[b]; for (u32 i = 0; i < bl->ninsts; ++i) { @@ -1946,23 +1917,11 @@ static void opt_verify_alloc(Func* f, const OptLiveInfo* live) { u32 nregs = opt_reg_count(f); u8* cur; if (nregs <= 1u || !live) return; - ParamIncomingRegs incoming; - collect_param_incoming_regs(f, &incoming); - for (PReg v = 1; v < nregs; ++v) { - OptPRegInfo* vi = &f->preg_info[v]; - if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_HARD || vi->use_freq == 0) - continue; - u8 cls = opt_preg_loc_cls(f, v); - Reg hard = opt_preg_hard_reg(f, v); - for (u32 i = 0; i < incoming.nall; ++i) { - if (cls == incoming.all_cls[i] && hard == incoming.all_regs[i]) { - SrcLoc loc = {0, 0, 0}; - compiler_panic(f->c, loc, - "opt regalloc: O1 preg %u left in incoming cls%u reg%u", - (unsigned)v, (unsigned)cls, (unsigned)hard); - } - } - } + /* No "left in incoming reg" pre-check: the hint path's + * opt_ranges_overlap_kind precision check already permits the unit-overlap + * between a param PReg and its own incoming reg (= "no entry move"), and + * the standard allocator's bitmap rejects every other overlap. The + * per-instruction interference scan below is the residual safety net. */ cur = arena_array(f->arena, u8, nregs); for (u32 b = 0; b < f->nblocks; ++b) { Block* bl = &f->blocks[b]; @@ -2025,7 +1984,6 @@ static void opt_regalloc_place(Func* f, int allow_live_range_split, opt_live_ranges_build(f, &live, &ranges); opt_init_preg_info_from_ranges(f, &ranges); opt_apply_asm_constraints_from_live(f, &live); - apply_param_incoming_register_hazards(f); apply_abi_aliasing_hints(f); /* MIR coalesces only at -O2 (mir-gen.c:9431); match that here. At O1 the * point-bitmap allocator emits copies through the natural conflict-free