commit 2a9bbeef597e44132e18c68c5f44eaaf679e643b
parent a3d48fb69c42dd07b58715354a1a6bda6673bbf4
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 28 May 2026 13:00:35 -0700
opt/aa64: unify deferred in-function patches into one tagged-union list
NDT Phase 4. aa64 had two ad-hoc deferred-patch arrays (alloca_patches,
tail_sites) drained by two separate calls in aa_func_end. Collapse them
into a single arch-private AAPatch tagged union (AA_PATCH_ALLOCA /
AA_PATCH_TAIL) on one growable list, appended via aa_patch_alloc and
drained by one aa_apply_patches switch. Each entry targets a disjoint,
fixed code position, so insertion order is irrelevant and output stays
byte-identical. A dedicated nalloca counter gates the slim prologue/frame
conditions that previously read nalloca_patches. The prologue region stays
separately patched (exactly one per function, fixed position).
Diffstat:
1 file changed, 82 insertions(+), 76 deletions(-)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -135,15 +135,24 @@ typedef struct AANativeSlot {
u8 pad[3];
} AANativeSlot;
-typedef struct AATailSite {
+/* Deferred in-function patches, all resolved in aa_func_end once the frame
+ * layout (max_outgoing, callee-saves) is final. One growable list carries both
+ * kinds; each entry patches a disjoint, fixed code position, so insertion order
+ * is irrelevant. The prologue region is patched separately (exactly one per
+ * function, fixed position) and is not a list entry. */
+typedef enum AAPatchKind {
+ AA_PATCH_ALLOCA, /* single instr: add dst, sp, #max_outgoing */
+ AA_PATCH_TAIL, /* AA_TAIL_WORDS region: callee restores + frame + br/b */
+} AAPatchKind;
+
+typedef struct AAPatch {
+ AAPatchKind kind;
u32 pos;
- NativeLoc callee;
-} AATailSite;
-
-typedef struct AAAllocaPatch {
- u32 pos;
- u32 dst_reg;
-} AAAllocaPatch;
+ union {
+ u32 dst_reg; /* AA_PATCH_ALLOCA */
+ NativeLoc callee; /* AA_PATCH_TAIL */
+ } u;
+} AAPatch;
/* x19..x28 (10) + v8..v15 (8) is the maximum the allocator can assign. */
#define AA_MAX_CALLEE_SAVES 18u
@@ -174,12 +183,10 @@ typedef struct AANativeTarget {
NativeFrameSlot va_gr_slot;
NativeFrameSlot va_vr_slot;
- AATailSite* tail_sites;
- u32 ntail_sites;
- u32 tail_sites_cap;
- AAAllocaPatch* alloca_patches;
- u32 nalloca_patches;
- u32 alloca_patches_cap;
+ AAPatch* patches;
+ u32 npatches;
+ u32 patches_cap;
+ u32 nalloca; /* count of AA_PATCH_ALLOCA entries; gates slim prologue/frame */
u32 func_start;
u32 prologue_pos;
@@ -956,8 +963,8 @@ static void aa_func_begin(NativeTarget* t, const CGFuncDesc* fd) {
a->saved_tmp_slot = NATIVE_FRAME_SLOT_NONE;
a->va_gr_slot = NATIVE_FRAME_SLOT_NONE;
a->va_vr_slot = NATIVE_FRAME_SLOT_NONE;
- a->ntail_sites = 0;
- a->nalloca_patches = 0;
+ a->npatches = 0;
+ a->nalloca = 0;
a->ncallee_saves = 0;
a->slim_prologue = 0;
a->slim_small_frame = 0;
@@ -1282,16 +1289,17 @@ static void aa_emit_restore_frame(AANativeTarget* a, const AAFrameLayout* L) {
for (u32 i = 0; i < n; ++i) aa_emit32(mc, words[i]);
}
-static void aa_patch_allocas(AANativeTarget* a) {
- ObjSecId sec = a->func->text_section_id;
- u32 imm12, sh;
- for (u32 i = 0; i < a->nalloca_patches; ++i) {
- AAAllocaPatch* p = &a->alloca_patches[i];
- if (!aa64_addsub_imm_fits(a->max_outgoing, &imm12, &sh))
- aa_panic(a, "outgoing area too large for alloca result");
- aa_patch32(a->base.obj, sec, p->pos,
- aa64_add_imm(1, p->dst_reg, AA_SP, imm12, sh));
+/* Reserve one entry in the deferred-patch list, growing (arena-doubling) as
+ * needed. The returned pointer is stable until the next aa_patch_alloc. */
+static AAPatch* aa_patch_alloc(AANativeTarget* a) {
+ if (a->npatches == a->patches_cap) {
+ u32 cap = a->patches_cap ? a->patches_cap * 2u : 8u;
+ AAPatch* nb = arena_zarray(a->base.c->tu, AAPatch, cap);
+ if (a->patches) memcpy(nb, a->patches, sizeof(*nb) * a->npatches);
+ a->patches = nb;
+ a->patches_cap = cap;
}
+ return &a->patches[a->npatches++];
}
/* Append FP-relative loads that restore the saved callee registers (stp/ldp
@@ -1302,27 +1310,38 @@ static void aa_words_callee_restores(AANativeTarget* a, u32* words, u32 cap,
aa_words_callee_saves(a, 0, words, cap, n);
}
-static void aa_patch_tail_sites(AANativeTarget* a, const AAFrameLayout* L) {
+/* Drain the deferred-patch list. Each entry targets a disjoint, fixed code
+ * position, so insertion order does not affect output. */
+static void aa_apply_patches(AANativeTarget* a, const AAFrameLayout* L) {
ObjSecId sec = a->func->text_section_id;
- for (u32 i = 0; i < a->ntail_sites; ++i) {
- AATailSite* site = &a->tail_sites[i];
- u32 words[AA_TAIL_WORDS];
- u32 n = 0;
- memset(words, 0, sizeof words);
- aa_words_callee_restores(a, words, AA_TAIL_WORDS, &n);
- aa_words_restore_frame(a, words, AA_TAIL_WORDS, &n, L);
- if (n >= AA_TAIL_WORDS) aa_panic(a, "tail patch too small");
- if (site->callee.kind == NATIVE_LOC_REG) {
- words[n++] = aa64_br(loc_reg(site->callee));
- } else if (site->callee.kind == NATIVE_LOC_GLOBAL) {
- while (n + 1u < AA_TAIL_WORDS) words[n++] = 0xd503201fu;
- words[n++] = aa64_b(0);
- } else {
- aa_panic(a, "unsupported tail target");
+ for (u32 i = 0; i < a->npatches; ++i) {
+ AAPatch* p = &a->patches[i];
+ if (p->kind == AA_PATCH_ALLOCA) {
+ u32 imm12, sh;
+ if (!aa64_addsub_imm_fits(a->max_outgoing, &imm12, &sh))
+ aa_panic(a, "outgoing area too large for alloca result");
+ aa_patch32(a->base.obj, sec, p->pos,
+ aa64_add_imm(1, p->u.dst_reg, AA_SP, imm12, sh));
+ } else { /* AA_PATCH_TAIL */
+ NativeLoc callee = p->u.callee;
+ u32 words[AA_TAIL_WORDS];
+ u32 n = 0;
+ memset(words, 0, sizeof words);
+ aa_words_callee_restores(a, words, AA_TAIL_WORDS, &n);
+ aa_words_restore_frame(a, words, AA_TAIL_WORDS, &n, L);
+ if (n >= AA_TAIL_WORDS) aa_panic(a, "tail patch too small");
+ if (callee.kind == NATIVE_LOC_REG) {
+ words[n++] = aa64_br(loc_reg(callee));
+ } else if (callee.kind == NATIVE_LOC_GLOBAL) {
+ while (n + 1u < AA_TAIL_WORDS) words[n++] = 0xd503201fu;
+ words[n++] = aa64_b(0);
+ } else {
+ aa_panic(a, "unsupported tail target");
+ }
+ while (n < AA_TAIL_WORDS) words[n++] = 0xd503201fu;
+ for (u32 w = 0; w < AA_TAIL_WORDS; ++w)
+ aa_patch32(a->base.obj, sec, p->pos + w * 4u, words[w]);
}
- while (n < AA_TAIL_WORDS) words[n++] = 0xd503201fu;
- for (u32 w = 0; w < AA_TAIL_WORDS; ++w)
- aa_patch32(a->base.obj, sec, site->pos + w * 4u, words[w]);
}
}
@@ -1342,8 +1361,8 @@ static void aa_func_end(NativeTarget* t) {
* outgoing stack args, optimizer path only (the NDT reserves a much
* larger prologue region). */
a->slim_prologue =
- t->emit_minimal_prologue && a->ncallee_saves == 0 &&
- a->nalloca_patches == 0 && L.slot_bytes == 0 && L.out_stack == 0;
+ t->emit_minimal_prologue && a->ncallee_saves == 0 && a->nalloca == 0 &&
+ L.slot_bytes == 0 && L.out_stack == 0;
/* Universal small-frame fast path: skip the x17/x10 scratch when the
* saved-pair offset fits stp's signed 7-bit scaled immediate. Mutually
* exclusive with the Tier A slim form (Tier A is strictly tighter).
@@ -1351,16 +1370,14 @@ static void aa_func_end(NativeTarget* t) {
* fat epilogue (sp = fp + 16 via x10) is what restores sp from fp; the
* slim_small_frame epilogue's `add sp, sp, #N` only undoes the static
* frame, leaving sp pointing into the alloca area. */
- a->slim_small_frame =
- !a->slim_prologue && a->nalloca_patches == 0 &&
- aa_sp_off_saved_pair(&L) <= 504u;
+ a->slim_small_frame = !a->slim_prologue && a->nalloca == 0 &&
+ aa_sp_off_saved_pair(&L) <= 504u;
mc->label_place(mc, a->epilogue_label);
aa_emit_callee_restores(a);
aa_emit_restore_frame(a, &L);
aa_emit32(mc, aa64_ret(AA_LR));
aa_patch_prologue(a, &L, prologue_region);
- aa_patch_allocas(a);
- aa_patch_tail_sites(a, &L);
+ aa_apply_patches(a, &L);
if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) {
mc->cfi_set_next_pc_offset(mc, prologue_region * 4u);
/* CFA = caller's sp = fp + AA_FRAME_SAVE_SIZE. saved fp/lr at fp/fp+8
@@ -2005,23 +2022,19 @@ static void aa_alloca(NativeTarget* t, NativeLoc dst, NativeLoc size,
AANativeTarget* a = aa_of(t);
u32 use_align = align < 16u ? 16u : align;
if (use_align & (use_align - 1u)) aa_panic(a, "alloca alignment not pow2");
- if (a->nalloca_patches == a->alloca_patches_cap) {
- u32 cap = a->alloca_patches_cap ? a->alloca_patches_cap * 2u : 8u;
- AAAllocaPatch* nb = arena_zarray(t->c->tu, AAAllocaPatch, cap);
- if (a->alloca_patches)
- memcpy(nb, a->alloca_patches, sizeof(*nb) * a->nalloca_patches);
- a->alloca_patches = nb;
- a->alloca_patches_cap = cap;
- }
aa_emit_add_imm(a, AA_TMP0, loc_reg(size), (i32)(use_align - 1u));
aa_emit_load_imm(t->mc, 1, AA_TMP1, -(i64)use_align);
aa_emit32(t->mc, aa64_and(1, AA_TMP0, AA_TMP0, AA_TMP1));
aa_emit32(t->mc, aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0));
aa_emit32(t->mc, aa64_sub(1, AA_TMP1, AA_TMP1, AA_TMP0));
aa_emit32(t->mc, aa64_add_imm(1, AA_SP, AA_TMP1, 0, 0));
- a->alloca_patches[a->nalloca_patches].pos = t->mc->pos(t->mc);
- a->alloca_patches[a->nalloca_patches].dst_reg = loc_reg(dst);
- a->nalloca_patches++;
+ {
+ AAPatch* p = aa_patch_alloc(a);
+ p->kind = AA_PATCH_ALLOCA;
+ p->pos = t->mc->pos(t->mc);
+ p->u.dst_reg = loc_reg(dst);
+ a->nalloca++;
+ }
aa_emit32(t->mc, aa64_add_imm(1, loc_reg(dst), AA_SP, 0, 0));
}
@@ -2522,22 +2535,15 @@ static void aa_ret(NativeTarget* t);
static void aa_emit_tail_site(NativeTarget* t, NativeLoc callee) {
AANativeTarget* a = aa_of(t);
- if (a->ntail_sites == a->tail_sites_cap) {
- u32 cap = a->tail_sites_cap ? a->tail_sites_cap * 2u : 8u;
- AATailSite* nb = arena_zarray(t->c->tu, AATailSite, cap);
- if (a->tail_sites) memcpy(nb, a->tail_sites, sizeof(*nb) * a->ntail_sites);
- a->tail_sites = nb;
- a->tail_sites_cap = cap;
- }
- a->tail_sites[a->ntail_sites].pos = t->mc->pos(t->mc);
- a->tail_sites[a->ntail_sites].callee = callee;
- a->ntail_sites++;
+ AAPatch* p = aa_patch_alloc(a);
+ p->kind = AA_PATCH_TAIL;
+ p->pos = t->mc->pos(t->mc);
+ p->u.callee = callee;
for (u32 i = 0; i < AA_TAIL_WORDS; ++i) aa_emit32(t->mc, 0xd503201fu);
if (callee.kind == NATIVE_LOC_GLOBAL) {
- t->mc->emit_reloc_at(
- t->mc, t->mc->section_id,
- a->tail_sites[a->ntail_sites - 1u].pos + (AA_TAIL_WORDS - 1u) * 4u,
- R_AARCH64_JUMP26, callee.v.global.sym, callee.v.global.addend, 0, 0);
+ t->mc->emit_reloc_at(t->mc, t->mc->section_id,
+ p->pos + (AA_TAIL_WORDS - 1u) * 4u, R_AARCH64_JUMP26,
+ callee.v.global.sym, callee.v.global.addend, 0, 0);
}
}