kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 2a9bbeef597e44132e18c68c5f44eaaf679e643b
parent a3d48fb69c42dd07b58715354a1a6bda6673bbf4
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 28 May 2026 13:00:35 -0700

opt/aa64: unify deferred in-function patches into one tagged-union list

NDT Phase 4. aa64 had two ad-hoc deferred-patch arrays (alloca_patches,
tail_sites) drained by two separate calls in aa_func_end. Collapse them
into a single arch-private AAPatch tagged union (AA_PATCH_ALLOCA /
AA_PATCH_TAIL) on one growable list, appended via aa_patch_alloc and
drained by one aa_apply_patches switch. Each entry targets a disjoint,
fixed code position, so insertion order is irrelevant and output stays
byte-identical. A dedicated nalloca counter gates the slim prologue/frame
conditions that previously read nalloca_patches. The prologue region stays
separately patched (exactly one per function, fixed position).

Diffstat:
Msrc/arch/aa64/native.c | 158+++++++++++++++++++++++++++++++++++++++++--------------------------------------
1 file changed, 82 insertions(+), 76 deletions(-)

diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c @@ -135,15 +135,24 @@ typedef struct AANativeSlot { u8 pad[3]; } AANativeSlot; -typedef struct AATailSite { +/* Deferred in-function patches, all resolved in aa_func_end once the frame + * layout (max_outgoing, callee-saves) is final. One growable list carries both + * kinds; each entry patches a disjoint, fixed code position, so insertion order + * is irrelevant. The prologue region is patched separately (exactly one per + * function, fixed position) and is not a list entry. */ +typedef enum AAPatchKind { + AA_PATCH_ALLOCA, /* single instr: add dst, sp, #max_outgoing */ + AA_PATCH_TAIL, /* AA_TAIL_WORDS region: callee restores + frame + br/b */ +} AAPatchKind; + +typedef struct AAPatch { + AAPatchKind kind; u32 pos; - NativeLoc callee; -} AATailSite; - -typedef struct AAAllocaPatch { - u32 pos; - u32 dst_reg; -} AAAllocaPatch; + union { + u32 dst_reg; /* AA_PATCH_ALLOCA */ + NativeLoc callee; /* AA_PATCH_TAIL */ + } u; +} AAPatch; /* x19..x28 (10) + v8..v15 (8) is the maximum the allocator can assign. */ #define AA_MAX_CALLEE_SAVES 18u @@ -174,12 +183,10 @@ typedef struct AANativeTarget { NativeFrameSlot va_gr_slot; NativeFrameSlot va_vr_slot; - AATailSite* tail_sites; - u32 ntail_sites; - u32 tail_sites_cap; - AAAllocaPatch* alloca_patches; - u32 nalloca_patches; - u32 alloca_patches_cap; + AAPatch* patches; + u32 npatches; + u32 patches_cap; + u32 nalloca; /* count of AA_PATCH_ALLOCA entries; gates slim prologue/frame */ u32 func_start; u32 prologue_pos; @@ -956,8 +963,8 @@ static void aa_func_begin(NativeTarget* t, const CGFuncDesc* fd) { a->saved_tmp_slot = NATIVE_FRAME_SLOT_NONE; a->va_gr_slot = NATIVE_FRAME_SLOT_NONE; a->va_vr_slot = NATIVE_FRAME_SLOT_NONE; - a->ntail_sites = 0; - a->nalloca_patches = 0; + a->npatches = 0; + a->nalloca = 0; a->ncallee_saves = 0; a->slim_prologue = 0; a->slim_small_frame = 0; @@ -1282,16 +1289,17 @@ static void aa_emit_restore_frame(AANativeTarget* a, const AAFrameLayout* L) { for (u32 i = 0; i < n; ++i) aa_emit32(mc, words[i]); } -static void aa_patch_allocas(AANativeTarget* a) { - ObjSecId sec = a->func->text_section_id; - u32 imm12, sh; - for (u32 i = 0; i < a->nalloca_patches; ++i) { - AAAllocaPatch* p = &a->alloca_patches[i]; - if (!aa64_addsub_imm_fits(a->max_outgoing, &imm12, &sh)) - aa_panic(a, "outgoing area too large for alloca result"); - aa_patch32(a->base.obj, sec, p->pos, - aa64_add_imm(1, p->dst_reg, AA_SP, imm12, sh)); +/* Reserve one entry in the deferred-patch list, growing (arena-doubling) as + * needed. The returned pointer is stable until the next aa_patch_alloc. */ +static AAPatch* aa_patch_alloc(AANativeTarget* a) { + if (a->npatches == a->patches_cap) { + u32 cap = a->patches_cap ? a->patches_cap * 2u : 8u; + AAPatch* nb = arena_zarray(a->base.c->tu, AAPatch, cap); + if (a->patches) memcpy(nb, a->patches, sizeof(*nb) * a->npatches); + a->patches = nb; + a->patches_cap = cap; } + return &a->patches[a->npatches++]; } /* Append FP-relative loads that restore the saved callee registers (stp/ldp @@ -1302,27 +1310,38 @@ static void aa_words_callee_restores(AANativeTarget* a, u32* words, u32 cap, aa_words_callee_saves(a, 0, words, cap, n); } -static void aa_patch_tail_sites(AANativeTarget* a, const AAFrameLayout* L) { +/* Drain the deferred-patch list. Each entry targets a disjoint, fixed code + * position, so insertion order does not affect output. */ +static void aa_apply_patches(AANativeTarget* a, const AAFrameLayout* L) { ObjSecId sec = a->func->text_section_id; - for (u32 i = 0; i < a->ntail_sites; ++i) { - AATailSite* site = &a->tail_sites[i]; - u32 words[AA_TAIL_WORDS]; - u32 n = 0; - memset(words, 0, sizeof words); - aa_words_callee_restores(a, words, AA_TAIL_WORDS, &n); - aa_words_restore_frame(a, words, AA_TAIL_WORDS, &n, L); - if (n >= AA_TAIL_WORDS) aa_panic(a, "tail patch too small"); - if (site->callee.kind == NATIVE_LOC_REG) { - words[n++] = aa64_br(loc_reg(site->callee)); - } else if (site->callee.kind == NATIVE_LOC_GLOBAL) { - while (n + 1u < AA_TAIL_WORDS) words[n++] = 0xd503201fu; - words[n++] = aa64_b(0); - } else { - aa_panic(a, "unsupported tail target"); + for (u32 i = 0; i < a->npatches; ++i) { + AAPatch* p = &a->patches[i]; + if (p->kind == AA_PATCH_ALLOCA) { + u32 imm12, sh; + if (!aa64_addsub_imm_fits(a->max_outgoing, &imm12, &sh)) + aa_panic(a, "outgoing area too large for alloca result"); + aa_patch32(a->base.obj, sec, p->pos, + aa64_add_imm(1, p->u.dst_reg, AA_SP, imm12, sh)); + } else { /* AA_PATCH_TAIL */ + NativeLoc callee = p->u.callee; + u32 words[AA_TAIL_WORDS]; + u32 n = 0; + memset(words, 0, sizeof words); + aa_words_callee_restores(a, words, AA_TAIL_WORDS, &n); + aa_words_restore_frame(a, words, AA_TAIL_WORDS, &n, L); + if (n >= AA_TAIL_WORDS) aa_panic(a, "tail patch too small"); + if (callee.kind == NATIVE_LOC_REG) { + words[n++] = aa64_br(loc_reg(callee)); + } else if (callee.kind == NATIVE_LOC_GLOBAL) { + while (n + 1u < AA_TAIL_WORDS) words[n++] = 0xd503201fu; + words[n++] = aa64_b(0); + } else { + aa_panic(a, "unsupported tail target"); + } + while (n < AA_TAIL_WORDS) words[n++] = 0xd503201fu; + for (u32 w = 0; w < AA_TAIL_WORDS; ++w) + aa_patch32(a->base.obj, sec, p->pos + w * 4u, words[w]); } - while (n < AA_TAIL_WORDS) words[n++] = 0xd503201fu; - for (u32 w = 0; w < AA_TAIL_WORDS; ++w) - aa_patch32(a->base.obj, sec, site->pos + w * 4u, words[w]); } } @@ -1342,8 +1361,8 @@ static void aa_func_end(NativeTarget* t) { * outgoing stack args, optimizer path only (the NDT reserves a much * larger prologue region). */ a->slim_prologue = - t->emit_minimal_prologue && a->ncallee_saves == 0 && - a->nalloca_patches == 0 && L.slot_bytes == 0 && L.out_stack == 0; + t->emit_minimal_prologue && a->ncallee_saves == 0 && a->nalloca == 0 && + L.slot_bytes == 0 && L.out_stack == 0; /* Universal small-frame fast path: skip the x17/x10 scratch when the * saved-pair offset fits stp's signed 7-bit scaled immediate. Mutually * exclusive with the Tier A slim form (Tier A is strictly tighter). @@ -1351,16 +1370,14 @@ static void aa_func_end(NativeTarget* t) { * fat epilogue (sp = fp + 16 via x10) is what restores sp from fp; the * slim_small_frame epilogue's `add sp, sp, #N` only undoes the static * frame, leaving sp pointing into the alloca area. */ - a->slim_small_frame = - !a->slim_prologue && a->nalloca_patches == 0 && - aa_sp_off_saved_pair(&L) <= 504u; + a->slim_small_frame = !a->slim_prologue && a->nalloca == 0 && + aa_sp_off_saved_pair(&L) <= 504u; mc->label_place(mc, a->epilogue_label); aa_emit_callee_restores(a); aa_emit_restore_frame(a, &L); aa_emit32(mc, aa64_ret(AA_LR)); aa_patch_prologue(a, &L, prologue_region); - aa_patch_allocas(a); - aa_patch_tail_sites(a, &L); + aa_apply_patches(a, &L); if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) { mc->cfi_set_next_pc_offset(mc, prologue_region * 4u); /* CFA = caller's sp = fp + AA_FRAME_SAVE_SIZE. saved fp/lr at fp/fp+8 @@ -2005,23 +2022,19 @@ static void aa_alloca(NativeTarget* t, NativeLoc dst, NativeLoc size, AANativeTarget* a = aa_of(t); u32 use_align = align < 16u ? 16u : align; if (use_align & (use_align - 1u)) aa_panic(a, "alloca alignment not pow2"); - if (a->nalloca_patches == a->alloca_patches_cap) { - u32 cap = a->alloca_patches_cap ? a->alloca_patches_cap * 2u : 8u; - AAAllocaPatch* nb = arena_zarray(t->c->tu, AAAllocaPatch, cap); - if (a->alloca_patches) - memcpy(nb, a->alloca_patches, sizeof(*nb) * a->nalloca_patches); - a->alloca_patches = nb; - a->alloca_patches_cap = cap; - } aa_emit_add_imm(a, AA_TMP0, loc_reg(size), (i32)(use_align - 1u)); aa_emit_load_imm(t->mc, 1, AA_TMP1, -(i64)use_align); aa_emit32(t->mc, aa64_and(1, AA_TMP0, AA_TMP0, AA_TMP1)); aa_emit32(t->mc, aa64_add_imm(1, AA_TMP1, AA_SP, 0, 0)); aa_emit32(t->mc, aa64_sub(1, AA_TMP1, AA_TMP1, AA_TMP0)); aa_emit32(t->mc, aa64_add_imm(1, AA_SP, AA_TMP1, 0, 0)); - a->alloca_patches[a->nalloca_patches].pos = t->mc->pos(t->mc); - a->alloca_patches[a->nalloca_patches].dst_reg = loc_reg(dst); - a->nalloca_patches++; + { + AAPatch* p = aa_patch_alloc(a); + p->kind = AA_PATCH_ALLOCA; + p->pos = t->mc->pos(t->mc); + p->u.dst_reg = loc_reg(dst); + a->nalloca++; + } aa_emit32(t->mc, aa64_add_imm(1, loc_reg(dst), AA_SP, 0, 0)); } @@ -2522,22 +2535,15 @@ static void aa_ret(NativeTarget* t); static void aa_emit_tail_site(NativeTarget* t, NativeLoc callee) { AANativeTarget* a = aa_of(t); - if (a->ntail_sites == a->tail_sites_cap) { - u32 cap = a->tail_sites_cap ? a->tail_sites_cap * 2u : 8u; - AATailSite* nb = arena_zarray(t->c->tu, AATailSite, cap); - if (a->tail_sites) memcpy(nb, a->tail_sites, sizeof(*nb) * a->ntail_sites); - a->tail_sites = nb; - a->tail_sites_cap = cap; - } - a->tail_sites[a->ntail_sites].pos = t->mc->pos(t->mc); - a->tail_sites[a->ntail_sites].callee = callee; - a->ntail_sites++; + AAPatch* p = aa_patch_alloc(a); + p->kind = AA_PATCH_TAIL; + p->pos = t->mc->pos(t->mc); + p->u.callee = callee; for (u32 i = 0; i < AA_TAIL_WORDS; ++i) aa_emit32(t->mc, 0xd503201fu); if (callee.kind == NATIVE_LOC_GLOBAL) { - t->mc->emit_reloc_at( - t->mc, t->mc->section_id, - a->tail_sites[a->ntail_sites - 1u].pos + (AA_TAIL_WORDS - 1u) * 4u, - R_AARCH64_JUMP26, callee.v.global.sym, callee.v.global.addend, 0, 0); + t->mc->emit_reloc_at(t->mc, t->mc->section_id, + p->pos + (AA_TAIL_WORDS - 1u) * 4u, R_AARCH64_JUMP26, + callee.v.global.sym, callee.v.global.addend, 0, 0); } }