kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 042552da5134e5ade4b7183f419ef24220a873ea
parent 53115ba9e2fd1150957b4cbc870ded5295b6682d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed, 27 May 2026 06:37:16 -0700

aa64: make callee-saved registers allocable at O1

Expand the aa64 O1 allocable set with x19..x28 and v8..v15 (caller-saved
registers stay first so they are preferred; callee-saved are chosen only under
register pressure). The optimizer now finishes allocation before emission, so
pass_native_emit scans the lowered MIR for the callee-saved hard registers the
allocator assigned and hands them to a new NativeTarget reserve_callee_saves
hook before frame-slot mapping.

The aa64 backend reserves a save slot per used callee register first (so the
offsets stay within stur's signed-9-bit range), saves them in the back-patched
prologue (FP-relative, after FP is established), and restores them in the
epilogue. The tail-call patch restores them too before tearing down the frame;
AA_TAIL_WORDS is widened to fit. The FP class gains a callee_saved_mask
(v8..v15, low 64 bits per AAPCS64).

Verified: O0 and O1 agree on register-pressure int and fp programs; saves and
restores are balanced. Full toy suite 1333 pass / 0 fail.

Diffstat:
Msrc/arch/aa64/native.c | 134+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
Msrc/arch/native_target.h | 8++++++++
Msrc/opt/pass_native_emit.c | 40++++++++++++++++++++++++++++++++++++++++
3 files changed, 166 insertions(+), 16 deletions(-)

diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c @@ -58,7 +58,7 @@ enum { AA_SP = 31u, AA_FRAME_SAVE_SIZE = 16u, AA_PROLOGUE_WORDS = 24u, - AA_TAIL_WORDS = 16u, + AA_TAIL_WORDS = 32u, }; typedef struct AANativeSlot { @@ -79,6 +79,16 @@ typedef struct AAAllocaPatch { u32 dst_reg; } AAAllocaPatch; +/* x19..x28 (10) + v8..v15 (8) is the maximum the allocator can assign. */ +#define AA_MAX_CALLEE_SAVES 18u + +typedef struct AACalleeSave { + NativeFrameSlot slot; + CfreeCgTypeId type; + u8 cls; /* NativeAllocClass */ + Reg reg; +} AACalleeSave; + typedef struct AANativeTarget { NativeTarget base; SrcLoc loc; @@ -108,6 +118,9 @@ typedef struct AANativeTarget { u32 func_start; u32 prologue_pos; MCLabel epilogue_label; + + AACalleeSave callee_saves[AA_MAX_CALLEE_SAVES]; + u32 ncallee_saves; } AANativeTarget; static AANativeTarget* aa_of(NativeTarget* t) { return (AANativeTarget*)t; } @@ -771,6 +784,7 @@ static void aa_func_begin(NativeTarget* t, const CGFuncDesc* fd) { a->va_vr_slot = NATIVE_FRAME_SLOT_NONE; a->ntail_sites = 0; a->nalloca_patches = 0; + a->ncallee_saves = 0; mc->set_section(mc, fd->text_section_id); mc->emit_align(mc, 4, 0); a->func_start = mc->pos(mc); @@ -815,6 +829,56 @@ static void aa_note_frame_state(NativeTarget* t, a->max_outgoing = state->max_outgoing; } +/* Reserve a save slot for each callee-saved register the allocator used. Runs + * before frame-slot mapping so these slots get the lowest offsets, keeping the + * prologue stores within stur's signed-9-bit range. The prologue/epilogue + * save/restore is emitted from this list in aa_patch_prologue / aa_func_end. */ +static void aa_reserve_callee_saves(NativeTarget* t, const u32* used, + u32 nclasses) { + AANativeTarget* a = aa_of(t); + CfreeCgTypeId i64 = builtin_id(CFREE_CG_BUILTIN_I64); + CfreeCgTypeId f64 = builtin_id(CFREE_CG_BUILTIN_F64); + a->ncallee_saves = 0; + for (u32 cls = 0; cls < nclasses; ++cls) { + u32 mask = used[cls]; + for (Reg r = 0; r < 32u && mask; ++r) { + NativeFrameSlotDesc sd; + AACalleeSave* cs; + if ((mask & (1u << r)) == 0) continue; + mask &= ~(1u << r); + if (a->ncallee_saves >= AA_MAX_CALLEE_SAVES) + aa_panic(a, "too many callee-saved registers"); + memset(&sd, 0, sizeof sd); + sd.type = (cls == (u32)NATIVE_REG_FP) ? f64 : i64; + sd.size = 8; + sd.align = 8; + sd.kind = NATIVE_FRAME_SLOT_SAVE; + cs = &a->callee_saves[a->ncallee_saves++]; + cs->cls = (u8)cls; + cs->reg = r; + cs->type = sd.type; + cs->slot = t->frame_slot(t, &sd); + } + } +} + +static MemAccess aa_mem_for_type(NativeTarget* t, CfreeCgTypeId type, u32 size); + +static void aa_emit_callee_restores(AANativeTarget* a) { + MemAccess mem; + for (u32 i = a->ncallee_saves; i > 0; --i) { + const AACalleeSave* cs = &a->callee_saves[i - 1u]; + NativeAddr addr; + NativeLoc reg = aa_reg_loc(cs->type, (NativeAllocClass)cs->cls, cs->reg); + memset(&addr, 0, sizeof addr); + addr.base_kind = NATIVE_ADDR_BASE_FRAME; + addr.base.frame = cs->slot; + addr.base_type = cs->type; + mem = aa_mem_for_type(&a->base, cs->type, 8); + aa_emit_mem(a, 1, reg, addr, mem); + } +} + static void aa_words_load_imm(AANativeTarget* a, u32* words, u32 cap, u32* n, u32 rd, i64 imm) { u32 tmp[4]; @@ -889,6 +953,18 @@ static void aa_patch_prologue(AANativeTarget* a, u32 frame_size) { words[n++] = aa_stur_v(3, 0, AA_FP, AA_TMP1, 0); words[n++] = aa_stur_v(3, 0, AA_LR, AA_TMP1, 8); aa_words_frame_ptr_from_sp(a, words, AA_PROLOGUE_WORDS, &n, frame_size); + /* Save callee-saved registers the allocator used, FP-relative. Their slots + * were reserved first (aa_reserve_callee_saves), so offsets fit stur's + * signed-9-bit immediate. */ + for (u32 i = 0; i < a->ncallee_saves; ++i) { + const AACalleeSave* cs = &a->callee_saves[i]; + i32 off = -(i32)aa_slot(a, cs->slot)->off; + if (n >= AA_PROLOGUE_WORDS) aa_panic(a, "prologue too large"); + if (off < -256 || off > 255) + aa_panic(a, "callee-save offset out of prologue range"); + words[n++] = aa_stur_v(3, cs->cls == (u8)NATIVE_REG_FP ? 1u : 0u, cs->reg, + AA_FP, off); + } } while (n < AA_PROLOGUE_WORDS) words[n++] = 0xd503201fu; for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) @@ -916,6 +992,21 @@ static void aa_patch_allocas(AANativeTarget* a) { } } +/* Append FP-relative loads that restore the saved callee registers. Shared by + * the tail-call patch; the function epilogue uses aa_emit_callee_restores. */ +static void aa_words_callee_restores(AANativeTarget* a, u32* words, u32 cap, + u32* n) { + for (u32 i = a->ncallee_saves; i > 0; --i) { + const AACalleeSave* cs = &a->callee_saves[i - 1u]; + i32 off = -(i32)aa_slot(a, cs->slot)->off; + if (*n >= cap) aa_panic(a, "patch too small for callee restores"); + if (off < -256 || off > 255) + aa_panic(a, "callee-save offset out of restore range"); + words[(*n)++] = aa_ldur_v(3, cs->cls == (u8)NATIVE_REG_FP ? 1u : 0u, cs->reg, + AA_FP, off); + } +} + static void aa_patch_tail_sites(AANativeTarget* a, u32 frame_size) { ObjSecId sec = a->func->text_section_id; for (u32 i = 0; i < a->ntail_sites; ++i) { @@ -923,6 +1014,7 @@ static void aa_patch_tail_sites(AANativeTarget* a, u32 frame_size) { u32 words[AA_TAIL_WORDS]; u32 n = 0; memset(words, 0, sizeof words); + aa_words_callee_restores(a, words, AA_TAIL_WORDS, &n); aa_words_restore_frame(a, words, AA_TAIL_WORDS, &n, frame_size); if (n >= AA_TAIL_WORDS) aa_panic(a, "tail patch too small"); if (site->callee.kind == NATIVE_LOC_REG) { @@ -944,6 +1036,7 @@ static void aa_func_end(NativeTarget* t) { MCEmitter* mc = t->mc; u32 frame_size = align_up_u32(a->cum_off + a->max_outgoing, 16u); mc->label_place(mc, a->epilogue_label); + aa_emit_callee_restores(a); aa_emit_restore_frame(a, frame_size); aa_emit32(mc, aa64_ret(AA_LR)); aa_patch_prologue(a, frame_size); @@ -2424,9 +2517,14 @@ static void aa_finalize(NativeTarget* t) { static void aa_bind_native_param(NativeTarget* t, const CGParamDesc* p, NativeFrameSlot home); -static const Reg aa_int_allocable[] = {8u, 11u, 12u, 13u, 14u, 15u}; +/* Caller-saved allocables come first so the allocator prefers them (lower + * spill_cost); callee-saved x19..x28 / v8..v15 are appended and only chosen + * under register pressure, after which the prologue saves/restores them. */ +static const Reg aa_int_allocable[] = {8u, 11u, 12u, 13u, 14u, 15u, 19u, 20u, + 21u, 22u, 23u, 24u, 25u, 26u, 27u, 28u}; static const Reg aa_int_scratch[] = {9u, 10u}; -static const Reg aa_fp_allocable[] = {18u, 19u}; +static const Reg aa_fp_allocable[] = {18u, 19u, 8u, 9u, 10u, + 11u, 12u, 13u, 14u, 15u}; static const Reg aa_fp_scratch[] = {20u, 21u}; #define AA_PHYS_INT_ALLOC(r) \ @@ -2451,12 +2549,12 @@ static const Reg aa_fp_scratch[] = {20u, 21u}; ((r) < 2u ? NATIVE_REG_RET : 0), \ .spill_cost = 1u, \ .copy_cost = 1u} -#define AA_PHYS_INT_CALLEE(r) \ - {.reg = (r), \ - .cls = NATIVE_REG_INT, \ - .abi_index = 0xffu, \ - .flags = NATIVE_REG_CALLEE_SAVED, \ - .spill_cost = 4u, \ +#define AA_PHYS_INT_CALLEE(r) \ + {.reg = (r), \ + .cls = NATIVE_REG_INT, \ + .abi_index = 0xffu, \ + .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \ + .spill_cost = 4u, \ .copy_cost = 1u} #define AA_PHYS_INT_RESERVED(r) \ {.reg = (r), \ @@ -2507,12 +2605,12 @@ static const NativePhysRegInfo aa_int_phys[] = { ((r) < 4u ? NATIVE_REG_RET : 0), \ .spill_cost = 1u, \ .copy_cost = 1u} -#define AA_PHYS_FP_CALLEE(r) \ - {.reg = (r), \ - .cls = NATIVE_REG_FP, \ - .abi_index = 0xffu, \ - .flags = NATIVE_REG_CALLEE_SAVED, \ - .spill_cost = 4u, \ +#define AA_PHYS_FP_CALLEE(r) \ + {.reg = (r), \ + .cls = NATIVE_REG_FP, \ + .abi_index = 0xffu, \ + .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \ + .spill_cost = 4u, \ .copy_cost = 1u} #define AA_PHYS_FP_RESERVED(r) \ {.reg = (r), \ @@ -2557,7 +2655,10 @@ static const NativeAllocClassInfo aa_classes[] = { .nscratch = sizeof aa_fp_scratch / sizeof aa_fp_scratch[0], .phys = aa_fp_phys, .nphys = sizeof aa_fp_phys / sizeof aa_fp_phys[0], - .caller_saved_mask = 0xffffffffu, + /* v8..v15 are callee-saved (low 64 bits per AAPCS64); the rest are + * caller-saved. */ + .caller_saved_mask = 0xffff00ffu, + .callee_saved_mask = 0x0000ff00u, .arg_mask = 0x000000ffu, .ret_mask = 0x0000000fu}, }; @@ -2594,6 +2695,7 @@ NativeTarget* aa64_native_target_new(Compiler* c, ObjBuilder* obj, t->func_begin = aa_func_begin; t->func_begin_known_frame = aa_func_begin_known_frame; t->note_frame_state = aa_note_frame_state; + t->reserve_callee_saves = aa_reserve_callee_saves; t->func_end = aa_func_end; t->frame_slot = aa_frame_slot; t->bind_param = aa_bind_native_param; diff --git a/src/arch/native_target.h b/src/arch/native_target.h @@ -271,6 +271,14 @@ struct NativeTarget { const NativeKnownFrameDesc*, NativeFrameSlot* out_slots); void (*note_frame_state)(NativeTarget*, const NativeFramePatchState*); + /* Optional. Called once after func_begin and before frame-slot mapping, with + * the set of callee-saved hard registers the allocator assigned (one bitmask + * per NativeAllocClass, indexed by class id). The target reserves save slots + * and emits the prologue save / epilogue restore for each. Register + * allocation is complete before emission, so the caller knows the full set + * up front. */ + void (*reserve_callee_saves)(NativeTarget*, const u32* used_by_class, + u32 nclasses); void (*func_end)(NativeTarget*); NativeFrameSlot (*frame_slot)(NativeTarget*, const NativeFrameSlotDesc*); diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c @@ -1292,6 +1292,45 @@ static void map_frame_slots(NativeEmitCtx* e) { } } +#define EMIT_MAX_REG_CLASSES 4u + +static void collect_used_reg(Func* f, Inst* in, OptOperand* op, int is_def, + void* ctx) { + u32* used = (u32*)ctx; + (void)f; + (void)in; + (void)is_def; + if (op && op->kind == OPT_OPK_REG && op->cls < EMIT_MAX_REG_CLASSES && + op->v.reg < 32u) + used[op->cls] |= 1u << op->v.reg; +} + +/* After register allocation the MIR names hard registers directly, so we can + * scan it for the callee-saved registers the allocator assigned and ask the + * target to save/restore them. Must run after func_begin and before frame-slot + * mapping so the target can place the save slots first. */ +static void reserve_callee_saves(NativeEmitCtx* e) { + NativeTarget* t = e->target; + const NativeRegInfo* ri = t->regs; + u32 used[EMIT_MAX_REG_CLASSES]; + u32 nclasses; + if (!t->reserve_callee_saves || !ri) return; + memset(used, 0, sizeof used); + for (u32 b = 0; b < e->f->nblocks; ++b) { + Block* bl = &e->f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) + opt_walk_inst_operands(e->f, &bl->insts[i], collect_used_reg, used); + } + nclasses = ri->nclasses < EMIT_MAX_REG_CLASSES ? ri->nclasses + : EMIT_MAX_REG_CLASSES; + for (u32 i = 0; i < ri->nclasses; ++i) { + const NativeAllocClassInfo* ci = &ri->classes[i]; + if (ci->cls < EMIT_MAX_REG_CLASSES) + used[ci->cls] &= ci->callee_saved_mask; + } + t->reserve_callee_saves(t, used, nclasses); +} + void opt_emit_native(Compiler* c, Func* f, NativeTarget* target) { NativeEmitCtx e; Func view; @@ -1325,6 +1364,7 @@ void opt_emit_native(Compiler* c, Func* f, NativeTarget* target) { metrics_scope_begin(c, "opt.native_emit.func_begin"); target->func_begin(target, &fd); + reserve_callee_saves(&e); map_frame_slots(&e); bind_params(&e); metrics_scope_end(c, "opt.native_emit.func_begin");