commit 042552da5134e5ade4b7183f419ef24220a873ea
parent 53115ba9e2fd1150957b4cbc870ded5295b6682d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Wed, 27 May 2026 06:37:16 -0700
aa64: make callee-saved registers allocable at O1
Expand the aa64 O1 allocable set with x19..x28 and v8..v15 (caller-saved
registers stay first so they are preferred; callee-saved are chosen only under
register pressure). The optimizer now finishes allocation before emission, so
pass_native_emit scans the lowered MIR for the callee-saved hard registers the
allocator assigned and hands them to a new NativeTarget reserve_callee_saves
hook before frame-slot mapping.
The aa64 backend reserves a save slot per used callee register first (so the
offsets stay within stur's signed-9-bit range), saves them in the back-patched
prologue (FP-relative, after FP is established), and restores them in the
epilogue. The tail-call patch restores them too before tearing down the frame;
AA_TAIL_WORDS is widened to fit. The FP class gains a callee_saved_mask
(v8..v15, low 64 bits per AAPCS64).
Verified: O0 and O1 agree on register-pressure int and fp programs; saves and
restores are balanced. Full toy suite 1333 pass / 0 fail.
Diffstat:
3 files changed, 166 insertions(+), 16 deletions(-)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -58,7 +58,7 @@ enum {
AA_SP = 31u,
AA_FRAME_SAVE_SIZE = 16u,
AA_PROLOGUE_WORDS = 24u,
- AA_TAIL_WORDS = 16u,
+ AA_TAIL_WORDS = 32u,
};
typedef struct AANativeSlot {
@@ -79,6 +79,16 @@ typedef struct AAAllocaPatch {
u32 dst_reg;
} AAAllocaPatch;
+/* x19..x28 (10) + v8..v15 (8) is the maximum the allocator can assign. */
+#define AA_MAX_CALLEE_SAVES 18u
+
+typedef struct AACalleeSave {
+ NativeFrameSlot slot;
+ CfreeCgTypeId type;
+ u8 cls; /* NativeAllocClass */
+ Reg reg;
+} AACalleeSave;
+
typedef struct AANativeTarget {
NativeTarget base;
SrcLoc loc;
@@ -108,6 +118,9 @@ typedef struct AANativeTarget {
u32 func_start;
u32 prologue_pos;
MCLabel epilogue_label;
+
+ AACalleeSave callee_saves[AA_MAX_CALLEE_SAVES];
+ u32 ncallee_saves;
} AANativeTarget;
static AANativeTarget* aa_of(NativeTarget* t) { return (AANativeTarget*)t; }
@@ -771,6 +784,7 @@ static void aa_func_begin(NativeTarget* t, const CGFuncDesc* fd) {
a->va_vr_slot = NATIVE_FRAME_SLOT_NONE;
a->ntail_sites = 0;
a->nalloca_patches = 0;
+ a->ncallee_saves = 0;
mc->set_section(mc, fd->text_section_id);
mc->emit_align(mc, 4, 0);
a->func_start = mc->pos(mc);
@@ -815,6 +829,56 @@ static void aa_note_frame_state(NativeTarget* t,
a->max_outgoing = state->max_outgoing;
}
+/* Reserve a save slot for each callee-saved register the allocator used. Runs
+ * before frame-slot mapping so these slots get the lowest offsets, keeping the
+ * prologue stores within stur's signed-9-bit range. The prologue/epilogue
+ * save/restore is emitted from this list in aa_patch_prologue / aa_func_end. */
+static void aa_reserve_callee_saves(NativeTarget* t, const u32* used,
+ u32 nclasses) {
+ AANativeTarget* a = aa_of(t);
+ CfreeCgTypeId i64 = builtin_id(CFREE_CG_BUILTIN_I64);
+ CfreeCgTypeId f64 = builtin_id(CFREE_CG_BUILTIN_F64);
+ a->ncallee_saves = 0;
+ for (u32 cls = 0; cls < nclasses; ++cls) {
+ u32 mask = used[cls];
+ for (Reg r = 0; r < 32u && mask; ++r) {
+ NativeFrameSlotDesc sd;
+ AACalleeSave* cs;
+ if ((mask & (1u << r)) == 0) continue;
+ mask &= ~(1u << r);
+ if (a->ncallee_saves >= AA_MAX_CALLEE_SAVES)
+ aa_panic(a, "too many callee-saved registers");
+ memset(&sd, 0, sizeof sd);
+ sd.type = (cls == (u32)NATIVE_REG_FP) ? f64 : i64;
+ sd.size = 8;
+ sd.align = 8;
+ sd.kind = NATIVE_FRAME_SLOT_SAVE;
+ cs = &a->callee_saves[a->ncallee_saves++];
+ cs->cls = (u8)cls;
+ cs->reg = r;
+ cs->type = sd.type;
+ cs->slot = t->frame_slot(t, &sd);
+ }
+ }
+}
+
+static MemAccess aa_mem_for_type(NativeTarget* t, CfreeCgTypeId type, u32 size);
+
+static void aa_emit_callee_restores(AANativeTarget* a) {
+ MemAccess mem;
+ for (u32 i = a->ncallee_saves; i > 0; --i) {
+ const AACalleeSave* cs = &a->callee_saves[i - 1u];
+ NativeAddr addr;
+ NativeLoc reg = aa_reg_loc(cs->type, (NativeAllocClass)cs->cls, cs->reg);
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = cs->slot;
+ addr.base_type = cs->type;
+ mem = aa_mem_for_type(&a->base, cs->type, 8);
+ aa_emit_mem(a, 1, reg, addr, mem);
+ }
+}
+
static void aa_words_load_imm(AANativeTarget* a, u32* words, u32 cap, u32* n,
u32 rd, i64 imm) {
u32 tmp[4];
@@ -889,6 +953,18 @@ static void aa_patch_prologue(AANativeTarget* a, u32 frame_size) {
words[n++] = aa_stur_v(3, 0, AA_FP, AA_TMP1, 0);
words[n++] = aa_stur_v(3, 0, AA_LR, AA_TMP1, 8);
aa_words_frame_ptr_from_sp(a, words, AA_PROLOGUE_WORDS, &n, frame_size);
+ /* Save callee-saved registers the allocator used, FP-relative. Their slots
+ * were reserved first (aa_reserve_callee_saves), so offsets fit stur's
+ * signed-9-bit immediate. */
+ for (u32 i = 0; i < a->ncallee_saves; ++i) {
+ const AACalleeSave* cs = &a->callee_saves[i];
+ i32 off = -(i32)aa_slot(a, cs->slot)->off;
+ if (n >= AA_PROLOGUE_WORDS) aa_panic(a, "prologue too large");
+ if (off < -256 || off > 255)
+ aa_panic(a, "callee-save offset out of prologue range");
+ words[n++] = aa_stur_v(3, cs->cls == (u8)NATIVE_REG_FP ? 1u : 0u, cs->reg,
+ AA_FP, off);
+ }
}
while (n < AA_PROLOGUE_WORDS) words[n++] = 0xd503201fu;
for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i)
@@ -916,6 +992,21 @@ static void aa_patch_allocas(AANativeTarget* a) {
}
}
+/* Append FP-relative loads that restore the saved callee registers. Shared by
+ * the tail-call patch; the function epilogue uses aa_emit_callee_restores. */
+static void aa_words_callee_restores(AANativeTarget* a, u32* words, u32 cap,
+ u32* n) {
+ for (u32 i = a->ncallee_saves; i > 0; --i) {
+ const AACalleeSave* cs = &a->callee_saves[i - 1u];
+ i32 off = -(i32)aa_slot(a, cs->slot)->off;
+ if (*n >= cap) aa_panic(a, "patch too small for callee restores");
+ if (off < -256 || off > 255)
+ aa_panic(a, "callee-save offset out of restore range");
+ words[(*n)++] = aa_ldur_v(3, cs->cls == (u8)NATIVE_REG_FP ? 1u : 0u, cs->reg,
+ AA_FP, off);
+ }
+}
+
static void aa_patch_tail_sites(AANativeTarget* a, u32 frame_size) {
ObjSecId sec = a->func->text_section_id;
for (u32 i = 0; i < a->ntail_sites; ++i) {
@@ -923,6 +1014,7 @@ static void aa_patch_tail_sites(AANativeTarget* a, u32 frame_size) {
u32 words[AA_TAIL_WORDS];
u32 n = 0;
memset(words, 0, sizeof words);
+ aa_words_callee_restores(a, words, AA_TAIL_WORDS, &n);
aa_words_restore_frame(a, words, AA_TAIL_WORDS, &n, frame_size);
if (n >= AA_TAIL_WORDS) aa_panic(a, "tail patch too small");
if (site->callee.kind == NATIVE_LOC_REG) {
@@ -944,6 +1036,7 @@ static void aa_func_end(NativeTarget* t) {
MCEmitter* mc = t->mc;
u32 frame_size = align_up_u32(a->cum_off + a->max_outgoing, 16u);
mc->label_place(mc, a->epilogue_label);
+ aa_emit_callee_restores(a);
aa_emit_restore_frame(a, frame_size);
aa_emit32(mc, aa64_ret(AA_LR));
aa_patch_prologue(a, frame_size);
@@ -2424,9 +2517,14 @@ static void aa_finalize(NativeTarget* t) {
static void aa_bind_native_param(NativeTarget* t, const CGParamDesc* p,
NativeFrameSlot home);
-static const Reg aa_int_allocable[] = {8u, 11u, 12u, 13u, 14u, 15u};
+/* Caller-saved allocables come first so the allocator prefers them (lower
+ * spill_cost); callee-saved x19..x28 / v8..v15 are appended and only chosen
+ * under register pressure, after which the prologue saves/restores them. */
+static const Reg aa_int_allocable[] = {8u, 11u, 12u, 13u, 14u, 15u, 19u, 20u,
+ 21u, 22u, 23u, 24u, 25u, 26u, 27u, 28u};
static const Reg aa_int_scratch[] = {9u, 10u};
-static const Reg aa_fp_allocable[] = {18u, 19u};
+static const Reg aa_fp_allocable[] = {18u, 19u, 8u, 9u, 10u,
+ 11u, 12u, 13u, 14u, 15u};
static const Reg aa_fp_scratch[] = {20u, 21u};
#define AA_PHYS_INT_ALLOC(r) \
@@ -2451,12 +2549,12 @@ static const Reg aa_fp_scratch[] = {20u, 21u};
((r) < 2u ? NATIVE_REG_RET : 0), \
.spill_cost = 1u, \
.copy_cost = 1u}
-#define AA_PHYS_INT_CALLEE(r) \
- {.reg = (r), \
- .cls = NATIVE_REG_INT, \
- .abi_index = 0xffu, \
- .flags = NATIVE_REG_CALLEE_SAVED, \
- .spill_cost = 4u, \
+#define AA_PHYS_INT_CALLEE(r) \
+ {.reg = (r), \
+ .cls = NATIVE_REG_INT, \
+ .abi_index = 0xffu, \
+ .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \
+ .spill_cost = 4u, \
.copy_cost = 1u}
#define AA_PHYS_INT_RESERVED(r) \
{.reg = (r), \
@@ -2507,12 +2605,12 @@ static const NativePhysRegInfo aa_int_phys[] = {
((r) < 4u ? NATIVE_REG_RET : 0), \
.spill_cost = 1u, \
.copy_cost = 1u}
-#define AA_PHYS_FP_CALLEE(r) \
- {.reg = (r), \
- .cls = NATIVE_REG_FP, \
- .abi_index = 0xffu, \
- .flags = NATIVE_REG_CALLEE_SAVED, \
- .spill_cost = 4u, \
+#define AA_PHYS_FP_CALLEE(r) \
+ {.reg = (r), \
+ .cls = NATIVE_REG_FP, \
+ .abi_index = 0xffu, \
+ .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \
+ .spill_cost = 4u, \
.copy_cost = 1u}
#define AA_PHYS_FP_RESERVED(r) \
{.reg = (r), \
@@ -2557,7 +2655,10 @@ static const NativeAllocClassInfo aa_classes[] = {
.nscratch = sizeof aa_fp_scratch / sizeof aa_fp_scratch[0],
.phys = aa_fp_phys,
.nphys = sizeof aa_fp_phys / sizeof aa_fp_phys[0],
- .caller_saved_mask = 0xffffffffu,
+ /* v8..v15 are callee-saved (low 64 bits per AAPCS64); the rest are
+ * caller-saved. */
+ .caller_saved_mask = 0xffff00ffu,
+ .callee_saved_mask = 0x0000ff00u,
.arg_mask = 0x000000ffu,
.ret_mask = 0x0000000fu},
};
@@ -2594,6 +2695,7 @@ NativeTarget* aa64_native_target_new(Compiler* c, ObjBuilder* obj,
t->func_begin = aa_func_begin;
t->func_begin_known_frame = aa_func_begin_known_frame;
t->note_frame_state = aa_note_frame_state;
+ t->reserve_callee_saves = aa_reserve_callee_saves;
t->func_end = aa_func_end;
t->frame_slot = aa_frame_slot;
t->bind_param = aa_bind_native_param;
diff --git a/src/arch/native_target.h b/src/arch/native_target.h
@@ -271,6 +271,14 @@ struct NativeTarget {
const NativeKnownFrameDesc*,
NativeFrameSlot* out_slots);
void (*note_frame_state)(NativeTarget*, const NativeFramePatchState*);
+ /* Optional. Called once after func_begin and before frame-slot mapping, with
+ * the set of callee-saved hard registers the allocator assigned (one bitmask
+ * per NativeAllocClass, indexed by class id). The target reserves save slots
+ * and emits the prologue save / epilogue restore for each. Register
+ * allocation is complete before emission, so the caller knows the full set
+ * up front. */
+ void (*reserve_callee_saves)(NativeTarget*, const u32* used_by_class,
+ u32 nclasses);
void (*func_end)(NativeTarget*);
NativeFrameSlot (*frame_slot)(NativeTarget*, const NativeFrameSlotDesc*);
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -1292,6 +1292,45 @@ static void map_frame_slots(NativeEmitCtx* e) {
}
}
+#define EMIT_MAX_REG_CLASSES 4u
+
+static void collect_used_reg(Func* f, Inst* in, OptOperand* op, int is_def,
+ void* ctx) {
+ u32* used = (u32*)ctx;
+ (void)f;
+ (void)in;
+ (void)is_def;
+ if (op && op->kind == OPT_OPK_REG && op->cls < EMIT_MAX_REG_CLASSES &&
+ op->v.reg < 32u)
+ used[op->cls] |= 1u << op->v.reg;
+}
+
+/* After register allocation the MIR names hard registers directly, so we can
+ * scan it for the callee-saved registers the allocator assigned and ask the
+ * target to save/restore them. Must run after func_begin and before frame-slot
+ * mapping so the target can place the save slots first. */
+static void reserve_callee_saves(NativeEmitCtx* e) {
+ NativeTarget* t = e->target;
+ const NativeRegInfo* ri = t->regs;
+ u32 used[EMIT_MAX_REG_CLASSES];
+ u32 nclasses;
+ if (!t->reserve_callee_saves || !ri) return;
+ memset(used, 0, sizeof used);
+ for (u32 b = 0; b < e->f->nblocks; ++b) {
+ Block* bl = &e->f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i)
+ opt_walk_inst_operands(e->f, &bl->insts[i], collect_used_reg, used);
+ }
+ nclasses = ri->nclasses < EMIT_MAX_REG_CLASSES ? ri->nclasses
+ : EMIT_MAX_REG_CLASSES;
+ for (u32 i = 0; i < ri->nclasses; ++i) {
+ const NativeAllocClassInfo* ci = &ri->classes[i];
+ if (ci->cls < EMIT_MAX_REG_CLASSES)
+ used[ci->cls] &= ci->callee_saved_mask;
+ }
+ t->reserve_callee_saves(t, used, nclasses);
+}
+
void opt_emit_native(Compiler* c, Func* f, NativeTarget* target) {
NativeEmitCtx e;
Func view;
@@ -1325,6 +1364,7 @@ void opt_emit_native(Compiler* c, Func* f, NativeTarget* target) {
metrics_scope_begin(c, "opt.native_emit.func_begin");
target->func_begin(target, &fd);
+ reserve_callee_saves(&e);
map_frame_slots(&e);
bind_params(&e);
metrics_scope_end(c, "opt.native_emit.func_begin");