kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 98c440ab42655f8883d95a4d127341a27c4b75da
parent 4e913e9c8527350b317b9e01d5272bd11e077474
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 15 May 2026 16:51:22 -0700

opt: reserve only replayed hard regs

Diffstat:
Mdoc/OPT1.md | 25+++++++++++++------------
Msrc/opt/opt.c | 115++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
Mtest/opt/opt_test.c | 8++++----
3 files changed, 117 insertions(+), 31 deletions(-)

diff --git a/doc/OPT1.md b/doc/OPT1.md @@ -151,12 +151,12 @@ O1 relies on each target backend to provide: The current target pools are: - AArch64: integer `x19-x28`, FP `v8-v23`; -- x64: integer `RBX/R12/R13/R14/R15/R10`, FP `XMM6-XMM15`; -- RV64: integer `s2-s11`, FP `fs2-fs11`. +- x64: integer `R13/R14/R15/R10`, FP `XMM6-XMM15`; +- RV64: integer `s4-s11`, FP `fs4-fs11`. Backends still own final prologue/epilogue emission and callee-saved register -preservation. O1 calls `reserve_hard_regs` with the hard registers it assigned -so backend save/restore decisions match the rewritten IR. +preservation. O1 calls `reserve_hard_regs` with the hard registers still visible +in replay after cleanup so backend save/restore decisions match the emitted IR. Targets may also provide a known-frame entry path for O1. When `func_begin_known_frame` and `call_stack_size` are both available, O1 computes @@ -273,9 +273,10 @@ b ... Remaining O1 shape issues visible in the current dumps: -- O1 still saves/restores more callee-saved registers than the body appears to - need in small functions. The AArch64 while-loop probe saves `x19-x22`, and - the x64 direct-call probe saves `rbx/r12/r13/r14` in tiny functions. +- O1 still saves/restores more callee-saved registers than ideal in some small + functions under register pressure or values live across calls. The old + unconditional scratch-register saves have been removed, but wider + caller-saved allocation needs separate call-argument safety work. - Direct-call tiny functions are still heavy at O1. The x64 `callee(x) + 2` probe emitted 167 bytes and 47 instructions across two small functions, mostly frame setup, callee-save traffic, copies, and branch-to-epilogue @@ -287,8 +288,8 @@ Remaining O1 shape issues visible in the current dumps: MIR's O1 path suggests these high-value local cleanups that still fit cfree's fast tier: -1. Avoid unnecessary callee-save traffic. - Reserve and preserve only hard registers that survive final post-rewrite - cleanup, and consider caller-saved registers for values that are not live - across calls. This would make small leaf functions much closer to expected - O1 output without requiring global optimization. +1. Continue reducing callee-save traffic. + O1 now reserves/preserves only replay-visible hard registers after final + cleanup. Remaining work is mostly coalescing/argument-copy quality, + pressure-sensitive choices, and safely broadening caller-saved allocation for + values that are not live across calls. diff --git a/src/opt/opt.c b/src/opt/opt.c @@ -1538,17 +1538,102 @@ static void add_unique_reg(Reg* used, u32* nused, u32 cap, Reg r) { if (*nused < cap) used[(*nused)++] = r; } -static u32 collect_opt_hard_regs(Func* f, CGTarget* w, RegClass cls, - Reg* used, u32 cap) { - u32 nused = 0; - for (Val v = 1; v < f->nvals; ++v) { - if (f->val_info[v].alloc_kind != OPT_ALLOC_HARD) continue; - if (f->val_info[v].cls != cls) continue; - add_unique_reg(used, &nused, cap, f->val_info[v].hard_reg); +static void collect_replayed_operand_reg(const Operand* op, RegClass cls, + Reg* used, u32* nused, u32 cap) { + if (!op) return; + if (op->kind == OPK_REG) { + if (op->cls == cls) add_unique_reg(used, nused, cap, op->v.reg); + } else if (op->kind == OPK_INDIRECT) { + if (cls == RC_INT) add_unique_reg(used, nused, cap, op->v.ind.base); + } +} + +static void collect_replayed_abivalue_regs(const CGABIValue* v, RegClass cls, + Reg* used, u32* nused, u32 cap) { + if (!v) return; + collect_replayed_operand_reg(&v->storage, cls, used, nused, cap); + for (u32 i = 0; i < v->nparts; ++i) + collect_replayed_operand_reg(&v->parts[i].op, cls, used, nused, cap); +} + +static void collect_replayed_param_regs(Func* f, RegClass cls, Reg* used, + u32* nused, u32 cap) { + if (!f->opt_rewritten || !f->val_info) return; + for (u32 i = 0; i < f->nparams; ++i) { + IRParam* p = &f->params[i]; + if (p->storage.kind != CG_LOCAL_STORAGE_REG) continue; + Val v = (Val)p->storage.v.reg; + if (v == VAL_NONE || v >= f->nvals) continue; + OptValInfo* vi = &f->val_info[v]; + if (vi->alloc_kind != OPT_ALLOC_HARD || vi->cls != cls) continue; + add_unique_reg(used, nused, cap, vi->hard_reg); } - if ((u32)cls < OPT_REG_CLASSES) { - for (u32 i = 0; i < f->opt_scratch_reg_count[cls]; ++i) - add_unique_reg(used, &nused, cap, f->opt_scratch_regs[cls][i]); +} + +static u32 collect_replayed_hard_regs(Func* f, CGTarget* w, RegClass cls, + Reg* used, u32 cap) { + u32 nused = 0; + collect_replayed_param_regs(f, cls, used, &nused, cap); + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + if ((IROp)in->op == IR_PARAM_DECL) continue; + for (u32 j = 0; j < in->nopnds; ++j) + collect_replayed_operand_reg(&in->opnds[j], cls, used, &nused, cap); + + switch ((IROp)in->op) { + case IR_CALL: { + IRCallAux* aux = (IRCallAux*)in->extra.aux; + if (!aux) break; + collect_replayed_operand_reg(&aux->desc.callee, cls, used, &nused, + cap); + for (u32 j = 0; j < aux->desc.nargs; ++j) + collect_replayed_abivalue_regs(&aux->desc.args[j], cls, used, + &nused, cap); + collect_replayed_abivalue_regs(&aux->desc.ret, cls, used, &nused, + cap); + break; + } + case IR_RET: { + IRRetAux* aux = (IRRetAux*)in->extra.aux; + if (aux && aux->present) + collect_replayed_abivalue_regs(&aux->val, cls, used, &nused, cap); + break; + } + case IR_SCOPE_BEGIN: { + IRScopeAux* aux = (IRScopeAux*)in->extra.aux; + if (aux) + collect_replayed_operand_reg(&aux->desc.cond, cls, used, &nused, + cap); + break; + } + case IR_ASM_BLOCK: { + IRAsmAux* aux = (IRAsmAux*)in->extra.aux; + if (!aux) break; + for (u32 j = 0; j < aux->nin; ++j) + collect_replayed_operand_reg(&aux->in_ops[j], cls, used, &nused, + cap); + for (u32 j = 0; j < aux->nout; ++j) + collect_replayed_operand_reg(&aux->out_ops[j], cls, used, &nused, + cap); + break; + } + case IR_INTRINSIC: { + IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; + if (!aux) break; + for (u32 j = 0; j < aux->narg; ++j) + collect_replayed_operand_reg(&aux->args[j], cls, used, &nused, + cap); + for (u32 j = 0; j < aux->ndst; ++j) + collect_replayed_operand_reg(&aux->dsts[j], cls, used, &nused, + cap); + break; + } + default: + break; + } + } } if (w->resolve_reg_name) { for (u32 b = 0; b < f->nblocks; ++b) { @@ -1629,8 +1714,8 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) { if (identity && w->plan_hard_regs) { for (u32 cidx = 0; cidx < OPT_REG_CLASSES; ++cidx) { Reg used[OPT_MAX_HARD_REGS]; - u32 nused = collect_opt_hard_regs(f, w, (RegClass)cidx, used, - OPT_MAX_HARD_REGS); + u32 nused = collect_replayed_hard_regs(f, w, (RegClass)cidx, used, + OPT_MAX_HARD_REGS); w->plan_hard_regs(w, (RegClass)cidx, used, nused); } } @@ -1703,7 +1788,7 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) { /* At -O1, opt managed allocation and emitted hard regs directly, * bypassing backend-local allocation. Tell the backend which hard - * regs were actually assigned so it can save the right callee-saved + * regs are still visible in replay so it can save the right callee-saved * subset in prologue/epilogue. * * The backend records only callee-saved members of this set for @@ -1711,8 +1796,8 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) { if (r.identity_regs && w->reserve_hard_regs) { for (u32 c = 0; c < OPT_REG_CLASSES; ++c) { Reg used[OPT_MAX_HARD_REGS]; - u32 nused = collect_opt_hard_regs(f, w, (RegClass)c, used, - OPT_MAX_HARD_REGS); + u32 nused = collect_replayed_hard_regs(f, w, (RegClass)c, used, + OPT_MAX_HARD_REGS); if (nused) w->reserve_hard_regs(w, (RegClass)c, used, nused); } } else if (!r.identity_regs && w->reserve_hard_regs) { diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c @@ -2425,11 +2425,11 @@ static void opt_emit_no_virtual_alloc(void) { EXPECT(mock.func_begin_plan_calls == (int)OPT_REG_CLASSES, "opt_emit should plan hard regs before backend func_begin"); - EXPECT(mock.plan_regs[RC_INT] == 3, - "opt_emit should plan the hard pool reg and 2 scratch regs, got %d", + EXPECT(mock.plan_regs[RC_INT] == 1, + "opt_emit should plan only the replayed hard reg, got %d", mock.plan_regs[RC_INT]); - EXPECT(mock.reserve_calls[RC_INT] == 3, - "opt_emit should reserve the hard pool reg and 2 scratch regs, got %d", + EXPECT(mock.reserve_calls[RC_INT] == 1, + "opt_emit should reserve only the replayed hard reg, got %d", mock.reserve_calls[RC_INT]); EXPECT(mock.load_imm_calls == 1, "expected one emitted load_imm"); EXPECT(mock.last_load_imm_dst == 19,