commit 98c440ab42655f8883d95a4d127341a27c4b75da
parent 4e913e9c8527350b317b9e01d5272bd11e077474
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 15 May 2026 16:51:22 -0700
opt: reserve only replayed hard regs
Diffstat:
3 files changed, 117 insertions(+), 31 deletions(-)
diff --git a/doc/OPT1.md b/doc/OPT1.md
@@ -151,12 +151,12 @@ O1 relies on each target backend to provide:
The current target pools are:
- AArch64: integer `x19-x28`, FP `v8-v23`;
-- x64: integer `RBX/R12/R13/R14/R15/R10`, FP `XMM6-XMM15`;
-- RV64: integer `s2-s11`, FP `fs2-fs11`.
+- x64: integer `R13/R14/R15/R10`, FP `XMM6-XMM15`;
+- RV64: integer `s4-s11`, FP `fs4-fs11`.
Backends still own final prologue/epilogue emission and callee-saved register
-preservation. O1 calls `reserve_hard_regs` with the hard registers it assigned
-so backend save/restore decisions match the rewritten IR.
+preservation. O1 calls `reserve_hard_regs` with the hard registers still visible
+in replay after cleanup so backend save/restore decisions match the emitted IR.
Targets may also provide a known-frame entry path for O1. When
`func_begin_known_frame` and `call_stack_size` are both available, O1 computes
@@ -273,9 +273,10 @@ b ...
Remaining O1 shape issues visible in the current dumps:
-- O1 still saves/restores more callee-saved registers than the body appears to
- need in small functions. The AArch64 while-loop probe saves `x19-x22`, and
- the x64 direct-call probe saves `rbx/r12/r13/r14` in tiny functions.
+- O1 still saves/restores more callee-saved registers than ideal in some small
+ functions under register pressure or values live across calls. The old
+ unconditional scratch-register saves have been removed, but wider
+ caller-saved allocation needs separate call-argument safety work.
- Direct-call tiny functions are still heavy at O1. The x64 `callee(x) + 2`
probe emitted 167 bytes and 47 instructions across two small functions,
mostly frame setup, callee-save traffic, copies, and branch-to-epilogue
@@ -287,8 +288,8 @@ Remaining O1 shape issues visible in the current dumps:
MIR's O1 path suggests these high-value local cleanups that still fit cfree's
fast tier:
-1. Avoid unnecessary callee-save traffic.
- Reserve and preserve only hard registers that survive final post-rewrite
- cleanup, and consider caller-saved registers for values that are not live
- across calls. This would make small leaf functions much closer to expected
- O1 output without requiring global optimization.
+1. Continue reducing callee-save traffic.
+ O1 now reserves/preserves only replay-visible hard registers after final
+ cleanup. Remaining work is mostly coalescing/argument-copy quality,
+ pressure-sensitive choices, and safely broadening caller-saved allocation for
+ values that are not live across calls.
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -1538,17 +1538,102 @@ static void add_unique_reg(Reg* used, u32* nused, u32 cap, Reg r) {
if (*nused < cap) used[(*nused)++] = r;
}
-static u32 collect_opt_hard_regs(Func* f, CGTarget* w, RegClass cls,
- Reg* used, u32 cap) {
- u32 nused = 0;
- for (Val v = 1; v < f->nvals; ++v) {
- if (f->val_info[v].alloc_kind != OPT_ALLOC_HARD) continue;
- if (f->val_info[v].cls != cls) continue;
- add_unique_reg(used, &nused, cap, f->val_info[v].hard_reg);
+static void collect_replayed_operand_reg(const Operand* op, RegClass cls,
+ Reg* used, u32* nused, u32 cap) {
+ if (!op) return;
+ if (op->kind == OPK_REG) {
+ if (op->cls == cls) add_unique_reg(used, nused, cap, op->v.reg);
+ } else if (op->kind == OPK_INDIRECT) {
+ if (cls == RC_INT) add_unique_reg(used, nused, cap, op->v.ind.base);
+ }
+}
+
+static void collect_replayed_abivalue_regs(const CGABIValue* v, RegClass cls,
+ Reg* used, u32* nused, u32 cap) {
+ if (!v) return;
+ collect_replayed_operand_reg(&v->storage, cls, used, nused, cap);
+ for (u32 i = 0; i < v->nparts; ++i)
+ collect_replayed_operand_reg(&v->parts[i].op, cls, used, nused, cap);
+}
+
+static void collect_replayed_param_regs(Func* f, RegClass cls, Reg* used,
+ u32* nused, u32 cap) {
+ if (!f->opt_rewritten || !f->val_info) return;
+ for (u32 i = 0; i < f->nparams; ++i) {
+ IRParam* p = &f->params[i];
+ if (p->storage.kind != CG_LOCAL_STORAGE_REG) continue;
+ Val v = (Val)p->storage.v.reg;
+ if (v == VAL_NONE || v >= f->nvals) continue;
+ OptValInfo* vi = &f->val_info[v];
+ if (vi->alloc_kind != OPT_ALLOC_HARD || vi->cls != cls) continue;
+ add_unique_reg(used, nused, cap, vi->hard_reg);
}
- if ((u32)cls < OPT_REG_CLASSES) {
- for (u32 i = 0; i < f->opt_scratch_reg_count[cls]; ++i)
- add_unique_reg(used, &nused, cap, f->opt_scratch_regs[cls][i]);
+}
+
+static u32 collect_replayed_hard_regs(Func* f, CGTarget* w, RegClass cls,
+ Reg* used, u32 cap) {
+ u32 nused = 0;
+ collect_replayed_param_regs(f, cls, used, &nused, cap);
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if ((IROp)in->op == IR_PARAM_DECL) continue;
+ for (u32 j = 0; j < in->nopnds; ++j)
+ collect_replayed_operand_reg(&in->opnds[j], cls, used, &nused, cap);
+
+ switch ((IROp)in->op) {
+ case IR_CALL: {
+ IRCallAux* aux = (IRCallAux*)in->extra.aux;
+ if (!aux) break;
+ collect_replayed_operand_reg(&aux->desc.callee, cls, used, &nused,
+ cap);
+ for (u32 j = 0; j < aux->desc.nargs; ++j)
+ collect_replayed_abivalue_regs(&aux->desc.args[j], cls, used,
+ &nused, cap);
+ collect_replayed_abivalue_regs(&aux->desc.ret, cls, used, &nused,
+ cap);
+ break;
+ }
+ case IR_RET: {
+ IRRetAux* aux = (IRRetAux*)in->extra.aux;
+ if (aux && aux->present)
+ collect_replayed_abivalue_regs(&aux->val, cls, used, &nused, cap);
+ break;
+ }
+ case IR_SCOPE_BEGIN: {
+ IRScopeAux* aux = (IRScopeAux*)in->extra.aux;
+ if (aux)
+ collect_replayed_operand_reg(&aux->desc.cond, cls, used, &nused,
+ cap);
+ break;
+ }
+ case IR_ASM_BLOCK: {
+ IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
+ if (!aux) break;
+ for (u32 j = 0; j < aux->nin; ++j)
+ collect_replayed_operand_reg(&aux->in_ops[j], cls, used, &nused,
+ cap);
+ for (u32 j = 0; j < aux->nout; ++j)
+ collect_replayed_operand_reg(&aux->out_ops[j], cls, used, &nused,
+ cap);
+ break;
+ }
+ case IR_INTRINSIC: {
+ IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
+ if (!aux) break;
+ for (u32 j = 0; j < aux->narg; ++j)
+ collect_replayed_operand_reg(&aux->args[j], cls, used, &nused,
+ cap);
+ for (u32 j = 0; j < aux->ndst; ++j)
+ collect_replayed_operand_reg(&aux->dsts[j], cls, used, &nused,
+ cap);
+ break;
+ }
+ default:
+ break;
+ }
+ }
}
if (w->resolve_reg_name) {
for (u32 b = 0; b < f->nblocks; ++b) {
@@ -1629,8 +1714,8 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) {
if (identity && w->plan_hard_regs) {
for (u32 cidx = 0; cidx < OPT_REG_CLASSES; ++cidx) {
Reg used[OPT_MAX_HARD_REGS];
- u32 nused = collect_opt_hard_regs(f, w, (RegClass)cidx, used,
- OPT_MAX_HARD_REGS);
+ u32 nused = collect_replayed_hard_regs(f, w, (RegClass)cidx, used,
+ OPT_MAX_HARD_REGS);
w->plan_hard_regs(w, (RegClass)cidx, used, nused);
}
}
@@ -1703,7 +1788,7 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) {
/* At -O1, opt managed allocation and emitted hard regs directly,
* bypassing backend-local allocation. Tell the backend which hard
- * regs were actually assigned so it can save the right callee-saved
+ * regs are still visible in replay so it can save the right callee-saved
* subset in prologue/epilogue.
*
* The backend records only callee-saved members of this set for
@@ -1711,8 +1796,8 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) {
if (r.identity_regs && w->reserve_hard_regs) {
for (u32 c = 0; c < OPT_REG_CLASSES; ++c) {
Reg used[OPT_MAX_HARD_REGS];
- u32 nused = collect_opt_hard_regs(f, w, (RegClass)c, used,
- OPT_MAX_HARD_REGS);
+ u32 nused = collect_replayed_hard_regs(f, w, (RegClass)c, used,
+ OPT_MAX_HARD_REGS);
if (nused) w->reserve_hard_regs(w, (RegClass)c, used, nused);
}
} else if (!r.identity_regs && w->reserve_hard_regs) {
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -2425,11 +2425,11 @@ static void opt_emit_no_virtual_alloc(void) {
EXPECT(mock.func_begin_plan_calls == (int)OPT_REG_CLASSES,
"opt_emit should plan hard regs before backend func_begin");
- EXPECT(mock.plan_regs[RC_INT] == 3,
- "opt_emit should plan the hard pool reg and 2 scratch regs, got %d",
+ EXPECT(mock.plan_regs[RC_INT] == 1,
+ "opt_emit should plan only the replayed hard reg, got %d",
mock.plan_regs[RC_INT]);
- EXPECT(mock.reserve_calls[RC_INT] == 3,
- "opt_emit should reserve the hard pool reg and 2 scratch regs, got %d",
+ EXPECT(mock.reserve_calls[RC_INT] == 1,
+ "opt_emit should reserve only the replayed hard reg, got %d",
mock.reserve_calls[RC_INT]);
EXPECT(mock.load_imm_calls == 1, "expected one emitted load_imm");
EXPECT(mock.last_load_imm_dst == 19,