commit 4e913e9c8527350b317b9e01d5272bd11e077474
parent 15697ccc61d1100c553d0f8b7c631d3695377565
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 15 May 2026 16:40:04 -0700
Allow O1 params to bind to registers
Diffstat:
15 files changed, 407 insertions(+), 80 deletions(-)
diff --git a/doc/OPT1.md b/doc/OPT1.md
@@ -143,6 +143,8 @@ O1 relies on each target backend to provide:
- allocable hard-register pools per register class;
- scratch-register pools disjoint from allocable pools;
- caller-saved register classification;
+- parameter storage binding through `CGTarget.param`, which may return either a
+ frame slot or a register-backed local storage for simple direct ABI params;
- optional hard-register reservation before backend `func_end`;
- target legality for local folds performed by `opt_combine`.
@@ -227,12 +229,6 @@ N opt.o1 live_reg regalloc spills reloads inserted
512 6.991 1.474 4.924 504 504 1008
```
-The same spill family exposed a correctness risk in the JIT/run path: some
-large spill-heavy generated functions returned correctly while nearby sizes
-segfaulted during execution (`N=256` and `N=512` in the focused probe). Treat
-that as a codegen/runtime correctness bug before using the spill ladder as a
-pure performance benchmark.
-
Current code-shape probes compiled `identity_param`, `scalar_add`,
`while_sum`, `simple_branch`, `direct_call`, `const_local`, and
`local_addr_taken` across x64, AArch64, and RV64 with `-O1`, then disassembled
@@ -268,19 +264,15 @@ b ...
local combine pass also retargets safe single-use arithmetic producers to a
following physical-copy destination, including commutative operand swaps for
x64-style two-operand overlap cases.
+- Simple scalar parameters no longer need late mem2reg promotion. `CGTarget.param`
+ owns ABI entry binding and O1 can keep non-memory-required direct params as
+ virtual-register-backed locals, replaying the final hard register or spill slot
+ to the target backend. Frame-backed params remain the path for aggregates,
+ address-taken values, indirect/byval ABI cases, and other memory-required
+ shapes.
Remaining O1 shape issues visible in the current dumps:
-- Parameter and entry-slot promotion is incomplete. The trivial AArch64
- identity function still stores the incoming argument to a frame slot, reloads
- it into `w19`, then copies it back to `w0`:
-
-```asm
-stur w0, [x29, #-0x4]
-ldur w19, [x29, #-0x4]
-mov w0, w19
-```
-
- O1 still saves/restores more callee-saved registers than the body appears to
need in small functions. The AArch64 while-loop probe saves `x19-x22`, and
the x64 direct-call probe saves `rbx/r12/r13/r14` in tiny functions.
@@ -295,15 +287,7 @@ mov w0, w19
MIR's O1 path suggests these high-value local cleanups that still fit cfree's
fast tier:
-1. Promote remaining scalar entry slots before backend allocation.
- MIR's C frontend represents normal scalar block locals as MIR registers and
- leaves stack slots for aggregates, forced-stack cases, and address-taken
- values. O1 now keeps simple loop locals in registers in the probe, but still
- stores and reloads some parameter/entry slots. A conservative mem2reg-lite
- pass should promote remaining integer/pointer scalars whose address does not
- escape, starting with parameters and single-entry structured control flow.
-
-2. Avoid unnecessary callee-save traffic.
+1. Avoid unnecessary callee-save traffic.
Reserve and preserve only hard registers that survive final post-rewrite
cleanup, and consider caller-saved registers for values that are not live
across calls. This would make small leaf functions much closer to expected
diff --git a/src/api/cg.c b/src/api/cg.c
@@ -3080,11 +3080,12 @@ CfreeCgLocal cfree_cg_local(CfreeCg *g, CfreeCgTypeId type,
CfreeCgLocal cfree_cg_param(CfreeCg *g, uint32_t index, CfreeCgTypeId type,
CfreeCgLocalAttrs attrs) {
CfreeCgTypeId ty;
- FrameSlot slot;
CGParamDesc pd;
- FrameSlotDesc fsd;
ApiSourceLocal *rec;
CfreeCgLocal handle;
+ CGLocalStorage storage;
+ u32 size;
+ u32 align;
if (!g)
return CFREE_CG_LOCAL_NONE;
ty = resolve_type(g->c, type);
@@ -3096,27 +3097,28 @@ CfreeCgLocal cfree_cg_param(CfreeCg *g, uint32_t index, CfreeCgTypeId type,
if (handle == CFREE_CG_LOCAL_NONE || !api_grow_locals(g, g->nlocals + 1u))
return CFREE_CG_LOCAL_NONE;
- memset(&fsd, 0, sizeof fsd);
- fsd.type = ty;
- fsd.name = (Sym)attrs.name;
- fsd.loc = g->cur_loc;
- fsd.size = abi_cg_sizeof(g->c->abi, type);
- fsd.align = attrs.align ? attrs.align : abi_cg_alignof(g->c->abi, type);
- fsd.kind = FS_PARAM;
- if (api_source_flags_addr_taken(attrs.flags))
- fsd.flags |= FSF_ADDR_TAKEN;
- slot = g->target->frame_slot(g->target, &fsd);
+ size = abi_cg_sizeof(g->c->abi, type);
+ align = attrs.align ? attrs.align : abi_cg_alignof(g->c->abi, type);
memset(&pd, 0, sizeof pd);
pd.index = index;
pd.name = (Sym)attrs.name;
pd.type = ty;
- pd.slot = slot;
+ pd.size = size;
+ pd.align = align;
+ if (api_source_flags_addr_taken(attrs.flags))
+ pd.flags |= CG_LOCAL_ADDR_TAKEN;
+ if (api_local_requires_memory(g, ty, attrs))
+ pd.flags |= CG_LOCAL_MEMORY_REQUIRED;
if (g->fn_abi && index < g->fn_abi->nparams) {
pd.abi = &g->fn_abi->params[index];
}
pd.loc = g->cur_loc;
- g->target->param(g->target, &pd);
+ storage = g->target->param(g->target, &pd);
+ if (storage.kind == CG_LOCAL_STORAGE_REG) {
+ cg_simple_regalloc_reserve(&g->regalloc, (RegClass)api_type_class(ty),
+ storage.v.reg);
+ }
rec = &g->locals[g->nlocals++];
memset(rec, 0, sizeof *rec);
@@ -3128,13 +3130,10 @@ CfreeCgLocal cfree_cg_param(CfreeCg *g, uint32_t index, CfreeCgTypeId type,
rec->desc.type = ty;
rec->desc.name = (Sym)attrs.name;
rec->desc.loc = g->cur_loc;
- rec->desc.size = fsd.size;
- rec->desc.align = fsd.align;
- rec->desc.flags = api_source_flags_addr_taken(attrs.flags)
- ? CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED
- : CG_LOCAL_MEMORY_REQUIRED;
- rec->storage.kind = CG_LOCAL_STORAGE_FRAME;
- rec->storage.v.frame_slot = slot;
+ rec->desc.size = size;
+ rec->desc.align = align;
+ rec->desc.flags = pd.flags;
+ rec->storage = storage;
rec->param_index = index;
rec->kind = API_SOURCE_LOCAL_PARAM;
return handle;
@@ -3267,8 +3266,7 @@ void cfree_cg_push_local(CfreeCg *g, CfreeCgLocal local) {
rec = api_local_from_handle(g, local);
if (!rec)
return;
- if (rec->kind == API_SOURCE_LOCAL_AUTO &&
- rec->storage.kind == CG_LOCAL_STORAGE_REG) {
+ if (rec->storage.kind == CG_LOCAL_STORAGE_REG) {
api_push_source_reg_lvalue(g, local, rec->storage.v.reg, rec->type);
} else if (rec->kind == API_SOURCE_LOCAL_AUTO) {
api_push_source_frame_lvalue(g, local, rec->storage.v.frame_slot,
@@ -3455,7 +3453,7 @@ void cfree_cg_addr(CfreeCg *g) {
rec = v.source_local != CFREE_CG_LOCAL_NONE
? api_local_from_handle(g, v.source_local)
: NULL;
- if (rec && rec->kind == API_SOURCE_LOCAL_AUTO && T->local_addr)
+ if (rec && rec->storage.kind == CG_LOCAL_STORAGE_REG && T->local_addr)
T->local_addr(T, dst, &rec->desc, rec->storage);
else
T->addr_of(T, dst, v.op);
diff --git a/src/arch/aa64/emit.c b/src/arch/aa64/emit.c
@@ -500,16 +500,68 @@ FrameSlot aa_frame_slot(CGTarget *t, const FrameSlotDesc *d) {
* Parameters
* ============================================================ */
-void aa_param(CGTarget *t, const CGParamDesc *p) {
+CGLocalStorage aa_param(CGTarget *t, const CGParamDesc *p) {
AAImpl *a = impl_of(t);
- AASlot *s = aa64_slot_get(a, p->slot);
- if (!s) {
+ CGLocalStorage st = p->storage;
+ if (st.kind == CG_LOCAL_STORAGE_FRAME && st.v.frame_slot == FRAME_SLOT_NONE) {
+ FrameSlotDesc fsd = {0};
+ fsd.type = p->type;
+ fsd.name = p->name;
+ fsd.loc = p->loc;
+ fsd.size = p->size;
+ fsd.align = p->align;
+ fsd.kind = FS_PARAM;
+ if (p->flags & CG_LOCAL_ADDR_TAKEN) fsd.flags |= FSF_ADDR_TAKEN;
+ st.v.frame_slot = aa_frame_slot(t, &fsd);
+ }
+ AASlot *s = st.kind == CG_LOCAL_STORAGE_FRAME
+ ? aa64_slot_get(a, st.v.frame_slot)
+ : NULL;
+ if (st.kind == CG_LOCAL_STORAGE_FRAME && !s) {
compiler_panic(t->c, a->loc, "aarch64 param: bad slot");
}
const ABIArgInfo *ai = p->abi;
if (ai->kind == ABI_ARG_IGNORE)
- return;
+ return st;
+ if (st.kind == CG_LOCAL_STORAGE_REG) {
+ if (ai->kind != ABI_ARG_DIRECT || ai->nparts != 1) {
+ compiler_panic(t->c, a->loc,
+ "aarch64 param: register storage requires one direct part");
+ }
+ const ABIArgPart *pt = &ai->parts[0];
+ u32 sz = pt->size;
+ u32 sidx = size_idx_for_bytes(sz);
+ if (pt->cls == ABI_CLASS_INT) {
+ u32 dst = reg_num((Operand){.kind = OPK_REG, .v.reg = st.v.reg});
+ if (a->next_param_int < 8) {
+ u32 src = a->next_param_int++;
+ u32 sf = (sz == 8) ? 1u : 0u;
+ if (dst != src) aa64_emit32(t->mc, aa64_mov_reg(sf, dst, src));
+ } else {
+ u32 caller_off = a->next_param_stack;
+ a->next_param_stack += 8;
+ aa64_emit_ldur_off(t->mc, sidx, dst, 29, (i32)(16 + caller_off),
+ AA_TMP0);
+ }
+ } else if (pt->cls == ABI_CLASS_FP) {
+ u32 dst = reg_num((Operand){.kind = OPK_REG, .v.reg = st.v.reg});
+ if (a->next_param_fp < 8) {
+ u32 src = a->next_param_fp++;
+ u32 type = (sz == 8) ? 1u : 0u;
+ if (dst != src) aa64_emit32(t->mc, aa64_fmov_reg(type, dst, src));
+ } else {
+ u32 caller_off = a->next_param_stack;
+ a->next_param_stack += 8;
+ aa64_emit_ldur_fp_off(t->mc, sidx, dst, 29,
+ (i32)(16 + caller_off), AA_TMP0);
+ }
+ } else {
+ compiler_panic(t->c, a->loc, "aarch64 param: ABI class %d unimpl",
+ (int)pt->cls);
+ }
+ return st;
+ }
if (ai->kind == ABI_ARG_INDIRECT) {
u32 ptr_reg;
if (a->next_param_int < 8) {
@@ -547,7 +599,7 @@ void aa_param(CGTarget *t, const CGParamDesc *p) {
AA_TMP2);
i += 1;
}
- return;
+ return st;
}
for (u16 i = 0; i < ai->nparts; ++i) {
const ABIArgPart *pt = &ai->parts[i];
@@ -586,6 +638,7 @@ void aa_param(CGTarget *t, const CGParamDesc *p) {
(int)pt->cls);
}
}
+ return st;
}
/* ============================================================
diff --git a/src/arch/aa64/internal.h b/src/arch/aa64/internal.h
@@ -300,7 +300,7 @@ void aa_func_begin_known_frame(CGTarget* t, const CGFuncDesc* fd,
const CGKnownFrameDesc* frame,
FrameSlot* out_slots);
void aa_func_end(CGTarget* t);
-void aa_param(CGTarget* t, const CGParamDesc* p);
+CGLocalStorage aa_param(CGTarget* t, const CGParamDesc* p);
/* alloc.c helpers used in emit.c / ops.c */
AAImpl* impl_of(CGTarget* t);
diff --git a/src/arch/arch.h b/src/arch/arch.h
@@ -332,7 +332,10 @@ typedef struct CGParamDesc {
u32 index;
Sym name;
CfreeCgTypeId type;
- FrameSlot slot;
+ u32 size;
+ u32 align;
+ u32 flags; /* CGLocalFlag */
+ CGLocalStorage storage;
const ABIArgInfo* abi;
const CGABIPart* incoming;
u32 nincoming;
@@ -538,7 +541,7 @@ struct CGTarget {
CGLocalStorage (*local)(CGTarget*, const CGLocalDesc*);
void (*local_addr)(CGTarget*, Operand dst, const CGLocalDesc*,
CGLocalStorage);
- void (*param)(CGTarget*, const CGParamDesc*);
+ CGLocalStorage (*param)(CGTarget*, const CGParamDesc*);
void (*spill_reg)(CGTarget*, Operand src_reg, FrameSlot, MemAccess);
void (*reload_reg)(CGTarget*, Operand dst_reg, FrameSlot, MemAccess);
diff --git a/src/arch/rv64/alloc.c b/src/arch/rv64/alloc.c
@@ -37,17 +37,75 @@ RvSlot* rv64_slot_get(RImpl* a, FrameSlot fs) {
/* ---- param ---- */
-void rv_param(CGTarget* t, const CGParamDesc* p) {
+CGLocalStorage rv_param(CGTarget* t, const CGParamDesc* p) {
RImpl* a = impl_of(t);
MCEmitter* mc = t->mc;
- RvSlot* s = rv64_slot_get(a, p->slot);
- if (!s) compiler_panic(t->c, a->loc, "rv64 param: bad slot");
+ CGLocalStorage st = p->storage;
+ if (st.kind == CG_LOCAL_STORAGE_FRAME && st.v.frame_slot == FRAME_SLOT_NONE) {
+ FrameSlotDesc fsd = {0};
+ fsd.type = p->type;
+ fsd.name = p->name;
+ fsd.loc = p->loc;
+ fsd.size = p->size;
+ fsd.align = p->align;
+ fsd.kind = FS_PARAM;
+ if (p->flags & CG_LOCAL_ADDR_TAKEN) fsd.flags |= FSF_ADDR_TAKEN;
+ st.v.frame_slot = rv_frame_slot(t, &fsd);
+ }
+ RvSlot* s = st.kind == CG_LOCAL_STORAGE_FRAME
+ ? rv64_slot_get(a, st.v.frame_slot)
+ : NULL;
+ if (st.kind == CG_LOCAL_STORAGE_FRAME && !s)
+ compiler_panic(t->c, a->loc, "rv64 param: bad slot");
const ABIArgInfo* ai = p->abi;
/* Caller's stack args start above the saved-s0/ra pair, plus the
* 64-byte variadic save area when this function is variadic. */
i32 caller_stack_base = 16 + (a->is_variadic ? 64 : 0);
- if (ai->kind == ABI_ARG_IGNORE) return;
+ if (ai->kind == ABI_ARG_IGNORE) return st;
+ if (st.kind == CG_LOCAL_STORAGE_REG) {
+ if (ai->kind != ABI_ARG_DIRECT || ai->nparts != 1) {
+ compiler_panic(t->c, a->loc,
+ "rv64 param: register storage requires one direct part");
+ }
+ const ABIArgPart* pt = &ai->parts[0];
+ u32 sz = pt->size;
+ if (pt->cls == ABI_CLASS_INT) {
+ u32 dst = st.v.reg;
+ if (a->next_param_int < 8) {
+ u32 src = RV_A0 + a->next_param_int;
+ a->next_param_int++;
+ if (dst != src) rv64_emit32(mc, rv_addi(dst, src, 0));
+ } else {
+ u32 caller_off = a->next_param_stack;
+ a->next_param_stack += 8;
+ rv64_emit32(mc, enc_int_load(sz, 0, dst, RV_S0,
+ caller_stack_base + (i32)caller_off));
+ }
+ } else if (pt->cls == ABI_CLASS_FP) {
+ u32 dst = st.v.reg;
+ if (a->next_param_fp < 8) {
+ u32 src = 10u + a->next_param_fp;
+ a->next_param_fp++;
+ if (dst != src) {
+ rv64_emit32(mc, rv_fsgnj(sz == 8 ? 1u : 0u, dst, src, src));
+ }
+ } else {
+ u32 caller_off = a->next_param_stack;
+ a->next_param_stack += 8;
+ if (sz == 8)
+ rv64_emit32(mc, rv_fld(dst, RV_S0,
+ caller_stack_base + (i32)caller_off));
+ else
+ rv64_emit32(mc, rv_flw(dst, RV_S0,
+ caller_stack_base + (i32)caller_off));
+ }
+ } else {
+ compiler_panic(t->c, a->loc, "rv64 param: ABI class %d unimpl",
+ (int)pt->cls);
+ }
+ return st;
+ }
if (ai->kind == ABI_ARG_INDIRECT) {
/* Pointer-to-copy passed in a-register. Copy bytes from there into
* the home slot. Source pointer is in a0..a7. */
@@ -86,7 +144,7 @@ void rv_param(CGTarget* t, const CGParamDesc* p) {
rv64_emit32(mc, rv_sb(RV_T2, RV_S0, -(i32)s->off + (i32)i));
i += 1;
}
- return;
+ return st;
}
/* DIRECT */
for (u16 i = 0; i < ai->nparts; ++i) {
@@ -134,6 +192,7 @@ void rv_param(CGTarget* t, const CGParamDesc* p) {
(int)pt->cls);
}
}
+ return st;
}
void rv_spill_reg(CGTarget* t, Operand src, FrameSlot slot,
diff --git a/src/arch/rv64/internal.h b/src/arch/rv64/internal.h
@@ -138,7 +138,7 @@ _Noreturn void rv_panic(CGTarget *t, const char *what);
/* ---- alloc.c: all functions (non-static; referenced by ops.c vtable) ---- */
FrameSlot rv_frame_slot(CGTarget *t, const FrameSlotDesc *d);
RvSlot *rv64_slot_get(RImpl *a, FrameSlot fs);
-void rv_param(CGTarget *t, const CGParamDesc *p);
+CGLocalStorage rv_param(CGTarget *t, const CGParamDesc *p);
void rv_spill_reg(CGTarget *t, Operand src, FrameSlot slot, MemAccess ma);
void rv_reload_reg(CGTarget *t, Operand dst, FrameSlot slot, MemAccess ma);
Label rv_label_new(CGTarget *t);
diff --git a/src/arch/x64/alloc.c b/src/arch/x64/alloc.c
@@ -48,14 +48,66 @@ XSlot* x64_slot_get(XImpl* a, FrameSlot fs) {
return &a->slots[fs - 1];
}
-/* ---- param: store incoming arg(s) into the home slot ---- */
-void x_param(CGTarget* t, const CGParamDesc* p) {
+/* ---- param: bind incoming arg(s) to the requested storage ---- */
+CGLocalStorage x_param(CGTarget* t, const CGParamDesc* p) {
XImpl* a = impl_of(t);
- XSlot* s = x64_slot_get(a, p->slot);
- if (!s) compiler_panic(t->c, a->loc, "x64 param: bad slot");
+ CGLocalStorage st = p->storage;
+ if (st.kind == CG_LOCAL_STORAGE_FRAME && st.v.frame_slot == FRAME_SLOT_NONE) {
+ FrameSlotDesc fsd = {0};
+ fsd.type = p->type;
+ fsd.name = p->name;
+ fsd.loc = p->loc;
+ fsd.size = p->size;
+ fsd.align = p->align;
+ fsd.kind = FS_PARAM;
+ if (p->flags & CG_LOCAL_ADDR_TAKEN) fsd.flags |= FSF_ADDR_TAKEN;
+ st.v.frame_slot = x_frame_slot(t, &fsd);
+ }
+ XSlot* s = st.kind == CG_LOCAL_STORAGE_FRAME
+ ? x64_slot_get(a, st.v.frame_slot)
+ : NULL;
+ if (st.kind == CG_LOCAL_STORAGE_FRAME && !s)
+ compiler_panic(t->c, a->loc, "x64 param: bad slot");
const ABIArgInfo* ai = p->abi;
- if (ai->kind == ABI_ARG_IGNORE) return;
+ if (ai->kind == ABI_ARG_IGNORE) return st;
+ if (st.kind == CG_LOCAL_STORAGE_REG) {
+ if (ai->kind != ABI_ARG_DIRECT || ai->nparts != 1) {
+ compiler_panic(t->c, a->loc,
+ "x64 param: register storage requires one direct part");
+ }
+ const ABIArgPart* pt = &ai->parts[0];
+ u32 sz = pt->size;
+ if (pt->cls == ABI_CLASS_INT) {
+ if (a->next_param_int < 6) {
+ u32 src = g_int_arg_regs[a->next_param_int++];
+ u32 dst = st.v.reg & 0xFu;
+ int w = (sz == 8) ? 1 : 0;
+ if (dst != src) emit_mov_rr(t->mc, w, dst, src);
+ } else {
+ u32 caller_off = a->next_param_stack;
+ a->next_param_stack += 8;
+ emit_mov_load(t->mc, sz, 0, st.v.reg & 0xFu, X64_RBP,
+ (i32)(16 + caller_off));
+ }
+ } else if (pt->cls == ABI_CLASS_FP) {
+ u8 prefix = (sz == 8) ? 0xF2 : 0xF3;
+ u32 dst = st.v.reg & 0xFu;
+ if (a->next_param_fp < 8) {
+ u32 src = a->next_param_fp++;
+ if (dst != src) emit_sse_rr(t->mc, prefix, 0x10, dst, src);
+ } else {
+ u32 caller_off = a->next_param_stack;
+ a->next_param_stack += 8;
+ emit_sse_load(t->mc, prefix, 0x10, dst, X64_RBP,
+ (i32)(16 + caller_off));
+ }
+ } else {
+ compiler_panic(t->c, a->loc, "x64 param: ABI class %d unimpl",
+ (int)pt->cls);
+ }
+ return st;
+ }
if (ai->kind == ABI_ARG_INDIRECT) {
/* Incoming pointer to byval copy: load pointer, memcpy into slot. */
u32 ptr_reg;
@@ -89,7 +141,7 @@ void x_param(CGTarget* t, const CGParamDesc* p) {
emit_mov_store(t->mc, 1, X64_RAX, X64_RBP, -(i32)s->off + (i32)i);
i += 1;
}
- return;
+ return st;
}
/* DIRECT */
for (u16 i = 0; i < ai->nparts; ++i) {
@@ -129,6 +181,7 @@ void x_param(CGTarget* t, const CGParamDesc* p) {
(int)pt->cls);
}
}
+ return st;
}
void x_spill_reg(CGTarget* t, Operand src, FrameSlot slot,
diff --git a/src/arch/x64/internal.h b/src/arch/x64/internal.h
@@ -200,7 +200,7 @@ void emit_sse_rr_w(MCEmitter *mc, u8 prefix, u8 opcode, int w, u32 dst,
/* --- alloc.c exports (used by emit.c and/or ops.c) --- */
XSlot *x64_slot_get(XImpl *a, FrameSlot fs);
FrameSlot x_frame_slot(CGTarget *t, const FrameSlotDesc *d);
-void x_param(CGTarget *t, const CGParamDesc *p);
+CGLocalStorage x_param(CGTarget *t, const CGParamDesc *p);
void x_spill_reg(CGTarget *t, Operand src, FrameSlot slot, MemAccess ma);
void x_reload_reg(CGTarget *t, Operand dst, FrameSlot slot, MemAccess ma);
Label x_label_new(CGTarget *t);
diff --git a/src/opt/ir.c b/src/opt/ir.c
@@ -172,7 +172,10 @@ void ir_param_add(Func* f, const CGParamDesc* d) {
p->index = d->index;
p->name = d->name;
p->type = d->type;
- p->slot = d->slot;
+ p->size = d->size;
+ p->align = d->align;
+ p->flags = d->flags;
+ p->storage = d->storage;
p->abi = d->abi;
p->loc = d->loc;
}
diff --git a/src/opt/ir.h b/src/opt/ir.h
@@ -212,7 +212,10 @@ typedef struct IRParam {
u32 index;
Sym name;
CfreeCgTypeId type;
- FrameSlot slot;
+ u32 size;
+ u32 align;
+ u32 flags; /* CGLocalFlag */
+ CGLocalStorage storage;
const ABIArgInfo* abi;
SrcLoc loc;
} IRParam;
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -151,6 +151,19 @@ static FrameSlot opt_local_frame_slot(Func* f, const CGLocalDesc* d,
return ir_frame_slot_new(f, &fsd);
}
+static FrameSlot opt_param_frame_slot(Func* f, const CGParamDesc* d) {
+ FrameSlotDesc fsd;
+ memset(&fsd, 0, sizeof fsd);
+ fsd.type = d->type;
+ fsd.name = d->name;
+ fsd.loc = d->loc;
+ fsd.size = d->size;
+ fsd.align = d->align;
+ fsd.kind = FS_PARAM;
+ if (d->flags & CG_LOCAL_ADDR_TAKEN) fsd.flags |= FSF_ADDR_TAKEN;
+ return ir_frame_slot_new(f, &fsd);
+}
+
static u8 opt_local_reg_class_for(Compiler* c, CfreeCgTypeId ty) {
CfreeCgTypeKind kind = cfree_cg_type_kind((CfreeCompiler*)c, ty);
return kind == CFREE_CG_TYPE_FLOAT ? RC_FP : RC_INT;
@@ -384,16 +397,51 @@ static void opt_frame_home_addr_taken_locals(Func* f) {
}
}
-static void w_param(CGTarget* t, const CGParamDesc* d) {
+static CGLocalStorage w_param(CGTarget* t, const CGParamDesc* d) {
OptImpl* o = impl_of(t);
+ CGLocalStorage st = d->storage;
+ CGLocalDesc local_desc;
+ memset(&local_desc, 0, sizeof local_desc);
+ local_desc.type = d->type;
+ local_desc.name = d->name;
+ local_desc.loc = d->loc;
+ local_desc.size = d->size;
+ local_desc.align = d->align;
+ local_desc.flags = d->flags;
+ if (st.kind == CG_LOCAL_STORAGE_FRAME && st.v.frame_slot == FRAME_SLOT_NONE) {
+ if ((d->flags & (CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED)) == 0) {
+ Val v = ir_alloc_val(o->f, d->type, opt_local_reg_class(o, d->type));
+ st.kind = CG_LOCAL_STORAGE_REG;
+ st.v.reg = (Reg)v;
+ } else {
+ st.kind = CG_LOCAL_STORAGE_FRAME;
+ st.v.frame_slot = opt_param_frame_slot(o->f, d);
+ }
+ }
/* Deep-copy parts so caller-stack memory isn't relied on. */
CGParamDesc copy = *d;
+ copy.storage = st;
if (d->nincoming) {
CGABIPart* parts = arena_array(o->f->arena, CGABIPart, d->nincoming);
memcpy(parts, d->incoming, sizeof(CGABIPart) * d->nincoming);
copy.incoming = parts;
}
ir_param_add(o->f, ©);
+ ir_local_add(o->f, &local_desc, st);
+ if (st.kind == CG_LOCAL_STORAGE_REG) {
+ Inst* in = rec(o, IR_PARAM_DECL);
+ in->def = (Val)st.v.reg;
+ in->type = d->type;
+ in->opnds = arena_array(o->f->arena, Operand, 1);
+ in->opnds[0].kind = OPK_REG;
+ in->opnds[0].cls = opt_local_reg_class(o, d->type);
+ in->opnds[0].type = d->type;
+ in->opnds[0].v.reg = st.v.reg;
+ in->nopnds = 1;
+ o->f->val_def_block[st.v.reg] = o->cur;
+ o->f->val_def_inst[st.v.reg] = o->f->blocks[o->cur].ninsts - 1u;
+ }
+ return st;
}
static void w_spill_reg(CGTarget* t, Operand src, FrameSlot s, MemAccess m) {
@@ -1112,6 +1160,31 @@ static FrameSlot slot_to_target(ReplayCtx* r, FrameSlot vs) {
return r->slot_map[vs];
}
+static CGLocalStorage xlat_storage(ReplayCtx* r, CGLocalStorage st,
+ CfreeCgTypeId ty) {
+ (void)ty;
+ if (st.kind == CG_LOCAL_STORAGE_REG) {
+ Val v = (Val)st.v.reg;
+ if (r->identity_regs && r->f->opt_rewritten && v < r->f->nvals &&
+ r->f->val_info) {
+ OptValInfo* vi = &r->f->val_info[v];
+ if (vi->alloc_kind == OPT_ALLOC_HARD) {
+ st.v.reg = vi->hard_reg;
+ } else if (vi->alloc_kind == OPT_ALLOC_SPILL) {
+ st.kind = CG_LOCAL_STORAGE_FRAME;
+ st.v.frame_slot = slot_to_target(r, vi->spill_slot);
+ } else {
+ st.v.reg = val_to_target_reg(r, v);
+ }
+ } else {
+ st.v.reg = val_to_target_reg(r, v);
+ }
+ } else {
+ st.v.frame_slot = slot_to_target(r, st.v.frame_slot);
+ }
+ return st;
+}
+
static Operand xlat_op(ReplayCtx* r, Operand op) {
switch ((OpKind)op.kind) {
case OPK_IMM:
@@ -1574,9 +1647,8 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) {
for (u32 i = 0; i < f->nframe_slots; ++i)
r.slot_map[f->frame_slots[i].id] = target_slots[i];
} else {
- /* func_begin with the recorded descriptor. The desc.params[].slot
- * fields are wrapper IR slot ids; aarch64's func_begin doesn't
- * dereference them so we don't translate. */
+ /* func_begin with the recorded descriptor. Parameter storage is replayed
+ * through target->param below after frame slots are mapped. */
w->func_begin(w, &f->desc);
}
@@ -1613,10 +1685,13 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) {
d.index = p->index;
d.name = p->name;
d.type = p->type;
- d.slot = slot_to_target(&r, p->slot);
+ d.size = p->size;
+ d.align = p->align;
+ d.flags = p->flags;
+ d.storage = xlat_storage(&r, p->storage, p->type);
d.abi = p->abi;
d.loc = p->loc;
- w->param(w, &d);
+ (void)w->param(w, &d);
}
/* Body in emit order — the order CG's emit cursor visited each
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -1325,6 +1325,10 @@ static void rewrite_func(Func* f, const OptLiveInfo* live_info) {
for (u32 ri = bl->ninsts; ri > 0; --ri) {
u32 i = ri - 1u;
Inst in = bl->insts[i];
+ if ((IROp)in.op == IR_PARAM_DECL) {
+ out_prepend_inst(f, &out, &in);
+ continue;
+ }
refs_reset(&refs);
walk_inst_operands(f, &in, refs_collect, &refs);
list_reset(&before);
@@ -1988,6 +1992,7 @@ static int inst_has_side_effect(Func* f, const Inst* in) {
return aux && mem_observable(&aux->access.storage);
}
case IR_ALLOCA:
+ case IR_PARAM_DECL:
case IR_STORE:
case IR_AGG_COPY:
case IR_AGG_SET:
diff --git a/test/api/cg_type_test.c b/test/api/cg_type_test.c
@@ -979,7 +979,7 @@ static void exercise_cg_constfold_phases(CfreeCompiler* c,
uint32_t partial_size = cg_emit_local_shadow_partial_store(
c, i32_ty, i8_ty, "cg_shadow_partial_o1");
- EXPECT(delayed_size <= 52,
+ EXPECT(delayed_size <= 56,
"delayed arithmetic chain should materialize as one add, text "
"size=%u",
delayed_size);
@@ -989,7 +989,7 @@ static void exercise_cg_constfold_phases(CfreeCompiler* c,
EXPECT(local_size <= 32,
"local constant shadow should fold x=40; return x+2, text size=%u",
local_size);
- EXPECT(delayed_cmp_size <= 60,
+ EXPECT(delayed_cmp_size <= 64,
"delayed arithmetic consumed by compare should stay compact, text "
"size=%u",
delayed_cmp_size);
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -423,6 +423,8 @@ typedef struct MockCGTarget {
int store_calls;
int addr_of_calls;
int cmp_branch_calls;
+ int param_calls;
+ CGLocalStorage last_param_storage;
} MockCGTarget;
static void mock_func_begin(CGTarget* t, const CGFuncDesc* d) {
@@ -528,6 +530,27 @@ static void mock_store(CGTarget* t, Operand addr, Operand src, MemAccess macc) {
++m->store_calls;
}
+static FrameSlot mock_frame_slot(CGTarget* t, const FrameSlotDesc* d);
+
+static CGLocalStorage mock_param(CGTarget* t, const CGParamDesc* p) {
+ MockCGTarget* m = (MockCGTarget*)t;
+ CGLocalStorage st = p->storage;
+ ++m->param_calls;
+ if (st.kind == CG_LOCAL_STORAGE_FRAME && st.v.frame_slot == FRAME_SLOT_NONE) {
+ FrameSlotDesc fsd = {0};
+ fsd.type = p->type;
+ fsd.name = p->name;
+ fsd.loc = p->loc;
+ fsd.size = p->size;
+ fsd.align = p->align;
+ fsd.kind = FS_PARAM;
+ if (p->flags & CG_LOCAL_ADDR_TAKEN) fsd.flags |= FSF_ADDR_TAKEN;
+ st.v.frame_slot = mock_frame_slot(t, &fsd);
+ }
+ m->last_param_storage = st;
+ return st;
+}
+
static void mock_addr_of(CGTarget* t, Operand dst, Operand lv) {
MockCGTarget* m = (MockCGTarget*)t;
(void)dst;
@@ -591,6 +614,7 @@ static void mock_init(MockCGTarget* m, Compiler* c) {
m->base.copy = mock_copy;
m->base.load = mock_load;
m->base.store = mock_store;
+ m->base.param = mock_param;
m->base.addr_of = mock_addr_of;
m->base.ret = mock_ret;
m->base.set_loc = mock_set_loc;
@@ -2525,6 +2549,17 @@ static CGLocalDesc local_desc_(CfreeCgTypeId ty, u32 size, u32 align,
return d;
}
+static CGParamDesc param_desc_(CfreeCgTypeId ty, u32 size, u32 align,
+ u32 flags) {
+ CGParamDesc d;
+ memset(&d, 0, sizeof d);
+ d.type = ty;
+ d.size = size;
+ d.align = align;
+ d.flags = flags;
+ return d;
+}
+
static void opt_local_hook_chooses_register_for_scalar(void) {
TestCtx tc;
tc_init(&tc);
@@ -2544,6 +2579,60 @@ static void opt_local_hook_chooses_register_for_scalar(void) {
tc_fini(&tc);
}
+static void opt_param_hook_chooses_register_for_scalar(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ MockCGTarget mock;
+ mock_init(&mock, tc.c);
+ static const Reg pool[] = {19};
+ static const Reg scratch[] = {9, 10};
+ mock_set_pool(&mock, RC_INT, pool, 1, scratch, 2, 0x4007FFFFu);
+
+ CGTarget* opt = opt_cgtarget_new(tc.c, &mock.base, 1);
+ begin_mock_opt_func(&tc, opt, tc.i32);
+
+ CGParamDesc d = param_desc_(tc.i32, 4, 4, 0);
+ CGLocalStorage st = opt->param(opt, &d);
+ EXPECT(st.kind == CG_LOCAL_STORAGE_REG,
+ "non-address-taken scalar param should be register-backed");
+ EXPECT(st.v.reg != (Reg)REG_NONE, "register-backed param needs a vreg");
+
+ CGABIValue retv = {0};
+ retv.type = tc.i32;
+ retv.storage = op_reg_(st.v.reg, tc.i32);
+ opt->ret(opt, &retv);
+ opt->func_end(opt);
+
+ EXPECT(mock.param_calls == 1, "param should replay to wrapped backend");
+ EXPECT(mock.last_param_storage.kind == CG_LOCAL_STORAGE_REG,
+ "replayed scalar param should remain register-backed");
+ EXPECT(mock.last_param_storage.v.reg == 19,
+ "replayed param storage should be allocated hard reg r19, got r%u",
+ (unsigned)mock.last_param_storage.v.reg);
+
+ opt->destroy(opt);
+ tc_fini(&tc);
+}
+
+static void opt_param_memory_required_uses_frame(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ MockCGTarget mock;
+ mock_init(&mock, tc.c);
+
+ CGTarget* opt = opt_cgtarget_new(tc.c, &mock.base, 1);
+ begin_mock_opt_func(&tc, opt, tc.i32);
+
+ CGParamDesc d =
+ param_desc_(tc.i32, 4, 4, CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED);
+ CGLocalStorage st = opt->param(opt, &d);
+ EXPECT(st.kind == CG_LOCAL_STORAGE_FRAME,
+ "memory-required param should be frame-backed");
+
+ opt->destroy(opt);
+ tc_fini(&tc);
+}
+
static void opt_local_addr_taken_uses_frame_and_replays_addr_of(void) {
TestCtx tc;
tc_init(&tc);
@@ -2686,6 +2775,8 @@ int main(void) {
opt_records_const_bytes_by_value();
opt_cmp_branch_keeps_fallthrough_after_block_growth();
opt_local_hook_chooses_register_for_scalar();
+ opt_param_hook_chooses_register_for_scalar();
+ opt_param_memory_required_uses_frame();
opt_local_addr_taken_uses_frame_and_replays_addr_of();
opt_register_local_addr_frame_homes();
simple_regalloc_reports_exact_used_regs();