kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 4e913e9c8527350b317b9e01d5272bd11e077474
parent 15697ccc61d1100c553d0f8b7c631d3695377565
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 15 May 2026 16:40:04 -0700

Allow O1 params to bind to registers

Diffstat:
Mdoc/OPT1.md | 34+++++++++-------------------------
Msrc/api/cg.c | 46++++++++++++++++++++++------------------------
Msrc/arch/aa64/emit.c | 63++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Msrc/arch/aa64/internal.h | 2+-
Msrc/arch/arch.h | 7+++++--
Msrc/arch/rv64/alloc.c | 69++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Msrc/arch/rv64/internal.h | 2+-
Msrc/arch/x64/alloc.c | 65+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Msrc/arch/x64/internal.h | 2+-
Msrc/opt/ir.c | 5++++-
Msrc/opt/ir.h | 5++++-
Msrc/opt/opt.c | 87+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Msrc/opt/pass_lower.c | 5+++++
Mtest/api/cg_type_test.c | 4++--
Mtest/opt/opt_test.c | 91+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
15 files changed, 407 insertions(+), 80 deletions(-)

diff --git a/doc/OPT1.md b/doc/OPT1.md @@ -143,6 +143,8 @@ O1 relies on each target backend to provide: - allocable hard-register pools per register class; - scratch-register pools disjoint from allocable pools; - caller-saved register classification; +- parameter storage binding through `CGTarget.param`, which may return either a + frame slot or a register-backed local storage for simple direct ABI params; - optional hard-register reservation before backend `func_end`; - target legality for local folds performed by `opt_combine`. @@ -227,12 +229,6 @@ N opt.o1 live_reg regalloc spills reloads inserted 512 6.991 1.474 4.924 504 504 1008 ``` -The same spill family exposed a correctness risk in the JIT/run path: some -large spill-heavy generated functions returned correctly while nearby sizes -segfaulted during execution (`N=256` and `N=512` in the focused probe). Treat -that as a codegen/runtime correctness bug before using the spill ladder as a -pure performance benchmark. - Current code-shape probes compiled `identity_param`, `scalar_add`, `while_sum`, `simple_branch`, `direct_call`, `const_local`, and `local_addr_taken` across x64, AArch64, and RV64 with `-O1`, then disassembled @@ -268,19 +264,15 @@ b ... local combine pass also retargets safe single-use arithmetic producers to a following physical-copy destination, including commutative operand swaps for x64-style two-operand overlap cases. +- Simple scalar parameters no longer need late mem2reg promotion. `CGTarget.param` + owns ABI entry binding and O1 can keep non-memory-required direct params as + virtual-register-backed locals, replaying the final hard register or spill slot + to the target backend. Frame-backed params remain the path for aggregates, + address-taken values, indirect/byval ABI cases, and other memory-required + shapes. Remaining O1 shape issues visible in the current dumps: -- Parameter and entry-slot promotion is incomplete. The trivial AArch64 - identity function still stores the incoming argument to a frame slot, reloads - it into `w19`, then copies it back to `w0`: - -```asm -stur w0, [x29, #-0x4] -ldur w19, [x29, #-0x4] -mov w0, w19 -``` - - O1 still saves/restores more callee-saved registers than the body appears to need in small functions. The AArch64 while-loop probe saves `x19-x22`, and the x64 direct-call probe saves `rbx/r12/r13/r14` in tiny functions. @@ -295,15 +287,7 @@ mov w0, w19 MIR's O1 path suggests these high-value local cleanups that still fit cfree's fast tier: -1. Promote remaining scalar entry slots before backend allocation. - MIR's C frontend represents normal scalar block locals as MIR registers and - leaves stack slots for aggregates, forced-stack cases, and address-taken - values. O1 now keeps simple loop locals in registers in the probe, but still - stores and reloads some parameter/entry slots. A conservative mem2reg-lite - pass should promote remaining integer/pointer scalars whose address does not - escape, starting with parameters and single-entry structured control flow. - -2. Avoid unnecessary callee-save traffic. +1. Avoid unnecessary callee-save traffic. Reserve and preserve only hard registers that survive final post-rewrite cleanup, and consider caller-saved registers for values that are not live across calls. This would make small leaf functions much closer to expected diff --git a/src/api/cg.c b/src/api/cg.c @@ -3080,11 +3080,12 @@ CfreeCgLocal cfree_cg_local(CfreeCg *g, CfreeCgTypeId type, CfreeCgLocal cfree_cg_param(CfreeCg *g, uint32_t index, CfreeCgTypeId type, CfreeCgLocalAttrs attrs) { CfreeCgTypeId ty; - FrameSlot slot; CGParamDesc pd; - FrameSlotDesc fsd; ApiSourceLocal *rec; CfreeCgLocal handle; + CGLocalStorage storage; + u32 size; + u32 align; if (!g) return CFREE_CG_LOCAL_NONE; ty = resolve_type(g->c, type); @@ -3096,27 +3097,28 @@ CfreeCgLocal cfree_cg_param(CfreeCg *g, uint32_t index, CfreeCgTypeId type, if (handle == CFREE_CG_LOCAL_NONE || !api_grow_locals(g, g->nlocals + 1u)) return CFREE_CG_LOCAL_NONE; - memset(&fsd, 0, sizeof fsd); - fsd.type = ty; - fsd.name = (Sym)attrs.name; - fsd.loc = g->cur_loc; - fsd.size = abi_cg_sizeof(g->c->abi, type); - fsd.align = attrs.align ? attrs.align : abi_cg_alignof(g->c->abi, type); - fsd.kind = FS_PARAM; - if (api_source_flags_addr_taken(attrs.flags)) - fsd.flags |= FSF_ADDR_TAKEN; - slot = g->target->frame_slot(g->target, &fsd); + size = abi_cg_sizeof(g->c->abi, type); + align = attrs.align ? attrs.align : abi_cg_alignof(g->c->abi, type); memset(&pd, 0, sizeof pd); pd.index = index; pd.name = (Sym)attrs.name; pd.type = ty; - pd.slot = slot; + pd.size = size; + pd.align = align; + if (api_source_flags_addr_taken(attrs.flags)) + pd.flags |= CG_LOCAL_ADDR_TAKEN; + if (api_local_requires_memory(g, ty, attrs)) + pd.flags |= CG_LOCAL_MEMORY_REQUIRED; if (g->fn_abi && index < g->fn_abi->nparams) { pd.abi = &g->fn_abi->params[index]; } pd.loc = g->cur_loc; - g->target->param(g->target, &pd); + storage = g->target->param(g->target, &pd); + if (storage.kind == CG_LOCAL_STORAGE_REG) { + cg_simple_regalloc_reserve(&g->regalloc, (RegClass)api_type_class(ty), + storage.v.reg); + } rec = &g->locals[g->nlocals++]; memset(rec, 0, sizeof *rec); @@ -3128,13 +3130,10 @@ CfreeCgLocal cfree_cg_param(CfreeCg *g, uint32_t index, CfreeCgTypeId type, rec->desc.type = ty; rec->desc.name = (Sym)attrs.name; rec->desc.loc = g->cur_loc; - rec->desc.size = fsd.size; - rec->desc.align = fsd.align; - rec->desc.flags = api_source_flags_addr_taken(attrs.flags) - ? CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED - : CG_LOCAL_MEMORY_REQUIRED; - rec->storage.kind = CG_LOCAL_STORAGE_FRAME; - rec->storage.v.frame_slot = slot; + rec->desc.size = size; + rec->desc.align = align; + rec->desc.flags = pd.flags; + rec->storage = storage; rec->param_index = index; rec->kind = API_SOURCE_LOCAL_PARAM; return handle; @@ -3267,8 +3266,7 @@ void cfree_cg_push_local(CfreeCg *g, CfreeCgLocal local) { rec = api_local_from_handle(g, local); if (!rec) return; - if (rec->kind == API_SOURCE_LOCAL_AUTO && - rec->storage.kind == CG_LOCAL_STORAGE_REG) { + if (rec->storage.kind == CG_LOCAL_STORAGE_REG) { api_push_source_reg_lvalue(g, local, rec->storage.v.reg, rec->type); } else if (rec->kind == API_SOURCE_LOCAL_AUTO) { api_push_source_frame_lvalue(g, local, rec->storage.v.frame_slot, @@ -3455,7 +3453,7 @@ void cfree_cg_addr(CfreeCg *g) { rec = v.source_local != CFREE_CG_LOCAL_NONE ? api_local_from_handle(g, v.source_local) : NULL; - if (rec && rec->kind == API_SOURCE_LOCAL_AUTO && T->local_addr) + if (rec && rec->storage.kind == CG_LOCAL_STORAGE_REG && T->local_addr) T->local_addr(T, dst, &rec->desc, rec->storage); else T->addr_of(T, dst, v.op); diff --git a/src/arch/aa64/emit.c b/src/arch/aa64/emit.c @@ -500,16 +500,68 @@ FrameSlot aa_frame_slot(CGTarget *t, const FrameSlotDesc *d) { * Parameters * ============================================================ */ -void aa_param(CGTarget *t, const CGParamDesc *p) { +CGLocalStorage aa_param(CGTarget *t, const CGParamDesc *p) { AAImpl *a = impl_of(t); - AASlot *s = aa64_slot_get(a, p->slot); - if (!s) { + CGLocalStorage st = p->storage; + if (st.kind == CG_LOCAL_STORAGE_FRAME && st.v.frame_slot == FRAME_SLOT_NONE) { + FrameSlotDesc fsd = {0}; + fsd.type = p->type; + fsd.name = p->name; + fsd.loc = p->loc; + fsd.size = p->size; + fsd.align = p->align; + fsd.kind = FS_PARAM; + if (p->flags & CG_LOCAL_ADDR_TAKEN) fsd.flags |= FSF_ADDR_TAKEN; + st.v.frame_slot = aa_frame_slot(t, &fsd); + } + AASlot *s = st.kind == CG_LOCAL_STORAGE_FRAME + ? aa64_slot_get(a, st.v.frame_slot) + : NULL; + if (st.kind == CG_LOCAL_STORAGE_FRAME && !s) { compiler_panic(t->c, a->loc, "aarch64 param: bad slot"); } const ABIArgInfo *ai = p->abi; if (ai->kind == ABI_ARG_IGNORE) - return; + return st; + if (st.kind == CG_LOCAL_STORAGE_REG) { + if (ai->kind != ABI_ARG_DIRECT || ai->nparts != 1) { + compiler_panic(t->c, a->loc, + "aarch64 param: register storage requires one direct part"); + } + const ABIArgPart *pt = &ai->parts[0]; + u32 sz = pt->size; + u32 sidx = size_idx_for_bytes(sz); + if (pt->cls == ABI_CLASS_INT) { + u32 dst = reg_num((Operand){.kind = OPK_REG, .v.reg = st.v.reg}); + if (a->next_param_int < 8) { + u32 src = a->next_param_int++; + u32 sf = (sz == 8) ? 1u : 0u; + if (dst != src) aa64_emit32(t->mc, aa64_mov_reg(sf, dst, src)); + } else { + u32 caller_off = a->next_param_stack; + a->next_param_stack += 8; + aa64_emit_ldur_off(t->mc, sidx, dst, 29, (i32)(16 + caller_off), + AA_TMP0); + } + } else if (pt->cls == ABI_CLASS_FP) { + u32 dst = reg_num((Operand){.kind = OPK_REG, .v.reg = st.v.reg}); + if (a->next_param_fp < 8) { + u32 src = a->next_param_fp++; + u32 type = (sz == 8) ? 1u : 0u; + if (dst != src) aa64_emit32(t->mc, aa64_fmov_reg(type, dst, src)); + } else { + u32 caller_off = a->next_param_stack; + a->next_param_stack += 8; + aa64_emit_ldur_fp_off(t->mc, sidx, dst, 29, + (i32)(16 + caller_off), AA_TMP0); + } + } else { + compiler_panic(t->c, a->loc, "aarch64 param: ABI class %d unimpl", + (int)pt->cls); + } + return st; + } if (ai->kind == ABI_ARG_INDIRECT) { u32 ptr_reg; if (a->next_param_int < 8) { @@ -547,7 +599,7 @@ void aa_param(CGTarget *t, const CGParamDesc *p) { AA_TMP2); i += 1; } - return; + return st; } for (u16 i = 0; i < ai->nparts; ++i) { const ABIArgPart *pt = &ai->parts[i]; @@ -586,6 +638,7 @@ void aa_param(CGTarget *t, const CGParamDesc *p) { (int)pt->cls); } } + return st; } /* ============================================================ diff --git a/src/arch/aa64/internal.h b/src/arch/aa64/internal.h @@ -300,7 +300,7 @@ void aa_func_begin_known_frame(CGTarget* t, const CGFuncDesc* fd, const CGKnownFrameDesc* frame, FrameSlot* out_slots); void aa_func_end(CGTarget* t); -void aa_param(CGTarget* t, const CGParamDesc* p); +CGLocalStorage aa_param(CGTarget* t, const CGParamDesc* p); /* alloc.c helpers used in emit.c / ops.c */ AAImpl* impl_of(CGTarget* t); diff --git a/src/arch/arch.h b/src/arch/arch.h @@ -332,7 +332,10 @@ typedef struct CGParamDesc { u32 index; Sym name; CfreeCgTypeId type; - FrameSlot slot; + u32 size; + u32 align; + u32 flags; /* CGLocalFlag */ + CGLocalStorage storage; const ABIArgInfo* abi; const CGABIPart* incoming; u32 nincoming; @@ -538,7 +541,7 @@ struct CGTarget { CGLocalStorage (*local)(CGTarget*, const CGLocalDesc*); void (*local_addr)(CGTarget*, Operand dst, const CGLocalDesc*, CGLocalStorage); - void (*param)(CGTarget*, const CGParamDesc*); + CGLocalStorage (*param)(CGTarget*, const CGParamDesc*); void (*spill_reg)(CGTarget*, Operand src_reg, FrameSlot, MemAccess); void (*reload_reg)(CGTarget*, Operand dst_reg, FrameSlot, MemAccess); diff --git a/src/arch/rv64/alloc.c b/src/arch/rv64/alloc.c @@ -37,17 +37,75 @@ RvSlot* rv64_slot_get(RImpl* a, FrameSlot fs) { /* ---- param ---- */ -void rv_param(CGTarget* t, const CGParamDesc* p) { +CGLocalStorage rv_param(CGTarget* t, const CGParamDesc* p) { RImpl* a = impl_of(t); MCEmitter* mc = t->mc; - RvSlot* s = rv64_slot_get(a, p->slot); - if (!s) compiler_panic(t->c, a->loc, "rv64 param: bad slot"); + CGLocalStorage st = p->storage; + if (st.kind == CG_LOCAL_STORAGE_FRAME && st.v.frame_slot == FRAME_SLOT_NONE) { + FrameSlotDesc fsd = {0}; + fsd.type = p->type; + fsd.name = p->name; + fsd.loc = p->loc; + fsd.size = p->size; + fsd.align = p->align; + fsd.kind = FS_PARAM; + if (p->flags & CG_LOCAL_ADDR_TAKEN) fsd.flags |= FSF_ADDR_TAKEN; + st.v.frame_slot = rv_frame_slot(t, &fsd); + } + RvSlot* s = st.kind == CG_LOCAL_STORAGE_FRAME + ? rv64_slot_get(a, st.v.frame_slot) + : NULL; + if (st.kind == CG_LOCAL_STORAGE_FRAME && !s) + compiler_panic(t->c, a->loc, "rv64 param: bad slot"); const ABIArgInfo* ai = p->abi; /* Caller's stack args start above the saved-s0/ra pair, plus the * 64-byte variadic save area when this function is variadic. */ i32 caller_stack_base = 16 + (a->is_variadic ? 64 : 0); - if (ai->kind == ABI_ARG_IGNORE) return; + if (ai->kind == ABI_ARG_IGNORE) return st; + if (st.kind == CG_LOCAL_STORAGE_REG) { + if (ai->kind != ABI_ARG_DIRECT || ai->nparts != 1) { + compiler_panic(t->c, a->loc, + "rv64 param: register storage requires one direct part"); + } + const ABIArgPart* pt = &ai->parts[0]; + u32 sz = pt->size; + if (pt->cls == ABI_CLASS_INT) { + u32 dst = st.v.reg; + if (a->next_param_int < 8) { + u32 src = RV_A0 + a->next_param_int; + a->next_param_int++; + if (dst != src) rv64_emit32(mc, rv_addi(dst, src, 0)); + } else { + u32 caller_off = a->next_param_stack; + a->next_param_stack += 8; + rv64_emit32(mc, enc_int_load(sz, 0, dst, RV_S0, + caller_stack_base + (i32)caller_off)); + } + } else if (pt->cls == ABI_CLASS_FP) { + u32 dst = st.v.reg; + if (a->next_param_fp < 8) { + u32 src = 10u + a->next_param_fp; + a->next_param_fp++; + if (dst != src) { + rv64_emit32(mc, rv_fsgnj(sz == 8 ? 1u : 0u, dst, src, src)); + } + } else { + u32 caller_off = a->next_param_stack; + a->next_param_stack += 8; + if (sz == 8) + rv64_emit32(mc, rv_fld(dst, RV_S0, + caller_stack_base + (i32)caller_off)); + else + rv64_emit32(mc, rv_flw(dst, RV_S0, + caller_stack_base + (i32)caller_off)); + } + } else { + compiler_panic(t->c, a->loc, "rv64 param: ABI class %d unimpl", + (int)pt->cls); + } + return st; + } if (ai->kind == ABI_ARG_INDIRECT) { /* Pointer-to-copy passed in a-register. Copy bytes from there into * the home slot. Source pointer is in a0..a7. */ @@ -86,7 +144,7 @@ void rv_param(CGTarget* t, const CGParamDesc* p) { rv64_emit32(mc, rv_sb(RV_T2, RV_S0, -(i32)s->off + (i32)i)); i += 1; } - return; + return st; } /* DIRECT */ for (u16 i = 0; i < ai->nparts; ++i) { @@ -134,6 +192,7 @@ void rv_param(CGTarget* t, const CGParamDesc* p) { (int)pt->cls); } } + return st; } void rv_spill_reg(CGTarget* t, Operand src, FrameSlot slot, diff --git a/src/arch/rv64/internal.h b/src/arch/rv64/internal.h @@ -138,7 +138,7 @@ _Noreturn void rv_panic(CGTarget *t, const char *what); /* ---- alloc.c: all functions (non-static; referenced by ops.c vtable) ---- */ FrameSlot rv_frame_slot(CGTarget *t, const FrameSlotDesc *d); RvSlot *rv64_slot_get(RImpl *a, FrameSlot fs); -void rv_param(CGTarget *t, const CGParamDesc *p); +CGLocalStorage rv_param(CGTarget *t, const CGParamDesc *p); void rv_spill_reg(CGTarget *t, Operand src, FrameSlot slot, MemAccess ma); void rv_reload_reg(CGTarget *t, Operand dst, FrameSlot slot, MemAccess ma); Label rv_label_new(CGTarget *t); diff --git a/src/arch/x64/alloc.c b/src/arch/x64/alloc.c @@ -48,14 +48,66 @@ XSlot* x64_slot_get(XImpl* a, FrameSlot fs) { return &a->slots[fs - 1]; } -/* ---- param: store incoming arg(s) into the home slot ---- */ -void x_param(CGTarget* t, const CGParamDesc* p) { +/* ---- param: bind incoming arg(s) to the requested storage ---- */ +CGLocalStorage x_param(CGTarget* t, const CGParamDesc* p) { XImpl* a = impl_of(t); - XSlot* s = x64_slot_get(a, p->slot); - if (!s) compiler_panic(t->c, a->loc, "x64 param: bad slot"); + CGLocalStorage st = p->storage; + if (st.kind == CG_LOCAL_STORAGE_FRAME && st.v.frame_slot == FRAME_SLOT_NONE) { + FrameSlotDesc fsd = {0}; + fsd.type = p->type; + fsd.name = p->name; + fsd.loc = p->loc; + fsd.size = p->size; + fsd.align = p->align; + fsd.kind = FS_PARAM; + if (p->flags & CG_LOCAL_ADDR_TAKEN) fsd.flags |= FSF_ADDR_TAKEN; + st.v.frame_slot = x_frame_slot(t, &fsd); + } + XSlot* s = st.kind == CG_LOCAL_STORAGE_FRAME + ? x64_slot_get(a, st.v.frame_slot) + : NULL; + if (st.kind == CG_LOCAL_STORAGE_FRAME && !s) + compiler_panic(t->c, a->loc, "x64 param: bad slot"); const ABIArgInfo* ai = p->abi; - if (ai->kind == ABI_ARG_IGNORE) return; + if (ai->kind == ABI_ARG_IGNORE) return st; + if (st.kind == CG_LOCAL_STORAGE_REG) { + if (ai->kind != ABI_ARG_DIRECT || ai->nparts != 1) { + compiler_panic(t->c, a->loc, + "x64 param: register storage requires one direct part"); + } + const ABIArgPart* pt = &ai->parts[0]; + u32 sz = pt->size; + if (pt->cls == ABI_CLASS_INT) { + if (a->next_param_int < 6) { + u32 src = g_int_arg_regs[a->next_param_int++]; + u32 dst = st.v.reg & 0xFu; + int w = (sz == 8) ? 1 : 0; + if (dst != src) emit_mov_rr(t->mc, w, dst, src); + } else { + u32 caller_off = a->next_param_stack; + a->next_param_stack += 8; + emit_mov_load(t->mc, sz, 0, st.v.reg & 0xFu, X64_RBP, + (i32)(16 + caller_off)); + } + } else if (pt->cls == ABI_CLASS_FP) { + u8 prefix = (sz == 8) ? 0xF2 : 0xF3; + u32 dst = st.v.reg & 0xFu; + if (a->next_param_fp < 8) { + u32 src = a->next_param_fp++; + if (dst != src) emit_sse_rr(t->mc, prefix, 0x10, dst, src); + } else { + u32 caller_off = a->next_param_stack; + a->next_param_stack += 8; + emit_sse_load(t->mc, prefix, 0x10, dst, X64_RBP, + (i32)(16 + caller_off)); + } + } else { + compiler_panic(t->c, a->loc, "x64 param: ABI class %d unimpl", + (int)pt->cls); + } + return st; + } if (ai->kind == ABI_ARG_INDIRECT) { /* Incoming pointer to byval copy: load pointer, memcpy into slot. */ u32 ptr_reg; @@ -89,7 +141,7 @@ void x_param(CGTarget* t, const CGParamDesc* p) { emit_mov_store(t->mc, 1, X64_RAX, X64_RBP, -(i32)s->off + (i32)i); i += 1; } - return; + return st; } /* DIRECT */ for (u16 i = 0; i < ai->nparts; ++i) { @@ -129,6 +181,7 @@ void x_param(CGTarget* t, const CGParamDesc* p) { (int)pt->cls); } } + return st; } void x_spill_reg(CGTarget* t, Operand src, FrameSlot slot, diff --git a/src/arch/x64/internal.h b/src/arch/x64/internal.h @@ -200,7 +200,7 @@ void emit_sse_rr_w(MCEmitter *mc, u8 prefix, u8 opcode, int w, u32 dst, /* --- alloc.c exports (used by emit.c and/or ops.c) --- */ XSlot *x64_slot_get(XImpl *a, FrameSlot fs); FrameSlot x_frame_slot(CGTarget *t, const FrameSlotDesc *d); -void x_param(CGTarget *t, const CGParamDesc *p); +CGLocalStorage x_param(CGTarget *t, const CGParamDesc *p); void x_spill_reg(CGTarget *t, Operand src, FrameSlot slot, MemAccess ma); void x_reload_reg(CGTarget *t, Operand dst, FrameSlot slot, MemAccess ma); Label x_label_new(CGTarget *t); diff --git a/src/opt/ir.c b/src/opt/ir.c @@ -172,7 +172,10 @@ void ir_param_add(Func* f, const CGParamDesc* d) { p->index = d->index; p->name = d->name; p->type = d->type; - p->slot = d->slot; + p->size = d->size; + p->align = d->align; + p->flags = d->flags; + p->storage = d->storage; p->abi = d->abi; p->loc = d->loc; } diff --git a/src/opt/ir.h b/src/opt/ir.h @@ -212,7 +212,10 @@ typedef struct IRParam { u32 index; Sym name; CfreeCgTypeId type; - FrameSlot slot; + u32 size; + u32 align; + u32 flags; /* CGLocalFlag */ + CGLocalStorage storage; const ABIArgInfo* abi; SrcLoc loc; } IRParam; diff --git a/src/opt/opt.c b/src/opt/opt.c @@ -151,6 +151,19 @@ static FrameSlot opt_local_frame_slot(Func* f, const CGLocalDesc* d, return ir_frame_slot_new(f, &fsd); } +static FrameSlot opt_param_frame_slot(Func* f, const CGParamDesc* d) { + FrameSlotDesc fsd; + memset(&fsd, 0, sizeof fsd); + fsd.type = d->type; + fsd.name = d->name; + fsd.loc = d->loc; + fsd.size = d->size; + fsd.align = d->align; + fsd.kind = FS_PARAM; + if (d->flags & CG_LOCAL_ADDR_TAKEN) fsd.flags |= FSF_ADDR_TAKEN; + return ir_frame_slot_new(f, &fsd); +} + static u8 opt_local_reg_class_for(Compiler* c, CfreeCgTypeId ty) { CfreeCgTypeKind kind = cfree_cg_type_kind((CfreeCompiler*)c, ty); return kind == CFREE_CG_TYPE_FLOAT ? RC_FP : RC_INT; @@ -384,16 +397,51 @@ static void opt_frame_home_addr_taken_locals(Func* f) { } } -static void w_param(CGTarget* t, const CGParamDesc* d) { +static CGLocalStorage w_param(CGTarget* t, const CGParamDesc* d) { OptImpl* o = impl_of(t); + CGLocalStorage st = d->storage; + CGLocalDesc local_desc; + memset(&local_desc, 0, sizeof local_desc); + local_desc.type = d->type; + local_desc.name = d->name; + local_desc.loc = d->loc; + local_desc.size = d->size; + local_desc.align = d->align; + local_desc.flags = d->flags; + if (st.kind == CG_LOCAL_STORAGE_FRAME && st.v.frame_slot == FRAME_SLOT_NONE) { + if ((d->flags & (CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED)) == 0) { + Val v = ir_alloc_val(o->f, d->type, opt_local_reg_class(o, d->type)); + st.kind = CG_LOCAL_STORAGE_REG; + st.v.reg = (Reg)v; + } else { + st.kind = CG_LOCAL_STORAGE_FRAME; + st.v.frame_slot = opt_param_frame_slot(o->f, d); + } + } /* Deep-copy parts so caller-stack memory isn't relied on. */ CGParamDesc copy = *d; + copy.storage = st; if (d->nincoming) { CGABIPart* parts = arena_array(o->f->arena, CGABIPart, d->nincoming); memcpy(parts, d->incoming, sizeof(CGABIPart) * d->nincoming); copy.incoming = parts; } ir_param_add(o->f, &copy); + ir_local_add(o->f, &local_desc, st); + if (st.kind == CG_LOCAL_STORAGE_REG) { + Inst* in = rec(o, IR_PARAM_DECL); + in->def = (Val)st.v.reg; + in->type = d->type; + in->opnds = arena_array(o->f->arena, Operand, 1); + in->opnds[0].kind = OPK_REG; + in->opnds[0].cls = opt_local_reg_class(o, d->type); + in->opnds[0].type = d->type; + in->opnds[0].v.reg = st.v.reg; + in->nopnds = 1; + o->f->val_def_block[st.v.reg] = o->cur; + o->f->val_def_inst[st.v.reg] = o->f->blocks[o->cur].ninsts - 1u; + } + return st; } static void w_spill_reg(CGTarget* t, Operand src, FrameSlot s, MemAccess m) { @@ -1112,6 +1160,31 @@ static FrameSlot slot_to_target(ReplayCtx* r, FrameSlot vs) { return r->slot_map[vs]; } +static CGLocalStorage xlat_storage(ReplayCtx* r, CGLocalStorage st, + CfreeCgTypeId ty) { + (void)ty; + if (st.kind == CG_LOCAL_STORAGE_REG) { + Val v = (Val)st.v.reg; + if (r->identity_regs && r->f->opt_rewritten && v < r->f->nvals && + r->f->val_info) { + OptValInfo* vi = &r->f->val_info[v]; + if (vi->alloc_kind == OPT_ALLOC_HARD) { + st.v.reg = vi->hard_reg; + } else if (vi->alloc_kind == OPT_ALLOC_SPILL) { + st.kind = CG_LOCAL_STORAGE_FRAME; + st.v.frame_slot = slot_to_target(r, vi->spill_slot); + } else { + st.v.reg = val_to_target_reg(r, v); + } + } else { + st.v.reg = val_to_target_reg(r, v); + } + } else { + st.v.frame_slot = slot_to_target(r, st.v.frame_slot); + } + return st; +} + static Operand xlat_op(ReplayCtx* r, Operand op) { switch ((OpKind)op.kind) { case OPK_IMM: @@ -1574,9 +1647,8 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) { for (u32 i = 0; i < f->nframe_slots; ++i) r.slot_map[f->frame_slots[i].id] = target_slots[i]; } else { - /* func_begin with the recorded descriptor. The desc.params[].slot - * fields are wrapper IR slot ids; aarch64's func_begin doesn't - * dereference them so we don't translate. */ + /* func_begin with the recorded descriptor. Parameter storage is replayed + * through target->param below after frame slots are mapped. */ w->func_begin(w, &f->desc); } @@ -1613,10 +1685,13 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) { d.index = p->index; d.name = p->name; d.type = p->type; - d.slot = slot_to_target(&r, p->slot); + d.size = p->size; + d.align = p->align; + d.flags = p->flags; + d.storage = xlat_storage(&r, p->storage, p->type); d.abi = p->abi; d.loc = p->loc; - w->param(w, &d); + (void)w->param(w, &d); } /* Body in emit order — the order CG's emit cursor visited each diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c @@ -1325,6 +1325,10 @@ static void rewrite_func(Func* f, const OptLiveInfo* live_info) { for (u32 ri = bl->ninsts; ri > 0; --ri) { u32 i = ri - 1u; Inst in = bl->insts[i]; + if ((IROp)in.op == IR_PARAM_DECL) { + out_prepend_inst(f, &out, &in); + continue; + } refs_reset(&refs); walk_inst_operands(f, &in, refs_collect, &refs); list_reset(&before); @@ -1988,6 +1992,7 @@ static int inst_has_side_effect(Func* f, const Inst* in) { return aux && mem_observable(&aux->access.storage); } case IR_ALLOCA: + case IR_PARAM_DECL: case IR_STORE: case IR_AGG_COPY: case IR_AGG_SET: diff --git a/test/api/cg_type_test.c b/test/api/cg_type_test.c @@ -979,7 +979,7 @@ static void exercise_cg_constfold_phases(CfreeCompiler* c, uint32_t partial_size = cg_emit_local_shadow_partial_store( c, i32_ty, i8_ty, "cg_shadow_partial_o1"); - EXPECT(delayed_size <= 52, + EXPECT(delayed_size <= 56, "delayed arithmetic chain should materialize as one add, text " "size=%u", delayed_size); @@ -989,7 +989,7 @@ static void exercise_cg_constfold_phases(CfreeCompiler* c, EXPECT(local_size <= 32, "local constant shadow should fold x=40; return x+2, text size=%u", local_size); - EXPECT(delayed_cmp_size <= 60, + EXPECT(delayed_cmp_size <= 64, "delayed arithmetic consumed by compare should stay compact, text " "size=%u", delayed_cmp_size); diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c @@ -423,6 +423,8 @@ typedef struct MockCGTarget { int store_calls; int addr_of_calls; int cmp_branch_calls; + int param_calls; + CGLocalStorage last_param_storage; } MockCGTarget; static void mock_func_begin(CGTarget* t, const CGFuncDesc* d) { @@ -528,6 +530,27 @@ static void mock_store(CGTarget* t, Operand addr, Operand src, MemAccess macc) { ++m->store_calls; } +static FrameSlot mock_frame_slot(CGTarget* t, const FrameSlotDesc* d); + +static CGLocalStorage mock_param(CGTarget* t, const CGParamDesc* p) { + MockCGTarget* m = (MockCGTarget*)t; + CGLocalStorage st = p->storage; + ++m->param_calls; + if (st.kind == CG_LOCAL_STORAGE_FRAME && st.v.frame_slot == FRAME_SLOT_NONE) { + FrameSlotDesc fsd = {0}; + fsd.type = p->type; + fsd.name = p->name; + fsd.loc = p->loc; + fsd.size = p->size; + fsd.align = p->align; + fsd.kind = FS_PARAM; + if (p->flags & CG_LOCAL_ADDR_TAKEN) fsd.flags |= FSF_ADDR_TAKEN; + st.v.frame_slot = mock_frame_slot(t, &fsd); + } + m->last_param_storage = st; + return st; +} + static void mock_addr_of(CGTarget* t, Operand dst, Operand lv) { MockCGTarget* m = (MockCGTarget*)t; (void)dst; @@ -591,6 +614,7 @@ static void mock_init(MockCGTarget* m, Compiler* c) { m->base.copy = mock_copy; m->base.load = mock_load; m->base.store = mock_store; + m->base.param = mock_param; m->base.addr_of = mock_addr_of; m->base.ret = mock_ret; m->base.set_loc = mock_set_loc; @@ -2525,6 +2549,17 @@ static CGLocalDesc local_desc_(CfreeCgTypeId ty, u32 size, u32 align, return d; } +static CGParamDesc param_desc_(CfreeCgTypeId ty, u32 size, u32 align, + u32 flags) { + CGParamDesc d; + memset(&d, 0, sizeof d); + d.type = ty; + d.size = size; + d.align = align; + d.flags = flags; + return d; +} + static void opt_local_hook_chooses_register_for_scalar(void) { TestCtx tc; tc_init(&tc); @@ -2544,6 +2579,60 @@ static void opt_local_hook_chooses_register_for_scalar(void) { tc_fini(&tc); } +static void opt_param_hook_chooses_register_for_scalar(void) { + TestCtx tc; + tc_init(&tc); + MockCGTarget mock; + mock_init(&mock, tc.c); + static const Reg pool[] = {19}; + static const Reg scratch[] = {9, 10}; + mock_set_pool(&mock, RC_INT, pool, 1, scratch, 2, 0x4007FFFFu); + + CGTarget* opt = opt_cgtarget_new(tc.c, &mock.base, 1); + begin_mock_opt_func(&tc, opt, tc.i32); + + CGParamDesc d = param_desc_(tc.i32, 4, 4, 0); + CGLocalStorage st = opt->param(opt, &d); + EXPECT(st.kind == CG_LOCAL_STORAGE_REG, + "non-address-taken scalar param should be register-backed"); + EXPECT(st.v.reg != (Reg)REG_NONE, "register-backed param needs a vreg"); + + CGABIValue retv = {0}; + retv.type = tc.i32; + retv.storage = op_reg_(st.v.reg, tc.i32); + opt->ret(opt, &retv); + opt->func_end(opt); + + EXPECT(mock.param_calls == 1, "param should replay to wrapped backend"); + EXPECT(mock.last_param_storage.kind == CG_LOCAL_STORAGE_REG, + "replayed scalar param should remain register-backed"); + EXPECT(mock.last_param_storage.v.reg == 19, + "replayed param storage should be allocated hard reg r19, got r%u", + (unsigned)mock.last_param_storage.v.reg); + + opt->destroy(opt); + tc_fini(&tc); +} + +static void opt_param_memory_required_uses_frame(void) { + TestCtx tc; + tc_init(&tc); + MockCGTarget mock; + mock_init(&mock, tc.c); + + CGTarget* opt = opt_cgtarget_new(tc.c, &mock.base, 1); + begin_mock_opt_func(&tc, opt, tc.i32); + + CGParamDesc d = + param_desc_(tc.i32, 4, 4, CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED); + CGLocalStorage st = opt->param(opt, &d); + EXPECT(st.kind == CG_LOCAL_STORAGE_FRAME, + "memory-required param should be frame-backed"); + + opt->destroy(opt); + tc_fini(&tc); +} + static void opt_local_addr_taken_uses_frame_and_replays_addr_of(void) { TestCtx tc; tc_init(&tc); @@ -2686,6 +2775,8 @@ int main(void) { opt_records_const_bytes_by_value(); opt_cmp_branch_keeps_fallthrough_after_block_growth(); opt_local_hook_chooses_register_for_scalar(); + opt_param_hook_chooses_register_for_scalar(); + opt_param_memory_required_uses_frame(); opt_local_addr_taken_uses_frame_and_replays_addr_of(); opt_register_local_addr_frame_homes(); simple_regalloc_reports_exact_used_regs();