commit 64a16dbb17499ff7c984aa6980a5c9cf3a5487a3
parent da2192d8a98985012e6f3532c233f02a4efbaa55
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sat, 16 May 2026 00:39:37 -0700
Make call setup visible to opt
Diffstat:
11 files changed, 608 insertions(+), 82 deletions(-)
diff --git a/doc/OPT_REGS_CALL_PLAN.md b/doc/OPT_REGS_CALL_PLAN.md
@@ -23,11 +23,13 @@ allocation scoring, and preserves hard-assigned live-across-call values by
intersecting the assigned register with the planned call's clobber mask.
Post-RA hard-register liveness uses the same call-specific clobber mask.
-For supported non-tail, non-sret, register-argument call plans, O1 now replays
-calls by materializing arguments with a local parallel-copy resolver, invoking
-the backend's `emit_call_plan` hook, and extracting returns from fixed return
-registers. The x64, AArch64, and RV64 backends implement `emit_call_plan` as a
-branch-only call emission hook.
+For supported non-tail call plans, O1 now replays calls by materializing
+arguments with a local parallel-copy resolver, invoking backend stack-argument
+and branch-only call-plan hooks, and extracting returns from fixed return
+registers. Address-valued call moves cover byval/indirect arguments and hidden
+sret destination pointers. The x64, AArch64, and RV64 backends implement
+`store_call_arg` for outgoing stack slots and `emit_call_plan` for the call
+branch.
What this closes:
@@ -44,19 +46,18 @@ What remains open:
- call setup/return extraction are represented by call-plan aux data rather
than separate first-class IR ops;
-- stack-argument call plans, sret calls, and tail calls still fall back to the
- legacy backend `call` hook;
+- tail calls still fall back to the legacy backend `call` hook;
- target `get_phys_regs` tables expose broader O1 pools, but ABI argument and
return registers are suppressed for functions with incoming parameters or
- legacy call fallback until those setup paths are also opt-visible;
+ legacy call fallback until incoming parameter setup is also opt-visible;
- direct CG still uses legacy allocation/call hooks;
- broader real-architecture call-plan layout tests and code-shape probes remain
to be added.
In phase terms: Phase 1 and Phase 2 are done, Phase 3 is implemented through
call-plan aux visibility plus planned replay for supported call shapes, Phase 4
-is implemented for register argument/return moves with stack/sret/tail fallback,
-Phase 5 has an initial guarded implementation, and Phase 6 remains open.
+is implemented for register, stack, sret, and return moves with tail fallback,
+Phase 5 is implemented for call setup/replay, and Phase 6 remains open.
## Planned Call Replay Boundary
@@ -69,15 +70,18 @@ Planned replay is used only when all of the following are true:
- the call has a valid `CGCallPlan`;
- the backend provides `emit_call_plan`;
- the call is not a tail call;
-- the call is not an sret call;
-- every argument destination is `CG_CALL_PLAN_REG`;
+- every stack argument destination has backend `store_call_arg` support;
+- every offset/address-valued argument source has backend `load_call_arg`
+ support;
+- every offset aggregate return store has backend `store_call_ret` support;
- every return destination is a register, local, or indirect operand.
For those calls, O1 owns the setup and extraction sequence:
- source operands are rewritten to hard registers or spill slots;
- live-across-call hard registers are saved before argument setup;
-- argument moves into ABI registers are resolved as a local parallel copy;
+- argument moves into ABI registers and outgoing stack slots are resolved as a
+ local parallel copy;
- indirect callees that would be overwritten by argument setup are copied to a
target-provided scratch register first;
- the backend emits only required call metadata and the branch through
@@ -86,21 +90,15 @@ For those calls, O1 owns the setup and extraction sequence:
The fallback path is still required for:
-- **outgoing stack arguments**: `CG_CALL_PLAN_STACK` records the ABI stack
- offset, but opt replay does not yet have a backend-neutral operation for
- writing into the target-owned outgoing area while preserving known-frame and
- `max_outgoing` invariants;
-- **sret calls**: backends still synthesize the hidden destination pointer from
- frame-slot state inside `call`;
- **tail calls**: the legacy hook owns epilogue emission, legality checks, and
branch-without-continuation behavior;
- **direct CG**: direct codegen still uses the old backend allocation and call
hooks while O1 migrates first.
This boundary lets Phase 3/4 tests exercise register argument permutation,
-indirect-callee clobber hazards, call-specific clobber preservation, and return
-extraction without broadening the register file before stack/sret/tail lowering
-is explicit enough to be target-independent.
+outgoing stack arguments, sret hidden pointers, indirect-callee clobber hazards,
+call-specific clobber preservation, and return extraction without broadening
+the register file across still-legacy tail-call lowering.
## Current Problem
@@ -258,11 +256,18 @@ typedef enum CGCallPlanLocKind {
CG_CALL_PLAN_IGNORE,
} CGCallPlanLocKind;
+typedef enum CGCallPlanSrcKind {
+ CG_CALL_PLAN_SRC_VALUE,
+ CG_CALL_PLAN_SRC_ADDR,
+} CGCallPlanSrcKind;
+
typedef struct CGCallPlanMove {
Operand src; /* virtual value, local, indirect, imm, or global */
u8 dst_kind; /* CGCallPlanLocKind */
+ u8 src_kind; /* CGCallPlanSrcKind: value vs address materialization */
u8 cls; /* RegClass for register destinations */
Reg dst_reg; /* valid for CG_CALL_PLAN_REG */
+ u32 src_offset; /* byte offset within aggregate source */
u32 stack_offset; /* valid for CG_CALL_PLAN_STACK */
MemAccess mem; /* width/sign for loads/stores */
} CGCallPlanMove;
@@ -271,6 +276,7 @@ typedef struct CGCallPlanRet {
Operand dst; /* virtual destination in current IR */
u8 cls;
Reg src_reg;
+ u32 dst_offset; /* byte offset within aggregate destination */
MemAccess mem;
} CGCallPlanRet;
@@ -358,12 +364,17 @@ come from per-call clobber masks in rewrite.
Backends should gain emission hooks for an already-planned call:
```c
+void (*load_call_arg)(CGTarget*, Operand dst, const CGCallPlanMove*);
+void (*store_call_arg)(CGTarget*, const CGCallPlanMove*);
+void (*store_call_ret)(CGTarget*, const CGCallPlanRet*, Operand src);
void (*emit_call_plan)(CGTarget*, const CGCallPlan*);
```
-For the current transition, this hook assumes register arguments have already
-been materialized by opt. Stack-argument plans still use the legacy `call`
-fallback. The hook only emits:
+For the current transition, these hooks assume register arguments have already
+been materialized by opt and stack arguments are written one planned move at a
+time through `store_call_arg`. `load_call_arg` and `store_call_ret` are the
+offset-aware load/store hooks for aggregate parts and address-valued moves.
+`emit_call_plan` only emits:
- required varargs metadata such as x64 `AL`;
- direct or indirect call branch;
@@ -431,23 +442,24 @@ allocator starts using those registers widely.
### Phase 4 - Parallel Copy Resolver
-Status: implemented for register argument/return plans. O1 replay uses a local
+Status: implemented for non-tail call plans. O1 replay uses a local
parallel-copy resolver for planned call setup and return extraction, including
-register-register cycles, local/indirect/immediate/global sources, register
-destinations, local/indirect return destinations, and indirect callees that
-occupy a destination argument register. Stack-argument, sret, and tail-call
-plans continue to use the legacy backend `call` fallback until outgoing
-stack-slot materialization is represented in the target contract.
+register-register cycles, local/indirect loads, address-valued moves,
+immediates, globals, register and outgoing stack destinations, local/indirect
+return destinations, and indirect callees that occupy a destination argument
+register. Tail-call plans continue to use the legacy backend `call` fallback
+until epilogue transfer is represented in the target contract.
- done: implement local parallel move resolution for register call setup and
return extraction;
-- done: support register-register cycles, local/indirect loads, immediates,
- globals, and local/indirect return stores;
+- done: support register-register cycles, local/indirect loads,
+ address-valued moves, immediates, globals, outgoing stack stores, and
+ local/indirect return stores;
- done: use target-provided scratch registers to break cycles and preserve
indirect callees;
- done: add red-green tests for argument permutation cycles, indirect callees in
- argument registers, and stack-argument fallback;
-- still open: support `CG_CALL_PLAN_STACK` materialization directly in opt;
+ argument registers, stack-argument replay, and address-valued args;
+- done: support `CG_CALL_PLAN_STACK` materialization directly in opt;
- still open: add return-register collision and stack-source hazard tests once
stack materialization is explicit.
@@ -455,11 +467,12 @@ Expected result: ABI arg and return registers can be made allocable safely.
### Phase 5 - Broaden Register Exposure
-Status: partially implemented. O1 has target-informed scoring and per-call
-preservation, and the native target phys-reg tables now expose broader O1 pools.
-Known backend helper scratch registers remain hidden. ABI arg/return registers
-are available only when O1 can avoid the still-sequential setup paths: functions
-with incoming parameters or legacy call fallback suppress those ABI registers.
+Status: implemented for call setup. O1 has target-informed scoring and
+per-call preservation, and the native target phys-reg tables now expose broader
+O1 pools. Known backend helper scratch registers remain hidden. ABI arg/return
+registers are available when all calls in a function use planned replay; only
+functions with incoming parameters or legacy tail-call fallback suppress those
+ABI registers.
- done: expand target `get_phys_regs` tables with guarded caller-saved and ABI
registers for x64, AArch64, and RV64;
@@ -467,8 +480,9 @@ with incoming parameters or legacy call fallback suppress those ABI registers.
values and callee-saved regs for call-crossing values;
- done: keep known backend helper scratch registers reserved until their
clobbers are expressed;
+- done: remove call-driven ABI-reg suppression for stack and sret call plans;
- still open: remove the ABI-reg suppression after incoming parameter setup and
- stack/sret/tail call setup are opt-visible;
+ tail call setup are opt-visible;
- Add code-shape tests for direct-call tiny functions and unused-param functions
across x64, AArch64, and RV64.
@@ -497,8 +511,8 @@ Focused unit tests:
- done: opt-side target register metadata consumption;
- done: caller-saved live-across-call preservation using per-call masks;
- done: planned-call replay through `emit_call_plan` for register-argument
- cycles and indirect-callee/argument-register hazards;
-- done: legacy fallback for planned calls requiring outgoing stack arguments;
+ cycles, stack arguments, address-valued args, sret-shaped plans, and
+ indirect-callee/argument-register hazards;
- still needed: target register metadata tests per real architecture;
- still needed: broader real-architecture call-plan layout for scalar, FP,
mixed, sret, variadic, and stack-arg calls;
@@ -552,18 +566,21 @@ Completed:
2. Teach `opt_machinize` to consume the new metadata.
3. Add `CGCallPlan` and plan calls without using it for emission.
4. Use call-plan clobber masks for rewrite and post-RA hard-register liveness.
+5. Replay non-tail call plans in opt, including ABI register setup, outgoing
+ stack arguments, address-valued byval/indirect/sret moves, and return
+ extraction.
+6. Remove call-driven ABI-reg suppression for stack-argument and sret-shaped
+ calls.
Next patch stack:
1. Add call-plan layout/dump tests for real x64/AArch64/RV64 scalar, FP, mixed,
sret, variadic, and stack-arg cases.
-2. Extend the target contract with an opt-visible outgoing stack-slot
- materialization path, then remove the stack-argument fallback.
-3. Add red-green hazard tests for return-register collisions and stack-argument
- sources once stack materialization is explicit.
-4. Continue broadening register exposure by removing the current ABI-reg guards
- as incoming-parameter and stack/sret/tail-call setup become opt-visible.
-5. Migrate direct CG or wrap it with internal call planning, then remove legacy
+2. Add red-green hazard tests for return-register collisions and stack-argument
+ sources.
+3. Continue broadening register exposure by removing the remaining ABI-reg
+ guards as incoming-parameter and tail-call setup become opt-visible.
+4. Migrate direct CG or wrap it with internal call planning, then remove legacy
pool semantics.
This order keeps each step testable and avoids mixing API migration, allocation
diff --git a/src/arch/aa64/ops.c b/src/arch/aa64/ops.c
@@ -1096,6 +1096,82 @@ static void aa_emit_call_plan(CGTarget* t, const CGCallPlan* p) {
}
}
+static Operand aa_call_plan_offset_operand(CGTarget* t, Operand op,
+ u32 offset) {
+ if (!offset) return op;
+ if (op.kind == OPK_INDIRECT) {
+ op.v.ind.ofs += (i32)offset;
+ } else if (op.kind == OPK_LOCAL) {
+ AAImpl* a = impl_of(t);
+ AASlot* s = aa64_slot_get(a, op.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 call plan: bad slot");
+ op.kind = OPK_INDIRECT;
+ op.v.ind.base = 29;
+ op.v.ind.ofs = -(i32)s->off + (i32)offset;
+ }
+ return op;
+}
+
+static void aa_load_call_arg(CGTarget* t, Operand dst,
+ const CGCallPlanMove* m) {
+ Operand src = aa_call_plan_offset_operand(t, m->src, m->src_offset);
+ if (m->src_kind == CG_CALL_PLAN_SRC_ADDR) {
+ aa_addr_of(t, dst, src);
+ return;
+ }
+ if (src.kind == OPK_GLOBAL) {
+ aa_addr_of(t, dst, src);
+ return;
+ }
+ aa_load(t, dst, src, m->mem);
+}
+
+static void aa_store_call_ret(CGTarget* t, const CGCallPlanRet* r,
+ Operand src) {
+ Operand dst = aa_call_plan_offset_operand(t, r->dst, r->dst_offset);
+ aa_store(t, dst, src, r->mem);
+}
+
+static void aa_store_call_arg(CGTarget* t, const CGCallPlanMove* m) {
+ Operand addr;
+ memset(&addr, 0, sizeof addr);
+ addr.kind = OPK_INDIRECT;
+ addr.cls = RC_INT;
+ addr.type = m->mem.type;
+ addr.v.ind.base = 31;
+ addr.v.ind.ofs = (i32)m->stack_offset;
+
+ if (m->src_kind == CG_CALL_PLAN_SRC_ADDR) {
+ Operand tmp = {.kind = OPK_REG, .cls = RC_INT, .type = m->mem.type};
+ tmp.v.reg = AA_TMP0;
+ aa_load_call_arg(t, tmp, m);
+ aa_store(t, addr, tmp, m->mem);
+ return;
+ }
+
+ if (m->src.kind == OPK_REG || m->src.kind == OPK_IMM) {
+ aa_store(t, addr, m->src, m->mem);
+ return;
+ }
+ if (m->src.kind == OPK_GLOBAL) {
+ Operand tmp = {.kind = OPK_REG, .cls = RC_INT, .type = m->mem.type};
+ tmp.v.reg = AA_TMP0;
+ aa_load_call_arg(t, tmp, m);
+ aa_store(t, addr, tmp, m->mem);
+ return;
+ }
+ if (m->src.kind == OPK_LOCAL || m->src.kind == OPK_INDIRECT) {
+ Operand tmp = {.kind = OPK_REG, .cls = m->cls, .type = m->mem.type};
+ tmp.v.reg = m->cls == RC_FP ? AA_FP_TMP0 : AA_TMP0;
+ aa_load_call_arg(t, tmp, m);
+ aa_store(t, addr, tmp, m->mem);
+ return;
+ }
+ compiler_panic(t->c, impl_of(t)->loc,
+ "aarch64 store_call_arg: source kind %d unsupported",
+ (int)m->src.kind);
+}
+
static void aa_ret(CGTarget* t, const CGABIValue* val) {
AAImpl* a = impl_of(t);
MCEmitter* mc = t->mc;
@@ -1979,7 +2055,10 @@ CGTarget* aa64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) {
t->convert = aa_convert;
t->call = aa_call;
+ t->load_call_arg = aa_load_call_arg;
t->emit_call_plan = aa_emit_call_plan;
+ t->store_call_arg = aa_store_call_arg;
+ t->store_call_ret = aa_store_call_ret;
t->call_stack_size = aa_call_stack_size;
t->ret = aa_ret;
diff --git a/src/arch/aa64/opt_coord.c b/src/arch/aa64/opt_coord.c
@@ -216,6 +216,17 @@ static void aa_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
out->args = arena_zarray(t->c->tu, CGCallPlanMove, cap ? cap : 1u);
out->rets = arena_zarray(t->c->tu, CGCallPlanRet, 4);
u32 next_int = 0, next_fp = 0, stack = 0;
+ if (d->abi && d->abi->has_sret) {
+ CGCallPlanMove* m = &out->args[out->nargs++];
+ m->src = d->ret.storage;
+ m->src_kind = CG_CALL_PLAN_SRC_ADDR;
+ m->dst_kind = CG_CALL_PLAN_REG;
+ m->cls = RC_INT;
+ m->dst_reg = 8;
+ m->mem.type = d->ret.type;
+ m->mem.size = 8;
+ m->mem.align = 8;
+ }
for (u32 a = 0; a < d->nargs; ++a) {
const CGABIValue* av = &d->args[a];
const ABIArgInfo* ai = av->abi;
@@ -236,6 +247,7 @@ static void aa_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
if (ai->kind == ABI_ARG_INDIRECT) {
CGCallPlanMove* m = &out->args[out->nargs++];
m->src = av->storage;
+ m->src_kind = CG_CALL_PLAN_SRC_ADDR;
m->cls = RC_INT;
if (next_int < 8) {
m->dst_kind = CG_CALL_PLAN_REG;
@@ -254,6 +266,7 @@ static void aa_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
const ABIArgPart* p = &ai->parts[i];
CGCallPlanMove* m = &out->args[out->nargs++];
m->src = av->nparts ? av->parts[i].op : av->storage;
+ m->src_offset = av->nparts ? av->parts[i].src_offset : p->src_offset;
m->mem.type = av->type;
m->mem.size = p->size;
m->mem.align = p->align ? p->align : p->size;
@@ -287,6 +300,7 @@ static void aa_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
const ABIArgPart* p = &d->abi->ret.parts[i];
CGCallPlanRet* r = &out->rets[out->nrets++];
r->dst = d->ret.storage;
+ r->dst_offset = p->src_offset;
r->mem.type = d->ret.type;
r->mem.size = p->size;
r->mem.align = p->align ? p->align : p->size;
diff --git a/src/arch/arch.h b/src/arch/arch.h
@@ -423,11 +423,19 @@ typedef enum CGCallPlanLocKind {
CG_CALL_PLAN_IGNORE,
} CGCallPlanLocKind;
+typedef enum CGCallPlanSrcKind {
+ CG_CALL_PLAN_SRC_VALUE,
+ CG_CALL_PLAN_SRC_ADDR,
+} CGCallPlanSrcKind;
+
typedef struct CGCallPlanMove {
Operand src;
u8 dst_kind; /* CGCallPlanLocKind */
+ u8 src_kind; /* CGCallPlanSrcKind */
u8 cls; /* RegClass for register destinations */
+ u8 pad;
Reg dst_reg;
+ u32 src_offset;
u32 stack_offset;
MemAccess mem;
} CGCallPlanMove;
@@ -436,6 +444,7 @@ typedef struct CGCallPlanRet {
Operand dst;
u8 cls;
Reg src_reg;
+ u32 dst_offset;
MemAccess mem;
} CGCallPlanRet;
@@ -727,6 +736,9 @@ struct CGTarget {
* `callee.kind == OPK_GLOBAL` is direct; any other kind is indirect. */
void (*call)(CGTarget*, const CGCallDesc*);
void (*plan_call)(CGTarget*, const CGCallDesc*, CGCallPlan* out);
+ void (*load_call_arg)(CGTarget*, Operand dst, const CGCallPlanMove*);
+ void (*store_call_arg)(CGTarget*, const CGCallPlanMove*);
+ void (*store_call_ret)(CGTarget*, const CGCallPlanRet*, Operand src);
void (*emit_call_plan)(CGTarget*, const CGCallPlan*);
void (*ret)(CGTarget*, const CGABIValue* val_or_null);
diff --git a/src/arch/rv64/ops.c b/src/arch/rv64/ops.c
@@ -1103,6 +1103,82 @@ static void rv_emit_call_plan(CGTarget* t, const CGCallPlan* p) {
}
}
+static Operand rv_call_plan_offset_operand(CGTarget* t, Operand op,
+ u32 offset) {
+ if (!offset) return op;
+ if (op.kind == OPK_INDIRECT) {
+ op.v.ind.ofs += (i32)offset;
+ } else if (op.kind == OPK_LOCAL) {
+ RImpl* a = impl_of(t);
+ RvSlot* s = rv64_slot_get(a, op.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "rv64 call plan: bad slot");
+ op.kind = OPK_INDIRECT;
+ op.v.ind.base = RV_S0;
+ op.v.ind.ofs = -(i32)s->off + (i32)offset;
+ }
+ return op;
+}
+
+static void rv_load_call_arg(CGTarget* t, Operand dst,
+ const CGCallPlanMove* m) {
+ Operand src = rv_call_plan_offset_operand(t, m->src, m->src_offset);
+ if (m->src_kind == CG_CALL_PLAN_SRC_ADDR) {
+ rv_addr_of(t, dst, src);
+ return;
+ }
+ if (src.kind == OPK_GLOBAL) {
+ rv_addr_of(t, dst, src);
+ return;
+ }
+ rv_load(t, dst, src, m->mem);
+}
+
+static void rv_store_call_ret(CGTarget* t, const CGCallPlanRet* r,
+ Operand src) {
+ Operand dst = rv_call_plan_offset_operand(t, r->dst, r->dst_offset);
+ rv_store(t, dst, src, r->mem);
+}
+
+static void rv_store_call_arg(CGTarget* t, const CGCallPlanMove* m) {
+ Operand addr;
+ memset(&addr, 0, sizeof addr);
+ addr.kind = OPK_INDIRECT;
+ addr.cls = RC_INT;
+ addr.type = m->mem.type;
+ addr.v.ind.base = RV_SP;
+ addr.v.ind.ofs = (i32)m->stack_offset;
+
+ if (m->src_kind == CG_CALL_PLAN_SRC_ADDR) {
+ Operand tmp = {.kind = OPK_REG, .cls = RC_INT, .type = m->mem.type};
+ tmp.v.reg = RV_T0;
+ rv_load_call_arg(t, tmp, m);
+ rv_store(t, addr, tmp, m->mem);
+ return;
+ }
+
+ if (m->src.kind == OPK_REG || m->src.kind == OPK_IMM) {
+ rv_store(t, addr, m->src, m->mem);
+ return;
+ }
+ if (m->src.kind == OPK_GLOBAL) {
+ Operand tmp = {.kind = OPK_REG, .cls = RC_INT, .type = m->mem.type};
+ tmp.v.reg = RV_T0;
+ rv_load_call_arg(t, tmp, m);
+ rv_store(t, addr, tmp, m->mem);
+ return;
+ }
+ if (m->src.kind == OPK_LOCAL || m->src.kind == OPK_INDIRECT) {
+ Operand tmp = {.kind = OPK_REG, .cls = m->cls, .type = m->mem.type};
+ tmp.v.reg = m->cls == RC_FP ? 0u : RV_T0;
+ rv_load_call_arg(t, tmp, m);
+ rv_store(t, addr, tmp, m->mem);
+ return;
+ }
+ compiler_panic(t->c, impl_of(t)->loc,
+ "rv64 store_call_arg: source kind %d unsupported",
+ (int)m->src.kind);
+}
+
static void rv_ret(CGTarget* t, const CGABIValue* val) {
RImpl* a = impl_of(t);
MCEmitter* mc = t->mc;
@@ -1905,7 +1981,10 @@ CGTarget* rv64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) {
t->convert = rv_convert;
t->call = rv_call;
+ t->load_call_arg = rv_load_call_arg;
t->emit_call_plan = rv_emit_call_plan;
+ t->store_call_arg = rv_store_call_arg;
+ t->store_call_ret = rv_store_call_ret;
t->call_stack_size = rv_call_stack_size;
t->ret = rv_ret;
diff --git a/src/arch/rv64/opt_coord.c b/src/arch/rv64/opt_coord.c
@@ -200,6 +200,17 @@ static void rv_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
out->args = arena_zarray(t->c->tu, CGCallPlanMove, cap ? cap : 1u);
out->rets = arena_zarray(t->c->tu, CGCallPlanRet, 4);
u32 next_int = d->abi && d->abi->has_sret ? 1u : 0u, next_fp = 0, stack = 0;
+ if (d->abi && d->abi->has_sret) {
+ CGCallPlanMove* m = &out->args[out->nargs++];
+ m->src = d->ret.storage;
+ m->src_kind = CG_CALL_PLAN_SRC_ADDR;
+ m->dst_kind = CG_CALL_PLAN_REG;
+ m->cls = RC_INT;
+ m->dst_reg = RV_A0;
+ m->mem.type = d->ret.type;
+ m->mem.size = 8;
+ m->mem.align = 8;
+ }
for (u32 a = 0; a < d->nargs; ++a) {
const CGABIValue* av = &d->args[a];
const ABIArgInfo* ai = av->abi;
@@ -219,6 +230,7 @@ static void rv_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
if (ai->kind == ABI_ARG_INDIRECT) {
CGCallPlanMove* m = &out->args[out->nargs++];
m->src = av->storage;
+ m->src_kind = CG_CALL_PLAN_SRC_ADDR;
m->cls = RC_INT;
if (next_int < 8) {
m->dst_kind = CG_CALL_PLAN_REG;
@@ -237,6 +249,7 @@ static void rv_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
const ABIArgPart* p = &ai->parts[i];
CGCallPlanMove* m = &out->args[out->nargs++];
m->src = av->nparts ? av->parts[i].op : av->storage;
+ m->src_offset = av->nparts ? av->parts[i].src_offset : p->src_offset;
m->mem.type = av->type;
m->mem.size = p->size;
m->mem.align = p->align ? p->align : p->size;
@@ -270,6 +283,7 @@ static void rv_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
const ABIArgPart* p = &d->abi->ret.parts[i];
CGCallPlanRet* r = &out->rets[out->nrets++];
r->dst = d->ret.storage;
+ r->dst_offset = p->src_offset;
r->mem.type = d->ret.type;
r->mem.size = p->size;
r->mem.align = p->align ? p->align : p->size;
diff --git a/src/arch/x64/ops.c b/src/arch/x64/ops.c
@@ -1034,6 +1034,102 @@ static void x_emit_call_plan(CGTarget* t, const CGCallPlan* p) {
}
}
+static Operand x_call_plan_offset_operand(Operand op, u32 offset) {
+ if (!offset) return op;
+ if (op.kind == OPK_INDIRECT) op.v.ind.ofs += (i32)offset;
+ return op;
+}
+
+static void x_load_call_arg(CGTarget* t, Operand dst, const CGCallPlanMove* m) {
+ Operand src = x_call_plan_offset_operand(m->src, m->src_offset);
+ if (m->src_kind == CG_CALL_PLAN_SRC_ADDR) {
+ x_addr_of(t, dst, src);
+ return;
+ }
+ if (src.kind == OPK_LOCAL) {
+ XImpl* a = impl_of(t);
+ XSlot* s = x64_slot_get(a, src.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "x64 load_call_arg: bad slot");
+ i32 off = -(i32)s->off + (i32)m->src_offset;
+ if (dst.cls == RC_FP) {
+ u8 prefix2 = (m->mem.size == 8) ? 0xF2 : 0xF3;
+ emit_sse_load(t->mc, prefix2, 0x10, dst.v.reg & 0xFu, X64_RBP, off);
+ } else {
+ emit_mov_load(t->mc, m->mem.size, 0, dst.v.reg & 0xFu, X64_RBP, off);
+ }
+ return;
+ }
+ if (src.kind == OPK_INDIRECT && m->src_offset) {
+ x_load(t, dst, src, m->mem);
+ return;
+ }
+ if (src.kind == OPK_GLOBAL) {
+ x_addr_of(t, dst, src);
+ return;
+ }
+ x_load(t, dst, src, m->mem);
+}
+
+static void x_store_call_ret(CGTarget* t, const CGCallPlanRet* r,
+ Operand src) {
+ Operand dst = r->dst;
+ if (dst.kind == OPK_INDIRECT) dst.v.ind.ofs += (i32)r->dst_offset;
+ if (dst.kind == OPK_LOCAL) {
+ XImpl* a = impl_of(t);
+ XSlot* s = x64_slot_get(a, dst.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "x64 store_call_ret: bad slot");
+ i32 off = -(i32)s->off + (i32)r->dst_offset;
+ if (src.cls == RC_FP) {
+ u8 prefix2 = (r->mem.size == 8) ? 0xF2 : 0xF3;
+ emit_sse_store(t->mc, prefix2, 0x11, src.v.reg & 0xFu, X64_RBP, off);
+ } else {
+ emit_mov_store(t->mc, r->mem.size, src.v.reg & 0xFu, X64_RBP, off);
+ }
+ return;
+ }
+ x_store(t, dst, src, r->mem);
+}
+
+static void x_store_call_arg(CGTarget* t, const CGCallPlanMove* m) {
+ Operand addr;
+ memset(&addr, 0, sizeof addr);
+ addr.kind = OPK_INDIRECT;
+ addr.cls = RC_INT;
+ addr.type = m->mem.type;
+ addr.v.ind.base = X64_RSP;
+ addr.v.ind.ofs = (i32)m->stack_offset;
+
+ if (m->src_kind == CG_CALL_PLAN_SRC_ADDR) {
+ Operand tmp = {.kind = OPK_REG, .cls = RC_INT, .type = m->mem.type};
+ tmp.v.reg = X64_RAX;
+ x_load_call_arg(t, tmp, m);
+ x_store(t, addr, tmp, m->mem);
+ return;
+ }
+
+ if (m->src.kind == OPK_REG || m->src.kind == OPK_IMM) {
+ x_store(t, addr, m->src, m->mem);
+ return;
+ }
+ if (m->src.kind == OPK_GLOBAL) {
+ Operand tmp = {.kind = OPK_REG, .cls = RC_INT, .type = m->mem.type};
+ tmp.v.reg = X64_RAX;
+ x_load_call_arg(t, tmp, m);
+ x_store(t, addr, tmp, m->mem);
+ return;
+ }
+ if (m->src.kind == OPK_LOCAL || m->src.kind == OPK_INDIRECT) {
+ Operand tmp = {.kind = OPK_REG, .cls = m->cls, .type = m->mem.type};
+ tmp.v.reg = m->cls == RC_FP ? X64_XMM15 : X64_RAX;
+ x_load_call_arg(t, tmp, m);
+ x_store(t, addr, tmp, m->mem);
+ return;
+ }
+ compiler_panic(t->c, impl_of(t)->loc,
+ "x64 store_call_arg: source kind %d unsupported",
+ (int)m->src.kind);
+}
+
static void x_ret(CGTarget* t, const CGABIValue* val) {
XImpl* a = impl_of(t);
MCEmitter* mc = t->mc;
@@ -1978,7 +2074,10 @@ CGTarget* x64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) {
t->convert = x_convert;
t->call = x_call;
+ t->load_call_arg = x_load_call_arg;
t->emit_call_plan = x_emit_call_plan;
+ t->store_call_arg = x_store_call_arg;
+ t->store_call_ret = x_store_call_ret;
t->call_stack_size = x_call_stack_size;
t->ret = x_ret;
diff --git a/src/arch/x64/opt_coord.c b/src/arch/x64/opt_coord.c
@@ -189,6 +189,17 @@ static void x_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
out->rets = arena_zarray(t->c->tu, CGCallPlanRet, 4);
u32 next_int = d->abi && d->abi->has_sret ? 1u : 0u, next_fp = 0, stack = 0;
static const u32 iregs[6] = {X64_RDI, X64_RSI, X64_RDX, X64_RCX, X64_R8, X64_R9};
+ if (d->abi && d->abi->has_sret) {
+ CGCallPlanMove* m = &out->args[out->nargs++];
+ m->src = d->ret.storage;
+ m->src_kind = CG_CALL_PLAN_SRC_ADDR;
+ m->dst_kind = CG_CALL_PLAN_REG;
+ m->cls = RC_INT;
+ m->dst_reg = X64_RDI;
+ m->mem.type = d->ret.type;
+ m->mem.size = 8;
+ m->mem.align = 8;
+ }
for (u32 a = 0; a < d->nargs; ++a) {
const CGABIValue* av = &d->args[a];
const ABIArgInfo* ai = av->abi;
@@ -208,6 +219,7 @@ static void x_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
if (ai->kind == ABI_ARG_INDIRECT) {
CGCallPlanMove* m = &out->args[out->nargs++];
m->src = av->storage;
+ m->src_kind = CG_CALL_PLAN_SRC_ADDR;
m->cls = RC_INT;
if (next_int < 6) {
m->dst_kind = CG_CALL_PLAN_REG;
@@ -226,6 +238,7 @@ static void x_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
const ABIArgPart* p = &ai->parts[i];
CGCallPlanMove* m = &out->args[out->nargs++];
m->src = av->nparts ? av->parts[i].op : av->storage;
+ m->src_offset = av->nparts ? av->parts[i].src_offset : p->src_offset;
m->mem.type = av->type;
m->mem.size = p->size;
m->mem.align = p->align ? p->align : p->size;
@@ -261,6 +274,7 @@ static void x_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
const ABIArgPart* p = &d->abi->ret.parts[i];
CGCallPlanRet* r = &out->rets[out->nrets++];
r->dst = d->ret.storage;
+ r->dst_offset = p->src_offset;
r->mem.type = d->ret.type;
r->mem.size = p->size;
r->mem.align = p->align ? p->align : p->size;
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -918,6 +918,23 @@ static void w_emit_call_plan(CGTarget* t, const CGCallPlan* p) {
if (wr->emit_call_plan) wr->emit_call_plan(wr, p);
}
+static void w_load_call_arg(CGTarget* t, Operand dst,
+ const CGCallPlanMove* m) {
+ CGTarget* wr = impl_of(t)->target;
+ if (wr->load_call_arg) wr->load_call_arg(wr, dst, m);
+}
+
+static void w_store_call_arg(CGTarget* t, const CGCallPlanMove* m) {
+ CGTarget* wr = impl_of(t)->target;
+ if (wr->store_call_arg) wr->store_call_arg(wr, m);
+}
+
+static void w_store_call_ret(CGTarget* t, const CGCallPlanRet* ret,
+ Operand src) {
+ CGTarget* wr = impl_of(t)->target;
+ if (wr->store_call_ret) wr->store_call_ret(wr, ret, src);
+}
+
static void w_ret(CGTarget* t, const CGABIValue* v) {
OptImpl* o = impl_of(t);
Inst* in = rec(o, IR_RET);
@@ -1269,8 +1286,14 @@ typedef struct ReplayParallelMove {
Operand dst;
Operand src;
MemAccess mem;
+ const CGCallPlanRet* ret;
+ u32 src_offset;
+ u32 dst_offset;
+ u32 stack_offset;
+ u8 dst_kind;
+ u8 src_kind;
+ u8 is_ret;
u8 done;
- u8 pad[3];
} ReplayParallelMove;
static Operand phys_reg_operand(Reg r, RegClass cls, CfreeCgTypeId ty) {
@@ -1323,9 +1346,35 @@ static Reg replay_scratch_reg(ReplayCtx* r, RegClass cls, Reg avoid) {
return REG_NONE;
}
-static void replay_emit_move(CGTarget* w, Operand dst, Operand src,
- MemAccess mem) {
- if (dst.kind == OPK_REG) {
+static void replay_emit_move(CGTarget* w, const ReplayParallelMove* move) {
+ Operand dst = move->dst;
+ Operand src = move->src;
+ MemAccess mem = move->mem;
+ if (move->dst_kind == CG_CALL_PLAN_STACK) {
+ CGCallPlanMove m;
+ memset(&m, 0, sizeof m);
+ m.src = src;
+ m.src_kind = move->src_kind;
+ m.dst_kind = CG_CALL_PLAN_STACK;
+ m.cls = dst.cls;
+ m.src_offset = move->src_offset;
+ m.stack_offset = move->stack_offset;
+ m.mem = mem;
+ w->store_call_arg(w, &m);
+ } else if (dst.kind == OPK_REG) {
+ if (move->src_kind == CG_CALL_PLAN_SRC_ADDR || move->src_offset) {
+ CGCallPlanMove m;
+ memset(&m, 0, sizeof m);
+ m.src = src;
+ m.src_kind = move->src_kind;
+ m.dst_kind = CG_CALL_PLAN_REG;
+ m.cls = dst.cls;
+ m.dst_reg = dst.v.reg;
+ m.src_offset = move->src_offset;
+ m.mem = mem;
+ w->load_call_arg(w, dst, &m);
+ return;
+ }
if (src.kind == OPK_REG) {
if (!operand_reg_eq(&dst, &src)) w->copy(w, dst, src);
} else if (src.kind == OPK_IMM) {
@@ -1336,6 +1385,14 @@ static void replay_emit_move(CGTarget* w, Operand dst, Operand src,
w->addr_of(w, dst, src);
}
} else if (dst.kind == OPK_LOCAL || dst.kind == OPK_INDIRECT) {
+ if (move->is_ret && move->dst_offset) {
+ CGCallPlanRet ret = move->ret ? *move->ret : (CGCallPlanRet){0};
+ ret.dst = dst;
+ ret.dst_offset = move->dst_offset;
+ ret.mem = mem;
+ w->store_call_ret(w, &ret, src);
+ return;
+ }
w->store(w, dst, src, mem);
}
}
@@ -1356,7 +1413,7 @@ static void replay_parallel_moves(ReplayCtx* r, ReplayParallelMove* moves,
int progressed = 0;
for (u32 i = 0; i < n; ++i) {
if (moves[i].done || !replay_move_src_ready(moves, n, i)) continue;
- replay_emit_move(w, moves[i].dst, moves[i].src, moves[i].mem);
+ replay_emit_move(w, &moves[i]);
moves[i].done = 1;
--remaining;
progressed = 1;
@@ -1369,8 +1426,13 @@ static void replay_parallel_moves(ReplayCtx* r, ReplayParallelMove* moves,
if (sr == (Reg)REG_NONE) continue;
Operand tmp = phys_reg_operand(sr, (RegClass)moves[i].dst.cls,
moves[i].dst.type);
- replay_emit_move(w, tmp, moves[i].src, moves[i].mem);
+ ReplayParallelMove tmp_move = moves[i];
+ tmp_move.dst = tmp;
+ tmp_move.dst_kind = CG_CALL_PLAN_REG;
+ replay_emit_move(w, &tmp_move);
moves[i].src = tmp;
+ moves[i].src_kind = CG_CALL_PLAN_SRC_VALUE;
+ moves[i].src_offset = 0;
progressed = 1;
break;
}
@@ -1393,25 +1455,41 @@ static void replay_parallel_moves(ReplayCtx* r, ReplayParallelMove* moves,
for (;;) {
int idx = replay_find_move_dst(moves, n, &hole);
if (idx < 0 || (u32)idx == first) break;
- replay_emit_move(w, moves[idx].dst, moves[idx].src, moves[idx].mem);
+ replay_emit_move(w, &moves[idx]);
hole = moves[idx].src;
moves[idx].done = 1;
--remaining;
}
- replay_emit_move(w, moves[first].dst, tmp, moves[first].mem);
+ moves[first].src = tmp;
+ moves[first].src_kind = CG_CALL_PLAN_SRC_VALUE;
+ moves[first].src_offset = 0;
+ replay_emit_move(w, &moves[first]);
moves[first].done = 1;
--remaining;
}
}
-static int replay_plan_supported(const CGCallPlan* p) {
- if (!p || p->has_sret) return 0;
- for (u32 i = 0; i < p->nargs; ++i)
- if (p->args[i].dst_kind != CG_CALL_PLAN_REG) return 0;
+static int replay_plan_supported(CGTarget* w, const CGCallPlan* p) {
+ if (!p) return 0;
+ for (u32 i = 0; i < p->nargs; ++i) {
+ if (p->args[i].dst_kind == CG_CALL_PLAN_STACK && !w->store_call_arg)
+ return 0;
+ if (p->args[i].dst_kind == CG_CALL_PLAN_REG &&
+ (p->args[i].src_kind == CG_CALL_PLAN_SRC_ADDR ||
+ p->args[i].src_offset) &&
+ !w->load_call_arg)
+ return 0;
+ }
for (u32 i = 0; i < p->nrets; ++i)
if (p->rets[i].dst.kind != OPK_REG && p->rets[i].dst.kind != OPK_LOCAL &&
p->rets[i].dst.kind != OPK_INDIRECT)
return 0;
+ for (u32 i = 0; i < p->nrets; ++i)
+ if (p->rets[i].dst_offset &&
+ (p->rets[i].dst.kind == OPK_LOCAL ||
+ p->rets[i].dst.kind == OPK_INDIRECT) &&
+ !w->store_call_ret)
+ return 0;
return 1;
}
@@ -1434,19 +1512,33 @@ static void replay_planned_call(ReplayCtx* r, const IRCallAux* aux) {
for (u32 i = 0; i < src_plan->nargs; ++i) {
plan.args[i] = src_plan->args[i];
plan.args[i].src = xlat_op(r, src_plan->args[i].src);
- Operand dst = phys_reg_operand(plan.args[i].dst_reg,
- (RegClass)plan.args[i].cls,
- plan.args[i].mem.type);
+ Operand dst;
+ if (plan.args[i].dst_kind == CG_CALL_PLAN_REG) {
+ dst = phys_reg_operand(plan.args[i].dst_reg,
+ (RegClass)plan.args[i].cls,
+ plan.args[i].mem.type);
+ } else {
+ memset(&dst, 0, sizeof dst);
+ dst.kind = OPK_LOCAL;
+ dst.cls = plan.args[i].cls;
+ dst.type = plan.args[i].mem.type;
+ }
arg_moves[nargs].dst = dst;
arg_moves[nargs].src = plan.args[i].src;
arg_moves[nargs].mem = plan.args[i].mem;
+ arg_moves[nargs].src_offset = plan.args[i].src_offset;
+ arg_moves[nargs].stack_offset = plan.args[i].stack_offset;
+ arg_moves[nargs].dst_kind = plan.args[i].dst_kind;
+ arg_moves[nargs].src_kind = plan.args[i].src_kind;
++nargs;
}
Reg callee_scratch = REG_NONE;
if (plan.callee.kind == OPK_REG) {
for (u32 i = 0; i < nargs; ++i) {
- if (!operand_reg_eq(&arg_moves[i].dst, &plan.callee)) continue;
+ if (arg_moves[i].dst_kind != CG_CALL_PLAN_REG ||
+ !operand_reg_eq(&arg_moves[i].dst, &plan.callee))
+ continue;
callee_scratch = replay_scratch_reg(r, RC_INT, REG_NONE);
if (callee_scratch == (Reg)REG_NONE) {
SrcLoc loc = {0, 0, 0};
@@ -1477,6 +1569,11 @@ static void replay_planned_call(ReplayCtx* r, const IRCallAux* aux) {
ret_moves[nrets].dst = plan.rets[i].dst;
ret_moves[nrets].src = src;
ret_moves[nrets].mem = plan.rets[i].mem;
+ ret_moves[nrets].ret = &plan.rets[i];
+ ret_moves[nrets].dst_offset = plan.rets[i].dst_offset;
+ ret_moves[nrets].dst_kind = CG_CALL_PLAN_REG;
+ ret_moves[nrets].src_kind = CG_CALL_PLAN_SRC_VALUE;
+ ret_moves[nrets].is_ret = 1;
++nrets;
}
replay_parallel_moves(r, ret_moves, nrets);
@@ -1628,7 +1725,7 @@ static void replay_inst(ReplayCtx* r, u32 b, Inst* in) {
IRCallAux* aux = (IRCallAux*)in->extra.aux;
if (aux && aux->use_plan_replay && (aux->desc.flags & CG_CALL_TAIL) == 0 &&
w->emit_call_plan &&
- replay_plan_supported(&aux->plan)) {
+ replay_plan_supported(w, &aux->plan)) {
replay_planned_call(r, aux);
break;
}
@@ -2353,6 +2450,9 @@ CGTarget* opt_cgtarget_new(Compiler* c, CGTarget* target, int level) {
t->call = w_call;
t->plan_call = w_plan_call;
+ t->load_call_arg = w_load_call_arg;
+ t->store_call_arg = w_store_call_arg;
+ t->store_call_ret = w_store_call_ret;
t->emit_call_plan = w_emit_call_plan;
t->ret = w_ret;
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -498,14 +498,27 @@ static int call_plan_replay_supported(const IRCallAux* aux,
const CGTarget* target) {
if (!aux || !aux->plan_valid || !target || !target->emit_call_plan) return 0;
if (aux->desc.flags & CG_CALL_TAIL) return 0;
- if (aux->plan.has_sret) return 0;
- for (u32 i = 0; i < aux->plan.nargs; ++i)
- if (aux->plan.args[i].dst_kind != CG_CALL_PLAN_REG) return 0;
+ for (u32 i = 0; i < aux->plan.nargs; ++i) {
+ if (aux->plan.args[i].dst_kind == CG_CALL_PLAN_STACK &&
+ !target->store_call_arg)
+ return 0;
+ if (aux->plan.args[i].dst_kind == CG_CALL_PLAN_REG &&
+ (aux->plan.args[i].src_kind == CG_CALL_PLAN_SRC_ADDR ||
+ aux->plan.args[i].src_offset) &&
+ !target->load_call_arg)
+ return 0;
+ }
for (u32 i = 0; i < aux->plan.nrets; ++i)
if (aux->plan.rets[i].dst.kind != OPK_REG &&
aux->plan.rets[i].dst.kind != OPK_LOCAL &&
aux->plan.rets[i].dst.kind != OPK_INDIRECT)
return 0;
+ for (u32 i = 0; i < aux->plan.nrets; ++i)
+ if (aux->plan.rets[i].dst_offset &&
+ (aux->plan.rets[i].dst.kind == OPK_LOCAL ||
+ aux->plan.rets[i].dst.kind == OPK_INDIRECT) &&
+ !target->store_call_ret)
+ return 0;
return 1;
}
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -428,6 +428,12 @@ typedef struct MockCGTarget {
Operand copy_src[16];
int load_calls;
int store_calls;
+ int store_call_arg_calls;
+ CGCallPlanMove last_stack_arg;
+ int load_call_arg_calls;
+ CGCallPlanMove last_load_arg;
+ int store_call_ret_calls;
+ CGCallPlanRet last_store_ret;
int addr_of_calls;
int cmp_branch_calls;
int call_calls;
@@ -621,6 +627,30 @@ static void mock_store(CGTarget* t, Operand addr, Operand src, MemAccess macc) {
++m->store_calls;
}
+static void mock_store_call_arg(CGTarget* t, const CGCallPlanMove* move) {
+ MockCGTarget* m = (MockCGTarget*)t;
+ ++m->store_call_arg_calls;
+ m->last_stack_arg = *move;
+}
+
+static void mock_load_call_arg(CGTarget* t, Operand dst,
+ const CGCallPlanMove* move) {
+ MockCGTarget* m = (MockCGTarget*)t;
+ (void)dst;
+ ++m->load_call_arg_calls;
+ m->last_load_arg = *move;
+ if (move->src_kind == CG_CALL_PLAN_SRC_ADDR) ++m->addr_of_calls;
+ else ++m->load_calls;
+}
+
+static void mock_store_call_ret(CGTarget* t, const CGCallPlanRet* ret,
+ Operand src) {
+ MockCGTarget* m = (MockCGTarget*)t;
+ (void)src;
+ ++m->store_call_ret_calls;
+ m->last_store_ret = *ret;
+}
+
static FrameSlot mock_frame_slot(CGTarget* t, const FrameSlotDesc* d);
static CGLocalStorage mock_param(CGTarget* t, const CGParamDesc* p) {
@@ -705,6 +735,9 @@ static void mock_init(MockCGTarget* m, Compiler* c) {
m->base.copy = mock_copy;
m->base.load = mock_load;
m->base.store = mock_store;
+ m->base.load_call_arg = mock_load_call_arg;
+ m->base.store_call_arg = mock_store_call_arg;
+ m->base.store_call_ret = mock_store_call_ret;
m->base.param = mock_param;
m->base.addr_of = mock_addr_of;
m->base.ret = mock_ret;
@@ -826,13 +859,15 @@ static void opt_machinize_filters_abi_regs_for_legacy_call_fallback(void) {
opt_machinize(f, &mock.base);
EXPECT(mock.plan_call_count == 1, "call should be planned before filtering");
- EXPECT(!aux->use_plan_replay,
- "stack-arg call should stay on legacy fallback");
- EXPECT(f->opt_hard_reg_count[RC_INT] == 2,
- "legacy fallback should filter ABI arg/ret regs from hard pool");
- EXPECT(f->opt_hard_regs[RC_INT][0] == 12 &&
- f->opt_hard_regs[RC_INT][1] == 19,
- "only non-ABI regs should remain allocable under fallback");
+ EXPECT(aux->use_plan_replay,
+ "stack-arg call should use planned replay");
+ EXPECT(f->opt_hard_reg_count[RC_INT] == 4,
+ "planned stack calls should keep ABI arg/ret regs allocable");
+ EXPECT(f->opt_hard_regs[RC_INT][0] == 2 &&
+ f->opt_hard_regs[RC_INT][1] == 3 &&
+ f->opt_hard_regs[RC_INT][2] == 12 &&
+ f->opt_hard_regs[RC_INT][3] == 19,
+ "all non-reserved regs should remain allocable under planned calls");
EXPECT((f->opt_arg_regs[RC_INT] & (1u << 2)) != 0,
"arg metadata should still be recorded");
EXPECT((f->opt_ret_regs[RC_INT] & (1u << 3)) != 0,
@@ -2746,7 +2781,7 @@ static void opt_planned_call_replay_preserves_indirect_callee_arg_reg(void) {
tc_fini(&tc);
}
-static void opt_planned_call_replay_falls_back_for_stack_args(void) {
+static void opt_planned_call_replay_stores_stack_args(void) {
TestCtx tc;
tc_init(&tc);
MockCGTarget mock;
@@ -2757,6 +2792,7 @@ static void opt_planned_call_replay_falls_back_for_stack_args(void) {
IRCallAux* aux = arena_znew(f->arena, IRCallAux);
in->extra.aux = aux;
aux->plan_valid = 1;
+ aux->use_plan_replay = 1;
aux->plan.callee = op_reg_(8, tc.i64);
aux->plan.args = arena_zarray(f->arena, CGCallPlanMove, 1);
aux->plan.nargs = 1;
@@ -2767,9 +2803,57 @@ static void opt_planned_call_replay_falls_back_for_stack_args(void) {
opt_emit(tc.c, f, &mock.base);
- EXPECT(mock.emit_call_plan_calls == 0,
- "stack-arg plans should stay on legacy fallback for now");
- EXPECT(mock.call_calls == 1, "legacy call fallback should be used");
+ EXPECT(mock.emit_call_plan_calls == 1,
+ "stack-arg plans should use emit_call_plan");
+ EXPECT(mock.call_calls == 0, "legacy call fallback should not be used");
+ EXPECT(mock.store_call_arg_calls == 1,
+ "planned stack arg should be materialized by opt replay");
+ EXPECT(mock.last_stack_arg.stack_offset == 0,
+ "planned stack arg offset should be preserved");
+ tc_fini(&tc);
+}
+
+static void opt_planned_call_replay_materializes_address_args(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ MockCGTarget mock;
+ mock_init(&mock, tc.c);
+
+ Func* f = new_func(&tc);
+ FrameSlot fs = add_frame_slot(f, tc.i64, FS_LOCAL, 8, 0);
+ Inst* in = ir_emit(f, f->entry, IR_CALL);
+ IRCallAux* aux = arena_znew(f->arena, IRCallAux);
+ in->extra.aux = aux;
+ aux->plan_valid = 1;
+ aux->use_plan_replay = 1;
+ aux->plan.has_sret = 1;
+ aux->plan.callee = op_reg_(8, tc.i64);
+ aux->plan.args = arena_zarray(f->arena, CGCallPlanMove, 2);
+ aux->plan.nargs = 2;
+ aux->plan.args[0].src = op_local_(fs, tc.i64);
+ aux->plan.args[0].src_kind = CG_CALL_PLAN_SRC_ADDR;
+ aux->plan.args[0].dst_kind = CG_CALL_PLAN_REG;
+ aux->plan.args[0].cls = RC_INT;
+ aux->plan.args[0].dst_reg = 1;
+ aux->plan.args[0].mem = mem_unknown_(tc.i64, 8);
+ aux->plan.args[1].src = op_local_(fs, tc.i64);
+ aux->plan.args[1].src_kind = CG_CALL_PLAN_SRC_ADDR;
+ aux->plan.args[1].dst_kind = CG_CALL_PLAN_STACK;
+ aux->plan.args[1].cls = RC_INT;
+ aux->plan.args[1].stack_offset = 8;
+ aux->plan.args[1].mem = mem_unknown_(tc.i64, 8);
+
+ opt_emit(tc.c, f, &mock.base);
+
+ EXPECT(mock.emit_call_plan_calls == 1,
+ "sret-shaped plans should use planned replay");
+ EXPECT(mock.addr_of_calls == 1,
+ "register address arg should materialize with addr_of");
+ EXPECT(mock.load_calls == 0,
+ "address args should not be loaded as values");
+ EXPECT(mock.store_call_arg_calls == 1 &&
+ mock.last_stack_arg.src_kind == CG_CALL_PLAN_SRC_ADDR,
+ "stack address arg should stay marked as an address");
tc_fini(&tc);
}
@@ -3184,7 +3268,8 @@ int main(void) {
opt_dead_def_elim_test();
opt_planned_call_replay_resolves_arg_cycle();
opt_planned_call_replay_preserves_indirect_callee_arg_reg();
- opt_planned_call_replay_falls_back_for_stack_args();
+ opt_planned_call_replay_stores_stack_args();
+ opt_planned_call_replay_materializes_address_args();
opt_emit_preserves_physical_reg_zero();
opt_emit_no_virtual_alloc();
opt_records_const_bytes_by_value();