commit d328ef30125bf7b82667e0196b0bcbb90135aaaf
parent 98d5b64ce59ab7cddb5ffc28aeece8ca689f4ca1
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 15 May 2026 18:03:09 -0700
Implement planned call replay for O1
Diffstat:
13 files changed, 711 insertions(+), 89 deletions(-)
diff --git a/doc/OPT_REGS_CALL_PLAN.md b/doc/OPT_REGS_CALL_PLAN.md
@@ -14,14 +14,20 @@ backend emission intact enough to migrate one architecture at a time.
## Current Status
-The correctness foundation for register preservation is implemented. Targets now
-expose descriptive physical-register metadata, per-call clobber masks,
-return-register masks, callee-save masks, and call plans. O1 records each call
-plan during `machinize`, builds its current hard-register tables from
-`CGPhysRegInfo`, uses target save/use costs in allocation scoring, and preserves
-hard-assigned live-across-call values by intersecting the assigned register with
-the planned call's clobber mask. Post-RA hard-register liveness uses the same
-call-specific clobber mask.
+The correctness foundation for register preservation and the first planned-call
+replay path are implemented. Targets now expose descriptive physical-register
+metadata, per-call clobber masks, return-register masks, callee-save masks, and
+call plans. O1 records each call plan during `machinize`, builds its current
+hard-register tables from `CGPhysRegInfo`, uses target save/use costs in
+allocation scoring, and preserves hard-assigned live-across-call values by
+intersecting the assigned register with the planned call's clobber mask.
+Post-RA hard-register liveness uses the same call-specific clobber mask.
+
+For supported non-tail, non-sret, register-argument call plans, O1 now replays
+calls by materializing arguments with a local parallel-copy resolver, invoking
+the backend's `emit_call_plan` hook, and extracting returns from fixed return
+registers. The x64, AArch64, and RV64 backends implement `emit_call_plan` as a
+branch-only call emission hook.
What this closes:
@@ -36,19 +42,64 @@ What this closes:
What remains open:
-- calls are not yet lowered into explicit opt-visible setup/call/return-copy IR;
-- call argument and return moves are not yet resolved by an opt parallel-copy
- resolver;
-- backend emission still uses the legacy sequential `call` hook rather than an
- `emit_call_plan` path with pre-materialized arguments and returns;
+- call setup/return extraction are represented by call-plan aux data rather
+ than separate first-class IR ops;
+- stack-argument call plans, sret calls, and tail calls still fall back to the
+ legacy backend `call` hook;
- target `get_phys_regs` tables still expose mostly the old conservative pools,
so ABI argument/return registers are not generally allocable yet;
- direct CG still uses legacy allocation/call hooks;
-- broader call-plan layout tests, parallel-copy hazard tests, and code-shape
- probes remain to be added.
-
-In phase terms: Phase 1 is done, Phase 2 is mostly done, Phase 3 is partially
-done for clobber/preservation visibility, and Phases 4-6 remain open.
+- broader real-architecture call-plan layout tests and code-shape probes remain
+ to be added.
+
+In phase terms: Phase 1 and Phase 2 are done, Phase 3 is implemented through
+call-plan aux visibility plus planned replay for supported call shapes, Phase 4
+is implemented for register argument/return moves with stack/sret/tail fallback,
+and Phases 5-6 remain open.
+
+## Planned Call Replay Boundary
+
+The legacy backend `call` hook remains intentionally active as a correctness
+fallback. The planned replay path currently covers the call shapes needed to
+prove ABI register hazards without moving every ABI corner case at once.
+
+Planned replay is used only when all of the following are true:
+
+- the call has a valid `CGCallPlan`;
+- the backend provides `emit_call_plan`;
+- the call is not a tail call;
+- the call is not an sret call;
+- every argument destination is `CG_CALL_PLAN_REG`;
+- every return destination is a register, local, or indirect operand.
+
+For those calls, O1 owns the setup and extraction sequence:
+
+- source operands are rewritten to hard registers or spill slots;
+- live-across-call hard registers are saved before argument setup;
+- argument moves into ABI registers are resolved as a local parallel copy;
+- indirect callees that would be overwritten by argument setup are copied to a
+ target-provided scratch register first;
+- the backend emits only required call metadata and the branch through
+ `emit_call_plan`;
+- return registers are copied or stored into their planned destinations.
+
+The fallback path is still required for:
+
+- **outgoing stack arguments**: `CG_CALL_PLAN_STACK` records the ABI stack
+ offset, but opt replay does not yet have a backend-neutral operation for
+ writing into the target-owned outgoing area while preserving known-frame and
+ `max_outgoing` invariants;
+- **sret calls**: backends still synthesize the hidden destination pointer from
+ frame-slot state inside `call`;
+- **tail calls**: the legacy hook owns epilogue emission, legality checks, and
+ branch-without-continuation behavior;
+- **direct CG**: direct codegen still uses the old backend allocation and call
+ hooks while O1 migrates first.
+
+This boundary lets Phase 3/4 tests exercise register argument permutation,
+indirect-callee clobber hazards, call-specific clobber preservation, and return
+extraction without broadening the register file before stack/sret/tail lowering
+is explicit enough to be target-independent.
## Current Problem
@@ -232,6 +283,7 @@ typedef struct CGCallPlan {
u32 return_mask[OPT_REG_CLASSES];
u32 stack_arg_size;
u8 variadic_fp_count;
+ u8 is_variadic;
u8 has_sret;
} CGCallPlan;
```
@@ -308,8 +360,9 @@ Backends should gain emission hooks for an already-planned call:
void (*emit_call_plan)(CGTarget*, const CGCallPlan*);
```
-For the transition, this hook should assume arg registers and outgoing stack
-slots have already been materialized by opt. It only emits:
+For the current transition, this hook assumes register arguments have already
+been materialized by opt. Stack-argument plans still use the legacy `call`
+fallback. The hook only emits:
- required varargs metadata such as x64 `AL`;
- direct or indirect call branch;
@@ -339,10 +392,10 @@ Expected result: no codegen behavior change.
### Phase 2 - Call Plan Construction
-Status: mostly done. `CGCallPlan`, `plan_call`, call clobber masks, return
-masks, and callee-save masks exist for the three native backends. O1 attaches
-plans during `machinize`. Remaining work is fuller layout/dump coverage for
-mixed, variadic, stack-arg, and aggregate cases.
+Status: done. `CGCallPlan`, `plan_call`, call clobber masks, return masks, and
+callee-save masks exist for the three native backends. O1 attaches plans during
+`machinize`, and the opt tests cover plan attachment plus downstream planned
+replay/fallback behavior.
- Add `CGCallPlan` and `plan_call`.
- Implement call planning for simple direct scalar integer and FP args/returns on
@@ -356,36 +409,46 @@ yet.
### Phase 3 - Opt IR Call Constraints
-Status: partial. Calls carry plan aux data before liveness/allocation, and
-rewrite plus hard-register liveness use the planned clobber masks. Calls are not
-yet lowered into explicit setup/call/return-copy IR, and implicit arg/return
-register uses/defs are not yet modeled as first-class constrained operations.
-
-- Lower `IR_CALL` into opt-visible call setup, constrained call, and return-copy
- representation during `machinize`.
-- Teach liveness/range building that call ops have implicit register uses,
- implicit return defs, and clobber masks.
-- Keep the old path behind a fallback for unsupported call-plan shapes.
-- Add tests for values occupying ABI arg registers before call setup.
+Status: implemented for the current aux-data representation. Calls carry plan
+aux data before liveness/allocation. Liveness, rewrite, hard-register DCE, and
+hard-register liveness inspect plan operands for supported planned calls, while
+rewrite uses the call-specific clobber mask to save live-across-call hard values
+before argument setup. The implementation keeps explicit setup/call/return-copy
+IR ops as a possible later cleanup rather than a prerequisite.
+
+- done: attach plan aux data to `IR_CALL` during `machinize`;
+- done: teach liveness/range building to use planned source and destination
+ operands when planned replay is enabled;
+- done: model call clobbers through the call-specific plan mask;
+- done: keep the legacy `call` path behind a fallback for unsupported call-plan
+ shapes;
+- still optional: split setup/call/return extraction into separate IR ops if the
+ aux-data representation becomes too opaque for later passes.
Expected result: correctness coverage for arg-register hazards before the
allocator starts using those registers widely.
### Phase 4 - Parallel Copy Resolver
-Status: open. Argument/return setup still goes through backend sequential call
-emitters, so ABI argument and return registers cannot be broadly exposed yet.
-
-- Implement local parallel move resolution for call setup and return extraction.
-- Support register-register cycles, register-stack moves, local/indirect loads,
- immediates, and stack arguments.
-- Use target-provided temporary policy first; later this can use per-instruction
- temp allocation.
-- Add red-green tests for argument permutation hazards:
- - `f(b, a)` where `a` and `b` are already in the opposite ABI registers;
- - indirect callee held in an argument register;
- - return register also used by a live pre-call value;
- - stack arguments sourced from registers that are also call destinations.
+Status: implemented for register argument/return plans. O1 replay uses a local
+parallel-copy resolver for planned call setup and return extraction, including
+register-register cycles, local/indirect/immediate/global sources, register
+destinations, local/indirect return destinations, and indirect callees that
+occupy a destination argument register. Stack-argument, sret, and tail-call
+plans continue to use the legacy backend `call` fallback until outgoing
+stack-slot materialization is represented in the target contract.
+
+- done: implement local parallel move resolution for register call setup and
+ return extraction;
+- done: support register-register cycles, local/indirect loads, immediates,
+ globals, and local/indirect return stores;
+- done: use target-provided scratch registers to break cycles and preserve
+ indirect callees;
+- done: add red-green tests for argument permutation cycles, indirect callees in
+ argument registers, and stack-argument fallback;
+- still open: support `CG_CALL_PLAN_STACK` materialization directly in opt;
+- still open: add return-register collision and stack-source hazard tests once
+ stack materialization is explicit.
Expected result: ABI arg and return registers can be made allocable safely.
@@ -429,11 +492,14 @@ Focused unit tests:
- done: opt-side target register metadata consumption;
- done: caller-saved live-across-call preservation using per-call masks;
+- done: planned-call replay through `emit_call_plan` for register-argument
+ cycles and indirect-callee/argument-register hazards;
+- done: legacy fallback for planned calls requiring outgoing stack arguments;
- still needed: target register metadata tests per real architecture;
-- still needed: call-plan layout for scalar, FP, mixed, sret, variadic, and
- stack-arg calls;
+- still needed: broader real-architecture call-plan layout for scalar, FP,
+ mixed, sret, variadic, and stack-arg calls;
- still needed: direct call-clobber mask tests per real architecture;
-- still needed: parallel-copy cycles and memory routing;
+- still needed: code-shape probes after ABI registers are exposed broadly;
- still needed: callee-save reservation/code-shape tests after broadened
allocation.
@@ -487,15 +553,13 @@ Next patch stack:
1. Add call-plan layout/dump tests for real x64/AArch64/RV64 scalar, FP, mixed,
sret, variadic, and stack-arg cases.
-2. Lower one simple call shape through explicit opt-visible setup/call/return
- constraints behind a narrow capability check.
-3. Implement local parallel copies for that shape.
-4. Add red-green hazard tests for argument permutations, indirect callees in
- argument registers, return-register collisions, and stack-argument sources.
-5. Add `emit_call_plan` for one backend and switch O1 replay for supported plans.
-6. Broaden register exposure incrementally, keeping helper scratch registers
+2. Extend the target contract with an opt-visible outgoing stack-slot
+ materialization path, then remove the stack-argument fallback.
+3. Add red-green hazard tests for return-register collisions and stack-argument
+ sources once stack materialization is explicit.
+4. Broaden register exposure incrementally, keeping helper scratch registers
reserved until their clobbers are explicit.
-7. Migrate direct CG or wrap it with internal call planning, then remove legacy
+5. Migrate direct CG or wrap it with internal call planning, then remove legacy
pool semantics.
This order keeps each step testable and avoids mixing API migration, allocation
diff --git a/src/arch/aa64/ops.c b/src/arch/aa64/ops.c
@@ -1078,6 +1078,24 @@ static void aa_call(CGTarget* t, const CGCallDesc* d) {
}
}
+static void aa_emit_call_plan(CGTarget* t, const CGCallPlan* p) {
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+
+ if (p->callee.kind == OPK_GLOBAL) {
+ u32 bl_pos = mc->pos(mc);
+ aa64_emit32(mc, aa64_bl_base());
+ mc->emit_reloc_at(mc, mc->section_id, bl_pos, R_AARCH64_CALL26,
+ p->callee.v.global.sym, p->callee.v.global.addend, 0, 0);
+ } else if (p->callee.kind == OPK_REG) {
+ aa64_emit32(mc, aa64_blr(reg_num(p->callee)));
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64 emit_call_plan: callee kind %d unsupported",
+ (int)p->callee.kind);
+ }
+}
+
static void aa_ret(CGTarget* t, const CGABIValue* val) {
AAImpl* a = impl_of(t);
MCEmitter* mc = t->mc;
@@ -1961,6 +1979,7 @@ CGTarget* aa64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) {
t->convert = aa_convert;
t->call = aa_call;
+ t->emit_call_plan = aa_emit_call_plan;
t->call_stack_size = aa_call_stack_size;
t->ret = aa_ret;
diff --git a/src/arch/aa64/opt_coord.c b/src/arch/aa64/opt_coord.c
@@ -160,6 +160,7 @@ static void aa_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
out->callee = d->callee;
out->stack_arg_size = t->call_stack_size ? t->call_stack_size(t, d) : 0;
out->has_sret = d->abi && d->abi->has_sret;
+ out->is_variadic = d->abi && d->abi->variadic;
for (u32 c = 0; c < CG_CALL_PLAN_REG_CLASSES; ++c) {
out->clobber_mask[c] = aa_call_clobber_mask(t, d, (RegClass)c);
out->return_mask[c] = aa_return_reg_mask(t, d->abi, (RegClass)c);
diff --git a/src/arch/arch.h b/src/arch/arch.h
@@ -451,8 +451,9 @@ typedef struct CGCallPlan {
u32 return_mask[CG_CALL_PLAN_REG_CLASSES];
u32 stack_arg_size;
u8 variadic_fp_count;
+ u8 is_variadic;
u8 has_sret;
- u8 pad[2];
+ u8 pad;
} CGCallPlan;
typedef u32 Label;
diff --git a/src/arch/rv64/ops.c b/src/arch/rv64/ops.c
@@ -1083,6 +1083,26 @@ static void rv_call(CGTarget* t, const CGCallDesc* d) {
}
}
+static void rv_emit_call_plan(CGTarget* t, const CGCallPlan* p) {
+ RImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+
+ if (p->callee.kind == OPK_GLOBAL) {
+ u32 sec = mc->section_id;
+ u32 pos = mc->pos(mc);
+ rv64_emit32(mc, rv_auipc(RV_RA, 0));
+ rv64_emit32(mc, rv_jalr(RV_RA, RV_RA, 0));
+ mc->emit_reloc_at(mc, sec, pos, R_RV_CALL,
+ p->callee.v.global.sym, p->callee.v.global.addend, 0, 0);
+ } else if (p->callee.kind == OPK_REG) {
+ rv64_emit32(mc, rv_jalr(RV_RA, reg_num(p->callee), 0));
+ } else {
+ compiler_panic(t->c, a->loc,
+ "rv64 emit_call_plan: callee kind %d unsupported",
+ (int)p->callee.kind);
+ }
+}
+
static void rv_ret(CGTarget* t, const CGABIValue* val) {
RImpl* a = impl_of(t);
MCEmitter* mc = t->mc;
@@ -1885,6 +1905,7 @@ CGTarget* rv64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) {
t->convert = rv_convert;
t->call = rv_call;
+ t->emit_call_plan = rv_emit_call_plan;
t->call_stack_size = rv_call_stack_size;
t->ret = rv_ret;
diff --git a/src/arch/rv64/opt_coord.c b/src/arch/rv64/opt_coord.c
@@ -149,6 +149,7 @@ static void rv_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
out->callee = d->callee;
out->stack_arg_size = t->call_stack_size ? t->call_stack_size(t, d) : 0;
out->has_sret = d->abi && d->abi->has_sret;
+ out->is_variadic = d->abi && d->abi->variadic;
for (u32 c = 0; c < CG_CALL_PLAN_REG_CLASSES; ++c) {
out->clobber_mask[c] = rv_call_clobber_mask(t, d, (RegClass)c);
out->return_mask[c] = rv_return_reg_mask(t, d->abi, (RegClass)c);
diff --git a/src/arch/x64/ops.c b/src/arch/x64/ops.c
@@ -1008,6 +1008,32 @@ static void x_call(CGTarget* t, const CGCallDesc* d) {
}
}
+static void x_emit_call_plan(CGTarget* t, const CGCallPlan* p) {
+ MCEmitter* mc = t->mc;
+
+ if (p->is_variadic)
+ x64_emit_load_imm(mc, 0, X64_RAX, (i64)p->variadic_fp_count);
+
+ if (p->callee.kind == OPK_GLOBAL) {
+ u8 op = 0xE8;
+ mc->emit_bytes(mc, &op, 1);
+ u32 disp_pos = mc->pos(mc);
+ emit_u32le(mc, 0);
+ mc->emit_reloc_at(mc, mc->section_id, disp_pos, R_X64_PLT32,
+ p->callee.v.global.sym,
+ p->callee.v.global.addend - 4, 1, 0);
+ } else if (p->callee.kind == OPK_REG) {
+ u32 r = p->callee.v.reg & 0xFu;
+ emit_rex(mc, 0, 0, 0, r);
+ u8 buf[2] = {0xFF, modrm(3u, 2u, r)};
+ mc->emit_bytes(mc, buf, 2);
+ } else {
+ compiler_panic(t->c, impl_of(t)->loc,
+ "x64 emit_call_plan: callee kind %d unsupported",
+ (int)p->callee.kind);
+ }
+}
+
static void x_ret(CGTarget* t, const CGABIValue* val) {
XImpl* a = impl_of(t);
MCEmitter* mc = t->mc;
@@ -1952,6 +1978,7 @@ CGTarget* x64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) {
t->convert = x_convert;
t->call = x_call;
+ t->emit_call_plan = x_emit_call_plan;
t->call_stack_size = x_call_stack_size;
t->ret = x_ret;
diff --git a/src/arch/x64/opt_coord.c b/src/arch/x64/opt_coord.c
@@ -157,6 +157,7 @@ static void x_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
out->callee = d->callee;
out->stack_arg_size = t->call_stack_size ? t->call_stack_size(t, d) : 0;
out->has_sret = d->abi && d->abi->has_sret;
+ out->is_variadic = d->abi && d->abi->variadic;
for (u32 c = 0; c < CG_CALL_PLAN_REG_CLASSES; ++c) {
out->clobber_mask[c] = x_call_clobber_mask(t, d, (RegClass)c);
out->return_mask[c] = x_return_reg_mask(t, d->abi, (RegClass)c);
diff --git a/src/opt/ir.h b/src/opt/ir.h
@@ -128,7 +128,8 @@ typedef struct IRCallAux {
CGCallDesc desc;
CGCallPlan plan;
u8 plan_valid;
- u8 pad[3];
+ u8 use_plan_replay;
+ u8 pad[2];
/* Result Vals (one per ABI-decomposed return part). 0 for void. */
u32 nresults;
Val* results;
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -1263,6 +1263,223 @@ static CGABIValue xlat_abivalue(ReplayCtx* r, const CGABIValue* in,
return out;
}
+typedef struct ReplayParallelMove {
+ Operand dst;
+ Operand src;
+ MemAccess mem;
+ u8 done;
+ u8 pad[3];
+} ReplayParallelMove;
+
+static Operand phys_reg_operand(Reg r, RegClass cls, CfreeCgTypeId ty) {
+ Operand op;
+ memset(&op, 0, sizeof op);
+ op.kind = OPK_REG;
+ op.cls = (u8)cls;
+ op.type = ty;
+ op.v.reg = r;
+ return op;
+}
+
+static int operand_reg_eq(const Operand* a, const Operand* b) {
+ return a && b && a->kind == OPK_REG && b->kind == OPK_REG &&
+ a->cls == b->cls && a->v.reg == b->v.reg;
+}
+
+static int operand_uses_reg_for_replay(const Operand* op, const Operand* r) {
+ if (!op || !r || r->kind != OPK_REG) return 0;
+ if (op->kind == OPK_REG) return operand_reg_eq(op, r);
+ if (op->kind == OPK_INDIRECT)
+ return r->cls == RC_INT && op->v.ind.base == r->v.reg;
+ return 0;
+}
+
+static int replay_move_src_ready(const ReplayParallelMove* moves, u32 n,
+ u32 idx) {
+ const Operand* dst = &moves[idx].dst;
+ for (u32 i = 0; i < n; ++i) {
+ if (i == idx || moves[i].done) continue;
+ if (operand_uses_reg_for_replay(&moves[i].src, dst)) return 0;
+ }
+ return 1;
+}
+
+static int replay_find_move_dst(const ReplayParallelMove* moves, u32 n,
+ const Operand* dst) {
+ for (u32 i = 0; i < n; ++i) {
+ if (!moves[i].done && operand_reg_eq(&moves[i].dst, dst)) return (int)i;
+ }
+ return -1;
+}
+
+static Reg replay_scratch_reg(ReplayCtx* r, RegClass cls, Reg avoid) {
+ if ((u32)cls >= OPT_REG_CLASSES) return REG_NONE;
+ for (u32 i = 0; i < r->f->opt_scratch_reg_count[cls]; ++i) {
+ Reg sr = r->f->opt_scratch_regs[cls][i];
+ if (sr != avoid) return sr;
+ }
+ return REG_NONE;
+}
+
+static void replay_emit_move(CGTarget* w, Operand dst, Operand src,
+ MemAccess mem) {
+ if (dst.kind == OPK_REG) {
+ if (src.kind == OPK_REG) {
+ if (!operand_reg_eq(&dst, &src)) w->copy(w, dst, src);
+ } else if (src.kind == OPK_IMM) {
+ w->load_imm(w, dst, src.v.imm);
+ } else if (src.kind == OPK_LOCAL || src.kind == OPK_INDIRECT) {
+ w->load(w, dst, src, mem);
+ } else if (src.kind == OPK_GLOBAL) {
+ w->addr_of(w, dst, src);
+ }
+ } else if (dst.kind == OPK_LOCAL || dst.kind == OPK_INDIRECT) {
+ w->store(w, dst, src, mem);
+ }
+}
+
+static void replay_parallel_moves(ReplayCtx* r, ReplayParallelMove* moves,
+ u32 n) {
+ CGTarget* w = r->tgt;
+ u32 remaining = 0;
+ for (u32 i = 0; i < n; ++i) {
+ if (operand_reg_eq(&moves[i].dst, &moves[i].src)) {
+ moves[i].done = 1;
+ } else {
+ ++remaining;
+ }
+ }
+
+ while (remaining) {
+ int progressed = 0;
+ for (u32 i = 0; i < n; ++i) {
+ if (moves[i].done || !replay_move_src_ready(moves, n, i)) continue;
+ replay_emit_move(w, moves[i].dst, moves[i].src, moves[i].mem);
+ moves[i].done = 1;
+ --remaining;
+ progressed = 1;
+ }
+ if (progressed) continue;
+
+ for (u32 i = 0; i < n; ++i) {
+ if (moves[i].done || moves[i].src.kind == OPK_REG) continue;
+ Reg sr = replay_scratch_reg(r, (RegClass)moves[i].dst.cls, REG_NONE);
+ if (sr == (Reg)REG_NONE) continue;
+ Operand tmp = phys_reg_operand(sr, (RegClass)moves[i].dst.cls,
+ moves[i].dst.type);
+ replay_emit_move(w, tmp, moves[i].src, moves[i].mem);
+ moves[i].src = tmp;
+ progressed = 1;
+ break;
+ }
+ if (progressed) continue;
+
+ u32 first = 0;
+ while (first < n && moves[first].done) ++first;
+ if (first == n) break;
+ Operand save = moves[first].src;
+ Reg sr = replay_scratch_reg(r, (RegClass)save.cls, REG_NONE);
+ if (sr == (Reg)REG_NONE) {
+ SrcLoc loc = {0, 0, 0};
+ compiler_panic(r->c, loc,
+ "opt replay: no scratch register for parallel call move");
+ }
+ Operand tmp = phys_reg_operand(sr, (RegClass)save.cls, save.type);
+ w->copy(w, tmp, save);
+
+ Operand hole = save;
+ for (;;) {
+ int idx = replay_find_move_dst(moves, n, &hole);
+ if (idx < 0 || (u32)idx == first) break;
+ replay_emit_move(w, moves[idx].dst, moves[idx].src, moves[idx].mem);
+ hole = moves[idx].src;
+ moves[idx].done = 1;
+ --remaining;
+ }
+ replay_emit_move(w, moves[first].dst, tmp, moves[first].mem);
+ moves[first].done = 1;
+ --remaining;
+ }
+}
+
+static int replay_plan_supported(const CGCallPlan* p) {
+ if (!p || p->has_sret) return 0;
+ for (u32 i = 0; i < p->nargs; ++i)
+ if (p->args[i].dst_kind != CG_CALL_PLAN_REG) return 0;
+ for (u32 i = 0; i < p->nrets; ++i)
+ if (p->rets[i].dst.kind != OPK_REG && p->rets[i].dst.kind != OPK_LOCAL &&
+ p->rets[i].dst.kind != OPK_INDIRECT)
+ return 0;
+ return 1;
+}
+
+static void replay_planned_call(ReplayCtx* r, const IRCallAux* aux) {
+ const CGCallPlan* src_plan = &aux->plan;
+ CGCallPlan plan = *src_plan;
+ plan.callee = xlat_op(r, src_plan->callee);
+ plan.args = src_plan->nargs
+ ? arena_array(r->f->arena, CGCallPlanMove, src_plan->nargs)
+ : NULL;
+ plan.rets = src_plan->nrets
+ ? arena_array(r->f->arena, CGCallPlanRet, src_plan->nrets)
+ : NULL;
+
+ ReplayParallelMove* arg_moves =
+ src_plan->nargs ? arena_zarray(r->f->arena, ReplayParallelMove,
+ src_plan->nargs)
+ : NULL;
+ u32 nargs = 0;
+ for (u32 i = 0; i < src_plan->nargs; ++i) {
+ plan.args[i] = src_plan->args[i];
+ plan.args[i].src = xlat_op(r, src_plan->args[i].src);
+ Operand dst = phys_reg_operand(plan.args[i].dst_reg,
+ (RegClass)plan.args[i].cls,
+ plan.args[i].mem.type);
+ arg_moves[nargs].dst = dst;
+ arg_moves[nargs].src = plan.args[i].src;
+ arg_moves[nargs].mem = plan.args[i].mem;
+ ++nargs;
+ }
+
+ Reg callee_scratch = REG_NONE;
+ if (plan.callee.kind == OPK_REG) {
+ for (u32 i = 0; i < nargs; ++i) {
+ if (!operand_reg_eq(&arg_moves[i].dst, &plan.callee)) continue;
+ callee_scratch = replay_scratch_reg(r, RC_INT, REG_NONE);
+ if (callee_scratch == (Reg)REG_NONE) {
+ SrcLoc loc = {0, 0, 0};
+ compiler_panic(r->c, loc,
+ "opt replay: no scratch register for indirect call");
+ }
+ Operand tmp = phys_reg_operand(callee_scratch, RC_INT, plan.callee.type);
+ r->tgt->copy(r->tgt, tmp, plan.callee);
+ plan.callee = tmp;
+ break;
+ }
+ }
+
+ replay_parallel_moves(r, arg_moves, nargs);
+ r->tgt->emit_call_plan(r->tgt, &plan);
+
+ ReplayParallelMove* ret_moves =
+ src_plan->nrets ? arena_zarray(r->f->arena, ReplayParallelMove,
+ src_plan->nrets)
+ : NULL;
+ u32 nrets = 0;
+ for (u32 i = 0; i < src_plan->nrets; ++i) {
+ plan.rets[i] = src_plan->rets[i];
+ plan.rets[i].dst = xlat_op(r, src_plan->rets[i].dst);
+ Operand src = phys_reg_operand(plan.rets[i].src_reg,
+ (RegClass)plan.rets[i].cls,
+ plan.rets[i].mem.type);
+ ret_moves[nrets].dst = plan.rets[i].dst;
+ ret_moves[nrets].src = src;
+ ret_moves[nrets].mem = plan.rets[i].mem;
+ ++nrets;
+ }
+ replay_parallel_moves(r, ret_moves, nrets);
+}
+
static Label ensure_label(ReplayCtx* r, u32 b) {
if (b >= r->f->nblocks) return LABEL_NONE;
if (r->label_map[b] == LABEL_NONE) {
@@ -1407,6 +1624,12 @@ static void replay_inst(ReplayCtx* r, u32 b, Inst* in) {
}
case IR_CALL: {
IRCallAux* aux = (IRCallAux*)in->extra.aux;
+ if (aux && aux->use_plan_replay && (aux->desc.flags & CG_CALL_TAIL) == 0 &&
+ w->emit_call_plan &&
+ replay_plan_supported(&aux->plan)) {
+ replay_planned_call(r, aux);
+ break;
+ }
CGCallDesc cd = aux->desc;
cd.callee = xlat_op(r, cd.callee);
CGABIValue* args = NULL;
@@ -1630,13 +1853,31 @@ static u32 collect_replayed_hard_regs(Func* f, CGTarget* w, RegClass cls,
case IR_CALL: {
IRCallAux* aux = (IRCallAux*)in->extra.aux;
if (!aux) break;
- collect_replayed_operand_reg(&aux->desc.callee, cls, used, &nused,
- cap);
- for (u32 j = 0; j < aux->desc.nargs; ++j)
- collect_replayed_abivalue_regs(&aux->desc.args[j], cls, used,
+ if (aux->use_plan_replay) {
+ collect_replayed_operand_reg(&aux->plan.callee, cls, used, &nused,
+ cap);
+ for (u32 j = 0; j < aux->plan.nargs; ++j) {
+ collect_replayed_operand_reg(&aux->plan.args[j].src, cls, used,
+ &nused, cap);
+ if (aux->plan.args[j].dst_kind == CG_CALL_PLAN_REG &&
+ aux->plan.args[j].cls == (u8)cls)
+ add_unique_reg(used, &nused, cap, aux->plan.args[j].dst_reg);
+ }
+ for (u32 j = 0; j < aux->plan.nrets; ++j) {
+ collect_replayed_operand_reg(&aux->plan.rets[j].dst, cls, used,
&nused, cap);
- collect_replayed_abivalue_regs(&aux->desc.ret, cls, used, &nused,
+ if (aux->plan.rets[j].cls == (u8)cls)
+ add_unique_reg(used, &nused, cap, aux->plan.rets[j].src_reg);
+ }
+ } else {
+ collect_replayed_operand_reg(&aux->desc.callee, cls, used, &nused,
cap);
+ for (u32 j = 0; j < aux->desc.nargs; ++j)
+ collect_replayed_abivalue_regs(&aux->desc.args[j], cls, used,
+ &nused, cap);
+ collect_replayed_abivalue_regs(&aux->desc.ret, cls, used, &nused,
+ cap);
+ }
break;
}
case IR_RET: {
diff --git a/src/opt/pass_live.c b/src/opt/pass_live.c
@@ -162,10 +162,19 @@ static void live_walk_inst_operands(Func* f, Inst* in, LiveOperandWalkFn fn,
case IR_CALL: {
IRCallAux* aux = (IRCallAux*)in->extra.aux;
if (!aux) break;
- live_walk_operand(f, in, &aux->desc.callee, 0, fn, ctx);
- for (u32 i = 0; i < aux->desc.nargs; ++i)
- live_walk_abivalue(f, in, (CGABIValue*)&aux->desc.args[i], 0, fn, ctx);
- live_walk_abivalue(f, in, &aux->desc.ret, 1, fn, ctx);
+ if (aux->use_plan_replay) {
+ live_walk_operand(f, in, &aux->plan.callee, 0, fn, ctx);
+ for (u32 i = 0; i < aux->plan.nargs; ++i)
+ live_walk_operand(f, in, &aux->plan.args[i].src, 0, fn, ctx);
+ for (u32 i = 0; i < aux->plan.nrets; ++i)
+ live_walk_operand(f, in, &aux->plan.rets[i].dst, 1, fn, ctx);
+ } else {
+ live_walk_operand(f, in, &aux->desc.callee, 0, fn, ctx);
+ for (u32 i = 0; i < aux->desc.nargs; ++i)
+ live_walk_abivalue(f, in, (CGABIValue*)&aux->desc.args[i], 0, fn,
+ ctx);
+ live_walk_abivalue(f, in, &aux->desc.ret, 1, fn, ctx);
+ }
break;
}
case IR_RET: {
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -104,10 +104,18 @@ static void walk_inst_operands(Func* f, Inst* in, OperandWalkFn fn, void* ctx) {
case IR_CALL: {
IRCallAux* aux = (IRCallAux*)in->extra.aux;
if (!aux) break;
- walk_operand(f, in, &aux->desc.callee, 0, fn, ctx);
- for (u32 i = 0; i < aux->desc.nargs; ++i)
- walk_abivalue(f, in, (CGABIValue*)&aux->desc.args[i], 0, fn, ctx);
- walk_abivalue(f, in, &aux->desc.ret, 1, fn, ctx);
+ if (aux->use_plan_replay) {
+ walk_operand(f, in, &aux->plan.callee, 0, fn, ctx);
+ for (u32 i = 0; i < aux->plan.nargs; ++i)
+ walk_operand(f, in, &aux->plan.args[i].src, 0, fn, ctx);
+ for (u32 i = 0; i < aux->plan.nrets; ++i)
+ walk_operand(f, in, &aux->plan.rets[i].dst, 1, fn, ctx);
+ } else {
+ walk_operand(f, in, &aux->desc.callee, 0, fn, ctx);
+ for (u32 i = 0; i < aux->desc.nargs; ++i)
+ walk_abivalue(f, in, (CGABIValue*)&aux->desc.args[i], 0, fn, ctx);
+ walk_abivalue(f, in, &aux->desc.ret, 1, fn, ctx);
+ }
break;
}
case IR_RET: {
@@ -353,6 +361,9 @@ static void asm_prepare_constraints(Func* f, CGTarget* target, IRAsmAux* aux) {
}
}
+static int call_plan_replay_supported(const IRCallAux* aux,
+ const CGTarget* target);
+
void opt_machinize(Func* f, CGTarget* target) {
f->opt_target = target->c->target;
f->opt_has_target = 1;
@@ -442,6 +453,7 @@ void opt_machinize(Func* f, CGTarget* target) {
if (aux) {
target->plan_call(target, &aux->desc, &aux->plan);
aux->plan_valid = 1;
+ aux->use_plan_replay = call_plan_replay_supported(aux, target);
}
}
}
@@ -465,6 +477,21 @@ static u32 call_clobber_mask_for(Func* f, const Inst* in, u8 cls) {
return f->opt_caller_saved[cls];
}
+static int call_plan_replay_supported(const IRCallAux* aux,
+ const CGTarget* target) {
+ if (!aux || !aux->plan_valid || !target || !target->emit_call_plan) return 0;
+ if (aux->desc.flags & CG_CALL_TAIL) return 0;
+ if (aux->plan.has_sret) return 0;
+ for (u32 i = 0; i < aux->plan.nargs; ++i)
+ if (aux->plan.args[i].dst_kind != CG_CALL_PLAN_REG) return 0;
+ for (u32 i = 0; i < aux->plan.nrets; ++i)
+ if (aux->plan.rets[i].dst.kind != OPK_REG &&
+ aux->plan.rets[i].dst.kind != OPK_LOCAL &&
+ aux->plan.rets[i].dst.kind != OPK_INDIRECT)
+ return 0;
+ return 1;
+}
+
#define OPT_BLK_NONE 0xffffffffu
typedef struct LoopPostorderCtx {
@@ -1426,11 +1453,23 @@ static void rewrite_func(Func* f, const OptLiveInfo* live_info) {
if ((IROp)in.op == IR_CALL) {
IRCallAux* aux = (IRCallAux*)in.extra.aux;
if (aux) {
- rewrite_one_operand(f, &in, &aux->desc.callee, 0, &ctx);
- for (u32 k = 0; k < aux->desc.nargs; ++k)
- rewrite_call_arg_value(f, &in, (CGABIValue*)&aux->desc.args[k],
- &ctx);
- walk_abivalue(f, &in, &aux->desc.ret, 1, rewrite_one_operand, &ctx);
+ if (aux->use_plan_replay) {
+ rewrite_one_operand(f, &in, &aux->plan.callee, 0, &ctx);
+ for (u32 k = 0; k < aux->plan.nargs; ++k) {
+ rewrite_call_arg_indirect_base(f, &in, &aux->plan.args[k].src,
+ &ctx);
+ rewrite_call_arg_operand(f, &aux->plan.args[k].src);
+ }
+ for (u32 k = 0; k < aux->plan.nrets; ++k)
+ rewrite_one_operand(f, &in, &aux->plan.rets[k].dst, 1, &ctx);
+ } else {
+ rewrite_one_operand(f, &in, &aux->desc.callee, 0, &ctx);
+ for (u32 k = 0; k < aux->desc.nargs; ++k)
+ rewrite_call_arg_value(f, &in, (CGABIValue*)&aux->desc.args[k],
+ &ctx);
+ walk_abivalue(f, &in, &aux->desc.ret, 1, rewrite_one_operand,
+ &ctx);
+ }
}
} else {
walk_inst_operands(f, &in, rewrite_one_operand, &ctx);
@@ -1694,9 +1733,15 @@ static int inst_uses_phys_reg(const Inst* in, const Operand* r) {
case IR_CALL: {
IRCallAux* aux = (IRCallAux*)in->extra.aux;
if (!aux) break;
- n += count_operand_phys_uses(&aux->desc.callee, r);
- for (u32 i = 0; i < aux->desc.nargs; ++i)
- n += abi_uses_phys_reg(&aux->desc.args[i], r);
+ if (aux->use_plan_replay) {
+ n += count_operand_phys_uses(&aux->plan.callee, r);
+ for (u32 i = 0; i < aux->plan.nargs; ++i)
+ n += count_operand_phys_uses(&aux->plan.args[i].src, r);
+ } else {
+ n += count_operand_phys_uses(&aux->desc.callee, r);
+ for (u32 i = 0; i < aux->desc.nargs; ++i)
+ n += abi_uses_phys_reg(&aux->desc.args[i], r);
+ }
break;
}
case IR_CMP_BRANCH:
@@ -1784,7 +1829,21 @@ static int inst_defines_phys_reg(const Inst* in, const Operand* r) {
return in->nopnds >= 1 && same_phys_reg(&in->opnds[0], r);
case IR_CALL: {
IRCallAux* aux = (IRCallAux*)in->extra.aux;
- return aux && abi_defines_phys_reg(&aux->desc.ret, r);
+ if (!aux) return 0;
+ if (aux->use_plan_replay) {
+ for (u32 i = 0; i < aux->plan.nargs; ++i)
+ if (aux->plan.args[i].dst_kind == CG_CALL_PLAN_REG &&
+ r->cls == aux->plan.args[i].cls &&
+ r->v.reg == aux->plan.args[i].dst_reg)
+ return 1;
+ for (u32 i = 0; i < aux->plan.nrets; ++i)
+ if ((r->cls == aux->plan.rets[i].cls &&
+ r->v.reg == aux->plan.rets[i].src_reg) ||
+ same_phys_reg(&aux->plan.rets[i].dst, r))
+ return 1;
+ return 0;
+ }
+ return abi_defines_phys_reg(&aux->desc.ret, r);
}
case IR_ATOMIC_CAS:
return (in->nopnds >= 1 && same_phys_reg(&in->opnds[0], r)) ||
@@ -2225,10 +2284,23 @@ static void hard_inst_use_def(Func* f, const Inst* in, HardRegSet* use,
case IR_CALL: {
IRCallAux* aux = (IRCallAux*)in->extra.aux;
if (!aux) break;
- hard_use_operand(use, &aux->desc.callee);
- for (u32 i = 0; i < aux->desc.nargs; ++i)
- hard_use_abivalue(use, &aux->desc.args[i]);
- hard_def_abivalue(def, &aux->desc.ret);
+ if (aux->use_plan_replay) {
+ hard_use_operand(use, &aux->plan.callee);
+ for (u32 i = 0; i < aux->plan.nargs; ++i) {
+ hard_use_operand(use, &aux->plan.args[i].src);
+ if (aux->plan.args[i].dst_kind == CG_CALL_PLAN_REG)
+ hard_add(def, aux->plan.args[i].cls, aux->plan.args[i].dst_reg);
+ }
+ for (u32 i = 0; i < aux->plan.nrets; ++i) {
+ hard_add(def, aux->plan.rets[i].cls, aux->plan.rets[i].src_reg);
+ hard_def_operand(def, &aux->plan.rets[i].dst);
+ }
+ } else {
+ hard_use_operand(use, &aux->desc.callee);
+ for (u32 i = 0; i < aux->desc.nargs; ++i)
+ hard_use_abivalue(use, &aux->desc.args[i]);
+ hard_def_abivalue(def, &aux->desc.ret);
+ }
for (u32 c = 0; c < OPT_REG_CLASSES; ++c)
def->cls[c] |= call_clobber_mask_for(f, in, (u8)c);
break;
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -424,10 +424,20 @@ typedef struct MockCGTarget {
u8 last_const_bytes[16];
u32 last_const_size;
int copy_calls;
+ Operand copy_dst[16];
+ Operand copy_src[16];
int load_calls;
int store_calls;
int addr_of_calls;
int cmp_branch_calls;
+ int call_calls;
+ int emit_call_plan_calls;
+ Operand last_plan_callee;
+ Reg planned_arg_regs[8];
+ u32 planned_nargs;
+ Reg planned_ret_regs[4];
+ u32 planned_nrets;
+ int planned_stack_arg;
int param_calls;
CGLocalStorage last_param_storage;
} MockCGTarget;
@@ -495,9 +505,46 @@ static void mock_plan_call(CGTarget* t, const CGCallDesc* d,
out->callee = d->callee;
for (u32 c = 0; c < OPT_REG_CLASSES; ++c)
out->clobber_mask[c] = m->call_clobber_mask[c];
+ u32 nargs = m->planned_nargs ? m->planned_nargs : d->nargs;
+ if (nargs) out->args = arena_zarray(t->c->tu, CGCallPlanMove, nargs);
+ for (u32 i = 0; i < nargs && i < d->nargs; ++i) {
+ CGCallPlanMove* pm = &out->args[out->nargs++];
+ pm->src = d->args[i].storage;
+ pm->dst_kind = m->planned_stack_arg ? CG_CALL_PLAN_STACK : CG_CALL_PLAN_REG;
+ pm->cls = RC_INT;
+ pm->dst_reg = m->planned_arg_regs[i] ? m->planned_arg_regs[i] : (Reg)(i + 1u);
+ pm->stack_offset = i * 8u;
+ pm->mem.type = d->args[i].type;
+ pm->mem.size = 8;
+ pm->mem.align = 8;
+ }
+ if (m->planned_nrets) {
+ out->rets = arena_zarray(t->c->tu, CGCallPlanRet, m->planned_nrets);
+ for (u32 i = 0; i < m->planned_nrets; ++i) {
+ CGCallPlanRet* pr = &out->rets[out->nrets++];
+ pr->dst = d->ret.storage;
+ pr->cls = RC_INT;
+ pr->src_reg = m->planned_ret_regs[i];
+ pr->mem.type = d->ret.type;
+ pr->mem.size = 8;
+ pr->mem.align = 8;
+ }
+ }
++m->plan_call_count;
}
+static void mock_call(CGTarget* t, const CGCallDesc* d) {
+ MockCGTarget* m = (MockCGTarget*)t;
+ (void)d;
+ ++m->call_calls;
+}
+
+static void mock_emit_call_plan(CGTarget* t, const CGCallPlan* p) {
+ MockCGTarget* m = (MockCGTarget*)t;
+ ++m->emit_call_plan_calls;
+ m->last_plan_callee = p->callee;
+}
+
static void mock_reserve_hard_regs(CGTarget* t, RegClass cls, const Reg* regs,
u32 n) {
MockCGTarget* m = (MockCGTarget*)t;
@@ -551,8 +598,10 @@ static void mock_load_const(CGTarget* t, Operand dst, ConstBytes cb) {
static void mock_copy(CGTarget* t, Operand dst, Operand src) {
MockCGTarget* m = (MockCGTarget*)t;
- (void)dst;
- (void)src;
+ if (m->copy_calls < (int)(sizeof m->copy_dst / sizeof m->copy_dst[0])) {
+ m->copy_dst[m->copy_calls] = dst;
+ m->copy_src[m->copy_calls] = src;
+ }
++m->copy_calls;
}
@@ -667,7 +716,9 @@ static void mock_init(MockCGTarget* m, Compiler* c) {
m->base.call_clobber_mask = mock_call_clobber_mask;
m->base.return_reg_mask = mock_return_reg_mask;
m->base.callee_save_mask = mock_callee_save_mask;
+ m->base.call = mock_call;
m->base.plan_call = mock_plan_call;
+ m->base.emit_call_plan = mock_emit_call_plan;
m->base.plan_hard_regs = mock_plan_hard_regs;
m->base.reserve_hard_regs = mock_reserve_hard_regs;
m->base.resolve_reg_name = mock_resolve_reg_name;
@@ -2525,6 +2576,116 @@ static void opt_dead_def_elim_test(void) {
tc_fini(&tc);
}
+static void opt_planned_call_replay_resolves_arg_cycle(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ MockCGTarget mock;
+ mock_init(&mock, tc.c);
+
+ Func* f = new_func(&tc);
+ f->opt_scratch_regs[RC_INT][0] = 9;
+ f->opt_scratch_reg_count[RC_INT] = 1;
+
+ Inst* in = ir_emit(f, f->entry, IR_CALL);
+ IRCallAux* aux = arena_znew(f->arena, IRCallAux);
+ in->extra.aux = aux;
+ aux->plan_valid = 1;
+ aux->use_plan_replay = 1;
+ aux->plan.callee = op_reg_(8, tc.i64);
+ aux->plan.args = arena_zarray(f->arena, CGCallPlanMove, 2);
+ aux->plan.nargs = 2;
+ aux->plan.args[0].src = op_reg_(1, tc.i64);
+ aux->plan.args[0].dst_kind = CG_CALL_PLAN_REG;
+ aux->plan.args[0].cls = RC_INT;
+ aux->plan.args[0].dst_reg = 2;
+ aux->plan.args[0].mem = mem_unknown_(tc.i64, 8);
+ aux->plan.args[1].src = op_reg_(2, tc.i64);
+ aux->plan.args[1].dst_kind = CG_CALL_PLAN_REG;
+ aux->plan.args[1].cls = RC_INT;
+ aux->plan.args[1].dst_reg = 1;
+ aux->plan.args[1].mem = mem_unknown_(tc.i64, 8);
+
+ opt_emit(tc.c, f, &mock.base);
+
+ EXPECT(mock.emit_call_plan_calls == 1,
+ "planned call should use emit_call_plan");
+ EXPECT(mock.copy_calls == 3,
+ "two-register cycle should need three copies, got %d",
+ mock.copy_calls);
+ EXPECT(mock.copy_dst[0].v.reg == 9 && mock.copy_src[0].v.reg == 1,
+ "cycle should save first source to scratch");
+ EXPECT(mock.copy_dst[1].v.reg == 1 && mock.copy_src[1].v.reg == 2,
+ "cycle should rotate second arg into first ABI reg");
+ EXPECT(mock.copy_dst[2].v.reg == 2 && mock.copy_src[2].v.reg == 9,
+ "cycle should restore scratch into second ABI reg");
+ tc_fini(&tc);
+}
+
+static void opt_planned_call_replay_preserves_indirect_callee_arg_reg(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ MockCGTarget mock;
+ mock_init(&mock, tc.c);
+
+ Func* f = new_func(&tc);
+ f->opt_scratch_regs[RC_INT][0] = 9;
+ f->opt_scratch_reg_count[RC_INT] = 1;
+
+ Inst* in = ir_emit(f, f->entry, IR_CALL);
+ IRCallAux* aux = arena_znew(f->arena, IRCallAux);
+ in->extra.aux = aux;
+ aux->plan_valid = 1;
+ aux->use_plan_replay = 1;
+ aux->plan.callee = op_reg_(1, tc.i64);
+ aux->plan.args = arena_zarray(f->arena, CGCallPlanMove, 1);
+ aux->plan.nargs = 1;
+ aux->plan.args[0].src = op_reg_(2, tc.i64);
+ aux->plan.args[0].dst_kind = CG_CALL_PLAN_REG;
+ aux->plan.args[0].cls = RC_INT;
+ aux->plan.args[0].dst_reg = 1;
+ aux->plan.args[0].mem = mem_unknown_(tc.i64, 8);
+
+ opt_emit(tc.c, f, &mock.base);
+
+ EXPECT(mock.emit_call_plan_calls == 1,
+ "planned indirect call should use emit_call_plan");
+ EXPECT(mock.copy_calls >= 2,
+ "callee-in-arg-register hazard should copy callee plus arg");
+ EXPECT(mock.copy_dst[0].v.reg == 9 && mock.copy_src[0].v.reg == 1,
+ "callee should be saved before arg setup overwrites its register");
+ EXPECT(mock.last_plan_callee.kind == OPK_REG &&
+ mock.last_plan_callee.v.reg == 9,
+ "emit_call_plan should receive scratch callee register");
+ tc_fini(&tc);
+}
+
+static void opt_planned_call_replay_falls_back_for_stack_args(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ MockCGTarget mock;
+ mock_init(&mock, tc.c);
+
+ Func* f = new_func(&tc);
+ Inst* in = ir_emit(f, f->entry, IR_CALL);
+ IRCallAux* aux = arena_znew(f->arena, IRCallAux);
+ in->extra.aux = aux;
+ aux->plan_valid = 1;
+ aux->plan.callee = op_reg_(8, tc.i64);
+ aux->plan.args = arena_zarray(f->arena, CGCallPlanMove, 1);
+ aux->plan.nargs = 1;
+ aux->plan.args[0].src = op_reg_(1, tc.i64);
+ aux->plan.args[0].dst_kind = CG_CALL_PLAN_STACK;
+ aux->plan.args[0].cls = RC_INT;
+ aux->plan.args[0].mem = mem_unknown_(tc.i64, 8);
+
+ opt_emit(tc.c, f, &mock.base);
+
+ EXPECT(mock.emit_call_plan_calls == 0,
+ "stack-arg plans should stay on legacy fallback for now");
+ EXPECT(mock.call_calls == 1, "legacy call fallback should be used");
+ tc_fini(&tc);
+}
+
/* ============================================================
* End-to-end test — drive the opt-wrapped CGTarget through the
* public CGTarget interface, let func_end run the full pipeline,
@@ -2908,6 +3069,9 @@ int main(void) {
opt_dce_physical_dead_defs();
opt_dead_def_keeps_observable_loads();
opt_dead_def_elim_test();
+ opt_planned_call_replay_resolves_arg_cycle();
+ opt_planned_call_replay_preserves_indirect_callee_arg_reg();
+ opt_planned_call_replay_falls_back_for_stack_args();
opt_emit_no_virtual_alloc();
opt_records_const_bytes_by_value();
opt_cmp_branch_keeps_fallthrough_after_block_growth();