kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit d328ef30125bf7b82667e0196b0bcbb90135aaaf
parent 98d5b64ce59ab7cddb5ffc28aeece8ca689f4ca1
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 15 May 2026 18:03:09 -0700

Implement planned call replay for O1

Diffstat:
Mdoc/OPT_REGS_CALL_PLAN.md | 182+++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
Msrc/arch/aa64/ops.c | 19+++++++++++++++++++
Msrc/arch/aa64/opt_coord.c | 1+
Msrc/arch/arch.h | 3++-
Msrc/arch/rv64/ops.c | 21+++++++++++++++++++++
Msrc/arch/rv64/opt_coord.c | 1+
Msrc/arch/x64/ops.c | 27+++++++++++++++++++++++++++
Msrc/arch/x64/opt_coord.c | 1+
Msrc/opt/ir.h | 3++-
Msrc/opt/opt.c | 251+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Msrc/opt/pass_live.c | 17+++++++++++++----
Msrc/opt/pass_lower.c | 106++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
Mtest/opt/opt_test.c | 168++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
13 files changed, 711 insertions(+), 89 deletions(-)

diff --git a/doc/OPT_REGS_CALL_PLAN.md b/doc/OPT_REGS_CALL_PLAN.md @@ -14,14 +14,20 @@ backend emission intact enough to migrate one architecture at a time. ## Current Status -The correctness foundation for register preservation is implemented. Targets now -expose descriptive physical-register metadata, per-call clobber masks, -return-register masks, callee-save masks, and call plans. O1 records each call -plan during `machinize`, builds its current hard-register tables from -`CGPhysRegInfo`, uses target save/use costs in allocation scoring, and preserves -hard-assigned live-across-call values by intersecting the assigned register with -the planned call's clobber mask. Post-RA hard-register liveness uses the same -call-specific clobber mask. +The correctness foundation for register preservation and the first planned-call +replay path are implemented. Targets now expose descriptive physical-register +metadata, per-call clobber masks, return-register masks, callee-save masks, and +call plans. O1 records each call plan during `machinize`, builds its current +hard-register tables from `CGPhysRegInfo`, uses target save/use costs in +allocation scoring, and preserves hard-assigned live-across-call values by +intersecting the assigned register with the planned call's clobber mask. +Post-RA hard-register liveness uses the same call-specific clobber mask. + +For supported non-tail, non-sret, register-argument call plans, O1 now replays +calls by materializing arguments with a local parallel-copy resolver, invoking +the backend's `emit_call_plan` hook, and extracting returns from fixed return +registers. The x64, AArch64, and RV64 backends implement `emit_call_plan` as a +branch-only call emission hook. What this closes: @@ -36,19 +42,64 @@ What this closes: What remains open: -- calls are not yet lowered into explicit opt-visible setup/call/return-copy IR; -- call argument and return moves are not yet resolved by an opt parallel-copy - resolver; -- backend emission still uses the legacy sequential `call` hook rather than an - `emit_call_plan` path with pre-materialized arguments and returns; +- call setup/return extraction are represented by call-plan aux data rather + than separate first-class IR ops; +- stack-argument call plans, sret calls, and tail calls still fall back to the + legacy backend `call` hook; - target `get_phys_regs` tables still expose mostly the old conservative pools, so ABI argument/return registers are not generally allocable yet; - direct CG still uses legacy allocation/call hooks; -- broader call-plan layout tests, parallel-copy hazard tests, and code-shape - probes remain to be added. - -In phase terms: Phase 1 is done, Phase 2 is mostly done, Phase 3 is partially -done for clobber/preservation visibility, and Phases 4-6 remain open. +- broader real-architecture call-plan layout tests and code-shape probes remain + to be added. + +In phase terms: Phase 1 and Phase 2 are done, Phase 3 is implemented through +call-plan aux visibility plus planned replay for supported call shapes, Phase 4 +is implemented for register argument/return moves with stack/sret/tail fallback, +and Phases 5-6 remain open. + +## Planned Call Replay Boundary + +The legacy backend `call` hook remains intentionally active as a correctness +fallback. The planned replay path currently covers the call shapes needed to +prove ABI register hazards without moving every ABI corner case at once. + +Planned replay is used only when all of the following are true: + +- the call has a valid `CGCallPlan`; +- the backend provides `emit_call_plan`; +- the call is not a tail call; +- the call is not an sret call; +- every argument destination is `CG_CALL_PLAN_REG`; +- every return destination is a register, local, or indirect operand. + +For those calls, O1 owns the setup and extraction sequence: + +- source operands are rewritten to hard registers or spill slots; +- live-across-call hard registers are saved before argument setup; +- argument moves into ABI registers are resolved as a local parallel copy; +- indirect callees that would be overwritten by argument setup are copied to a + target-provided scratch register first; +- the backend emits only required call metadata and the branch through + `emit_call_plan`; +- return registers are copied or stored into their planned destinations. + +The fallback path is still required for: + +- **outgoing stack arguments**: `CG_CALL_PLAN_STACK` records the ABI stack + offset, but opt replay does not yet have a backend-neutral operation for + writing into the target-owned outgoing area while preserving known-frame and + `max_outgoing` invariants; +- **sret calls**: backends still synthesize the hidden destination pointer from + frame-slot state inside `call`; +- **tail calls**: the legacy hook owns epilogue emission, legality checks, and + branch-without-continuation behavior; +- **direct CG**: direct codegen still uses the old backend allocation and call + hooks while O1 migrates first. + +This boundary lets Phase 3/4 tests exercise register argument permutation, +indirect-callee clobber hazards, call-specific clobber preservation, and return +extraction without broadening the register file before stack/sret/tail lowering +is explicit enough to be target-independent. ## Current Problem @@ -232,6 +283,7 @@ typedef struct CGCallPlan { u32 return_mask[OPT_REG_CLASSES]; u32 stack_arg_size; u8 variadic_fp_count; + u8 is_variadic; u8 has_sret; } CGCallPlan; ``` @@ -308,8 +360,9 @@ Backends should gain emission hooks for an already-planned call: void (*emit_call_plan)(CGTarget*, const CGCallPlan*); ``` -For the transition, this hook should assume arg registers and outgoing stack -slots have already been materialized by opt. It only emits: +For the current transition, this hook assumes register arguments have already +been materialized by opt. Stack-argument plans still use the legacy `call` +fallback. The hook only emits: - required varargs metadata such as x64 `AL`; - direct or indirect call branch; @@ -339,10 +392,10 @@ Expected result: no codegen behavior change. ### Phase 2 - Call Plan Construction -Status: mostly done. `CGCallPlan`, `plan_call`, call clobber masks, return -masks, and callee-save masks exist for the three native backends. O1 attaches -plans during `machinize`. Remaining work is fuller layout/dump coverage for -mixed, variadic, stack-arg, and aggregate cases. +Status: done. `CGCallPlan`, `plan_call`, call clobber masks, return masks, and +callee-save masks exist for the three native backends. O1 attaches plans during +`machinize`, and the opt tests cover plan attachment plus downstream planned +replay/fallback behavior. - Add `CGCallPlan` and `plan_call`. - Implement call planning for simple direct scalar integer and FP args/returns on @@ -356,36 +409,46 @@ yet. ### Phase 3 - Opt IR Call Constraints -Status: partial. Calls carry plan aux data before liveness/allocation, and -rewrite plus hard-register liveness use the planned clobber masks. Calls are not -yet lowered into explicit setup/call/return-copy IR, and implicit arg/return -register uses/defs are not yet modeled as first-class constrained operations. - -- Lower `IR_CALL` into opt-visible call setup, constrained call, and return-copy - representation during `machinize`. -- Teach liveness/range building that call ops have implicit register uses, - implicit return defs, and clobber masks. -- Keep the old path behind a fallback for unsupported call-plan shapes. -- Add tests for values occupying ABI arg registers before call setup. +Status: implemented for the current aux-data representation. Calls carry plan +aux data before liveness/allocation. Liveness, rewrite, hard-register DCE, and +hard-register liveness inspect plan operands for supported planned calls, while +rewrite uses the call-specific clobber mask to save live-across-call hard values +before argument setup. The implementation keeps explicit setup/call/return-copy +IR ops as a possible later cleanup rather than a prerequisite. + +- done: attach plan aux data to `IR_CALL` during `machinize`; +- done: teach liveness/range building to use planned source and destination + operands when planned replay is enabled; +- done: model call clobbers through the call-specific plan mask; +- done: keep the legacy `call` path behind a fallback for unsupported call-plan + shapes; +- still optional: split setup/call/return extraction into separate IR ops if the + aux-data representation becomes too opaque for later passes. Expected result: correctness coverage for arg-register hazards before the allocator starts using those registers widely. ### Phase 4 - Parallel Copy Resolver -Status: open. Argument/return setup still goes through backend sequential call -emitters, so ABI argument and return registers cannot be broadly exposed yet. - -- Implement local parallel move resolution for call setup and return extraction. -- Support register-register cycles, register-stack moves, local/indirect loads, - immediates, and stack arguments. -- Use target-provided temporary policy first; later this can use per-instruction - temp allocation. -- Add red-green tests for argument permutation hazards: - - `f(b, a)` where `a` and `b` are already in the opposite ABI registers; - - indirect callee held in an argument register; - - return register also used by a live pre-call value; - - stack arguments sourced from registers that are also call destinations. +Status: implemented for register argument/return plans. O1 replay uses a local +parallel-copy resolver for planned call setup and return extraction, including +register-register cycles, local/indirect/immediate/global sources, register +destinations, local/indirect return destinations, and indirect callees that +occupy a destination argument register. Stack-argument, sret, and tail-call +plans continue to use the legacy backend `call` fallback until outgoing +stack-slot materialization is represented in the target contract. + +- done: implement local parallel move resolution for register call setup and + return extraction; +- done: support register-register cycles, local/indirect loads, immediates, + globals, and local/indirect return stores; +- done: use target-provided scratch registers to break cycles and preserve + indirect callees; +- done: add red-green tests for argument permutation cycles, indirect callees in + argument registers, and stack-argument fallback; +- still open: support `CG_CALL_PLAN_STACK` materialization directly in opt; +- still open: add return-register collision and stack-source hazard tests once + stack materialization is explicit. Expected result: ABI arg and return registers can be made allocable safely. @@ -429,11 +492,14 @@ Focused unit tests: - done: opt-side target register metadata consumption; - done: caller-saved live-across-call preservation using per-call masks; +- done: planned-call replay through `emit_call_plan` for register-argument + cycles and indirect-callee/argument-register hazards; +- done: legacy fallback for planned calls requiring outgoing stack arguments; - still needed: target register metadata tests per real architecture; -- still needed: call-plan layout for scalar, FP, mixed, sret, variadic, and - stack-arg calls; +- still needed: broader real-architecture call-plan layout for scalar, FP, + mixed, sret, variadic, and stack-arg calls; - still needed: direct call-clobber mask tests per real architecture; -- still needed: parallel-copy cycles and memory routing; +- still needed: code-shape probes after ABI registers are exposed broadly; - still needed: callee-save reservation/code-shape tests after broadened allocation. @@ -487,15 +553,13 @@ Next patch stack: 1. Add call-plan layout/dump tests for real x64/AArch64/RV64 scalar, FP, mixed, sret, variadic, and stack-arg cases. -2. Lower one simple call shape through explicit opt-visible setup/call/return - constraints behind a narrow capability check. -3. Implement local parallel copies for that shape. -4. Add red-green hazard tests for argument permutations, indirect callees in - argument registers, return-register collisions, and stack-argument sources. -5. Add `emit_call_plan` for one backend and switch O1 replay for supported plans. -6. Broaden register exposure incrementally, keeping helper scratch registers +2. Extend the target contract with an opt-visible outgoing stack-slot + materialization path, then remove the stack-argument fallback. +3. Add red-green hazard tests for return-register collisions and stack-argument + sources once stack materialization is explicit. +4. Broaden register exposure incrementally, keeping helper scratch registers reserved until their clobbers are explicit. -7. Migrate direct CG or wrap it with internal call planning, then remove legacy +5. Migrate direct CG or wrap it with internal call planning, then remove legacy pool semantics. This order keeps each step testable and avoids mixing API migration, allocation diff --git a/src/arch/aa64/ops.c b/src/arch/aa64/ops.c @@ -1078,6 +1078,24 @@ static void aa_call(CGTarget* t, const CGCallDesc* d) { } } +static void aa_emit_call_plan(CGTarget* t, const CGCallPlan* p) { + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + + if (p->callee.kind == OPK_GLOBAL) { + u32 bl_pos = mc->pos(mc); + aa64_emit32(mc, aa64_bl_base()); + mc->emit_reloc_at(mc, mc->section_id, bl_pos, R_AARCH64_CALL26, + p->callee.v.global.sym, p->callee.v.global.addend, 0, 0); + } else if (p->callee.kind == OPK_REG) { + aa64_emit32(mc, aa64_blr(reg_num(p->callee))); + } else { + compiler_panic(t->c, a->loc, + "aarch64 emit_call_plan: callee kind %d unsupported", + (int)p->callee.kind); + } +} + static void aa_ret(CGTarget* t, const CGABIValue* val) { AAImpl* a = impl_of(t); MCEmitter* mc = t->mc; @@ -1961,6 +1979,7 @@ CGTarget* aa64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) { t->convert = aa_convert; t->call = aa_call; + t->emit_call_plan = aa_emit_call_plan; t->call_stack_size = aa_call_stack_size; t->ret = aa_ret; diff --git a/src/arch/aa64/opt_coord.c b/src/arch/aa64/opt_coord.c @@ -160,6 +160,7 @@ static void aa_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) { out->callee = d->callee; out->stack_arg_size = t->call_stack_size ? t->call_stack_size(t, d) : 0; out->has_sret = d->abi && d->abi->has_sret; + out->is_variadic = d->abi && d->abi->variadic; for (u32 c = 0; c < CG_CALL_PLAN_REG_CLASSES; ++c) { out->clobber_mask[c] = aa_call_clobber_mask(t, d, (RegClass)c); out->return_mask[c] = aa_return_reg_mask(t, d->abi, (RegClass)c); diff --git a/src/arch/arch.h b/src/arch/arch.h @@ -451,8 +451,9 @@ typedef struct CGCallPlan { u32 return_mask[CG_CALL_PLAN_REG_CLASSES]; u32 stack_arg_size; u8 variadic_fp_count; + u8 is_variadic; u8 has_sret; - u8 pad[2]; + u8 pad; } CGCallPlan; typedef u32 Label; diff --git a/src/arch/rv64/ops.c b/src/arch/rv64/ops.c @@ -1083,6 +1083,26 @@ static void rv_call(CGTarget* t, const CGCallDesc* d) { } } +static void rv_emit_call_plan(CGTarget* t, const CGCallPlan* p) { + RImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + + if (p->callee.kind == OPK_GLOBAL) { + u32 sec = mc->section_id; + u32 pos = mc->pos(mc); + rv64_emit32(mc, rv_auipc(RV_RA, 0)); + rv64_emit32(mc, rv_jalr(RV_RA, RV_RA, 0)); + mc->emit_reloc_at(mc, sec, pos, R_RV_CALL, + p->callee.v.global.sym, p->callee.v.global.addend, 0, 0); + } else if (p->callee.kind == OPK_REG) { + rv64_emit32(mc, rv_jalr(RV_RA, reg_num(p->callee), 0)); + } else { + compiler_panic(t->c, a->loc, + "rv64 emit_call_plan: callee kind %d unsupported", + (int)p->callee.kind); + } +} + static void rv_ret(CGTarget* t, const CGABIValue* val) { RImpl* a = impl_of(t); MCEmitter* mc = t->mc; @@ -1885,6 +1905,7 @@ CGTarget* rv64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) { t->convert = rv_convert; t->call = rv_call; + t->emit_call_plan = rv_emit_call_plan; t->call_stack_size = rv_call_stack_size; t->ret = rv_ret; diff --git a/src/arch/rv64/opt_coord.c b/src/arch/rv64/opt_coord.c @@ -149,6 +149,7 @@ static void rv_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) { out->callee = d->callee; out->stack_arg_size = t->call_stack_size ? t->call_stack_size(t, d) : 0; out->has_sret = d->abi && d->abi->has_sret; + out->is_variadic = d->abi && d->abi->variadic; for (u32 c = 0; c < CG_CALL_PLAN_REG_CLASSES; ++c) { out->clobber_mask[c] = rv_call_clobber_mask(t, d, (RegClass)c); out->return_mask[c] = rv_return_reg_mask(t, d->abi, (RegClass)c); diff --git a/src/arch/x64/ops.c b/src/arch/x64/ops.c @@ -1008,6 +1008,32 @@ static void x_call(CGTarget* t, const CGCallDesc* d) { } } +static void x_emit_call_plan(CGTarget* t, const CGCallPlan* p) { + MCEmitter* mc = t->mc; + + if (p->is_variadic) + x64_emit_load_imm(mc, 0, X64_RAX, (i64)p->variadic_fp_count); + + if (p->callee.kind == OPK_GLOBAL) { + u8 op = 0xE8; + mc->emit_bytes(mc, &op, 1); + u32 disp_pos = mc->pos(mc); + emit_u32le(mc, 0); + mc->emit_reloc_at(mc, mc->section_id, disp_pos, R_X64_PLT32, + p->callee.v.global.sym, + p->callee.v.global.addend - 4, 1, 0); + } else if (p->callee.kind == OPK_REG) { + u32 r = p->callee.v.reg & 0xFu; + emit_rex(mc, 0, 0, 0, r); + u8 buf[2] = {0xFF, modrm(3u, 2u, r)}; + mc->emit_bytes(mc, buf, 2); + } else { + compiler_panic(t->c, impl_of(t)->loc, + "x64 emit_call_plan: callee kind %d unsupported", + (int)p->callee.kind); + } +} + static void x_ret(CGTarget* t, const CGABIValue* val) { XImpl* a = impl_of(t); MCEmitter* mc = t->mc; @@ -1952,6 +1978,7 @@ CGTarget* x64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) { t->convert = x_convert; t->call = x_call; + t->emit_call_plan = x_emit_call_plan; t->call_stack_size = x_call_stack_size; t->ret = x_ret; diff --git a/src/arch/x64/opt_coord.c b/src/arch/x64/opt_coord.c @@ -157,6 +157,7 @@ static void x_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) { out->callee = d->callee; out->stack_arg_size = t->call_stack_size ? t->call_stack_size(t, d) : 0; out->has_sret = d->abi && d->abi->has_sret; + out->is_variadic = d->abi && d->abi->variadic; for (u32 c = 0; c < CG_CALL_PLAN_REG_CLASSES; ++c) { out->clobber_mask[c] = x_call_clobber_mask(t, d, (RegClass)c); out->return_mask[c] = x_return_reg_mask(t, d->abi, (RegClass)c); diff --git a/src/opt/ir.h b/src/opt/ir.h @@ -128,7 +128,8 @@ typedef struct IRCallAux { CGCallDesc desc; CGCallPlan plan; u8 plan_valid; - u8 pad[3]; + u8 use_plan_replay; + u8 pad[2]; /* Result Vals (one per ABI-decomposed return part). 0 for void. */ u32 nresults; Val* results; diff --git a/src/opt/opt.c b/src/opt/opt.c @@ -1263,6 +1263,223 @@ static CGABIValue xlat_abivalue(ReplayCtx* r, const CGABIValue* in, return out; } +typedef struct ReplayParallelMove { + Operand dst; + Operand src; + MemAccess mem; + u8 done; + u8 pad[3]; +} ReplayParallelMove; + +static Operand phys_reg_operand(Reg r, RegClass cls, CfreeCgTypeId ty) { + Operand op; + memset(&op, 0, sizeof op); + op.kind = OPK_REG; + op.cls = (u8)cls; + op.type = ty; + op.v.reg = r; + return op; +} + +static int operand_reg_eq(const Operand* a, const Operand* b) { + return a && b && a->kind == OPK_REG && b->kind == OPK_REG && + a->cls == b->cls && a->v.reg == b->v.reg; +} + +static int operand_uses_reg_for_replay(const Operand* op, const Operand* r) { + if (!op || !r || r->kind != OPK_REG) return 0; + if (op->kind == OPK_REG) return operand_reg_eq(op, r); + if (op->kind == OPK_INDIRECT) + return r->cls == RC_INT && op->v.ind.base == r->v.reg; + return 0; +} + +static int replay_move_src_ready(const ReplayParallelMove* moves, u32 n, + u32 idx) { + const Operand* dst = &moves[idx].dst; + for (u32 i = 0; i < n; ++i) { + if (i == idx || moves[i].done) continue; + if (operand_uses_reg_for_replay(&moves[i].src, dst)) return 0; + } + return 1; +} + +static int replay_find_move_dst(const ReplayParallelMove* moves, u32 n, + const Operand* dst) { + for (u32 i = 0; i < n; ++i) { + if (!moves[i].done && operand_reg_eq(&moves[i].dst, dst)) return (int)i; + } + return -1; +} + +static Reg replay_scratch_reg(ReplayCtx* r, RegClass cls, Reg avoid) { + if ((u32)cls >= OPT_REG_CLASSES) return REG_NONE; + for (u32 i = 0; i < r->f->opt_scratch_reg_count[cls]; ++i) { + Reg sr = r->f->opt_scratch_regs[cls][i]; + if (sr != avoid) return sr; + } + return REG_NONE; +} + +static void replay_emit_move(CGTarget* w, Operand dst, Operand src, + MemAccess mem) { + if (dst.kind == OPK_REG) { + if (src.kind == OPK_REG) { + if (!operand_reg_eq(&dst, &src)) w->copy(w, dst, src); + } else if (src.kind == OPK_IMM) { + w->load_imm(w, dst, src.v.imm); + } else if (src.kind == OPK_LOCAL || src.kind == OPK_INDIRECT) { + w->load(w, dst, src, mem); + } else if (src.kind == OPK_GLOBAL) { + w->addr_of(w, dst, src); + } + } else if (dst.kind == OPK_LOCAL || dst.kind == OPK_INDIRECT) { + w->store(w, dst, src, mem); + } +} + +static void replay_parallel_moves(ReplayCtx* r, ReplayParallelMove* moves, + u32 n) { + CGTarget* w = r->tgt; + u32 remaining = 0; + for (u32 i = 0; i < n; ++i) { + if (operand_reg_eq(&moves[i].dst, &moves[i].src)) { + moves[i].done = 1; + } else { + ++remaining; + } + } + + while (remaining) { + int progressed = 0; + for (u32 i = 0; i < n; ++i) { + if (moves[i].done || !replay_move_src_ready(moves, n, i)) continue; + replay_emit_move(w, moves[i].dst, moves[i].src, moves[i].mem); + moves[i].done = 1; + --remaining; + progressed = 1; + } + if (progressed) continue; + + for (u32 i = 0; i < n; ++i) { + if (moves[i].done || moves[i].src.kind == OPK_REG) continue; + Reg sr = replay_scratch_reg(r, (RegClass)moves[i].dst.cls, REG_NONE); + if (sr == (Reg)REG_NONE) continue; + Operand tmp = phys_reg_operand(sr, (RegClass)moves[i].dst.cls, + moves[i].dst.type); + replay_emit_move(w, tmp, moves[i].src, moves[i].mem); + moves[i].src = tmp; + progressed = 1; + break; + } + if (progressed) continue; + + u32 first = 0; + while (first < n && moves[first].done) ++first; + if (first == n) break; + Operand save = moves[first].src; + Reg sr = replay_scratch_reg(r, (RegClass)save.cls, REG_NONE); + if (sr == (Reg)REG_NONE) { + SrcLoc loc = {0, 0, 0}; + compiler_panic(r->c, loc, + "opt replay: no scratch register for parallel call move"); + } + Operand tmp = phys_reg_operand(sr, (RegClass)save.cls, save.type); + w->copy(w, tmp, save); + + Operand hole = save; + for (;;) { + int idx = replay_find_move_dst(moves, n, &hole); + if (idx < 0 || (u32)idx == first) break; + replay_emit_move(w, moves[idx].dst, moves[idx].src, moves[idx].mem); + hole = moves[idx].src; + moves[idx].done = 1; + --remaining; + } + replay_emit_move(w, moves[first].dst, tmp, moves[first].mem); + moves[first].done = 1; + --remaining; + } +} + +static int replay_plan_supported(const CGCallPlan* p) { + if (!p || p->has_sret) return 0; + for (u32 i = 0; i < p->nargs; ++i) + if (p->args[i].dst_kind != CG_CALL_PLAN_REG) return 0; + for (u32 i = 0; i < p->nrets; ++i) + if (p->rets[i].dst.kind != OPK_REG && p->rets[i].dst.kind != OPK_LOCAL && + p->rets[i].dst.kind != OPK_INDIRECT) + return 0; + return 1; +} + +static void replay_planned_call(ReplayCtx* r, const IRCallAux* aux) { + const CGCallPlan* src_plan = &aux->plan; + CGCallPlan plan = *src_plan; + plan.callee = xlat_op(r, src_plan->callee); + plan.args = src_plan->nargs + ? arena_array(r->f->arena, CGCallPlanMove, src_plan->nargs) + : NULL; + plan.rets = src_plan->nrets + ? arena_array(r->f->arena, CGCallPlanRet, src_plan->nrets) + : NULL; + + ReplayParallelMove* arg_moves = + src_plan->nargs ? arena_zarray(r->f->arena, ReplayParallelMove, + src_plan->nargs) + : NULL; + u32 nargs = 0; + for (u32 i = 0; i < src_plan->nargs; ++i) { + plan.args[i] = src_plan->args[i]; + plan.args[i].src = xlat_op(r, src_plan->args[i].src); + Operand dst = phys_reg_operand(plan.args[i].dst_reg, + (RegClass)plan.args[i].cls, + plan.args[i].mem.type); + arg_moves[nargs].dst = dst; + arg_moves[nargs].src = plan.args[i].src; + arg_moves[nargs].mem = plan.args[i].mem; + ++nargs; + } + + Reg callee_scratch = REG_NONE; + if (plan.callee.kind == OPK_REG) { + for (u32 i = 0; i < nargs; ++i) { + if (!operand_reg_eq(&arg_moves[i].dst, &plan.callee)) continue; + callee_scratch = replay_scratch_reg(r, RC_INT, REG_NONE); + if (callee_scratch == (Reg)REG_NONE) { + SrcLoc loc = {0, 0, 0}; + compiler_panic(r->c, loc, + "opt replay: no scratch register for indirect call"); + } + Operand tmp = phys_reg_operand(callee_scratch, RC_INT, plan.callee.type); + r->tgt->copy(r->tgt, tmp, plan.callee); + plan.callee = tmp; + break; + } + } + + replay_parallel_moves(r, arg_moves, nargs); + r->tgt->emit_call_plan(r->tgt, &plan); + + ReplayParallelMove* ret_moves = + src_plan->nrets ? arena_zarray(r->f->arena, ReplayParallelMove, + src_plan->nrets) + : NULL; + u32 nrets = 0; + for (u32 i = 0; i < src_plan->nrets; ++i) { + plan.rets[i] = src_plan->rets[i]; + plan.rets[i].dst = xlat_op(r, src_plan->rets[i].dst); + Operand src = phys_reg_operand(plan.rets[i].src_reg, + (RegClass)plan.rets[i].cls, + plan.rets[i].mem.type); + ret_moves[nrets].dst = plan.rets[i].dst; + ret_moves[nrets].src = src; + ret_moves[nrets].mem = plan.rets[i].mem; + ++nrets; + } + replay_parallel_moves(r, ret_moves, nrets); +} + static Label ensure_label(ReplayCtx* r, u32 b) { if (b >= r->f->nblocks) return LABEL_NONE; if (r->label_map[b] == LABEL_NONE) { @@ -1407,6 +1624,12 @@ static void replay_inst(ReplayCtx* r, u32 b, Inst* in) { } case IR_CALL: { IRCallAux* aux = (IRCallAux*)in->extra.aux; + if (aux && aux->use_plan_replay && (aux->desc.flags & CG_CALL_TAIL) == 0 && + w->emit_call_plan && + replay_plan_supported(&aux->plan)) { + replay_planned_call(r, aux); + break; + } CGCallDesc cd = aux->desc; cd.callee = xlat_op(r, cd.callee); CGABIValue* args = NULL; @@ -1630,13 +1853,31 @@ static u32 collect_replayed_hard_regs(Func* f, CGTarget* w, RegClass cls, case IR_CALL: { IRCallAux* aux = (IRCallAux*)in->extra.aux; if (!aux) break; - collect_replayed_operand_reg(&aux->desc.callee, cls, used, &nused, - cap); - for (u32 j = 0; j < aux->desc.nargs; ++j) - collect_replayed_abivalue_regs(&aux->desc.args[j], cls, used, + if (aux->use_plan_replay) { + collect_replayed_operand_reg(&aux->plan.callee, cls, used, &nused, + cap); + for (u32 j = 0; j < aux->plan.nargs; ++j) { + collect_replayed_operand_reg(&aux->plan.args[j].src, cls, used, + &nused, cap); + if (aux->plan.args[j].dst_kind == CG_CALL_PLAN_REG && + aux->plan.args[j].cls == (u8)cls) + add_unique_reg(used, &nused, cap, aux->plan.args[j].dst_reg); + } + for (u32 j = 0; j < aux->plan.nrets; ++j) { + collect_replayed_operand_reg(&aux->plan.rets[j].dst, cls, used, &nused, cap); - collect_replayed_abivalue_regs(&aux->desc.ret, cls, used, &nused, + if (aux->plan.rets[j].cls == (u8)cls) + add_unique_reg(used, &nused, cap, aux->plan.rets[j].src_reg); + } + } else { + collect_replayed_operand_reg(&aux->desc.callee, cls, used, &nused, cap); + for (u32 j = 0; j < aux->desc.nargs; ++j) + collect_replayed_abivalue_regs(&aux->desc.args[j], cls, used, + &nused, cap); + collect_replayed_abivalue_regs(&aux->desc.ret, cls, used, &nused, + cap); + } break; } case IR_RET: { diff --git a/src/opt/pass_live.c b/src/opt/pass_live.c @@ -162,10 +162,19 @@ static void live_walk_inst_operands(Func* f, Inst* in, LiveOperandWalkFn fn, case IR_CALL: { IRCallAux* aux = (IRCallAux*)in->extra.aux; if (!aux) break; - live_walk_operand(f, in, &aux->desc.callee, 0, fn, ctx); - for (u32 i = 0; i < aux->desc.nargs; ++i) - live_walk_abivalue(f, in, (CGABIValue*)&aux->desc.args[i], 0, fn, ctx); - live_walk_abivalue(f, in, &aux->desc.ret, 1, fn, ctx); + if (aux->use_plan_replay) { + live_walk_operand(f, in, &aux->plan.callee, 0, fn, ctx); + for (u32 i = 0; i < aux->plan.nargs; ++i) + live_walk_operand(f, in, &aux->plan.args[i].src, 0, fn, ctx); + for (u32 i = 0; i < aux->plan.nrets; ++i) + live_walk_operand(f, in, &aux->plan.rets[i].dst, 1, fn, ctx); + } else { + live_walk_operand(f, in, &aux->desc.callee, 0, fn, ctx); + for (u32 i = 0; i < aux->desc.nargs; ++i) + live_walk_abivalue(f, in, (CGABIValue*)&aux->desc.args[i], 0, fn, + ctx); + live_walk_abivalue(f, in, &aux->desc.ret, 1, fn, ctx); + } break; } case IR_RET: { diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c @@ -104,10 +104,18 @@ static void walk_inst_operands(Func* f, Inst* in, OperandWalkFn fn, void* ctx) { case IR_CALL: { IRCallAux* aux = (IRCallAux*)in->extra.aux; if (!aux) break; - walk_operand(f, in, &aux->desc.callee, 0, fn, ctx); - for (u32 i = 0; i < aux->desc.nargs; ++i) - walk_abivalue(f, in, (CGABIValue*)&aux->desc.args[i], 0, fn, ctx); - walk_abivalue(f, in, &aux->desc.ret, 1, fn, ctx); + if (aux->use_plan_replay) { + walk_operand(f, in, &aux->plan.callee, 0, fn, ctx); + for (u32 i = 0; i < aux->plan.nargs; ++i) + walk_operand(f, in, &aux->plan.args[i].src, 0, fn, ctx); + for (u32 i = 0; i < aux->plan.nrets; ++i) + walk_operand(f, in, &aux->plan.rets[i].dst, 1, fn, ctx); + } else { + walk_operand(f, in, &aux->desc.callee, 0, fn, ctx); + for (u32 i = 0; i < aux->desc.nargs; ++i) + walk_abivalue(f, in, (CGABIValue*)&aux->desc.args[i], 0, fn, ctx); + walk_abivalue(f, in, &aux->desc.ret, 1, fn, ctx); + } break; } case IR_RET: { @@ -353,6 +361,9 @@ static void asm_prepare_constraints(Func* f, CGTarget* target, IRAsmAux* aux) { } } +static int call_plan_replay_supported(const IRCallAux* aux, + const CGTarget* target); + void opt_machinize(Func* f, CGTarget* target) { f->opt_target = target->c->target; f->opt_has_target = 1; @@ -442,6 +453,7 @@ void opt_machinize(Func* f, CGTarget* target) { if (aux) { target->plan_call(target, &aux->desc, &aux->plan); aux->plan_valid = 1; + aux->use_plan_replay = call_plan_replay_supported(aux, target); } } } @@ -465,6 +477,21 @@ static u32 call_clobber_mask_for(Func* f, const Inst* in, u8 cls) { return f->opt_caller_saved[cls]; } +static int call_plan_replay_supported(const IRCallAux* aux, + const CGTarget* target) { + if (!aux || !aux->plan_valid || !target || !target->emit_call_plan) return 0; + if (aux->desc.flags & CG_CALL_TAIL) return 0; + if (aux->plan.has_sret) return 0; + for (u32 i = 0; i < aux->plan.nargs; ++i) + if (aux->plan.args[i].dst_kind != CG_CALL_PLAN_REG) return 0; + for (u32 i = 0; i < aux->plan.nrets; ++i) + if (aux->plan.rets[i].dst.kind != OPK_REG && + aux->plan.rets[i].dst.kind != OPK_LOCAL && + aux->plan.rets[i].dst.kind != OPK_INDIRECT) + return 0; + return 1; +} + #define OPT_BLK_NONE 0xffffffffu typedef struct LoopPostorderCtx { @@ -1426,11 +1453,23 @@ static void rewrite_func(Func* f, const OptLiveInfo* live_info) { if ((IROp)in.op == IR_CALL) { IRCallAux* aux = (IRCallAux*)in.extra.aux; if (aux) { - rewrite_one_operand(f, &in, &aux->desc.callee, 0, &ctx); - for (u32 k = 0; k < aux->desc.nargs; ++k) - rewrite_call_arg_value(f, &in, (CGABIValue*)&aux->desc.args[k], - &ctx); - walk_abivalue(f, &in, &aux->desc.ret, 1, rewrite_one_operand, &ctx); + if (aux->use_plan_replay) { + rewrite_one_operand(f, &in, &aux->plan.callee, 0, &ctx); + for (u32 k = 0; k < aux->plan.nargs; ++k) { + rewrite_call_arg_indirect_base(f, &in, &aux->plan.args[k].src, + &ctx); + rewrite_call_arg_operand(f, &aux->plan.args[k].src); + } + for (u32 k = 0; k < aux->plan.nrets; ++k) + rewrite_one_operand(f, &in, &aux->plan.rets[k].dst, 1, &ctx); + } else { + rewrite_one_operand(f, &in, &aux->desc.callee, 0, &ctx); + for (u32 k = 0; k < aux->desc.nargs; ++k) + rewrite_call_arg_value(f, &in, (CGABIValue*)&aux->desc.args[k], + &ctx); + walk_abivalue(f, &in, &aux->desc.ret, 1, rewrite_one_operand, + &ctx); + } } } else { walk_inst_operands(f, &in, rewrite_one_operand, &ctx); @@ -1694,9 +1733,15 @@ static int inst_uses_phys_reg(const Inst* in, const Operand* r) { case IR_CALL: { IRCallAux* aux = (IRCallAux*)in->extra.aux; if (!aux) break; - n += count_operand_phys_uses(&aux->desc.callee, r); - for (u32 i = 0; i < aux->desc.nargs; ++i) - n += abi_uses_phys_reg(&aux->desc.args[i], r); + if (aux->use_plan_replay) { + n += count_operand_phys_uses(&aux->plan.callee, r); + for (u32 i = 0; i < aux->plan.nargs; ++i) + n += count_operand_phys_uses(&aux->plan.args[i].src, r); + } else { + n += count_operand_phys_uses(&aux->desc.callee, r); + for (u32 i = 0; i < aux->desc.nargs; ++i) + n += abi_uses_phys_reg(&aux->desc.args[i], r); + } break; } case IR_CMP_BRANCH: @@ -1784,7 +1829,21 @@ static int inst_defines_phys_reg(const Inst* in, const Operand* r) { return in->nopnds >= 1 && same_phys_reg(&in->opnds[0], r); case IR_CALL: { IRCallAux* aux = (IRCallAux*)in->extra.aux; - return aux && abi_defines_phys_reg(&aux->desc.ret, r); + if (!aux) return 0; + if (aux->use_plan_replay) { + for (u32 i = 0; i < aux->plan.nargs; ++i) + if (aux->plan.args[i].dst_kind == CG_CALL_PLAN_REG && + r->cls == aux->plan.args[i].cls && + r->v.reg == aux->plan.args[i].dst_reg) + return 1; + for (u32 i = 0; i < aux->plan.nrets; ++i) + if ((r->cls == aux->plan.rets[i].cls && + r->v.reg == aux->plan.rets[i].src_reg) || + same_phys_reg(&aux->plan.rets[i].dst, r)) + return 1; + return 0; + } + return abi_defines_phys_reg(&aux->desc.ret, r); } case IR_ATOMIC_CAS: return (in->nopnds >= 1 && same_phys_reg(&in->opnds[0], r)) || @@ -2225,10 +2284,23 @@ static void hard_inst_use_def(Func* f, const Inst* in, HardRegSet* use, case IR_CALL: { IRCallAux* aux = (IRCallAux*)in->extra.aux; if (!aux) break; - hard_use_operand(use, &aux->desc.callee); - for (u32 i = 0; i < aux->desc.nargs; ++i) - hard_use_abivalue(use, &aux->desc.args[i]); - hard_def_abivalue(def, &aux->desc.ret); + if (aux->use_plan_replay) { + hard_use_operand(use, &aux->plan.callee); + for (u32 i = 0; i < aux->plan.nargs; ++i) { + hard_use_operand(use, &aux->plan.args[i].src); + if (aux->plan.args[i].dst_kind == CG_CALL_PLAN_REG) + hard_add(def, aux->plan.args[i].cls, aux->plan.args[i].dst_reg); + } + for (u32 i = 0; i < aux->plan.nrets; ++i) { + hard_add(def, aux->plan.rets[i].cls, aux->plan.rets[i].src_reg); + hard_def_operand(def, &aux->plan.rets[i].dst); + } + } else { + hard_use_operand(use, &aux->desc.callee); + for (u32 i = 0; i < aux->desc.nargs; ++i) + hard_use_abivalue(use, &aux->desc.args[i]); + hard_def_abivalue(def, &aux->desc.ret); + } for (u32 c = 0; c < OPT_REG_CLASSES; ++c) def->cls[c] |= call_clobber_mask_for(f, in, (u8)c); break; diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c @@ -424,10 +424,20 @@ typedef struct MockCGTarget { u8 last_const_bytes[16]; u32 last_const_size; int copy_calls; + Operand copy_dst[16]; + Operand copy_src[16]; int load_calls; int store_calls; int addr_of_calls; int cmp_branch_calls; + int call_calls; + int emit_call_plan_calls; + Operand last_plan_callee; + Reg planned_arg_regs[8]; + u32 planned_nargs; + Reg planned_ret_regs[4]; + u32 planned_nrets; + int planned_stack_arg; int param_calls; CGLocalStorage last_param_storage; } MockCGTarget; @@ -495,9 +505,46 @@ static void mock_plan_call(CGTarget* t, const CGCallDesc* d, out->callee = d->callee; for (u32 c = 0; c < OPT_REG_CLASSES; ++c) out->clobber_mask[c] = m->call_clobber_mask[c]; + u32 nargs = m->planned_nargs ? m->planned_nargs : d->nargs; + if (nargs) out->args = arena_zarray(t->c->tu, CGCallPlanMove, nargs); + for (u32 i = 0; i < nargs && i < d->nargs; ++i) { + CGCallPlanMove* pm = &out->args[out->nargs++]; + pm->src = d->args[i].storage; + pm->dst_kind = m->planned_stack_arg ? CG_CALL_PLAN_STACK : CG_CALL_PLAN_REG; + pm->cls = RC_INT; + pm->dst_reg = m->planned_arg_regs[i] ? m->planned_arg_regs[i] : (Reg)(i + 1u); + pm->stack_offset = i * 8u; + pm->mem.type = d->args[i].type; + pm->mem.size = 8; + pm->mem.align = 8; + } + if (m->planned_nrets) { + out->rets = arena_zarray(t->c->tu, CGCallPlanRet, m->planned_nrets); + for (u32 i = 0; i < m->planned_nrets; ++i) { + CGCallPlanRet* pr = &out->rets[out->nrets++]; + pr->dst = d->ret.storage; + pr->cls = RC_INT; + pr->src_reg = m->planned_ret_regs[i]; + pr->mem.type = d->ret.type; + pr->mem.size = 8; + pr->mem.align = 8; + } + } ++m->plan_call_count; } +static void mock_call(CGTarget* t, const CGCallDesc* d) { + MockCGTarget* m = (MockCGTarget*)t; + (void)d; + ++m->call_calls; +} + +static void mock_emit_call_plan(CGTarget* t, const CGCallPlan* p) { + MockCGTarget* m = (MockCGTarget*)t; + ++m->emit_call_plan_calls; + m->last_plan_callee = p->callee; +} + static void mock_reserve_hard_regs(CGTarget* t, RegClass cls, const Reg* regs, u32 n) { MockCGTarget* m = (MockCGTarget*)t; @@ -551,8 +598,10 @@ static void mock_load_const(CGTarget* t, Operand dst, ConstBytes cb) { static void mock_copy(CGTarget* t, Operand dst, Operand src) { MockCGTarget* m = (MockCGTarget*)t; - (void)dst; - (void)src; + if (m->copy_calls < (int)(sizeof m->copy_dst / sizeof m->copy_dst[0])) { + m->copy_dst[m->copy_calls] = dst; + m->copy_src[m->copy_calls] = src; + } ++m->copy_calls; } @@ -667,7 +716,9 @@ static void mock_init(MockCGTarget* m, Compiler* c) { m->base.call_clobber_mask = mock_call_clobber_mask; m->base.return_reg_mask = mock_return_reg_mask; m->base.callee_save_mask = mock_callee_save_mask; + m->base.call = mock_call; m->base.plan_call = mock_plan_call; + m->base.emit_call_plan = mock_emit_call_plan; m->base.plan_hard_regs = mock_plan_hard_regs; m->base.reserve_hard_regs = mock_reserve_hard_regs; m->base.resolve_reg_name = mock_resolve_reg_name; @@ -2525,6 +2576,116 @@ static void opt_dead_def_elim_test(void) { tc_fini(&tc); } +static void opt_planned_call_replay_resolves_arg_cycle(void) { + TestCtx tc; + tc_init(&tc); + MockCGTarget mock; + mock_init(&mock, tc.c); + + Func* f = new_func(&tc); + f->opt_scratch_regs[RC_INT][0] = 9; + f->opt_scratch_reg_count[RC_INT] = 1; + + Inst* in = ir_emit(f, f->entry, IR_CALL); + IRCallAux* aux = arena_znew(f->arena, IRCallAux); + in->extra.aux = aux; + aux->plan_valid = 1; + aux->use_plan_replay = 1; + aux->plan.callee = op_reg_(8, tc.i64); + aux->plan.args = arena_zarray(f->arena, CGCallPlanMove, 2); + aux->plan.nargs = 2; + aux->plan.args[0].src = op_reg_(1, tc.i64); + aux->plan.args[0].dst_kind = CG_CALL_PLAN_REG; + aux->plan.args[0].cls = RC_INT; + aux->plan.args[0].dst_reg = 2; + aux->plan.args[0].mem = mem_unknown_(tc.i64, 8); + aux->plan.args[1].src = op_reg_(2, tc.i64); + aux->plan.args[1].dst_kind = CG_CALL_PLAN_REG; + aux->plan.args[1].cls = RC_INT; + aux->plan.args[1].dst_reg = 1; + aux->plan.args[1].mem = mem_unknown_(tc.i64, 8); + + opt_emit(tc.c, f, &mock.base); + + EXPECT(mock.emit_call_plan_calls == 1, + "planned call should use emit_call_plan"); + EXPECT(mock.copy_calls == 3, + "two-register cycle should need three copies, got %d", + mock.copy_calls); + EXPECT(mock.copy_dst[0].v.reg == 9 && mock.copy_src[0].v.reg == 1, + "cycle should save first source to scratch"); + EXPECT(mock.copy_dst[1].v.reg == 1 && mock.copy_src[1].v.reg == 2, + "cycle should rotate second arg into first ABI reg"); + EXPECT(mock.copy_dst[2].v.reg == 2 && mock.copy_src[2].v.reg == 9, + "cycle should restore scratch into second ABI reg"); + tc_fini(&tc); +} + +static void opt_planned_call_replay_preserves_indirect_callee_arg_reg(void) { + TestCtx tc; + tc_init(&tc); + MockCGTarget mock; + mock_init(&mock, tc.c); + + Func* f = new_func(&tc); + f->opt_scratch_regs[RC_INT][0] = 9; + f->opt_scratch_reg_count[RC_INT] = 1; + + Inst* in = ir_emit(f, f->entry, IR_CALL); + IRCallAux* aux = arena_znew(f->arena, IRCallAux); + in->extra.aux = aux; + aux->plan_valid = 1; + aux->use_plan_replay = 1; + aux->plan.callee = op_reg_(1, tc.i64); + aux->plan.args = arena_zarray(f->arena, CGCallPlanMove, 1); + aux->plan.nargs = 1; + aux->plan.args[0].src = op_reg_(2, tc.i64); + aux->plan.args[0].dst_kind = CG_CALL_PLAN_REG; + aux->plan.args[0].cls = RC_INT; + aux->plan.args[0].dst_reg = 1; + aux->plan.args[0].mem = mem_unknown_(tc.i64, 8); + + opt_emit(tc.c, f, &mock.base); + + EXPECT(mock.emit_call_plan_calls == 1, + "planned indirect call should use emit_call_plan"); + EXPECT(mock.copy_calls >= 2, + "callee-in-arg-register hazard should copy callee plus arg"); + EXPECT(mock.copy_dst[0].v.reg == 9 && mock.copy_src[0].v.reg == 1, + "callee should be saved before arg setup overwrites its register"); + EXPECT(mock.last_plan_callee.kind == OPK_REG && + mock.last_plan_callee.v.reg == 9, + "emit_call_plan should receive scratch callee register"); + tc_fini(&tc); +} + +static void opt_planned_call_replay_falls_back_for_stack_args(void) { + TestCtx tc; + tc_init(&tc); + MockCGTarget mock; + mock_init(&mock, tc.c); + + Func* f = new_func(&tc); + Inst* in = ir_emit(f, f->entry, IR_CALL); + IRCallAux* aux = arena_znew(f->arena, IRCallAux); + in->extra.aux = aux; + aux->plan_valid = 1; + aux->plan.callee = op_reg_(8, tc.i64); + aux->plan.args = arena_zarray(f->arena, CGCallPlanMove, 1); + aux->plan.nargs = 1; + aux->plan.args[0].src = op_reg_(1, tc.i64); + aux->plan.args[0].dst_kind = CG_CALL_PLAN_STACK; + aux->plan.args[0].cls = RC_INT; + aux->plan.args[0].mem = mem_unknown_(tc.i64, 8); + + opt_emit(tc.c, f, &mock.base); + + EXPECT(mock.emit_call_plan_calls == 0, + "stack-arg plans should stay on legacy fallback for now"); + EXPECT(mock.call_calls == 1, "legacy call fallback should be used"); + tc_fini(&tc); +} + /* ============================================================ * End-to-end test — drive the opt-wrapped CGTarget through the * public CGTarget interface, let func_end run the full pipeline, @@ -2908,6 +3069,9 @@ int main(void) { opt_dce_physical_dead_defs(); opt_dead_def_keeps_observable_loads(); opt_dead_def_elim_test(); + opt_planned_call_replay_resolves_arg_cycle(); + opt_planned_call_replay_preserves_indirect_callee_arg_reg(); + opt_planned_call_replay_falls_back_for_stack_args(); opt_emit_no_virtual_alloc(); opt_records_const_bytes_by_value(); opt_cmp_branch_keeps_fallthrough_after_block_growth();