commit ceffaef359a99d6d2695adc06e78f6acde56921e
parent e4365d4502bd267ddd485d100170394e466a2d9a
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 21 May 2026 22:54:31 -0700
opt: add first Phase E inliner slice
Diffstat:
5 files changed, 671 insertions(+), 17 deletions(-)
diff --git a/doc/OPT.md b/doc/OPT.md
@@ -972,28 +972,168 @@ Remaining Phase D exit criteria:
default O2 schedule.
- [ ] Full O2 jump optimization has focused red-green tests before it enters
the default O2 schedule.
+- [ ] Post-inline cleanup quality: after Phase E inlines a simple Toy wrapper,
+ the existing Phase D value passes should collapse the exposed constant/copy
+ chain and remove frame traffic. Current example:
+
+ ```
+ fn add1(x: i32): i32 {
+ return x + (1 as i32);
+ }
+
+ pub fn main(): i32 {
+ let y: i32 = add1(41 as i32);
+ if y == (42 as i32) {
+ return 0 as i32;
+ }
+ return 1 as i32;
+ }
+ ```
+
+ `-O2` now removes the `bl _add1`, but AArch64 still shows materialized
+ parameter/local frame traffic and a runtime compare/branch in `main`. The
+ desired Phase D follow-up is for inlining plus cleanup to fold the call
+ result and branch to the constant return path.
### Phase E - Inlining and Cleanup
Goal: add MIR-style small direct-call inlining for `-O2`.
-Implement:
-
-- Bottom-up call graph over retained `Func` IR.
-- SCC skip for v1.
-- Size budgets modeled on MIR: small budget for normal calls, larger budget
- for explicit inline candidates once cfree tracks that source property.
-- Caller growth cap.
-- Register/value remapping, block/label remapping, parameter materialization,
- return rewrite, and debug location preservation.
-- Conservative refusal for varargs, setjmp/longjmp, inline asm with hard
- constraints, and functions whose frame/alloca behavior is not yet modeled.
-
-After each inline iteration, run `opt_cleanup` on dirty callers.
+Design decision: inline before machine lowering, on retained raw `Func` IR, at
+`CGTarget.finalize`. Do not inline the post-SSA-destruction or post-machinize
+form. The current implementation emits at `func_end`; Phase E first has to
+make the `opt.h` contract real for `-O2`: `func_end` records and stores the
+function, while `finalize` performs interprocedural work, then runs the normal
+per-function O2 pipeline and emission in original function order.
+
+Current implementation slice: retained `-O2` finalize scheduling is in place,
+and `pass_inline.c` inlines small direct calls to retained, acyclic,
+straight-line callees with void/scalar register returns. It intentionally
+refuses multi-block CFG cloning, aggregate/sret/byval returns, tail calls,
+varargs, alloca, inline asm, computed goto, and other hard cases until the
+broader rewrite rules below are implemented with focused tests.
+
+This matches MIR's useful property: inline while calls, parameters, frame
+slots, returns, labels, and source locations are still target-independent.
+Cloning target-lowered calls would couple the inliner to ABI move plans,
+hard-register constraints, and late spill/rewrite state. Cloning SSA would
+also require phi and def-use surgery across two functions. Raw IR cloning keeps
+Phase E mechanical and lets the existing O2 passes clean up the exposed
+constants, copies, dead stores, and dead blocks.
+
+Implementation shape:
+
+- Add real `FuncSet` ownership to `OptImpl`: an arena-backed ordered `Func**`
+ plus a symbol map from `ObjSymId` to the retained definition. No global
+ state. Record order is emission order at finalize.
+- Split scheduling in `opt.c`:
+ - `func_end`: `opt_frame_home_addr_taken_locals`, classify eligibility, append
+ to `FuncSet`, and clear the current recorder state. For `-O1`, keeping the
+ current immediate emit path is acceptable; `-O2` must retain functions.
+ - `finalize`: run `opt_inline(FuncSet*, max_iters)`, then lower/emit every
+ retained function. Functions that still require the defensive non-SSA O2
+ route use the shared O1 lowering path at finalize.
+ - Refactor the current O2 scheduler into "O2 cleanup/pre-lowering" and
+ "lowering/emission" pieces so an inlined dirty function can reuse the
+ existing verifier-bracketed pass order.
+- Build the call graph from direct `IR_CALL` sites whose callee is
+ `OPK_GLOBAL` with addend zero and whose symbol has a retained `Func`.
+ Indirect calls and unresolved/external symbols are ordinary non-inlineable
+ calls.
+- Compute SCCs on the retained graph. Inline only calls to callees in already
+ processed acyclic components. Skip self recursion and mutually recursive SCCs
+ for v1 instead of trying partial recursive budgets.
+- Rebuild the call graph at each inline iteration. Default `max_iters = 1`;
+ keep a small internal cap even if a later driver knob requests more.
+
+Eligibility and budgets:
+
+- Candidate callee must be retained, same target/context, non-variadic, not
+ naked/interrupt/ifunc once those attributes reach `CGFuncDesc`, and not marked
+ `noinline` once C declaration flags are plumbed through CG.
+- Candidate call site must be a normal direct call. Skip `CG_CALL_TAIL` and any
+ future musttail representation.
+- V1 skips callees containing `IR_ALLOCA`, `IR_VA_*`, `IR_LOAD_LABEL_ADDR`,
+ `IR_INDIRECT_BRANCH`, `IR_ASM_BLOCK`, `INTRIN_SETJMP`, `INTRIN_LONGJMP`,
+ coroutine switch, or any op whose cloning semantics are not explicit.
+- V1 supports void and scalar register returns first. Aggregate/sret/byval
+ returns can be enabled later after return materialization has focused tests.
+- Normal-call budget should be deliberately small: start around 12 to 20
+ non-debug, non-label instructions, with a caller growth cap around 25 percent
+ and an absolute per-function growth cap. Once `inline` / `always_inline` /
+ `noinline` are carried into CG, use a larger explicit-inline budget and let
+ `always_inline` exceed the normal budget but still obey semantic refusal
+ rules.
+- Penalize callees with calls, memory barriers, atomics, fences, and multiple
+ returns; discount simple `IR_LOAD_IMM`, `IR_COPY`, and single-use arithmetic.
+ Do not use target-specific instruction costs in Phase E.
+
+Inline rewrite:
+
+- Split the caller block at the call site into `pre`, cloned callee body, and
+ `cont`. The original call instruction is removed.
+- Allocate fresh caller-local ids for every cloned block, frame slot, scope,
+ pseudo-register, and SSA value id present in the callee. Preserve callee
+ instruction ids only as source material; cloned instructions get new stable
+ ids from the caller.
+- Clone `emit_order` for cloned blocks in callee layout order, inserted between
+ `pre` and `cont`. Update CFG successors/predecessors through `opt_build_cfg`
+ after the rewrite rather than hand-maintaining all derived metadata.
+- Materialize arguments at the clone entry. For register-like callee parameter
+ storage, emit `IR_COPY` from the call argument value to the remapped callee
+ parameter pseudo. For frame-backed parameters, emit stores to the remapped
+ parameter frame slots using the recorded `IRParam` storage and `MemAccess`
+ shape. Constants become `IR_LOAD_IMM` or direct stores as appropriate.
+- Rewrite each cloned `IR_RET`:
+ - void return: replace with `IR_BR cont`.
+ - scalar register return: copy the returned value to the call site's result
+ operand, then branch to `cont`.
+ - multiple returns: all returns branch to the same continuation; if later SSA
+ needs a phi, the normal O2 mem2reg/reg-SSA construction will create it from
+ the inserted copies/stores.
+- For noreturn callees, omit the continuation edge when every cloned exit is
+ terminating. Keep ordinary call semantics until this is tested; do not infer
+ noreturn only from body shape in v1.
+- Preserve `Inst.loc` from cloned callee instructions. Argument materialization
+ uses the call-site location. This is enough for line-accurate emission now;
+ `DW_TAG_inlined_subroutine` can remain a later DWARF phase.
+
+Cleanup after inlining:
+
+- Mark modified callers dirty and invalidate CFG, def-use, dominator, and loop
+ analyses.
+- For each retained `-O2` function, run the existing O2 pre-lowering schedule
+ after all inline iterations. This is `opt_cleanup` in Phase E terms:
+ `build_cfg`, jump cleanup, reg-SSA, mem2reg SSA, simplify, GVN, copy
+ propagation, DSE, SSA DCE, LICM, pressure relief, conventional SSA,
+ SSA combine, undo SSA, and jump optimization, with the same verifier
+ checkpoints as the current O2 path.
+- Then run the shared lowering pipeline once and emit. No function should be
+ emitted before all possible callers have had a chance to inline it.
+
+Testing plan:
+
+- Unit/pass tests in `test/opt` for direct wrapper inline, two-return scalar
+ inline, refusal for recursion/SCCs, refusal for varargs/alloca/setjmp/asm,
+ and caller growth cap.
+- End-to-end toy/C cases where `static int add1(int x) { return x + 1; }`
+ disappears at `-O2` but remains a call at `-O1`.
+- Keep the Toy `add1(41) == 42` example from the Phase D checklist as a
+ disassembly sanity case: Phase E is responsible for removing the call; Phase
+ D cleanup is responsible for removing the exposed constant/copy/branch
+ leftovers.
+- A chained-call case proves bottom-up order: `main -> f -> g` inlines `g`
+ into `f` before deciding whether `f` fits into `main`.
+- A negative recursive case proves codegen is unchanged and correct.
+- A focused disassembly sanity check on AArch64 or x64 verifies wrapper-call
+ removal after inline+cleanup without broad corpus churn.
Exit criteria:
-- Small wrapper-call cases optimize after inline+cleanup.
+- `make test-opt`
+- Focused wrapper/chained-call toy fixtures:
+ `CFREE_TEST_FILTER=<inline-case> CFREE_TOY_OPT_LEVELS="0 1 2" make test-toy`
+- Full `CFREE_TOY_OPT_LEVELS="0 1 2" make test-toy`
- Recursive and mutually recursive cases are unchanged and correct.
### Phase F - Benchmark Closure
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -35,6 +35,7 @@ typedef struct OptImpl {
Func* f;
u32 cur; /* current block id */
SrcLoc pending_loc; /* most recent set_loc; stamped on each Inst */
+ FuncSet funcs;
Writer* dump_writer;
} OptImpl;
@@ -1461,6 +1462,18 @@ static void opt_run_o2_pipeline(OptImpl* o) {
opt_run_lowering_pipeline(o, "opt.o2.total", 1);
}
+static void opt_funcset_add(OptImpl* o, Func* f) {
+ FuncSet* fs = &o->funcs;
+ if (fs->nfuncs == fs->cap) {
+ u32 ncap = fs->cap ? fs->cap * 2u : 8u;
+ Func** nf = arena_array(o->c->tu, Func*, ncap);
+ if (fs->funcs) memcpy(nf, fs->funcs, sizeof(Func*) * fs->nfuncs);
+ fs->funcs = nf;
+ fs->cap = ncap;
+ }
+ fs->funcs[fs->nfuncs++] = f;
+}
+
static int func_requires_non_ssa_o2(Func* f) {
if (!f) return 0;
for (u32 b = 0; b < f->nblocks; ++b) {
@@ -1486,8 +1499,8 @@ static void w_func_end(CGTarget* t) {
if (!o->f) return;
opt_frame_home_addr_taken_locals(o->f);
- if (o->level >= 2 && !func_requires_non_ssa_o2(o->f)) {
- opt_run_o2_pipeline(o);
+ if (o->level >= 2) {
+ opt_funcset_add(o, o->f);
} else if (o->level >= 1) {
opt_run_o1_pipeline(o);
} else {
@@ -1500,7 +1513,20 @@ static void w_func_end(CGTarget* t) {
/* ---- finalize / destroy ---- */
static void w_finalize(CGTarget* t) {
- CGTarget* wr = impl_of(t)->target;
+ OptImpl* o = impl_of(t);
+ CGTarget* wr = o->target;
+ if (o->level >= 2 && o->funcs.nfuncs) {
+ opt_inline(&o->funcs, 1);
+ for (u32 i = 0; i < o->funcs.nfuncs; ++i) {
+ o->f = o->funcs.funcs[i];
+ if (!o->f) continue;
+ if (!func_requires_non_ssa_o2(o->f))
+ opt_run_o2_pipeline(o);
+ else
+ opt_run_o1_pipeline(o);
+ }
+ o->f = NULL;
+ }
if (wr->finalize) wr->finalize(wr);
}
@@ -1534,6 +1560,8 @@ CGTarget* opt_cgtarget_new(Compiler* c, CGTarget* target, int level) {
o->c = c;
o->target = target;
o->level = level;
+ o->funcs.c = c;
+ o->funcs.arena = c->tu;
CGTarget* t = &o->base;
t->c = c;
diff --git a/src/opt/opt_internal.h b/src/opt/opt_internal.h
@@ -23,6 +23,14 @@ typedef struct OptPassCtx {
const char* stage;
} OptPassCtx;
+struct FuncSet {
+ Compiler* c;
+ Arena* arena;
+ Func** funcs;
+ u32 nfuncs;
+ u32 cap;
+};
+
typedef struct OptBlockList {
u32* items;
u32 n;
diff --git a/src/opt/pass_inline.c b/src/opt/pass_inline.c
@@ -0,0 +1,456 @@
+#include <string.h>
+
+#include "core/arena.h"
+#include "core/core.h"
+#include "core/metrics.h"
+#include "opt/opt_internal.h"
+
+typedef struct InlineMap {
+ Func* caller;
+ Func* callee;
+ PReg* preg;
+ u32 npreg;
+ FrameSlot* slot;
+ u32 nslot;
+} InlineMap;
+
+typedef struct InstVec {
+ Func* f;
+ Inst* v;
+ u32 n;
+ u32 cap;
+} InstVec;
+
+static void instvec_push(InstVec* iv, const Inst* in) {
+ if (iv->n == iv->cap) {
+ u32 ncap = iv->cap ? iv->cap * 2u : 16u;
+ Inst* nv = arena_array(iv->f->arena, Inst, ncap);
+ if (iv->v) memcpy(nv, iv->v, sizeof(Inst) * iv->n);
+ iv->v = nv;
+ iv->cap = ncap;
+ }
+ iv->v[iv->n++] = *in;
+}
+
+static Func* funcset_find(FuncSet* fs, ObjSymId sym) {
+ if (!fs || sym == OBJ_SYM_NONE) return NULL;
+ for (u32 i = 0; i < fs->nfuncs; ++i)
+ if (fs->funcs[i] && fs->funcs[i]->name == sym) return fs->funcs[i];
+ return NULL;
+}
+
+static int funcset_index(FuncSet* fs, Func* f) {
+ if (!fs || !f) return -1;
+ for (u32 i = 0; i < fs->nfuncs; ++i)
+ if (fs->funcs[i] == f) return (int)i;
+ return -1;
+}
+
+static Func* direct_callee(FuncSet* fs, const Inst* in) {
+ if (!in || (IROp)in->op != IR_CALL) return NULL;
+ IRCallAux* aux = (IRCallAux*)in->extra.aux;
+ if (!aux || aux->use_plan_replay) return NULL;
+ if (aux->desc.flags & CG_CALL_TAIL) return NULL;
+ if (aux->desc.callee.kind != OPK_GLOBAL) return NULL;
+ if (aux->desc.callee.v.global.addend != 0) return NULL;
+ return funcset_find(fs, aux->desc.callee.v.global.sym);
+}
+
+static int func_reaches(FuncSet* fs, Func* from, Func* target, u8* seen) {
+ int idx = funcset_index(fs, from);
+ if (idx < 0) return 0;
+ if (seen[idx]) return 0;
+ seen[idx] = 1;
+ for (u32 b = 0; b < from->nblocks; ++b) {
+ Block* bl = &from->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Func* callee = direct_callee(fs, &bl->insts[i]);
+ if (!callee) continue;
+ if (callee == target) return 1;
+ if (func_reaches(fs, callee, target, seen)) return 1;
+ }
+ }
+ return 0;
+}
+
+static int recursive_or_scc(FuncSet* fs, Func* caller, Func* callee) {
+ if (caller == callee) return 1;
+ u8* seen = arena_zarray(fs->arena, u8, fs->nfuncs ? fs->nfuncs : 1u);
+ return func_reaches(fs, callee, caller, seen);
+}
+
+static int op_supported_in_straightline_inline(IROp op) {
+ switch (op) {
+ case IR_NOP:
+ case IR_PARAM_DECL:
+ case IR_LOAD_IMM:
+ case IR_LOAD_CONST:
+ case IR_COPY:
+ case IR_LOAD:
+ case IR_STORE:
+ case IR_BINOP:
+ case IR_UNOP:
+ case IR_CMP:
+ case IR_CONVERT:
+ case IR_RET:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int callee_inline_shape(Func* callee, u32* cost_out) {
+ if (!callee || callee->opt_reg_ssa || callee->opt_rewritten) return 0;
+ if (cfree_cg_type_func_is_variadic((CfreeCompiler*)callee->c, callee->type)) {
+ metrics_count(callee->c, "opt.inline.refuse_shape_variadic", 1);
+ return 0;
+ }
+ if (callee->entry >= callee->nblocks) {
+ metrics_count(callee->c, "opt.inline.refuse_shape_entry", 1);
+ return 0;
+ }
+
+ Block* entry = &callee->blocks[callee->entry];
+ u32 nret = 0;
+ u32 cost = 0;
+ for (u32 i = 0; i < entry->ninsts; ++i) {
+ Inst* in = &entry->insts[i];
+ IROp op = (IROp)in->op;
+ if (!op_supported_in_straightline_inline(op)) {
+ metrics_count(callee->c, "opt.inline.refuse_shape_op", 1);
+ return 0;
+ }
+ if (op == IR_RET) {
+ ++nret;
+ if (i + 1u != entry->ninsts) {
+ metrics_count(callee->c, "opt.inline.refuse_shape_ret_pos", 1);
+ return 0;
+ }
+ continue;
+ }
+ if (op != IR_NOP && op != IR_PARAM_DECL) ++cost;
+ }
+ if (nret != 1) {
+ metrics_count(callee->c, "opt.inline.refuse_shape_ret_count", 1);
+ return 0;
+ }
+ if (cost > 20u) {
+ metrics_count(callee->c, "opt.inline.refuse_shape_budget", 1);
+ return 0;
+ }
+ if (cost_out) *cost_out = cost;
+ return 1;
+}
+
+static PReg map_preg(InlineMap* m, PReg r) {
+ if (r == PREG_NONE || r == 0 || r >= m->npreg) return r;
+ return m->preg[r] ? m->preg[r] : r;
+}
+
+static FrameSlot map_slot(InlineMap* m, FrameSlot s) {
+ if (s == FRAME_SLOT_NONE || s >= m->nslot) return s;
+ return m->slot[s] ? m->slot[s] : s;
+}
+
+static void map_mem(InlineMap* m, MemAccess* mem) {
+ if (!mem) return;
+ if (mem->alias.kind == ALIAS_LOCAL && mem->alias.v.local_id > 0) {
+ FrameSlot old = (FrameSlot)mem->alias.v.local_id;
+ mem->alias.v.local_id = (i32)map_slot(m, old);
+ }
+}
+
+static Operand map_operand(InlineMap* m, Operand op) {
+ switch ((OpKind)op.kind) {
+ case OPK_REG:
+ op.v.reg = map_preg(m, (PReg)op.v.reg);
+ break;
+ case OPK_LOCAL:
+ op.v.frame_slot = map_slot(m, op.v.frame_slot);
+ break;
+ case OPK_INDIRECT:
+ op.v.ind.base = map_preg(m, (PReg)op.v.ind.base);
+ break;
+ default:
+ break;
+ }
+ return op;
+}
+
+static int build_inline_map(InlineMap* m, Func* caller, Func* callee) {
+ memset(m, 0, sizeof *m);
+ m->caller = caller;
+ m->callee = callee;
+ m->npreg = callee->npregs;
+ m->nslot = callee->nframe_slots + 1u;
+ m->preg = arena_zarray(caller->arena, PReg, m->npreg ? m->npreg : 1u);
+ m->slot = arena_zarray(caller->arena, FrameSlot, m->nslot ? m->nslot : 1u);
+
+ for (FrameSlot s = 1; s <= callee->nframe_slots; ++s) {
+ IRFrameSlot* old = &callee->frame_slots[s - 1u];
+ FrameSlotDesc d;
+ memset(&d, 0, sizeof d);
+ d.type = old->type;
+ d.name = old->name;
+ d.loc = old->loc;
+ d.size = old->size;
+ d.align = old->align;
+ d.kind = old->kind;
+ d.flags = old->flags;
+ m->slot[s] = ir_frame_slot_new(caller, &d);
+ }
+
+ for (PReg r = 1; r < callee->npregs; ++r) {
+ m->preg[r] =
+ ir_alloc_preg(caller, callee->preg_type[r], callee->preg_cls[r]);
+ }
+ return 1;
+}
+
+static Inst make_inst(Func* f, IROp op, SrcLoc loc) {
+ Inst in;
+ memset(&in, 0, sizeof in);
+ in.op = (u16)op;
+ in.id = ir_inst_id_new(f);
+ in.loc = loc;
+ return in;
+}
+
+static Operand local_operand(FrameSlot s, CfreeCgTypeId ty) {
+ Operand op;
+ memset(&op, 0, sizeof op);
+ op.kind = OPK_LOCAL;
+ op.cls = RC_INT;
+ op.type = ty;
+ op.v.frame_slot = s;
+ return op;
+}
+
+static MemAccess param_mem(const IRParam* p, FrameSlot s) {
+ MemAccess m;
+ memset(&m, 0, sizeof m);
+ m.type = p->type;
+ m.size = p->size;
+ m.align = p->align;
+ m.alias.kind = ALIAS_LOCAL;
+ m.alias.v.local_id = (i32)s;
+ return m;
+}
+
+static int append_param_materialization(InstVec* out, InlineMap* m,
+ const Inst* call) {
+ IRCallAux* aux = (IRCallAux*)call->extra.aux;
+ Func* caller = m->caller;
+ Func* callee = m->callee;
+ if (!aux || aux->desc.nargs != callee->nparams) return 0;
+
+ for (u32 i = 0; i < callee->nparams; ++i) {
+ IRParam* p = &callee->params[i];
+ const CGABIValue* av = &aux->desc.args[i];
+ if (av->nparts != 0) return 0;
+ Operand src = av->storage;
+ if (src.kind != OPK_REG && src.kind != OPK_IMM) return 0;
+ if (p->storage.kind == CG_LOCAL_STORAGE_REG) {
+ PReg dst_r = map_preg(m, (PReg)p->storage.v.reg);
+ Operand dst;
+ memset(&dst, 0, sizeof dst);
+ dst.kind = OPK_REG;
+ dst.cls = opt_reg_cls(caller, dst_r);
+ dst.type = p->type;
+ dst.v.reg = dst_r;
+ Inst in = make_inst(caller, src.kind == OPK_IMM ? IR_LOAD_IMM : IR_COPY,
+ call->loc);
+ in.type = p->type;
+ in.def = (Val)dst_r;
+ in.opnds =
+ arena_array(caller->arena, Operand, src.kind == OPK_IMM ? 1u : 2u);
+ in.opnds[0] = dst;
+ in.nopnds = src.kind == OPK_IMM ? 1u : 2u;
+ if (src.kind == OPK_IMM) {
+ in.extra.imm = src.v.imm;
+ } else {
+ in.opnds[1] = src;
+ }
+ instvec_push(out, &in);
+ } else {
+ FrameSlot dst_s = map_slot(m, p->storage.v.frame_slot);
+ Inst in = make_inst(caller, IR_STORE, call->loc);
+ in.opnds = arena_array(caller->arena, Operand, 2);
+ in.opnds[0] = local_operand(dst_s, p->type);
+ in.opnds[1] = src;
+ in.nopnds = 2;
+ in.extra.mem = param_mem(p, dst_s);
+ instvec_push(out, &in);
+ }
+ }
+ return 1;
+}
+
+static Inst clone_inst(InlineMap* m, const Inst* src) {
+ Func* caller = m->caller;
+ Inst dst = *src;
+ dst.id = ir_inst_id_new(caller);
+ dst.def = src->def != VAL_NONE ? (Val)map_preg(m, (PReg)src->def) : VAL_NONE;
+ if (src->ndefs) {
+ dst.defs = arena_array(caller->arena, Val, src->ndefs);
+ for (u32 i = 0; i < src->ndefs; ++i) {
+ dst.defs[i] = src->defs[i] != VAL_NONE
+ ? (Val)map_preg(m, (PReg)src->defs[i])
+ : VAL_NONE;
+ }
+ }
+ if (src->nopnds) {
+ dst.opnds = arena_array(caller->arena, Operand, src->nopnds);
+ for (u32 i = 0; i < src->nopnds; ++i)
+ dst.opnds[i] = map_operand(m, src->opnds[i]);
+ }
+ if ((IROp)dst.op == IR_LOAD || (IROp)dst.op == IR_STORE) {
+ map_mem(m, &dst.extra.mem);
+ }
+ return dst;
+}
+
+static int append_return_materialization(InstVec* out, InlineMap* m,
+ const Inst* call, const Inst* ret) {
+ IRCallAux* call_aux = (IRCallAux*)call->extra.aux;
+ IRRetAux* ret_aux = (IRRetAux*)ret->extra.aux;
+ Func* caller = m->caller;
+ if (!call_aux) return 0;
+ if (!ret_aux || !ret_aux->present) return 1;
+ if (ret_aux->val.nparts != 0 || call_aux->desc.ret.nparts != 0) return 0;
+ Operand dst = call_aux->desc.ret.storage;
+ Operand src = ret_aux->val.storage;
+ if (dst.kind == OPK_IMM) return 1;
+ if (dst.kind != OPK_REG) return 0;
+ src = map_operand(m, src);
+ if (src.kind != OPK_REG && src.kind != OPK_IMM) return 0;
+
+ Inst in =
+ make_inst(caller, src.kind == OPK_IMM ? IR_LOAD_IMM : IR_COPY, call->loc);
+ in.type = dst.type;
+ in.def = (Val)dst.v.reg;
+ in.opnds = arena_array(caller->arena, Operand, src.kind == OPK_IMM ? 1u : 2u);
+ in.opnds[0] = dst;
+ in.nopnds = src.kind == OPK_IMM ? 1u : 2u;
+ if (src.kind == OPK_IMM) {
+ in.extra.imm = src.v.imm;
+ } else {
+ in.opnds[1] = src;
+ }
+ instvec_push(out, &in);
+ return 1;
+}
+
+static int inline_rewrite_supported(Func* callee, const Inst* call) {
+ IRCallAux* call_aux = (IRCallAux*)call->extra.aux;
+ if (!call_aux || call_aux->desc.nargs != callee->nparams) return 0;
+ for (u32 i = 0; i < callee->nparams; ++i) {
+ const CGABIValue* av = &call_aux->desc.args[i];
+ if (av->nparts != 0) return 0;
+ if (av->storage.kind != OPK_REG && av->storage.kind != OPK_IMM) return 0;
+ }
+
+ Block* cbl = &callee->blocks[callee->entry];
+ const Inst* ret = cbl->ninsts ? &cbl->insts[cbl->ninsts - 1u] : NULL;
+ if (!ret || (IROp)ret->op != IR_RET) return 0;
+ IRRetAux* ret_aux = (IRRetAux*)ret->extra.aux;
+ if (!ret_aux || !ret_aux->present) return 1;
+ if (ret_aux->val.nparts != 0 || call_aux->desc.ret.nparts != 0) return 0;
+ if (call_aux->desc.ret.storage.kind == OPK_IMM) return 1;
+ if (call_aux->desc.ret.storage.kind != OPK_REG) return 0;
+ return ret_aux->val.storage.kind == OPK_REG ||
+ ret_aux->val.storage.kind == OPK_IMM;
+}
+
+static int inline_call_site(Func* caller, u32 block_idx, u32 inst_idx,
+ Func* callee) {
+ Block* cbl = &callee->blocks[callee->entry];
+ Block* bl = &caller->blocks[block_idx];
+ Inst call = bl->insts[inst_idx];
+ InlineMap map;
+ InstVec out;
+ memset(&out, 0, sizeof out);
+ out.f = caller;
+ if (!build_inline_map(&map, caller, callee)) return 0;
+
+ for (u32 i = 0; i < inst_idx; ++i) instvec_push(&out, &bl->insts[i]);
+ if (!append_param_materialization(&out, &map, &call)) return 0;
+
+ const Inst* ret = NULL;
+ for (u32 i = 0; i < cbl->ninsts; ++i) {
+ Inst* src = &cbl->insts[i];
+ switch ((IROp)src->op) {
+ case IR_NOP:
+ case IR_PARAM_DECL:
+ break;
+ case IR_RET:
+ ret = src;
+ break;
+ default: {
+ Inst cloned = clone_inst(&map, src);
+ instvec_push(&out, &cloned);
+ break;
+ }
+ }
+ }
+ if (!ret || !append_return_materialization(&out, &map, &call, ret)) return 0;
+ for (u32 i = inst_idx + 1u; i < bl->ninsts; ++i)
+ instvec_push(&out, &bl->insts[i]);
+
+ bl->insts = out.v;
+ bl->ninsts = out.n;
+ bl->cap = out.cap;
+ opt_analysis_invalidate(caller, OPT_ANALYSIS_CFG | OPT_ANALYSIS_DEF_USE |
+ OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP);
+ return 1;
+}
+
+static int try_inline_call(FuncSet* fs, Func* caller, u32 b, u32 i) {
+ Inst* in = &caller->blocks[b].insts[i];
+ Func* callee = direct_callee(fs, in);
+ u32 cost = 0;
+ if (!callee) return 0;
+ metrics_count(caller->c, "opt.inline.candidates", 1);
+ if (recursive_or_scc(fs, caller, callee)) {
+ metrics_count(caller->c, "opt.inline.refuse_scc", 1);
+ return 0;
+ }
+ if (!callee_inline_shape(callee, &cost)) {
+ metrics_count(caller->c, "opt.inline.refuse_shape", 1);
+ return 0;
+ }
+ if (!inline_rewrite_supported(callee, in)) {
+ metrics_count(caller->c, "opt.inline.refuse_rewrite_shape", 1);
+ return 0;
+ }
+ (void)cost;
+ if (!inline_call_site(caller, b, i, callee)) {
+ metrics_count(caller->c, "opt.inline.refuse_rewrite", 1);
+ return 0;
+ }
+ metrics_count(caller->c, "opt.inline.inlined", 1);
+ return 1;
+}
+
+void opt_inline(FuncSet* fs, int max_iters) {
+ if (!fs || fs->nfuncs == 0 || max_iters <= 0) return;
+ if (max_iters > 4) max_iters = 4;
+ for (int iter = 0; iter < max_iters; ++iter) {
+ int changed = 0;
+ for (u32 fidx = 0; fidx < fs->nfuncs; ++fidx) {
+ Func* caller = fs->funcs[fidx];
+ if (!caller || caller->opt_reg_ssa || caller->opt_rewritten) continue;
+ for (u32 b = 0; b < caller->nblocks; ++b) {
+ Block* bl = &caller->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ if ((IROp)bl->insts[i].op != IR_CALL) continue;
+ if (try_inline_call(fs, caller, b, i)) {
+ changed = 1;
+ bl = &caller->blocks[b];
+ }
+ }
+ }
+ }
+ if (!changed) break;
+ }
+}
diff --git a/test/opt/phase0_guardrails.sh b/test/opt/phase0_guardrails.sh
@@ -207,6 +207,13 @@ int main() {
SRC
}
+write_inline_wrapper_o2() {
+ cat >"$TMP/inline_wrapper_o2.c" <<'SRC'
+static int add1(int x) { return x + 1; }
+int main() { return add1(41) == 42 ? 0 : 1; }
+SRC
+}
+
run_case() {
local name="$1"
local src="$2"
@@ -230,6 +237,19 @@ run_o2_case() {
printf 'phase0 %-24s O2 OK\n' "$name"
}
+run_inline_o2_case() {
+ local name="$1"
+ local src="$2"
+ local err="$TMP/${name}.metrics.err"
+ "$BIN" run --time -O2 "$src" >/dev/null 2>"$err"
+ if ! grep -q "opt.inline.inlined" "$err"; then
+ echo "phase0 $name: expected O2 inline metric" >&2
+ sed -n '1,160p' "$err" >&2
+ exit 1
+ fi
+ printf 'phase0 %-24s O2 inline OK\n' "$name"
+}
+
check_metrics() {
local src="$TMP/branch_liveness.c"
local err="$TMP/metrics.err"
@@ -287,6 +307,7 @@ write_loop_phi_o2
write_call_across_o2
write_dense_switch_o2
write_switch_join_add_o2
+write_inline_wrapper_o2
run_case branch_liveness "$TMP/branch_liveness.c"
run_case call_clobber "$TMP/call_clobber.c"
@@ -301,6 +322,7 @@ run_o2_case loop_phi "$TMP/loop_phi_o2.c"
run_o2_case call_across "$TMP/call_across_o2.c"
run_o2_case dense_switch "$TMP/dense_switch_o2.c"
run_o2_case switch_join_add "$TMP/switch_join_add_o2.c"
+run_inline_o2_case inline_wrapper "$TMP/inline_wrapper_o2.c"
check_metrics
printf 'phase0 identified inline-asm stress: test/parse/cases/asm_01_grammar.c\n'