commit eff3401d60b7db0ab8de3af4874fd5600fdcba68
parent 0a7004d9ad91577a885b3bdc19f05871fc6756d1
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Wed, 27 May 2026 22:38:11 -0700
opt: O1 codegen wins + tiny-function inliner
Codegen (aa64):
- Minimal prologue on opt path: emit exact-size prologue once callee-saves
and frame slots are known (new emit_prologue hook gated by
emit_minimal_prologue); only frame-size immediates are patched at func_end.
NativeDirectTarget keeps the worst-case-reserve path, now branching over the
unused tail instead of NOP-filling it. Saves ~17 executed NOPs per call.
- Save/restore fp/lr + integer callee-saves with stp/ldp pairs
(signed-offset form added to isa.h).
- Eliminate the scalar call-result round trip: emit_call hands plan_call the
MIR's destination directly so the ABI result reg moves once to its dest,
not via a temp spill slot.
- Store integer 0 from the hardware zero register via new
has_store_zero_reg / store_zero_reg hooks (strb wzr, ... on aa64).
Opt passes:
- pass_combine: strengthen the post-RA substitute into full local
copy-propagation for reg-source IR_COPY (no single-use / live-out gate);
immediate / convert forms keep the stricter gate.
- pass_addr_fold: new opt_hoist_loop_consts hoists single-def, non-zero
LOAD_IMM that appears inside a loop to one entry-block def, mirroring
opt_addr_of_global_cse. Single-def guard avoids remapping mutable
promoted locals (loop counters initialised to a constant).
- pass_jump: new cleanup_rotate_loops moves a loop header below its latch
and inverts the test in place, eliminating the per-iteration
unconditional back-jump (pure emit-order reorder + branch invert, no
CFG edges move).
Tail calls:
- Implement opt_on_tail_call_unrealizable_reason for the opt path using a
new NativeTarget signature_stack_bytes hook. Sibling calls whose
outgoing stack args don't fit the caller's incoming area now correctly
fall back to call+ret (variadic caller, or variadic callee invoked with
extra args, also fall back conservatively).
Inliner:
- Wire opt_try_tiny_inline into the O1 pipeline (streaming inline of tiny
already-recorded callees), with test-opt-tiny-inline + test-opt-inline
test targets and the run.sh disasm check.
Docs: refresh OPT_O1_PASSES.md for the new passes / emit-path features;
add OPT_O1_PERF_TODO.md and O1_INLINE.md.
Diffstat:
18 files changed, 1660 insertions(+), 596 deletions(-)
diff --git a/Makefile b/Makefile
@@ -154,7 +154,7 @@ LIB_SRCS_OBJ_COFF := $(filter-out %/archive.c,$(LIB_SRCS_OBJ_COFF))
LIB_SRCS_NONARCH += src/obj/archive_stubs.c
endif
-LIB_SRCS_OPT = $(filter-out src/opt/pass_inline.c src/opt/pass_o2.c,$(shell find src/opt -name '*.c' 2>/dev/null))
+LIB_SRCS_OPT = $(filter-out src/opt/pass_o2.c,$(shell find src/opt -name '*.c' 2>/dev/null))
LIB_SRCS_WASM_CORE = $(shell find src/wasm -name '*.c' 2>/dev/null)
LIB_SRCS_API_AR = src/api/archive.c
LIB_SRCS_API_DISASM = src/api/disasm.c
diff --git a/doc/O1_INLINE.md b/doc/O1_INLINE.md
@@ -0,0 +1,188 @@
+# Plan: Inline tiny functions at O1
+
+## Context
+
+The cfree optimizer currently does **no function inlining in any live compile**. Both
+`-O1` and `-O2` are normalized to the single O1 native path (`src/opt/opt.h:11`,
+"O2 cutover window"); the SSA-based O2 pipeline (`opt_cleanup`) and the whole-program
+inliner (`opt_inline`) exist in the tree but have **no callers** outside the stale,
+no-longer-compiling `test/opt/opt_test.c`. Verified empirically: at `-O1`/`-O2` a call to a
+trivial helper like `add1(x) = x+1` still emits a `bl` — nothing is inlined.
+
+We want O1 to inline *tiny* functions — simple `static inline`-style helpers — so callers
+don't pay call/frame overhead for one- or two-instruction bodies. This is the single
+biggest easy win for the kind of small leaf helpers that dominate idiomatic C.
+
+The good news: a complete, correct inlining toolkit already exists in
+`src/opt/pass_inline.c` and operates on the **pre-machinize PReg form**, which is exactly
+the IR shape present in the O1 pipeline before `opt_machinize_native`. This change is
+therefore mostly *wiring* that machinery into the live O1 path, gated to tiny callees —
+not writing an inliner from scratch.
+
+### Decisions
+- **Policy:** inline `DEFAULT`/`HINT` callees only when cost ≤ `INLINE_TINY_COST_LIMIT`
+ (8 straightline ops); always refuse `NEVER`; always inline `ALWAYS`
+ (`always_inline`) regardless of the tiny cap.
+- **Tests:** behavioral disasm fixture (reuse toy-135 at `-O1`) **plus** a fresh small
+ unit test in the `cg_ir_lower_test.c` style. Do **not** revive `opt_test.c` (it no
+ longer compiles against the current API — references the removed `CGTarget` type).
+
+## Key facts (verified)
+
+- `opt_on_func` (`src/opt/opt.c:171`) is the per-function sink callback: lowers each
+ `CgIrFunc` to a pre-machinize `Func` via `opt_func_from_cg_ir`, then runs
+ `opt_run_o1_native`. `OptImpl` struct at `opt.c:20`.
+- `opt_run_o1_native` (`opt.c:43`): cfg build → jump-cleanup → cfg → `opt_simplify_local`
+ (line 66) → `opt_verify` (69) → `opt_machinize_native` (73). The window **after line 67
+ and before line 68** is pre-machinize PReg form, CFG valid — the inliner's required shape.
+- The recorder (`src/cg/ir_recorder.c`) calls `cg_ir_module_add_func` at `func_begin` and
+ fires `func_recorded` at `func_end`. So when a **caller** is processed, every function
+ defined **earlier** in the TU is already recorded and `complete`. Static inline helpers
+ are defined before use ⇒ callee body is available. Forward-defined callees are not
+ (acceptable limitation).
+- Lifetimes: `Func` is allocated on `c->tu` (`ir_func_new`, `src/opt/ir.c:269`) and
+ `CgIrFunc`s live on the module — nothing is freed per-function. `opt_func_from_cg_ir` is
+ cheap and re-runnable, producing a fresh pre-machinize `Func` from a persistent `CgIrFunc`.
+- Reusable machinery in `pass_inline.c` (all operate on pre-machinize PReg form):
+ `inline_call_site` (line 500), gates `callee_inline_shape` (164, already enforces a
+ policy-dependent cost cap and a straightline-op whitelist that **excludes `IR_CALL`**),
+ `inline_rewrite_supported` (473), `recursive_or_scc` (135), `direct_callee` (85),
+ `effective_inline_policy` (100). Cost limits at lines 34-37.
+
+## Implementation
+
+### 1. `src/opt/pass_inline.c` — add the tiny-inline driver (reuse existing gates)
+
+Keep all existing static gates as-is; add in the same TU so nothing else needs exposing.
+
+- Add constant near lines 34-37:
+ ```c
+ #define INLINE_TINY_COST_LIMIT 8u
+ ```
+- Add `try_tiny_inline_call(FuncSet* fs, Func* caller, u32 b, u32 i)` — mirrors
+ `try_inline_call` (line 607) but: no `base_cost`/`caller_growth_ok` (the streaming path
+ has no whole-program cost array, and tiny callees can't blow up the caller); add the tiny
+ gate after `callee_inline_shape`:
+ ```c
+ if (policy != CFREE_CG_INLINE_ALWAYS && cost > INLINE_TINY_COST_LIMIT) return 0;
+ ```
+ Emit `opt.tiny_inline.candidates` / `opt.tiny_inline.inlined` metrics.
+- Add public driver (callback-based so `opt.c` owns the registry/cache):
+ ```c
+ typedef Func* (*OptInlineCalleeLookup)(void* ctx, ObjSymId callee_sym);
+ int opt_try_tiny_inline(Func* caller, OptInlineCalleeLookup lookup, void* ctx);
+ ```
+ It walks caller blocks/insts; for each `IR_CALL` resolves the callee `ObjSymId` from
+ `aux->desc.callee.v.global.sym`, calls `lookup(ctx, sym)` to get a fresh/cached
+ pre-machinize callee `Func`, builds a 2-element ad-hoc `FuncSet {caller, callee}` on
+ `caller->arena` (so `direct_callee`/`recursive_or_scc` work unchanged), and calls
+ `try_tiny_inline_call`. Returns the number of inlines performed.
+
+ **Iteration:** fixed-point loop with cap `#define TINY_INLINE_MAX_PASSES 4`. After any
+ successful `inline_call_site` (which splits the block, invalidating indices), set
+ `changed` and restart the block scan. Note inlined bodies are straightline-only
+ (`op_supported_in_straightline_inline` excludes `IR_CALL`) ⇒ inlining never introduces
+ new calls, so this terminates quickly; the cap is a guard.
+
+ **Recursion safety:** the only real check needed is `caller == callee` (kept via
+ `recursive_or_scc`). Any callee containing a call is already rejected by
+ `callee_inline_shape`'s whitelist, so transitive cycles cannot arise.
+
+### 2. `src/opt/opt_internal.h` — declare the new surface
+
+Add near the other pass prototypes:
+```c
+typedef Func* (*OptInlineCalleeLookup)(void* ctx, ObjSymId callee_sym);
+int opt_try_tiny_inline(Func* caller, OptInlineCalleeLookup lookup, void* ctx);
+void opt_inline(FuncSet*, int max_iters); /* formalize existing symbol */
+```
+
+### 3. `src/opt/opt.c` — registry + wiring
+
+- Extend `OptImpl` (line 20) with a callee registry/cache, all on `c->tu`:
+ ```c
+ CgIrFunc** cg_by_sym; /* recorded functions, for callee lookup */
+ Func** lowered_cache; /* parallel: lazily re-lowered callee Func */
+ u32 ncg;
+ u32 cg_cap;
+ ```
+ (zero-initialized by the existing `arena_znew` in `opt_cgtarget_new`).
+- At the **top of `opt_on_func`** (before lowering the caller), append `cg_func` to
+ `cg_by_sym` (it is `complete`; earlier-defined funcs already present). Growing array on
+ `c->tu`.
+- Add a lookup function `static Func* opt_tiny_callee_lookup(void* ctx, ObjSymId sym)`:
+ linear-scan `cg_by_sym` for `desc.sym == sym`; if found and `lowered_cache[idx] == NULL`,
+ `opt_func_from_cg_ir(o->c, cg_by_sym[idx])` and cache it (one re-lower per helper, reused
+ across all call sites — `inline_call_site` does not mutate the callee). Return NULL if
+ absent (forward-defined) so the call is left as-is.
+- In `opt_run_o1_native`, between line 67 (`metrics_scope_end "opt.cfg.simplify_local"`)
+ and line 68:
+ ```c
+ metrics_scope_begin(o->c, "opt.o1.tiny_inline");
+ int inlined = opt_try_tiny_inline(f, opt_tiny_callee_lookup, o);
+ metrics_scope_end(o->c, "opt.o1.tiny_inline");
+ if (inlined) {
+ /* REQUIRED for correctness: inline_call_site invalidated CFG and left
+ * preds stale (it maintains succ + emit_order only). Verify + machinize
+ * + regalloc all consume preds/dominance. */
+ opt_build_cfg(f);
+ /* QUALITY (optional): merge the single-pred/single-succ BR-glue chain the
+ * inliner introduced (pre -> callee body -> cont) back into straight-line
+ * blocks so regalloc doesn't see artificial block boundaries. Mirrors the
+ * prologue's build_cfg -> jump_cleanup -> build_cfg idiom (opt.c:57-63).
+ * NOTE: use opt_jump_cleanup, NOT opt_simplify_local — the latter is a
+ * per-inst peephole and does not merge blocks. Without this, output is
+ * still correct (post-RA opt_mir_jump_cleanup removes the dead jumps) but
+ * allocation is worse. */
+ opt_jump_cleanup(f, OPT_JUMP_CLEANUP_CFG);
+ opt_build_cfg(f);
+ opt_verify(f, "o1-tiny-inline");
+ }
+ ```
+ The pass needs `OptImpl* o` for the lookup — `opt_run_o1_native` already has `o`.
+
+### 4. Tests
+
+**(a) New unit test** — `test/opt/tiny_inline_test.c`, modeled on
+`test/opt/cg_ir_lower_test.c` (same `EXPECT` harness, builds `Func`s via `opt_func_from_cg_ir`
+from hand-built `CgIrFunc` tapes, links `$(LIB_OBJS)` with `-Isrc`). Cases:
+ - Tiny callee (`add1(x)=x+1`, cost 1): drive `opt_try_tiny_inline` with a trivial lookup
+ closure returning the callee `Func`; assert caller has no `IR_CALL` and contains the
+ cloned `BINOP`.
+ - Threshold: a `DEFAULT` callee with cost > 8 is refused (call survives); the same body
+ marked `CFREE_CG_INLINE_ALWAYS` is inlined.
+ Add a `test-opt-tiny-inline` target + binary in `test/test.mk` next to `test-opt`
+ (~line 509), and add it to the aggregate list near line 66.
+
+**(b) Behavioral disasm** — reuse `test/toy/cases/135_inline_cleanup_quality.toy`
+(`add1(x)=x+1`; `__user_main` calls `add1(41)`; `main` calls `__user_main`). Compile at
+`-O1`, `cfree objdump -d`, and assert the inlined helper's `bl` is **gone** from its caller.
+Realistic O1 assertions (O1 has no constant folding, so do **not** assert "main == constant
+42"): `__user_main` contains no `bl ... # add1`, and `main` contains no `bl ... # __user_main`.
+`test/opt/run.sh` already contains a (too-strong, orphaned) version of this check — repurpose
+it to the no-`bl` assertions and wire it into a `test-opt-inline` make target (it is
+currently not invoked by any target).
+
+## Verification
+
+1. `make bin && make lib`
+2. Behavioral, aarch64 cross (host-arch-independent):
+ ```
+ build/cfree cc -target aarch64-linux-gnu -O1 -c \
+ test/toy/cases/135_inline_cleanup_quality.toy -o /tmp/ti.o
+ build/cfree objdump -d /tmp/ti.o
+ ```
+ Confirm `<__user_main>` has no `bl ... add1` and `<main>` has no `bl ... __user_main`
+ (compare against the pre-change disasm, which shows both `bl`s).
+3. `make test-opt-tiny-inline` (new unit test) and `make test-opt-inline` (disasm) pass.
+4. Regression sweep: `make test-opt test-cg-api test-isa test-aa64-inline test-smoke-x64
+ test-smoke-rv64` — confirm no behavioral changes elsewhere and that `opt_verify`
+ ("o1-tiny-inline") never trips.
+5. Sanity that NEVER is honored and forward-defined callees are left alone (a quick
+ hand-written C file with `__attribute__((noinline))` and a forward-declared helper).
+
+## Out of scope
+- O2 / SSA pipeline (entirely disabled).
+- Cross-TU / forward-defined-callee inlining.
+- Inlining callees with control flow richer than the existing straightline whitelist, or
+ with multi-part/aggregate ABI args/returns (rejected by `inline_rewrite_supported`).
diff --git a/doc/OPT_O1_PASSES.md b/doc/OPT_O1_PASSES.md
@@ -1,625 +1,412 @@
# O1 Optimizer Pass Pipeline
-This document describes the O1 lowering pipeline, what each pass does, how it
-should work, and what was lost or incorrectly ported in the refactor at commit
-`f60a16d` (opt: rewrite pipeline to consume CgIrFunc input; replace pass_emit
-with NativeTarget).
+A reference for cfree's `-O1` lowering pipeline as it currently exists: the
+entry path, what each pass does, the key data structures, and the
+`NativeTarget` backend contract the emit step relies on. Intended as a map for
+performance work (see `doc/OPT_O1_PERF_TODO.md` for the active runtime targets).
+
+Scope: **aa64 only**. The aarch64 backend is the only `NativeTarget`
+implementation; x64/rv64 `NativeTarget` ports do not exist. The O0 path uses a
+separate backend (`NativeDirectTarget`, §16) and is not covered here beyond the
+boundary.
+
+History note: this pipeline replaced an older `CGTarget`-replay design (commit
+`f60a16d`). The transitional "direct-replay bypass" — which sent functions with
+alloca/varargs/asm/aggregates straight to unoptimized emission — **has been
+deleted**. Every function now goes through the full pipeline below
+unconditionally; varargs, inline asm, and aggregate/sret/byval are all handled
+on the optimizer path.
---
-## Pipeline overview
+## Entry path
-The O1 pipeline runs when `opt_cgtarget_new` is called with `level >= 1`.
-`CgIrRecorder` records the full function as a `CgIrFunc`. When the function is
-complete, `opt_on_func` is called:
+`opt_cgtarget_new(level >= 1)` installs the optimizer as the CG sink. The C
+frontend drives `CgIrRecorder`, which records each function as a `CgIrFunc`
+tape. On function completion, `opt_on_func` (src/opt/opt.c) runs:
-1. `opt_func_needs_direct_replay` — bypass check (see §1 below)
-2. `opt_func_from_cg_ir` — convert `CgIrFunc` → `Func` (new in refactor)
-3. `opt_run_o1_native` — run the pass sequence on `Func`
+1. `opt_func_from_cg_ir` (src/opt/cg_ir_lower.c) — convert the recorded
+ `CgIrFunc` into the optimizer's `Func` CFG/IR, classifying each local as
+ register (`PReg`) or frame storage (§1).
+2. `opt_run_o1_native` (src/opt/opt.c) — run the pass sequence on `Func`.
-The pass sequence in `opt_run_o1_native` (src/opt/opt.c):
-
-```
-opt_build_cfg (×2, with opt_jump_cleanup(CFG) in between)
-opt_simplify_local
-opt_machinize_native ABI lowering
-opt_addr_xform_pregs fold IR_ADDR_OF(local) → OPK_LOCAL in loads/stores
-opt_promote_scalar_locals promote non-escaped frame slots → PRegs
-opt_addr_of_global_cse hoist duplicate ADDR_OF(global) to entry block
-opt_build_loop_tree
-opt_live_blocks liveness bitmaps
-opt_dead_def_elim_with_live pre-RA DCE
-opt_regalloc_locations point-bitmap register allocator
-opt_lower_to_mir insert spill/reload, lower PReg → hard reg
-opt_mir_combine MIR-level peephole
-opt_mir_dce
-opt_mir_jump_cleanup(CFG) + opt_mir_build_cfg
-opt_mir_jump_cleanup(LAYOUT) block reorder + branch stripping
-opt_emit_native emit to NativeTarget
-```
+There is no per-function or per-op bypass.
---
-## §1 — Direct-replay bypass
-
-**Source**: `opt_func_needs_direct_replay` in src/opt/opt.c
-**Route**: → `opt_replay_cg_ir_direct` → raw CgTarget replay (no optimization)
+## Pipeline overview
-A function bypasses the optimizer entirely if it contains:
-- Aggregate or >8-byte params/results
-- `CG_IR_ASM_BLOCK`, `CG_IR_ALLOCA`, `CG_IR_INTRINSIC`
-- Varargs ops (`CG_IR_VA_*`)
-- Aggregate call arguments or results
+`opt_run_o1_native`, in order (verify/dump steps interleaved):
-**Before the refactor**: These functions went through the optimizer. The old
-`CGTarget` wrapper recorded them and replayed with `opt_emit`. The optimizer
-handled them with frame slots from the start. Post-refactor, they are entirely
-unoptimized — every function with an `alloca`, inline asm, or vararg is now
-compiled at -O0 quality regardless of the requested optimization level.
+```
+opt_build_cfg ┐ CFG build + cleanup
+opt_jump_cleanup(CFG) │
+opt_build_cfg │
+opt_simplify_local ┘
+opt_try_tiny_inline streaming inline of tiny callees recorded earlier
+opt_machinize_native ABI lowering (call/ret/param constraints)
+opt_addr_xform_pregs fold ADDR_OF(local) → OPK_LOCAL in load/store
+opt_promote_scalar_locals non-escaped frame slot → PReg
+opt_addr_of_global_cse hoist duplicate ADDR_OF(global) to entry
+opt_build_loop_tree loop nesting (informs spill cost)
+opt_hoist_loop_consts hoist single-def LOAD_IMM (imm!=0) in loops to entry
+opt_live_blocks per-block PReg liveness
+opt_dead_def_elim_with_live pre-RA DCE
+opt_regalloc_locations PReg → hard reg / spill slot
+opt_lower_to_mir materialize hard regs; insert spill/reload
+opt_mir_combine MIR peephole (substitute/addr-synth/sink/ext)
+opt_mir_dce post-RA DCE
+opt_mir_jump_cleanup(CFG) + build_cfg unreachable/jump-chain cleanup
+opt_mir_jump_cleanup(LAYOUT) reorder for fallthrough + rotate loops + invert
+opt_emit_native emit to NativeTarget (minimal prologue on opt path)
+```
-**What to fix**: The bypass is too conservative. Alloca, varargs, and asm-block
-functions can still benefit from register allocation and DCE on the non-alloca
-portions. The check should be narrowed to only bypass the parts of the pipeline
-that genuinely cannot handle those ops (currently none — `cg_ir_lower.c`
-should be extended to handle these).
+An `opt_verify` / `opt_mir_verify` runs after most stages (tags: `lowering-cfg`,
+`lowering-machinize`, `o1-addr-xform`, `o1-promote-scalar`,
+`o1-addr-global-cse`, `post-regalloc`, `lower-mir`, `post-mir-combine`,
+`post-mir-dce`, `post-mir-jump-cfg`). `CFREE_DUMP=<tag>` dumps the IR at the
+matching stage (`entry` before any pass, `pre-emit` just before emit); see
+Debugging aids.
---
-## §2 — opt_func_from_cg_ir (new pass, introduced in refactor)
-
-**Source**: src/opt/cg_ir_lower.c
-**Purpose**: Convert a recorded `CgIrFunc` into the `Func` IR that the optimizer
-works on.
-
-### Local classification
+## §1 — opt_func_from_cg_ir (CgIrFunc → Func)
-The critical function is `lower_locals` (cg_ir_lower.c:147). For each local:
+**Source**: src/opt/cg_ir_lower.c
-```c
-m->address_taken =
- local_needs_home(in) || local_address_used_in_cg_ir(l->src, in->id);
+Converts the recorded tape into the optimizer `Func` and classifies locals in
+`lower_locals`:
-if (m->address_taken) {
- m->storage.kind = CG_LOCAL_STORAGE_FRAME; // → OPK_LOCAL
-} else {
- m->storage.kind = CG_LOCAL_STORAGE_REG; // → OPK_REG (PReg)
- m->storage.v.reg = (Reg)r;
-}
-```
+- **Address-taken or memory-required** locals (`local_needs_home`:
+ `in->address_taken`, `CG_LOCAL_ADDR_TAKEN`, `CG_LOCAL_MEMORY_REQUIRED`), and
+ **aggregate / >8-byte** locals, get `CG_LOCAL_STORAGE_FRAME` → `OPK_LOCAL`.
+- All other locals get `CG_LOCAL_STORAGE_REG` → a fresh `PReg` → `OPK_REG`.
-`local_needs_home` returns true if:
-- `in->address_taken` — set by the recorder when `addr_of` is called on the local
-- `CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED` flags
+Address-taken locals start in frame storage; the addr-fold + promotion passes
+(§5, §6) recover register storage for those whose address does not truly escape.
-Address-taken locals get FRAME storage; all others get a PReg. The `opt_addr_xform_pregs`
-and `opt_promote_scalar_locals` passes are then responsible for recovering
-register storage for locals whose addresses don't truly escape.
+va_list operands are lowered as opaque pointer *values* (a `NativeLoc`), not
+flagged address-taken — all va-layout knowledge lives behind the `NativeTarget`
+va hooks. Aggregate locals are forced to frame.
-### Old approach (before refactor)
+---
-The old `w_local` handler in the CGTarget wrapper assigned storage differently:
+## §2 — opt_build_cfg / opt_jump_cleanup(CFG) / opt_simplify_local
-```c
-if (o->level < 2 &&
- (d->flags & (CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED)) == 0) {
- // REG storage — even if addr_of will be called later
-} else {
- // FRAME storage
-}
-```
+**Sources**: src/opt/pass_cfg.c, src/opt/pass_jump.c, src/opt/pass_simplify.c
-Address-taken status was determined conservatively at declaration time. Locals
-declared without `CG_LOCAL_ADDR_TAKEN` got register storage initially. When
-`addr_of` was called on them later, a `home_slot` was allocated alongside.
-After recording, `opt_frame_home_addr_taken_locals` ran as part of `w_func_end`
-and inserted explicit `IR_LOAD` / `IR_STORE` pairs around every use/def of the
-local's register to keep the frame slot in sync.
-
-This gave the optimizer explicit IR_LOAD/IR_STORE pairs to work with before
-running `opt_addr_xform_pregs` and `opt_promote_scalar_locals`. In the new
-code, address-taken locals go directly to FRAME storage and the optimizer must
-reconstruct the register-friendly form via the addr-fold and promotion passes.
-The promotion chain (§3 + §4) should in theory produce equivalent output, but
-depends on `opt_addr_xform_pregs` successfully classifying every `IR_ADDR_OF`
-use as non-escaping.
+Build the CFG, drop unreachable blocks and trivially-foldable branches, and fold
+single-successor chains. Runs before `opt_machinize_native` so ABI lowering sees
+an accurate live/critical-edge picture. `opt_build_cfg` runs twice with a jump
+cleanup between to settle the shape.
---
-## §3 — opt_build_cfg / opt_jump_cleanup(CFG) / opt_simplify_local
+## §3 — opt_machinize_native
-**Sources**: src/opt/pass_cfg.c
-**Purpose**: Build the CFG, remove unreachable blocks and trivially-foldable
-branches, and fold single-successor blocks into their predecessor.
+**Source**: src/opt/pass_machinize.c
-These passes are unchanged by the refactor and work the same as before.
-They must run before `opt_machinize_native` so that the ABI lowering has an
-accurate picture of which blocks are live and which are critical edges.
+ABI lowering against the `NativeTarget`. Annotates call/ret/param instructions
+with calling-convention constraints (argument/result registers, the call
+clobber mask, callee-save markers) so the register allocator knows which hard
+registers are killed across each call. Collects the target's register classes
+(`machinize_collect_regs`): the allocable set, the reserved scratch set, and the
+caller/callee-saved masks (`collect_class`), and checks allocable and scratch
+sets do not overlap (`machinize_check_overlap`).
---
-## §4 — opt_machinize_native
+## §4 — opt_addr_xform_pregs
-**Source**: src/opt/pass_machinize.c
-**Purpose**: ABI lowering. Inserts calling-convention constraints (scratch regs,
-clobbers, callee-save markers) as annotations on call/ret/param instructions so
-the register allocator knows which registers are killed across calls.
+**Source**: src/opt/pass_addr_fold.c
-The function was renamed from `opt_machinize(Func*, CGTarget*)` to
-`opt_machinize_native(Func*, NativeTarget*)` in the refactor. The body
-(`machinize_reset`, `machinize_prepare_insts`, `machinize_collect_regs`,
-`machinize_check_overlap`) is unchanged in structure. The only difference is
-the target type — `NativeTarget*` replaces `CGTarget*` — which affects which
-register metadata APIs are called to gather clobber and callee-save info.
+Folds `IR_ADDR_OF(local)` whose result pointer is used only as the base of
+non-observable zero-offset, no-index `IR_LOAD`/`IR_STORE`. When all uses
+qualify, the `IR_ADDR_OF` is removed and each use is rewritten to
+`OPK_LOCAL(slot)`. Partial folding leaves the `IR_ADDR_OF` alive for any
+effective-address-shaped uses. When a slot's `IR_ADDR_OF` defs are all retired,
+`FSF_ADDR_TAKEN` is cleared so §6 can promote it.
---
-## §5 — opt_addr_xform_pregs
+## §5 — opt_promote_scalar_locals
-**Source**: src/opt/pass_addr_fold.c
-**Purpose**: Fold `IR_ADDR_OF(local)` defs whose result pointer is used only as
-the base of non-observable `IR_LOAD` / `IR_STORE` with zero offset and no index.
-When all uses qualify, the `IR_ADDR_OF` is NOP'd and each use is rewritten to
-`OPK_LOCAL(slot)`. If only some uses qualify, partial folding leaves the
-`IR_ADDR_OF` alive for the EA-shaped uses.
+**Source**: src/opt/pass_addr_fold.c
-After folding, `FSF_ADDR_TAKEN` is cleared on any frame slot whose
-`IR_ADDR_OF` defs were all retired. This enables `opt_promote_scalar_locals`
-to promote the slot to a PReg.
+For each `FS_LOCAL` slot without `FSF_ADDR_TAKEN`/`FSF_VOLATILE` whose every
+`OPK_LOCAL(slot)` appearance is a matching-type non-observable `IR_LOAD.opnds[1]`
+or `IR_STORE.opnds[0]`, replaces the slot with a fresh `PReg`: stores become
+`IR_COPY preg, src`, loads become `IR_COPY dst, preg`. This pulls scalar locals
+out of memory into the register namespace before allocation.
-**How to be fast**: One forward pass per `IR_ADDR_OF` candidate. The function
-is scanned once per candidate to classify all uses, then rewritten in a second
-scan. This is O(candidates × insts), but in practice the number of surviving
-non-folded `IR_ADDR_OF` defs is small.
+**Limitation (perf):** `promote_inst_classify` rejects a slot if `OPK_LOCAL`
+appears anywhere else (call aux, `IR_ADDR_OF` operand, any other instruction).
+An address-taken local whose address is stored into a pointer and later
+dereferenced is not promoted (would need ADDR_OF copy-propagation through
+PRegs); genuinely aliased locals correctly stay in memory.
---
-## §6 — opt_promote_scalar_locals
+## §6 — opt_addr_of_global_cse
-**Source**: src/opt/pass_addr_fold.c
-**Purpose**: For each `FS_LOCAL` frame slot without `FSF_ADDR_TAKEN` or
-`FSF_VOLATILE`, and whose every `OPK_LOCAL(slot)` appearance is either:
-- `IR_LOAD.opnds[1]` — a matching-type, non-observable load, or
-- `IR_STORE.opnds[0]` — a matching-type, non-observable store,
+**Source**: src/opt/pass_addr_fold.c
-replace the slot with a fresh PReg: each store becomes `IR_COPY preg, src`
-and each load becomes `IR_COPY dst, preg`. The slot becomes unreferenced.
+CSE for `IR_ADDR_OF(global{sym, addend})`: when the same (sym, addend) appears
+in multiple defs, materialize one in the entry block (after the
+`IR_PARAM_DECL` prologue) and reuse it, collapsing per-iteration `adrp`/`add`
+pairs in loops.
+
+---
-**Critical limitation**: `promote_inst_classify` rejects a slot if `OPK_LOCAL(slot)`
-appears anywhere other than `IR_LOAD.opnds[1]` or `IR_STORE.opnds[0]`. In
-particular, an `OPK_LOCAL` appearing in a call aux (as an argument or result
-storage), in an `IR_ADDR_OF` operand, or in any other instruction, blocks
-promotion entirely for that slot.
+## §7 — opt_build_loop_tree
-This creates a coverage gap with the new `opt_func_from_cg_ir`: when a local
-is address-taken, `lower_operand_value` returns `OPK_LOCAL` for every VALUE use
-of that local (not just load/store address uses). If `opt_addr_xform_pregs` did
-not fold away all `IR_ADDR_OF` defs for the local (because at least one use
-escaped), the slot keeps `FSF_ADDR_TAKEN` and `opt_promote_scalar_locals` skips
-it entirely.
+**Source**: src/opt/pass_loop.c
-In the old code, the load/store pairs around the local's register were always
-IR_LOAD / IR_STORE instructions — exactly the form that `opt_promote_scalar_locals`
-can handle — and the only frame-slot operands visible to the pass were those
-load/store addresses.
+Computes loop nesting depth, used as a spill-cost weight by the register
+allocator (values live across deep loops are preferred for hard registers).
---
-## §7 — opt_addr_of_global_cse
+## §7.5 — opt_hoist_loop_consts
-**Source**: src/opt/pass_addr_fold.c
-**Purpose**: CSE for `IR_ADDR_OF(global{sym, addend})`. When the same
-(sym, addend) pair appears in two or more `IR_ADDR_OF` defs, replace all with a
-single def materialized in block 0 (after any `IR_PARAM_DECL` prologue). This
-collapses the per-iteration `adrp`/`add` pair in tight loops.
+**Source**: src/opt/pass_addr_fold.c
-Unchanged by the refactor.
+LICM-lite for constant materialization, mirroring `opt_addr_of_global_cse`'s
+hoist-to-entry pattern for `LOAD_IMM`. For each distinct `(imm, type, cls)`
+that appears inside a loop block (`loop_depth > 0`), allocates a canonical
+PReg, emits one `LOAD_IMM` in the entry block (after `IR_PARAM_DECL`s), NOPs
+the in-loop and any other defs of that key, and remaps every use to the
+canonical PReg. Without this the backend re-materializes the constant
+(`movz`/`movk`) on every iteration.
+
+Safety: only hoists `LOAD_IMM` whose def PReg is **single-defined** (the
+LOAD_IMM is its only def). The PReg form here is non-SSA, so a mutable promoted
+local initialized to a constant (e.g. a loop counter `i = 1`) is multiply
+defined; remapping it would discard the increment defs and break the loop.
+Skips `imm == 0` so the backend's store-zero-register path (see §14) keeps the
+cheaper `strb wzr, …` form for stored zeros.
---
## §8 — opt_live_blocks + opt_dead_def_elim_with_live
-**Sources**: src/opt/pass_analysis.c, src/opt/pass_lower.c
-**Purpose**: Dataflow liveness over the PReg namespace; pre-RA dead definition
-elimination.
-
-`opt_live_blocks` computes per-block liveness bitmaps (one bit per PReg) using
-backward dataflow. `opt_dead_def_elim_with_live` sweeps each block backward:
-any instruction all of whose defined PRegs are dead at its output and which has
-no side effects is removed.
-
-Running DCE before register allocation reduces the number of live ranges and
-the number of allocable PRegs, which keeps the point-bitmap allocator fast.
+**Sources**: src/opt/pass_live.c, src/opt/pass_lower.c
-Unchanged by the refactor.
+`opt_live_blocks` computes per-block PReg liveness bitmaps via backward
+dataflow. `opt_dead_def_elim_with_live` sweeps each block backward and removes
+any side-effect-free instruction whose defined PRegs are all dead at its output.
+Running before allocation shrinks live ranges and the allocable set.
---
## §9 — opt_regalloc_locations
-**Source**: src/opt/pass_analysis.c
-**Purpose**: Point-bitmap linear-scan register allocator. Allocates each PReg
-to either a hard register or a spill slot.
+**Source**: src/opt/pass_lower.c
-Uses a `used_locs[p * loc_words + w]` bitmap (one row per PReg, one column-word
-per location) to track which (PReg, location) pairs conflict. For each PReg in
-program order, the allocator scans the point bitmaps to find the first
-non-conflicting hard register; if none is available, assigns a spill slot.
-
-Unchanged by the refactor.
+Point-bitmap linear-scan allocator. A `used_locs[p * loc_words + w]` bitmap
+tracks (PReg, location) conflicts; for each PReg in program order it picks the
+first non-conflicting hard register, else a spill slot. The allocable set is the
+target's `allocable` list (aa64: caller-saved first so they're preferred, then
+callee-saved under pressure). It never assigns a reserved scratch register.
---
## §10 — opt_lower_to_mir
-**Source**: src/opt/pass_lower.c
-**Purpose**: Rewrite the PReg-namespace Func into a hard-register MIR Func.
-Every PReg operand is replaced with the hard register or spill slot assigned by
-`opt_regalloc_locations`. Spilled PRegs get `IR_LOAD` (reload) instructions
-inserted before each use and `IR_STORE` (spill) instructions inserted after
-each def.
-
-The output `f->mir` is a `MFunc` containing the rewritten blocks. The original
-`f->blocks` is also updated to point to the rewritten instruction arrays.
+**Source**: src/opt/pass_lower.c
-Unchanged by the refactor.
+Rewrites the PReg `Func` into a hard-register MIR `MFunc` (`f->mir`): every PReg
+operand becomes its assigned hard register or spill slot. Spilled PRegs get an
+`IR_LOAD` (reload) before each use and an `IR_STORE` (spill) after each def.
+`f->blocks` is updated to the rewritten arrays. From here the IR is non-SSA
+(physical registers, multiply defined).
---
## §11 — opt_mir_combine
-**Source**: src/opt/pass_combine.c
-**Purpose**: MIR-level peephole. Four rewrites run in a per-block fixpoint:
-
-1. **Substitute**: propagate a trivial copy `r1 ← copy r2` through later uses
- of `r1`, removing redundant register-to-register moves.
-
-2. **Addr-mode synthesis**: fold `r ← addr_of global` + `load/store r` into a
- direct `load/store [global]` where the backend supports it.
-
-3. **Sink**: move a def instruction forward in the block to be adjacent to its
- sole use, reducing live range pressure.
-
-4. **Ext-chain**: fold sign/zero extensions that feed into an instruction that
- already performs the extension as part of its semantics.
-
-Also runs `opt_combine_compact_block` to merge adjacent spill pairs.
-
-Unchanged by the refactor.
+**Source**: src/opt/pass_combine.c
+
+MIR peephole, per-block fixpoint, four rewrites:
+
+1. **Substitute** — propagate a copy `r1 ← r2` (or `r1 ← imm` / convert) into
+ later uses of `r1`, removing redundant moves. The reg-source form is full
+ local copy propagation: it rewrites every safe use of `r1` to `r2` (gated
+ only on `r2` being unchanged between the copy and the consumer, which
+ `ctx_def_changed_since` already verifies with call clobbers folded in); the
+ stricter single-use + live-out gate is kept only for the immediate/convert
+ forms, which would otherwise duplicate a multi-instruction materialization.
+2. **Addr-mode synthesis** — fold `r ← addr_of`/`r ← base+index*scale` into a
+ load/store's `OPK_INDIRECT` base/index/scale/offset where the backend
+ accepts it.
+3. **Sink** — move a def adjacent to its sole use to shrink its live range.
+4. **Ext-chain** — fold ext-of-ext and ext feeding an instruction that already
+ extends.
+
+`opt_combine_compact_block` additionally merges adjacent spill/reload pairs.
+
+Because the MIR is non-SSA, each rewrite is gated on a **live-range safety
+check**: `count_uses_in_live_range` counts uses of the producer's register up to
+its next redefinition in the block, and a producer that is **live-out of the
+block** (per `opt_block_live_out_has_phys_reg`, backed by the `hard_live`
+analysis) is not folded away. Register kills are computed from each
+instruction's precise def/clobber set (`inst_kills_phys_reg` →
+`opt_hard_inst_use_def`), so a call kills only its caller-saved clobbers — a
+value in a callee-saved register survives it.
---
## §12 — opt_mir_dce
-**Source**: src/opt/pass_dce.c
-**Purpose**: Post-RA dead definition elimination over hard-reg IR. Removes
-instructions that define registers that are never used, provided they are
-side-effect-free. Runs after `opt_mir_combine` to clean up any copy chains
-that were fully substituted away.
+**Source**: src/opt/pass_dce.c
-Unchanged by the refactor.
+Post-RA dead-def elimination over hard-register IR: removes side-effect-free
+instructions whose defined registers are never used, cleaning up copy chains
+substitution left behind. Uses the same `hard_live` machinery as §11.
---
-## §13 — opt_mir_jump_cleanup(CFG) + opt_mir_build_cfg + opt_mir_jump_cleanup(LAYOUT)
+## §13 — opt_mir_jump_cleanup(CFG) + build_cfg + jump_cleanup(LAYOUT)
-**Source**: src/opt/pass_cfg.c
-**Purpose**:
-- `OPT_JUMP_CLEANUP_CFG`: Remove unreachable blocks and collapse chains of
- unconditional jumps.
-- `opt_mir_build_cfg`: Recompute the CFG successor lists.
-- `OPT_JUMP_CLEANUP_LAYOUT`: `cleanup_reorder_for_fallthrough` — greedy chain
- extension that orders blocks so that the most common taken-branch edge
- becomes a fallthrough (eliminating the branch instruction); then strips
- redundant unconditional jumps to the textual successor.
+**Source**: src/opt/pass_jump.c, src/opt/pass_cfg.c
-Unchanged by the refactor.
+- `OPT_JUMP_CLEANUP_CFG`: drop unreachable blocks, collapse unconditional-jump
+ chains.
+- `opt_mir_build_cfg`: recompute successor lists.
+- `OPT_JUMP_CLEANUP_LAYOUT`: `cleanup_reorder_for_fallthrough` greedily orders
+ blocks so the hot taken edge falls through, then `cleanup_rotate_loops`
+ detects simple single-latch loops (header with `IR_CMP_BRANCH`, fallthrough
+ into the body chain, one back-edge predecessor later in emit order) and
+ moves the header to just after its latch, inverting the test in place. The
+ back-edge becomes the conditional branch and the latch falls through to the
+ header — the per-iteration unconditional back-jump is gone. `cleanup_invert_
+ taken_fallthrough` + `cleanup_layout_fallthrough_branches` then strip
+ trailing `b`-to-next-block and invert any remaining mis-aligned conditional.
---
-## §14 — opt_emit_native (replaces old opt_emit)
-
-**Source**: src/opt/pass_native_emit.c
-**Old source (deleted)**: src/opt/pass_emit.c
-
-### Old implementation: `opt_emit` / `replay_func_to`
-
-`opt_emit` replayed the optimized MIR through the wrapped `CGTarget` using
-`replay_func_to(identity=1)`. The `identity_regs` flag meant that PReg ids
-from the optimized IR were used directly as `Reg` values when calling the
-backend. The key mechanism:
-
-- **`xlat_storage`**: Checked `opt_preg_alloc_kind(f, pr)` to map a PReg's
- storage to either `CG_LOCAL_STORAGE_REG` (hard register) or
- `CG_LOCAL_STORAGE_FRAME` (spill slot).
-- **`plan_hard_regs` / `reserve_hard_regs`**: Scanned the replayed IR once to
- collect all hard register used, then called the wrapped backend's
- `plan_hard_regs` / `reserve_hard_regs` hooks. This allowed the native
- backend's frame layout to know the full set of callee-saved registers before
- emitting the prologue.
-- **`func_begin_known_frame`**: If the target supported it, the frame layout
- (slot sizes, alignments, max outgoing call stack) was computed upfront and
- passed to the backend in a single `func_begin_known_frame` call. Otherwise
- fell back to the streaming `frame_slot` / `func_begin` protocol.
-
-### New implementation: `opt_emit_native`
-
-`opt_emit_native` replays the MIR into a `NativeTarget` using `NativeLoc` as
-the currency (not `Reg`). The key differences:
-
-- **`loc_from_operand`**: Converts an `OptOperand` to a `NativeLoc` —
- `OPT_OPK_REG` → `NATIVE_LOC_REG`, `OPT_OPK_LOCAL` (frame slot) →
- `NATIVE_LOC_FRAME`, `OPT_OPK_IMM` → `NATIVE_LOC_IMM`, etc.
-- **`map_frame_slots`**: Pre-allocates all frame slots by calling
- `target->frame_slot` for each `IRFrameSlot`. Frame slot mapping is done
- upfront via a single scan, similar to the old `known_frame` path.
-- **`materialize`**: Loads a non-register `NativeLoc` into a scratch register
- when a register operand is required (e.g., for call callee address).
-- **`legalize_addr`**: After constructing a `NativeAddr`, calls
- `target->addr_legal` to check if the address shape is supported, and if not,
- collapses it to a base register via a `load_addr`.
-
-The `plan_hard_regs` / `reserve_hard_regs` hooks are not called in the new
-implementation. This means the native backend must infer callee-saved register
-usage from the frame layout pass rather than receiving an explicit list. Whether
-this is correct depends on the backend.
+## §14 — opt_emit_native
+
+**Source**: src/opt/pass_native_emit.c
+
+Replays the MIR into a `NativeTarget` using `NativeLoc` as the operand currency
+(not raw `Reg`). Key helpers:
+
+- **`loc_from_operand`** — `OptOperand` → `NativeLoc`: `OPT_OPK_REG` →
+ `NATIVE_LOC_REG`, `OPT_OPK_LOCAL` → `NATIVE_LOC_FRAME`, `OPT_OPK_IMM` →
+ `NATIVE_LOC_IMM`, `OPT_OPK_INDIRECT` → `NATIVE_LOC_ADDR`, etc.
+- **`reserve_callee_saves`** — scans the lowered MIR for the callee-saved
+ registers the allocator actually used and hands them to the backend's
+ `reserve_callee_saves` hook *before* frame-slot mapping, so the prologue saves
+ exactly that set.
+- **`map_frame_slots`** — pre-allocates every `IRFrameSlot` via
+ `target->frame_slot` in one scan.
+- **`emit_prologue` (minimal prologue)** — once the callee-save set and frame
+ slots are known the opt path calls this hook on backends that support it
+ (aa64) and sets `target->emit_minimal_prologue = 1` so `func_begin` skips its
+ worst-case NOP reservation. The backend emits an exact-size contiguous
+ prologue in place (no reserved region); the frame-size immediates are still
+ patched in `func_end` because the final frame isn't known until body emission
+ allocates its temporaries. `NativeDirectTarget` (O0) leaves `emit_prologue`
+ unset and keeps the reserve-and-patch path, which now branches over the
+ unused tail rather than NOP-filling it.
+- **call result routing** — for scalar return values `emit_call` now passes the
+ MIR's destination location (a register or its spill slot) straight to
+ `plan_call`, so the backend moves the ABI result register directly to its
+ destination. The old code routed every scalar result through a fresh temp
+ spill slot (`bl; stur [slot]; ldur`), a round trip on every call; aggregates /
+ oversized results still use the temp slot path.
+- **store-zero register** — when the value operand of an `IR_STORE` is the
+ integer immediate 0 and the backend advertises a hardware zero register
+ (`has_store_zero_reg` + `store_zero_reg`), the store goes straight from that
+ register instead of materializing `movz w,0` into a scratch first
+ (`strb wzr, …` on aa64).
+- **tail-call realizability** — `opt_on_tail_call_unrealizable_reason` returns
+ a reason when the callee's outgoing stack-arg bytes exceed the area this
+ function received for its own incoming stack args (via the backend's
+ `signature_stack_bytes` hook). CG then falls back to an ordinary call +
+ return for an ALLOWED tail, or panics for `musttail`.
+- **`materialize`** — loads a non-register `NativeLoc` into a reserved scratch
+ register when an instruction needs a register operand.
+- **`legalize_addr` / `collapse_addr_to_reg`** — if `target->addr_legal`
+ rejects an address shape, materialize it into a **reserved scratch register**
+ via `load_addr` (never the base register — the base may hold a value the
+ allocator keeps live past this op).
---
-## §15 — O0 path
-
-At `opt_level == 0`, `opt_cgtarget_new` is not called. The CgTarget returned by
-`backend->make` is used directly. Before the refactor, the native backends were
-CGTarget implementations (`CGTarget`-style). After `edbd83e` (aa64: delete old
-CGTarget backend) and related commits, the native backends use `NativeDirectTarget`.
-
-The O0 regression (cfree O0 is 3.25x slower than gcc-15 -O0 as a geomean across
-9 benchmarks) is therefore entirely in `NativeDirectTarget`, which is documented
-as a work-in-progress single-pass lowering with many known limitations.
+## Backend contract (`NativeTarget`, aa64)
+
+The emit step depends on these invariants; performance work that touches the
+allocator or emit must preserve them.
+
+- **2-scratch discipline.** An emit hook may use at most the **2 reserved
+ scratch registers** as private temporaries (aa64 int `x9`/`x10`; the hand-
+ written-PLT temps `x16`/`x17` = TMP0/TMP1 are reserved separately and used by
+ address-materialization helpers). Every other register an op needs — operand
+ bases, results — is **provided by the caller**. The optimizer owns all
+ allocation; ops clobber nothing the allocator tracks live.
+- **Allocable set (aa64).** Int: caller-saved `x8, x11..x15` first (preferred,
+ low spill cost), then callee-saved `x19..x28` under pressure. FP: caller-saved
+ `v18, v19` then callee-saved `v8..v15`. Scratch: int `{x9, x10}`, FP
+ `{v20, v21}` — disjoint from allocable. **Argument registers `x0..x7`/`v0..v7`
+ are *not* allocable** (see limitations).
+- **No `plan_hard_regs`/`reserve_hard_regs`.** Because only caller-saved (plus
+ optionally-reserved callee-saved) registers are allocable and every emit hook
+ receives fully-resolved hard regs, the backend never needs an up-front
+ hard-reg plan. Callee-save planning happens through `reserve_callee_saves`.
+- **Shared cores.** aa64 ops are factored so the semantic-operand path
+ (`NativeDirectTarget`, O0) and the `NativeLoc` path (this pipeline, O1) call
+ one shared core; the wrappers only convert operands.
---
-## Summary of regressions introduced by f60a16d
-
-### Regression A — Bypassed functions (O1 quality loss)
-
-Functions with alloca, varargs, inline asm, or aggregate types now bypass all
-optimization via `opt_replay_cg_ir_direct`. Before the refactor, these went
-through the full optimizer. This is incorrect: alloca functions should still
-benefit from register allocation on their non-alloca code; the bypass is a
-conservative workaround for unimplemented cases in `cg_ir_lower.c`.
+## §15 — O0 path (boundary)
-### Regression B — Local classification round-trip (O1 quality loss)
-
-The new `cg_ir_lower.c` classifies address-taken locals as FRAME storage from
-the start, rather than starting them as register storage and inserting
-explicit load/store pairs (as `opt_frame_home_addr_taken_locals` did). The
-downstream passes `opt_addr_xform_pregs` + `opt_promote_scalar_locals` are
-supposed to recover register storage for non-escaping locals, but:
-
-1. `opt_promote_scalar_locals` rejects a slot if `OPK_LOCAL` appears anywhere
- outside `IR_LOAD.opnds[1]` or `IR_STORE.opnds[0]`. But `lower_operand_value`
- emits `OPK_LOCAL` for ALL value uses of an address-taken local — including
- call args and binop operands — not only through explicit load/store.
-
-2. Any local whose `IR_ADDR_OF` use is not fully foldable by `opt_addr_xform_pregs`
- (because the pointer truly escapes, or because of EA-shaped uses) retains
- `FSF_ADDR_TAKEN` and is never promoted.
-
-The net effect: locals that are address-taken but whose address doesn't truly
-escape (very common in C — temporaries, loop iteration variables, short-lived
-pointers) stay in FRAME storage at O1 and incur memory traffic that the old
-optimizer eliminated.
-
-### Regression C — Missing plan_hard_regs / reserve_hard_regs (potential correctness issue)
-
-The old `replay_func_to` called `target->plan_hard_regs` and `target->reserve_hard_regs`
-with the complete set of hard registers used by the function. These hooks let the
-native backend plan its callee-save region before emitting the prologue. The new
-`opt_emit_native` does not call these hooks. If any backend relies on them for
-correct callee-save handling, this is a correctness issue, not just a quality
-issue.
-
-### Regression D — O0 path replaced by NativeDirectTarget
-
-Not introduced by f60a16d itself, but by the series of commits ending at
-`edbd83e` (delete old CGTarget backend). The O0 regression is a NativeDirectTarget
-limitation, separate from the optimizer.
+At `opt_level == 0` the optimizer is not installed; the backend's
+`NativeDirectTarget` (src/cg/native_direct_target.c) emits directly in a single
+pass. It is a separate, simpler lowering with its own (documented) limitations
+and is not part of this pipeline.
---
-## Remediation priorities
-
-1. **Fix local classification** (§2 / Regression B): Either restore the
- `opt_frame_home_addr_taken_locals` approach (start as REG, insert sync
- pairs lazily), or extend `opt_promote_scalar_locals` to handle the
- `OPK_LOCAL`-in-value-position case. The simpler fix is to not use
- `OPK_LOCAL` as a value operand in `lower_operand_value` for address-taken
- locals; instead emit `IR_LOAD(preg, frame_slot)` inline, matching what
- `opt_frame_home_addr_taken_locals` used to produce.
-
-2. **Restore bypass functions to optimizer** (Regression A): Extend
- `cg_ir_lower.c` to handle the cases currently routed to
- `opt_replay_cg_ir_direct`. At minimum, alloca and non-aggregate vararg
- functions should go through the optimizer.
+## Debugging aids
-3. **Wire plan_hard_regs / reserve_hard_regs** (Regression C): Survey the
- aarch64 / x64 / rv64 native backends to check whether they implement these
- hooks and what happens when they are not called.
+- `CFREE_DUMP=<tag>` — dump the optimizer IR at a named stage and panic
+ (`entry`, `pre-emit`, or any `opt_verify` tag above). `CFREE_DUMP=1` dumps at
+ `entry`. Add an `opt_dbg_dump(o, f, "<tag>")` call next to an existing
+ `opt_verify` to expose another stage.
+- `CFREE_DUMPCG=1` — dump the semantic CG IR tape (pre-lowering). Note the CG-IR
+ dumper does not print `OPK_INDIRECT` offsets.
+- Both panic on the first recorded function; to dump all functions, temporarily
+ swap the `compiler_panic` in `opt_dbg_dump` / `opt_dbg_dump_cg` for
+ `cfree_debug_printf`.
+- Disassembly: `cfree objdump -d <obj>` or `lldb -b -o "disassemble -n <fn>"`.
---
-# Recovery Project: goal, decisions, and checklist
-
-This section tracks the active work to **fully recover O1 on the new
-`CgIrFunc` → `NativeTarget` path and delete the direct-replay path entirely**.
-It supersedes the speculative remediation notes above where they conflict.
-
-## Goal / success criteria
-
-1. **Completeness (priority 1):** every function compiles through the
- optimizer pipeline. No `opt_replay_cg_ir_direct`. No per-op bypass.
-2. **Correctness (priority 2):** `CFREE_OPT_LEVELS=1 CFREE_TEST_PATHS=R
- ./test/toy/run.sh` fully green, and the default test suites stay green.
-3. **Performance (priority 3):** O1 vs `gcc-15 -O0` (baseline in
- `doc/OPT_PERF.md`): compile-time ≈5x faster, runtime ≈2x faster (geomean).
- Measure only from a **clean `RELEASE=1` build** (the dev build is
- ASan/UBSan-instrumented and not representative).
-
-Scope: **aa64 only** (the only `NativeTarget` implementation; host arch here).
-x64/rv64 `NativeTarget` ports are out of scope.
-
-## Design decisions
-
-- **Arch/ABI neutrality at the optimizer boundary.** The optimizer (cg_ir_lower,
- pass_native_emit, machinize) must make no ABI/layout assumptions. For varargs
- it passes the va_list *pointer* opaquely (a `NativeLoc`) and the `va_arg`
- type; all layout knowledge (pointer ABI vs AAPCS64 register-save-area, field
- offsets, sizes) lives behind the `NativeTarget` va hooks, which query
- `abi_va_list_layout`. Same principle applies to asm and aggregates.
-- **Backend scratch discipline.** A `NativeTarget` op may use at most the **2
- reserved scratch registers** (aa64 `x16`/`x17` = TMP0/TMP1) as private
- temporaries. Every *other* register it needs (operand bases, results) must be
- **provided by the caller**, so the optimizer owns allocation/planning and the
- op clobbers nothing the register allocator tracks live. This removed any need
- for a va clobber-set; it is the contract new hooks must follow.
-- **plan/reserve_hard_regs are unnecessary (Regression C is closed).** The aa64
- `NativeTarget` exposes only caller-saved registers as allocable
- (`aa_int_allocable = {x8,x11..x15}`, scratch `{x9,x10}`), so the allocator
- never assigns a callee-saved register and the prologue correctly saves only
- FP/LR. Every emit hook already receives fully-resolved hard regs. The hooks
- are not wired and will not be. The *consequence* is a small allocable set
- (perf item below), not a correctness gap.
-- **Shared cores, two thin wrappers.** aa64 backend ops are factored so the
- semantic-operand path (`NativeDirectTarget`, -O0) and the `NativeLoc` path
- (optimizer, O1) call one shared core; the wrappers only convert operands.
-
-## Checklist
-
-Integration test (drives this work):
-`CFREE_OPT_LEVELS=1 CFREE_TEST_PATHS=R ./test/toy/run.sh`
-(The `CFREE_NO_DIRECT_REPLAY` gate and the direct-replay path are now deleted —
-every function goes through the optimizer unconditionally, so no gate is
-needed.)
-
-Completeness — route all ops through the optimizer:
-- [x] **Varargs** — `va_start_/va_arg_/va_end_/va_copy_` hooks on `NativeTarget`;
- aa64 shared cores (2-temp discipline); cg_ir_lower lowers va operands as
- pointer *values*; `local_address_used_in_cg_ir` no longer flags va
- operands as address-taken.
-- [x] **Latent `IR_ADDR_OF` spill-writeback bug** in `pass_native_emit.c`
- (ADDR_OF result computed into scratch was never stored back).
-- [x] **Inline asm** (`IR_ASM_BLOCK`) — DONE. `NativeTarget` `asm_block` binds
- the optimizer's pre-allocated operand registers to the template (no
- self-allocation; inputs are already live in their regs and outputs are
- consumed via use/def data flow). aa64 asm clobber-mask / callee-save /
- restore helpers refactored off `NativeDirectTarget` onto `AANativeTarget`.
- Toy cases 102,104,105,108,110,19,20 all pass.
-- [x] **Aggregates / sret / byval** — DONE. Aggregate locals forced to frame;
- per-part ABI typing in plan_call/plan_ret; aggregate results via
- copy_bytes; aggregate-typed IR_COPY/IR_LOAD/IR_STORE via copy_bytes; sret
- x8 set after argument loads (was clobbering an arg allocated to x8).
- 124/130/36/37 all pass.
-- [ ] **BREAK_TO / CONTINUE_TO + SCOPE cond** — currently unused by frontends
- (toy/c lower break/continue to `BR`+labels), but unwired in emit. Either
- lower them to CFG edges in cg_ir_lower or wire emit, for true
- completeness once a producer exists.
-
-Direct-path deletion:
-- [x] Delete `opt_func_needs_direct_replay`, `opt_replay_cg_ir_direct`, the
- `OptReplay` machinery, and the `replay_*` helpers in `opt.c`. DONE — every
- function now goes through the optimizer (opt.c 680 → 269 lines).
-- [x] Remove the `CFREE_NO_DIRECT_REPLAY` env gate. DONE.
-
-Performance (priority 3, after completeness + correctness):
-- [x] **Expand aa64 allocable set — callee-saved** — DONE. x19..x28 and d8..d15
- are allocable (caller-saved stay first so they're preferred; callee-saved
- chosen only under pressure). `pass_native_emit` scans the lowered MIR for
- the callee-saved regs the allocator used and passes them to a new
- `reserve_callee_saves` `NativeTarget` hook before frame-slot mapping; the
- aa64 backend reserves save slots first (small FP-relative offsets), saves
- them in the back-patched prologue and restores them in the epilogue and
- the tail-call patch. FP class gained a `callee_saved_mask` (d8..d15).
-- [ ] **Expand aa64 allocable set — arg registers** — make x0..x7 / v0..v7
- allocable so values (params, call args, temporaries) can live in them.
- No plan yet. Key finding from the attempt (reverted): there is no safe
- "entry-only" subset — making arg regs allocable simultaneously exposes a
- parallel-move / shuffle hazard in *four* ABI emission paths (call-argument
- setup, tail-call argument setup, function entry / param materialization,
- and multi-result return). A "forbid arg-reg sources at calls/returns +
- let each param keep only its own incoming arg reg" subset got the bypass
- probe from 14→11 toy R-O1 failures but newly broke 3 previously-passing
- tests, i.e. the subset is unsound. Correct approach: one general
- parallel-move-with-memory sequencer (cycle-break via reserved scratch;
- x0..x7/v0..v7 currently excluded so sources and destinations are disjoint)
- applied uniformly to all four paths, gated by permutation/swap stress
- tests. The across-call correctness safety net already exists
- (`rewrite_call_save_one` saves caller-saved live-across-call values keyed
- on the clobber mask, which includes arg regs).
-- [~] **Local classification (Regression B)** — param round-trip CLOSED:
- `bind_param` now receives the allocator-chosen destination `NativeLoc`, so
- a register-allocated scalar param moves straight from its incoming arg
- register into its hard register (no frame store+reload); the
- `IR_PARAM_DECL` marker emits nothing. Remaining gap: address-taken locals
- whose address is stored into a pointer variable then dereferenced are not
- promoted (needs ADDR_OF copy-propagation through PRegs); genuinely aliased
- locals correctly stay in memory. `opt_addr_xform_pregs` +
- `opt_promote_scalar_locals` already handle direct `ADDR_OF(local)` →
- load/store base folding.
-- [ ] **Unit tests** — new targeted tests for the `CgIrFunc`→`NativeTarget`
- path (local promotion, addr-fold, regalloc, lowered bypass ops);
- re-enable a `test-opt` make target (old `test/opt/opt_test.c` is disabled
- and uses the pre-refactor API).
-- [ ] **Benchmark** — clean `RELEASE=1` build, run `make bench-opt`, confirm the
- O1-vs-`gcc-15 -O0` targets; refresh `doc/OPT_PERF.md`.
-
-## Progress log
-
-- Varargs landed end-to-end on the optimizer path; `IR_ADDR_OF` writeback fixed.
- Bypass-disabled R-path failures: 14 → 11. Default R-path (O0+O1): 408/408.
- (commit "opt: route varargs through optimizer path; fix ADDR_OF spill writeback")
-- Aggregate ABI, partial (commit "opt: aggregate ABI lowering ... (partial)"):
- - Force aggregate / >8-byte locals to frame in `cg_ir_lower` `lower_locals`
- (a 16-byte struct result local was being allocated to a single PReg).
- - Type each ABI part by its own width in `aa_plan_call`/`aa_plan_ret` direct
- paths via `aa_part_scalar_type` (was using the aggregate type → truncating
- `mov w0,w9` for i64 fields).
- - `emit_call`/`emit_ret`: aggregate/oversized results use `copy_bytes` / hand
- `plan_ret` the value's memory location directly (no scalar temp copy).
- - Result: no more "scalar too large" panics; sret no longer truncates. But
- values still wrong (130→0, 124→40, 36/37→160); still 11 bypass-off failures.
-
-- Aggregate COPY/LOAD/STORE via `copy_bytes` (commit "opt: handle
- aggregate-typed COPY/LOAD/STORE via byte copy"): the root cause of the
- truncated/mis-offset aggregate moves was that `IR_COPY`/`IR_LOAD`/`IR_STORE`
- on aggregate-typed operands were emitted as scalar moves. 130 and 124 now
- pass. Bypass-off R-path failures: 11 → 9 (7 asm + 2 tail-sret). Default
- R-path: 408/408.
-
-- Inline asm + direct-path deletion (commit "opt: route inline asm through
- optimizer; delete direct-replay path"): `asm_block` `NativeTarget` hook binds
- pre-allocated operand registers; all 7 asm cases pass bypass-off. With asm
- done the bypass-off R-path reached 0 failures (tail-sret had already been
- fixed), so the entire direct-replay path + `CFREE_NO_DIRECT_REPLAY` gate were
- deleted (opt.c 680 → 269 lines). Every function now goes through the
- optimizer. Full toy suite (R/L/C/W × O0/O1/O2): 1333 pass, 0 fail, 8 skip.
-
-- Callee-saved registers allocable (commit "aa64: make callee-saved registers
- allocable at O1"): x19..x28 / d8..d15 added to the allocable set with
- prologue/epilogue + tail-call save/restore via the new `reserve_callee_saves`
- hook. Verified O0==O1 on register-pressure int and fp programs.
-
-- Parameter-home elimination (commit "opt: route incoming params straight into
- their allocated register"): `bind_param` takes a destination `NativeLoc`;
- params no longer round-trip through a frame home. Deleted
- `allocate_param_home` / `local_home_for_preg` / `param_home_by_preg`.
-
-- Arg-register allocability attempted and reverted (see the unchecked perf item
- above for the finding): the "entry-only" subset is unsound; the change is
- indivisible and needs a general parallel-move sequencer across all four ABI
- paths. Tree left at the param-home-elimination commit, suite green.
-
-Debugging aids: `CFREE_NO_DIRECT_REPLAY=1 cfree cc -O1 -c <case>.toy` +
-`cfree objdump -d`; `CFREE_DUMP=1` / `CFREE_DUMPCG=1` dump optimizer/CG IR
-(they `compiler_panic` on the first recorded function — temporarily swap the
-panic in `opt_dbg_dump_cg`/`opt_dbg_dump` for `cfree_debug_printf` to dump all
-functions). Note the CG-IR dumper does not print INDIRECT `ofs`.
+## Known limitations affecting performance
+
+Feeds `doc/OPT_O1_PERF_TODO.md`. Recovery of the new pipeline is functionally
+complete (every function optimized; varargs/asm/aggregates handled; callee-saved
+allocable; params no longer round-trip through a frame home). Remaining
+quality gaps:
+
+- **Argument registers not allocable.** `x0..x7`/`v0..v7` cannot hold
+ allocator-managed values, shrinking the allocable set and forcing extra spills
+ in register-pressured code. An "entry-only" subset was attempted and reverted
+ as unsound: making arg regs allocable simultaneously exposes a parallel-move /
+ shuffle hazard across four ABI paths (call-arg setup, tail-call arg setup,
+ function entry / param materialization, multi-result return). The correct fix
+ is one general parallel-move-with-memory sequencer (cycle-break via reserved
+ scratch) applied uniformly to all four, gated by permutation/swap stress
+ tests.
+- **Address-taken-local promotion gap.** A local whose address is stored into a
+ pointer variable and later dereferenced stays in frame storage even when it
+ does not truly escape (§5). Needs ADDR_OF copy-propagation through PRegs.
+- **`BREAK_TO` / `CONTINUE_TO` + SCOPE cond** are unwired in emit (no current
+ frontend produces them — toy/C lower break/continue to `BR` + labels). Wire
+ emit or lower to CFG edges once a producer exists.
+- **Unit-test coverage** for the `CgIrFunc`→`NativeTarget` path (local
+ promotion, addr-fold, regalloc, lowered bypass ops) is thin; the old
+ `test/opt/opt_test.c` uses the pre-refactor API and is disabled.
diff --git a/doc/OPT_O1_PERF_TODO.md b/doc/OPT_O1_PERF_TODO.md
@@ -0,0 +1,91 @@
+# cfree -O1 performance TODO
+
+Working list of benchmarks where cfree's `-O1` codegen leaves clear runtime on
+the table. These are the high-confidence targets pulled from the full
+15-benchmark `scripts/opt_bench.sh` sweep (aarch64, Apple).
+
+## The bar
+
+cfree `-O1` runtime should be **at least 10% faster** than *both* reference
+points — i.e. cfree runtime ≤ 0.90× theirs (speedup ≥ 1.10×):
+
+- **gcc-15 -O0** — an unoptimized native baseline; losing here means we trail
+ even a compiler doing no real optimization.
+- **mir-c2m -O1** — a fast lightweight JIT-class optimizer; a peer we should
+ match or beat.
+
+A benchmark stays on this list until it clears 1.10× against both (where the
+reference exists).
+
+## Current standings
+
+Numbers below are from a single-run sweep (`COMPILE_REPEATS=1`,
+`RUN_REPEATS=1`), runtime in ms; speedup = reference_time / cfree_time
+(>1 means cfree is faster). The four entries here were chosen because their
+gaps are large enough to be real signal rather than single-run noise.
+
+| bench | cfree -O1 | gcc -O0 | vs gcc-O0 | mir -O1 | vs mir-O1 | behind |
+| --- | ---: | ---: | ---: | ---: | ---: | --- |
+| binary-trees | 3238 | 2562 | **0.79×** (slower) | n/a¹ | — | gcc |
+| lists | 8417 | 8583 | 1.02× | 4907 | **0.58×** | gcc, mir |
+| hash2 | 5965 | 7110 | 1.19× ✓ | 3765 | **0.63×** | mir |
+| sieve | 5066 | 4977 | 0.98× | 4107 | **0.81×** | gcc, mir |
+
+¹ mir-c2m fails to compile `binary-trees` in our setup, so only the gcc
+comparison applies there.
+
+## Per-benchmark notes
+
+### binary-trees — slower than unoptimized gcc (highest priority)
+The only case where cfree `-O1` is *slower than gcc -O0* (0.79×). Workload is
+allocation- and pointer-chasing-heavy (recursive tree build/walk). Likely
+suspects: call-heavy inner loops, per-node field access codegen, and whatever
+we emit around the allocator calls. Start here.
+
+### lists — 0.58× vs mir
+Matches gcc-O0 (1.02×) but loses badly to mir. Doubly-linked-list traversal and
+splice: lots of `p->next`/`p->prev` loads/stores in tight loops. The recently
+fixed O1 store-address clobber bug lived in this same code shape; correctness is
+now fine, but the field-access/loop codegen is still far behind mir. Compare our
+loop bodies against mir's to find the gap.
+
+### hash2 — 0.63× vs mir
+Clears the gcc bar (1.19×) but is the second-worst against mir. Open-addressing
+hash probing — integer hashing arithmetic plus indexed loads in the probe loop.
+Look at index/address computation and whether the probe loop carries redundant
+materialization.
+
+### sieve — 0.81× vs mir, ~tied with gcc
+Behind both. Classic nested sieve loop over a byte/array — bounded counted loops
+with strided stores. Suggests loop/IV and array-addressing codegen headroom
+(strength reduction, address-mode folding in the inner store).
+
+## Reproducing
+
+```sh
+# Build the optimized compiler first (clean release):
+rm -rf build/release && make RELEASE=1 bin
+
+# Run just these four, single-run, O0+O1, gcc + cfree + mir:
+CFREE="$PWD/build/release/cfree" \
+CFREE_OPT_BENCHES="binary-trees lists hash2 sieve" \
+CFREE_OPT_BENCH_LEVELS="0 1" \
+CFREE_OPT_BENCH_COMPILE_REPEATS=1 CFREE_OPT_BENCH_RUN_REPEATS=1 \
+CFREE_OPT_BENCH_SKIP_GCC=0 CFREE_OPT_BENCH_SKIP_CLANG=1 CFREE_OPT_BENCH_SKIP_MIR=0 \
+bash scripts/opt_bench.sh
+```
+
+Per-iteration codegen for a single function is easiest to inspect via the
+optimizer's staged IR dump (`CFREE_DUMP=pre-emit cfree cc -O1 -c bench.c ...`,
+which panics after printing the pre-emit MIR) plus `objdump`/`lldb`
+disassembly of the hot function.
+
+## Notes / caveats
+
+- Single-run timings: the four here were picked for gap size, but re-run with
+ repeats before/after a change to confirm movement (especially anything that
+ lands near the 1.0× line, e.g. `sieve`).
+- Not on this list but also marginal (within single-run noise, 0.92–1.05×):
+ `hash`, `funnkuch-reduce`, `strcat`. Revisit once the four above improve.
+- `cfree-run` (JIT) shares this codegen, so its runtimes track `cfree cc -O1`;
+ fixing these helps both paths.
diff --git a/src/arch/aa64/isa.h b/src/arch/aa64/isa.h
@@ -839,6 +839,7 @@ static inline u32 aa64_ldp64_pre(u32 Rt, u32 Rt2, u32 Rn, i32 imm7_scaled) {
.Rt = Rt});
}
+
/* ====================================================================
* Hint instructions (NOP / YIELD / WFE / WFI / SEV / SEVL)
* 1101 0101 0000 0011 0010 CRm(4) op2(3) 11111
@@ -972,6 +973,28 @@ static inline AA64LdStPSOff aa64_ldstp_soff_unpack(u32 w) {
return f;
}
+/* 64-bit integer STP/LDP, signed offset (no writeback). imm7_scaled is
+ * byte_offset / 8. Used for the prologue/epilogue frame record and callee-save
+ * pairs, which address off a fixed base (x17 / fp). */
+static inline u32 aa64_stp64_soff(u32 Rt, u32 Rt2, u32 Rn, i32 imm7_scaled) {
+ return aa64_ldstp_soff_pack((AA64LdStPSOff){.opc = 2,
+ .V = 0,
+ .L = 0,
+ .imm7 = (u32)imm7_scaled & 0x7fu,
+ .Rt2 = Rt2,
+ .Rn = Rn,
+ .Rt = Rt});
+}
+static inline u32 aa64_ldp64_soff(u32 Rt, u32 Rt2, u32 Rn, i32 imm7_scaled) {
+ return aa64_ldstp_soff_pack((AA64LdStPSOff){.opc = 2,
+ .V = 0,
+ .L = 1,
+ .imm7 = (u32)imm7_scaled & 0x7fu,
+ .Rt2 = Rt2,
+ .Rn = Rn,
+ .Rt = Rt});
+}
+
/* ====================================================================
* Load/store, unscaled 9-bit signed offset (LDUR / STUR, V=0 and V=1).
* size(2) 111 V(1) 00 opc(2) 0 imm9(9) 00 Rn(5) Rt(5)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -117,6 +117,7 @@ typedef struct AANativeTarget {
u32 func_start;
u32 prologue_pos;
+ u32 minimal_prologue_words; /* opt path: exact prologue length, else 0 */
MCLabel epilogue_label;
AACalleeSave callee_saves[AA_MAX_CALLEE_SAVES];
@@ -804,10 +805,11 @@ static void aa_emit_variadic_reg_saves(AANativeTarget* a) {
aa_emit_q_frame(a, 0, r, a->va_vr_slot, r * vai.fp_slot_size);
}
+static void aa_emit_entry_saves(AANativeTarget* a);
+
static void aa_func_begin(NativeTarget* t, const CGFuncDesc* fd) {
AANativeTarget* a = aa_of(t);
MCEmitter* mc = t->mc;
- const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type);
a->func = fd;
a->nslots = 0;
a->cum_off = AA_FRAME_SAVE_SIZE;
@@ -829,8 +831,24 @@ static void aa_func_begin(NativeTarget* t, const CGFuncDesc* fd) {
mc_begin_function(mc, fd->sym, fd->text_section_id, a->func_start);
if (mc->cfi_startproc) mc->cfi_startproc(mc);
a->prologue_pos = mc->pos(mc);
- for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) aa_emit32(mc, 0xd503201fu);
+ a->minimal_prologue_words = 0;
a->epilogue_label = mc->label_new(mc);
+ /* Optimizer path: emit nothing here. The exact-size prologue and the
+ * sret/variadic entry saves are emitted later by aa_emit_prologue, once the
+ * callee-save set and frame slots are known. The single-pass path reserves a
+ * worst-case region (patched in func_end) and emits the entry saves now. */
+ if (t->emit_minimal_prologue) return;
+ for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) aa_emit32(mc, 0xd503201fu);
+ aa_emit_entry_saves(a);
+}
+
+/* Emit the sret-pointer save (x8 → slot) and, for variadic functions, the
+ * argument register-save area. Run immediately after the prologue frame setup
+ * on both the single-pass path (from func_begin) and the optimizer path (from
+ * aa_emit_prologue). */
+static void aa_emit_entry_saves(AANativeTarget* a) {
+ NativeTarget* t = &a->base;
+ const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type);
if (abi && abi->has_sret) {
NativeFrameSlotDesc sd;
NativeAddr addr;
@@ -901,20 +919,14 @@ static void aa_reserve_callee_saves(NativeTarget* t, const u32* used,
}
static MemAccess aa_mem_for_type(NativeTarget* t, CfreeCgTypeId type, u32 size);
+static void aa_words_callee_saves(AANativeTarget* a, int save, u32* words,
+ u32 cap, u32* n);
static void aa_emit_callee_restores(AANativeTarget* a) {
- MemAccess mem;
- for (u32 i = a->ncallee_saves; i > 0; --i) {
- const AACalleeSave* cs = &a->callee_saves[i - 1u];
- NativeAddr addr;
- NativeLoc reg = aa_reg_loc(cs->type, (NativeAllocClass)cs->cls, cs->reg);
- memset(&addr, 0, sizeof addr);
- addr.base_kind = NATIVE_ADDR_BASE_FRAME;
- addr.base.frame = cs->slot;
- addr.base_type = cs->type;
- mem = aa_mem_for_type(&a->base, cs->type, 8);
- aa_emit_mem(a, 1, reg, addr, mem);
- }
+ u32 words[AA_PROLOGUE_WORDS];
+ u32 n = 0;
+ aa_words_callee_saves(a, 0, words, AA_PROLOGUE_WORDS, &n);
+ for (u32 i = 0; i < n; ++i) aa_emit32(a->base.mc, words[i]);
}
static void aa_words_load_imm(AANativeTarget* a, u32* words, u32 cap, u32* n,
@@ -972,49 +984,110 @@ static void aa_words_saved_pair_addr(AANativeTarget* a, u32* words, u32 cap,
static void aa_words_restore_frame(AANativeTarget* a, u32* words, u32 cap,
u32* n, u32 frame_size) {
if (!frame_size) return;
- if (*n + 4u > cap) aa_panic(a, "instruction patch too small");
+ if (*n + 3u > cap) aa_panic(a, "instruction patch too small");
words[(*n)++] = aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0);
- words[(*n)++] = aa_ldur_v(3, 0, AA_FP, AA_TMP0, -16);
- words[(*n)++] = aa_ldur_v(3, 0, AA_LR, AA_TMP0, -8);
+ words[(*n)++] = aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, -2); /* fp,lr @ -16 */
words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP0, 0, 0);
}
-static void aa_patch_prologue(AANativeTarget* a, u32 frame_size) {
- u32 words[AA_PROLOGUE_WORDS];
+/* Emit callee-save store (save=1) or restore (save=0) words into `words`,
+ * pairing adjacent integer registers into a single stp/ldp. reserve_callee_saves
+ * allocates consecutive 8-byte slots in order, so callee_saves[i] sits 8 bytes
+ * above callee_saves[i+1]; for an int pair the lower-addressed reg[i+1] is the
+ * stp's Rt and reg[i] is Rt2. FP registers (and an unpaired trailing int) use
+ * the single-register stur/ldur form. */
+static void aa_words_callee_saves(AANativeTarget* a, int save, u32* words,
+ u32 cap, u32* n) {
+ for (u32 i = 0; i < a->ncallee_saves;) {
+ const AACalleeSave* cs = &a->callee_saves[i];
+ i32 off = -(i32)aa_slot(a, cs->slot)->off;
+ if (off < -256 || off > 255)
+ aa_panic(a, "callee-save offset out of prologue range");
+ if (i + 1u < a->ncallee_saves && cs->cls == (u8)NATIVE_REG_INT &&
+ a->callee_saves[i + 1u].cls == (u8)NATIVE_REG_INT) {
+ const AACalleeSave* cs2 = &a->callee_saves[i + 1u];
+ i32 off2 = -(i32)aa_slot(a, cs2->slot)->off; /* off - 8, lower address */
+ if (*n >= cap) aa_panic(a, "prologue too large");
+ words[(*n)++] = save
+ ? aa64_stp64_soff(cs2->reg, cs->reg, AA_FP, off2 / 8)
+ : aa64_ldp64_soff(cs2->reg, cs->reg, AA_FP, off2 / 8);
+ i += 2u;
+ } else {
+ u32 v = cs->cls == (u8)NATIVE_REG_FP ? 1u : 0u;
+ if (*n >= cap) aa_panic(a, "prologue too large");
+ words[(*n)++] = save ? aa_stur_v(3, v, cs->reg, AA_FP, off)
+ : aa_ldur_v(3, v, cs->reg, AA_FP, off);
+ i += 1u;
+ }
+ }
+}
+
+/* Build the prologue instruction words for `frame_size` into `words` (capacity
+ * `cap`), returning the count. Shared by the NativeDirectTarget patch path
+ * (reserves a fixed worst-case region, then patches it here) and the optimizer
+ * path (emits an exact-size region up front; see aa_emit_prologue). */
+static u32 aa_build_prologue_words(AANativeTarget* a, u32 frame_size, u32* words,
+ u32 cap) {
u32 n = 0;
+ if (!frame_size) return 0;
+ aa_words_sub_sp_frame(a, words, cap, &n, frame_size);
+ aa_words_saved_pair_addr(a, words, cap, &n, frame_size);
+ if (n >= cap) aa_panic(a, "prologue too large");
+ words[n++] = aa64_stp64_soff(AA_FP, AA_LR, AA_TMP1, 0); /* fp,lr @ [x17] */
+ aa_words_frame_ptr_from_sp(a, words, cap, &n, frame_size);
+ /* Save callee-saved registers the allocator used, FP-relative. Their slots
+ * were reserved first (aa_reserve_callee_saves), so offsets fit stur's
+ * signed-9-bit immediate. */
+ aa_words_callee_saves(a, 1, words, cap, &n);
+ return n;
+}
+
+/* Patch the reserved prologue region (`region` words at prologue_pos) with the
+ * real prologue for `frame_size`. Used by the NativeDirectTarget single-pass
+ * path, which reserves AA_PROLOGUE_WORDS up front before the frame is known.
+ * The optimizer path reserves exactly the words it needs, so `region` equals
+ * the real prologue length and no tail remains. */
+static void aa_patch_prologue(AANativeTarget* a, u32 frame_size, u32 region) {
+ u32 words[AA_PROLOGUE_WORDS];
+ u32 n;
ObjSecId sec = a->func->text_section_id;
+ if (region > AA_PROLOGUE_WORDS) aa_panic(a, "prologue region too large");
memset(words, 0, sizeof words);
- if (frame_size) {
- aa_words_sub_sp_frame(a, words, AA_PROLOGUE_WORDS, &n, frame_size);
- aa_words_saved_pair_addr(a, words, AA_PROLOGUE_WORDS, &n, frame_size);
- if (n + 2u > AA_PROLOGUE_WORDS) aa_panic(a, "prologue too large");
- words[n++] = aa_stur_v(3, 0, AA_FP, AA_TMP1, 0);
- words[n++] = aa_stur_v(3, 0, AA_LR, AA_TMP1, 8);
- aa_words_frame_ptr_from_sp(a, words, AA_PROLOGUE_WORDS, &n, frame_size);
- /* Save callee-saved registers the allocator used, FP-relative. Their slots
- * were reserved first (aa_reserve_callee_saves), so offsets fit stur's
- * signed-9-bit immediate. */
- for (u32 i = 0; i < a->ncallee_saves; ++i) {
- const AACalleeSave* cs = &a->callee_saves[i];
- i32 off = -(i32)aa_slot(a, cs->slot)->off;
- if (n >= AA_PROLOGUE_WORDS) aa_panic(a, "prologue too large");
- if (off < -256 || off > 255)
- aa_panic(a, "callee-save offset out of prologue range");
- words[n++] = aa_stur_v(3, cs->cls == (u8)NATIVE_REG_FP ? 1u : 0u, cs->reg,
- AA_FP, off);
- }
- }
- while (n < AA_PROLOGUE_WORDS) words[n++] = 0xd503201fu;
- for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i)
+ n = aa_build_prologue_words(a, frame_size, words, region);
+ /* If the real prologue is shorter than the reserved region (the worst-case
+ * NDT reservation), branch straight to the body rather than leaving the
+ * trailing slots as NOPs that fall through and execute on every call. */
+ if (n < region) {
+ words[n] = aa64_b(region - n);
+ for (u32 i = n + 1u; i < region; ++i) words[i] = 0xd503201fu;
+ }
+ for (u32 i = 0; i < region; ++i)
aa_patch32(a->base.obj, sec, a->prologue_pos + i * 4u, words[i]);
}
+/* Optimizer path: emit an exact-size prologue in place (no reserved NOP
+ * region). The callee-save set and the static frame slots are final by now, so
+ * the prologue's instruction count is fixed; only the frame-size immediates
+ * (sub sp / save-area address / fp = sp+frame) still depend on body-emitted
+ * temporaries and are patched in func_end. We size the region with a frame that
+ * fits add/sub's imm12 (the real frame must too, or func_end's rebuild — capped
+ * at this length — panics). The sret/variadic entry saves follow, as on the
+ * single-pass path. */
+static void aa_emit_prologue(NativeTarget* t) {
+ AANativeTarget* a = aa_of(t);
+ u32 words[AA_PROLOGUE_WORDS];
+ u32 est_frame = align_up_u32(a->cum_off + a->max_outgoing, 16u);
+ u32 n = aa_build_prologue_words(a, est_frame, words, AA_PROLOGUE_WORDS);
+ for (u32 i = 0; i < n; ++i) aa_emit32(t->mc, words[i]);
+ a->minimal_prologue_words = n;
+ aa_emit_entry_saves(a);
+}
+
static void aa_emit_restore_frame(AANativeTarget* a, u32 frame_size) {
MCEmitter* mc = a->base.mc;
if (!frame_size) return;
aa_emit32(mc, aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0));
- aa_emit32(mc, aa_ldur_v(3, 0, AA_FP, AA_TMP0, -16));
- aa_emit32(mc, aa_ldur_v(3, 0, AA_LR, AA_TMP0, -8));
+ aa_emit32(mc, aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, -2)); /* fp,lr @ -16 */
aa_emit32(mc, aa64_add_imm(1, AA_SP, AA_TMP0, 0, 0));
}
@@ -1030,19 +1103,12 @@ static void aa_patch_allocas(AANativeTarget* a) {
}
}
-/* Append FP-relative loads that restore the saved callee registers. Shared by
- * the tail-call patch; the function epilogue uses aa_emit_callee_restores. */
+/* Append FP-relative loads that restore the saved callee registers (stp/ldp
+ * paired, same as the prologue saves). Shared by the tail-call patch; the
+ * function epilogue uses aa_emit_callee_restores. */
static void aa_words_callee_restores(AANativeTarget* a, u32* words, u32 cap,
u32* n) {
- for (u32 i = a->ncallee_saves; i > 0; --i) {
- const AACalleeSave* cs = &a->callee_saves[i - 1u];
- i32 off = -(i32)aa_slot(a, cs->slot)->off;
- if (*n >= cap) aa_panic(a, "patch too small for callee restores");
- if (off < -256 || off > 255)
- aa_panic(a, "callee-save offset out of restore range");
- words[(*n)++] = aa_ldur_v(3, cs->cls == (u8)NATIVE_REG_FP ? 1u : 0u, cs->reg,
- AA_FP, off);
- }
+ aa_words_callee_saves(a, 0, words, cap, n);
}
static void aa_patch_tail_sites(AANativeTarget* a, u32 frame_size) {
@@ -1073,15 +1139,20 @@ static void aa_func_end(NativeTarget* t) {
AANativeTarget* a = aa_of(t);
MCEmitter* mc = t->mc;
u32 frame_size = align_up_u32(a->cum_off + a->max_outgoing, 16u);
+ /* Optimizer path emitted an exact-size prologue (minimal_prologue_words);
+ * the single-pass path reserved a fixed worst-case region. Either way the
+ * frame-size immediates are only final now, so patch the region in place. */
+ u32 prologue_region =
+ t->emit_minimal_prologue ? a->minimal_prologue_words : AA_PROLOGUE_WORDS;
mc->label_place(mc, a->epilogue_label);
aa_emit_callee_restores(a);
aa_emit_restore_frame(a, frame_size);
aa_emit32(mc, aa64_ret(AA_LR));
- aa_patch_prologue(a, frame_size);
+ aa_patch_prologue(a, frame_size, prologue_region);
aa_patch_allocas(a);
aa_patch_tail_sites(a, frame_size);
if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) {
- mc->cfi_set_next_pc_offset(mc, AA_PROLOGUE_WORDS * 4u);
+ mc->cfi_set_next_pc_offset(mc, prologue_region * 4u);
mc->cfi_def_cfa(mc, AA_FP, 0);
mc->cfi_offset(mc, AA_FP, -16);
mc->cfi_offset(mc, AA_LR, -8);
@@ -1870,6 +1941,23 @@ static u32 aa_call_stack_size(NativeTarget* t, const NativeCallDesc* desc) {
return align_up_u32(stack, 16u);
}
+/* Stack-argument bytes a call with `fn_type`'s fixed parameters uses. Reuses
+ * aa_call_stack_size by routing the declared params through it (their ABI
+ * classification is independent of the actual operand locations, which
+ * aa_call_stack_size ignores for register/stack placement). */
+static u32 aa_signature_stack_bytes(NativeTarget* t, CfreeCgTypeId fn_type,
+ int* variadic, u32* nparams) {
+ const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fn_type);
+ NativeCallDesc d;
+ if (variadic) *variadic = abi ? (int)abi->variadic : 0;
+ if (nparams) *nparams = abi ? abi->nparams : 0u;
+ memset(&d, 0, sizeof d);
+ d.fn_type = fn_type;
+ d.nargs = abi ? abi->nparams : 0u;
+ if (d.nargs) d.args = arena_zarray(t->c->tu, NativeLoc, d.nargs);
+ return aa_call_stack_size(t, &d);
+}
+
static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc,
NativeCallPlan* plan) {
NativeCallPlanRet* rets;
@@ -2765,6 +2853,10 @@ NativeTarget* aa64_native_target_new(Compiler* c, ObjBuilder* obj,
t->func_begin_known_frame = aa_func_begin_known_frame;
t->note_frame_state = aa_note_frame_state;
t->reserve_callee_saves = aa_reserve_callee_saves;
+ t->emit_prologue = aa_emit_prologue;
+ t->signature_stack_bytes = aa_signature_stack_bytes;
+ t->has_store_zero_reg = 1;
+ t->store_zero_reg = 31u; /* wzr/xzr in the Rt position of a store */
t->func_end = aa_func_end;
t->frame_slot = aa_frame_slot;
t->bind_param = aa_bind_native_param;
diff --git a/src/arch/native_target.h b/src/arch/native_target.h
@@ -279,6 +279,31 @@ struct NativeTarget {
* up front. */
void (*reserve_callee_saves)(NativeTarget*, const u32* used_by_class,
u32 nclasses);
+ /* Optional. When set, the optimizer emit path calls this once — after
+ * func_begin, reserve_callee_saves, and frame-slot mapping, but before the
+ * body — to emit a minimal, exact-size prologue in place (no reserved NOP
+ * region). Frame-size immediates are still patched in func_end, since the
+ * final frame size isn't known until body emission allocates its temporaries.
+ * Backends that leave this NULL fall back to the single-pass reserve-and-patch
+ * prologue used by NativeDirectTarget. Gated by `emit_minimal_prologue`, which
+ * the optimizer emit path sets before func_begin so func_begin can skip the
+ * reserved region. */
+ void (*emit_prologue)(NativeTarget*);
+ u8 emit_minimal_prologue;
+ /* Bytes of stack-passed arguments the fixed parameters of this function
+ * signature use (the part beyond the register arg pools). Sets *variadic to
+ * whether the signature is variadic and *nparams to the fixed parameter
+ * count. Used to decide tail-call (sibling) realizability: the callee's
+ * outgoing stack args must fit the area the caller itself received. Either
+ * out-pointer may be NULL. May itself be NULL. */
+ u32 (*signature_stack_bytes)(NativeTarget*, CfreeCgTypeId fn_type,
+ int* variadic, u32* nparams);
+ /* Integer hardware zero register, if the ISA has one (aa64 wzr/xzr, rv64
+ * x0). When `has_store_zero_reg` is set, the emit path stores a constant 0
+ * straight from `store_zero_reg` instead of materializing 0 into a scratch
+ * with a mov/movz first. */
+ u8 has_store_zero_reg;
+ Reg store_zero_reg;
void (*func_end)(NativeTarget*);
NativeFrameSlot (*frame_slot)(NativeTarget*, const NativeFrameSlotDesc*);
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -23,8 +23,45 @@ typedef struct OptImpl {
NativeTarget* native;
int level;
Writer* dump_writer;
+ /* Registry of functions recorded so far, for tiny-inline callee lookup.
+ * `lowered_cache` is parallel to `cg_by_sym`: a lazily re-lowered
+ * pre-machinize Func for the callee, reused across all call sites. All on
+ * c->tu (persist for the whole translation unit). */
+ CgIrFunc** cg_by_sym;
+ Func** lowered_cache;
+ u32 ncg;
+ u32 cg_cap;
} OptImpl;
+/* Lazily re-lower (and cache) the pre-machinize Func for a recorded callee
+ * symbol. Returns NULL for forward-defined callees not yet recorded. */
+static Func* opt_tiny_callee_lookup(void* ctx, ObjSymId sym) {
+ OptImpl* o = (OptImpl*)ctx;
+ for (u32 i = 0; i < o->ncg; ++i) {
+ if (o->cg_by_sym[i]->desc.sym != sym) continue;
+ if (!o->lowered_cache[i])
+ o->lowered_cache[i] = opt_func_from_cg_ir(o->c, o->cg_by_sym[i]);
+ return o->lowered_cache[i];
+ }
+ return NULL;
+}
+
+static void opt_registry_add(OptImpl* o, CgIrFunc* f) {
+ if (o->ncg == o->cg_cap) {
+ u32 ncap = o->cg_cap ? o->cg_cap * 2u : 16u;
+ CgIrFunc** ncg = arena_array(o->c->tu, CgIrFunc*, ncap);
+ Func** nlc = arena_zarray(o->c->tu, Func*, ncap);
+ if (o->ncg) {
+ memcpy(ncg, o->cg_by_sym, sizeof(ncg[0]) * o->ncg);
+ memcpy(nlc, o->lowered_cache, sizeof(nlc[0]) * o->ncg);
+ }
+ o->cg_by_sym = ncg;
+ o->lowered_cache = nlc;
+ o->cg_cap = ncap;
+ }
+ o->cg_by_sym[o->ncg++] = f;
+}
+
static void opt_dbg_dump(OptImpl* o, Func* f, const char* tag) {
extern char* getenv(const char*);
const char* s = getenv("CFREE_DUMP");
@@ -65,6 +102,21 @@ static void opt_run_o1_native(OptImpl* o, Func* f) {
metrics_scope_begin(o->c, "opt.cfg.simplify_local");
opt_simplify_local(f);
metrics_scope_end(o->c, "opt.cfg.simplify_local");
+
+ metrics_scope_begin(o->c, "opt.o1.tiny_inline");
+ int inlined = opt_try_tiny_inline(f, opt_tiny_callee_lookup, o);
+ metrics_scope_end(o->c, "opt.o1.tiny_inline");
+ if (inlined) {
+ /* inline_call_site invalidated CFG and left preds stale (it maintains
+ * succ + emit_order only); rebuilding is required before verify/machinize.
+ * Then merge the BR-glue chain (pre -> body -> cont) back into
+ * straight-line blocks so regalloc sees no artificial boundaries, mirroring
+ * the prologue's build_cfg -> jump_cleanup -> build_cfg idiom. */
+ opt_build_cfg(f);
+ opt_jump_cleanup(f, OPT_JUMP_CLEANUP_CFG);
+ opt_build_cfg(f);
+ }
+
metrics_scope_begin(o->c, "opt.cfg.verify");
opt_verify(f, "lowering-cfg");
metrics_scope_end(o->c, "opt.cfg.verify");
@@ -98,6 +150,10 @@ static void opt_run_o1_native(OptImpl* o, Func* f) {
metrics_scope_begin(o->c, "opt.build_loop_tree");
opt_build_loop_tree(f);
metrics_scope_end(o->c, "opt.build_loop_tree");
+ metrics_scope_begin(o->c, "opt.o1.hoist_loop_consts");
+ opt_hoist_loop_consts(f);
+ metrics_scope_end(o->c, "opt.o1.hoist_loop_consts");
+ opt_verify(f, "o1-hoist-loop-consts");
metrics_scope_begin(o->c, "opt.live_blocks.pre_dde");
memset(&live, 0, sizeof live);
@@ -171,6 +227,9 @@ static void opt_dbg_dump_cg(OptImpl* o, const CgIrFunc* f) {
static void opt_on_func(void* user, CgIrFunc* cg_func) {
OptImpl* o = (OptImpl*)user;
Func* f;
+ /* Register before lowering so later callers can resolve this (complete)
+ * function as a tiny-inline callee. */
+ opt_registry_add(o, cg_func);
opt_dbg_dump_cg(o, cg_func);
/* The dump writer renders the semantic CG IR tape — the IR as recorded,
* before lowering to the optimizer's CFG form. */
@@ -205,9 +264,31 @@ static int opt_on_local_static_data_begin(void* user,
static const char* opt_on_tail_call_unrealizable_reason(
void* user, const struct CGFuncDesc* caller, const CGCallDesc* call) {
- (void)user;
- (void)caller;
- (void)call;
+ OptImpl* o = (OptImpl*)user;
+ int callee_va = 0, caller_va = 0;
+ u32 callee_nparams = 0;
+ u32 outgoing, incoming;
+ if (!o || !o->native || !o->native->signature_stack_bytes) return NULL;
+ /* A realized sibling call reuses the area this function received for its own
+ * incoming stack arguments to hold the callee's outgoing stack arguments, so
+ * the callee's must fit. If they don't, the tail isn't realizable and CG
+ * falls back to an ordinary call + return. */
+ outgoing = o->native->signature_stack_bytes(o->native, call->fn_type,
+ &callee_va, &callee_nparams);
+ incoming = o->native->signature_stack_bytes(o->native, caller->fn_type,
+ &caller_va, NULL);
+ /* A variadic caller's incoming layout includes the register save / overflow
+ * area, which isn't a simple reusable parameter slab — don't realize. And if
+ * the call actually passes variadic (beyond-fixed) arguments, their stack
+ * footprint isn't captured by the fixed-parameter sizing above, so be
+ * conservative and fall back. A variadic callee invoked with only its fixed
+ * arguments (e.g. `musttail sink(x)`) is fine and uses the byte comparison. */
+ if (caller_va)
+ return "variadic caller cannot host a sibling call";
+ if (callee_va && call->nargs > callee_nparams)
+ return "variadic tail call arguments are not realizable as a sibling call";
+ if (outgoing > incoming)
+ return "tail call stack arguments exceed the caller's parameter area";
return NULL;
}
diff --git a/src/opt/opt.h b/src/opt/opt.h
@@ -29,6 +29,8 @@ void opt_promote_scalar_locals(Func*); /* O1: promote non-escaped scalar
frame slots to mutable PRegs. */
void opt_addr_of_global_cse(Func*); /* O1: hoist duplicate ADDR_OF(global)
defs to a single entry-block compute. */
+void opt_hoist_loop_consts(Func*); /* O1: hoist loop-invariant LOAD_IMM
+ materialization to the entry block. */
void opt_simplify_local(Func*);
void opt_simplify(Func*);
void opt_gvn(Func*); /* incl. constprop, redundant-load elim */
diff --git a/src/opt/opt_internal.h b/src/opt/opt_internal.h
@@ -23,6 +23,7 @@ typedef struct OptPassCtx {
const char* stage;
} OptPassCtx;
+typedef struct FuncSet FuncSet;
struct FuncSet {
Compiler* c;
Arena* arena;
@@ -157,6 +158,15 @@ u32 opt_call_clobber_mask_for(Func*, const Inst*, u8 cls);
int opt_inst_has_side_effect(Func*, const Inst*);
+/* Inlining. opt_inline is the whole-program driver (FuncSet over all retained
+ * funcs). opt_try_tiny_inline is the streaming O1 entry: it inlines tiny
+ * direct callees into one caller, resolving callee bodies through `lookup`
+ * (returns a fresh pre-machinize Func for a symbol, or NULL to skip). Returns
+ * the number of call sites inlined. */
+typedef Func* (*OptInlineCalleeLookup)(void* ctx, ObjSymId callee_sym);
+int opt_try_tiny_inline(Func* caller, OptInlineCalleeLookup lookup, void* ctx);
+void opt_inline(FuncSet*, int max_iters);
+
int opt_hard_empty(const OptHardRegSet*);
int opt_hard_intersects(const OptHardRegSet*, const OptHardRegSet*);
void opt_hard_live_step(OptHardRegSet* live, const OptHardRegSet* use,
diff --git a/src/opt/pass_addr_fold.c b/src/opt/pass_addr_fold.c
@@ -758,3 +758,176 @@ void opt_addr_of_global_cse(Func* f) {
opt_analysis_invalidate(
f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP);
}
+
+/* LICM-lite for constant materialization. A LOAD_IMM has no inputs, so its
+ * result is loop-invariant and valid anywhere the entry block dominates (i.e.
+ * everywhere). When such a def lives inside a loop, the backend would otherwise
+ * re-materialize the constant (movz/movk) on every iteration; hoisting one copy
+ * to the entry block and reusing it removes that per-iteration work, mirroring
+ * opt_addr_of_global_cse for the `adrp`/`add` case.
+ *
+ * Scope/cost guards:
+ * - Only hoist constants that actually appear inside a loop (loop_depth > 0);
+ * non-loop constants gain nothing and would only lengthen live ranges.
+ * - Skip imm 0: the emit path stores a constant 0 from the hardware zero
+ * register (no materialization at all), which beats holding 0 in an
+ * allocated register across the loop.
+ *
+ * Must run after opt_build_loop_tree (reads block loop_depth) and before
+ * register allocation. */
+typedef struct ConstHoistEntry {
+ i64 imm;
+ CfreeCgTypeId type;
+ PReg canonical;
+ u8 cls;
+ u8 in_loop;
+} ConstHoistEntry;
+
+static u32 const_hoist_find_or_add(ConstHoistEntry** entries, u32* n, u32* cap,
+ Arena* arena, i64 imm, CfreeCgTypeId type,
+ u8 cls) {
+ for (u32 i = 0; i < *n; ++i) {
+ if ((*entries)[i].imm == imm && (*entries)[i].type == type &&
+ (*entries)[i].cls == cls)
+ return i;
+ }
+ if (*n == *cap) {
+ u32 ncap = *cap ? *cap * 2u : 16u;
+ ConstHoistEntry* nv = arena_array(arena, ConstHoistEntry, ncap);
+ if (*entries) memcpy(nv, *entries, sizeof(ConstHoistEntry) * (*n));
+ *entries = nv;
+ *cap = ncap;
+ }
+ u32 idx = (*n)++;
+ ConstHoistEntry* e = &(*entries)[idx];
+ memset(e, 0, sizeof *e);
+ e->imm = imm;
+ e->type = type;
+ e->cls = cls;
+ e->canonical = PREG_NONE;
+ return idx;
+}
+
+static void const_hoist_count_def(Func* f, Inst* in, Operand* op, int is_def,
+ void* ud) {
+ u32* counts = (u32*)ud;
+ (void)in;
+ if (!is_def || op->kind != OPK_REG) return;
+ PReg p = (PReg)op->v.reg;
+ if (opt_reg_valid(f, p)) ++counts[p];
+}
+
+/* True for a LOAD_IMM whose constant is a hoist candidate: a non-zero immediate
+ * defining a register that has exactly one def in the function. The single-def
+ * requirement is the safety condition for remapping every use of that register
+ * to one canonical entry def — the PReg form here is non-SSA, so a mutable
+ * local promoted by opt_promote_scalar_locals (e.g. a loop counter initialized
+ * to a constant) is multiply defined; remapping it would discard the other
+ * defs (the increment) and break the loop. */
+static int const_hoist_candidate(const Func* f, const Inst* in,
+ const u32* def_counts) {
+ if ((IROp)in->op != IR_LOAD_IMM || in->nopnds < 1) return 0;
+ if (in->opnds[0].kind != OPK_REG || in->extra.imm == 0) return 0;
+ PReg p = (PReg)in->opnds[0].v.reg;
+ return opt_reg_valid(f, p) && def_counts[p] == 1;
+}
+
+void opt_hoist_loop_consts(Func* f) {
+ if (!f || f->opt_reg_ssa || f->opt_rewritten) return;
+ if (f->nblocks == 0 || f->entry >= f->nblocks) return;
+
+ /* Count defs per PReg so const_hoist_candidate can reject multiply-defined
+ * registers (mutable promoted locals) — see its comment. */
+ u32* def_counts = arena_zarray(f->arena, u32, opt_reg_count(f));
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i)
+ opt_walk_inst_operands(f, &bl->insts[i], const_hoist_count_def,
+ def_counts);
+ }
+
+ /* Pass 1: index LOAD_IMM defs by (imm, type, cls); flag keys seen in a loop. */
+ ConstHoistEntry* entries = NULL;
+ u32 n_entries = 0, cap_entries = 0;
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ int in_loop = bl->loop_depth > 0;
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if (!const_hoist_candidate(f, in, def_counts)) continue;
+ u32 idx = const_hoist_find_or_add(&entries, &n_entries, &cap_entries,
+ f->arena, in->extra.imm,
+ in->opnds[0].type, in->opnds[0].cls);
+ if (in_loop) entries[idx].in_loop = 1;
+ }
+ }
+
+ /* Pass 2: allocate a canonical PReg for each constant that appears in a loop. */
+ u32 dup_count = 0;
+ for (u32 i = 0; i < n_entries; ++i) {
+ if (!entries[i].in_loop) continue;
+ entries[i].canonical = ir_alloc_preg(f, entries[i].type, entries[i].cls);
+ ++dup_count;
+ }
+ if (!dup_count) return;
+
+ /* Pass 3: NOP every def of a hoisted constant, mapping its PReg to the
+ * canonical one (consolidates loop and non-loop occurrences alike). */
+ PReg* remap = arena_zarray(f->arena, PReg, opt_reg_count(f));
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if (!const_hoist_candidate(f, in, def_counts)) continue;
+ u32 idx = const_hoist_find_or_add(&entries, &n_entries, &cap_entries,
+ f->arena, in->extra.imm,
+ in->opnds[0].type, in->opnds[0].cls);
+ if (entries[idx].canonical == PREG_NONE) continue;
+ PReg old = (PReg)in->opnds[0].v.reg;
+ if (opt_reg_valid(f, old)) remap[old] = entries[idx].canonical;
+ in->op = IR_NOP;
+ in->def = VAL_NONE;
+ in->ndefs = 0;
+ in->defs = NULL;
+ in->nopnds = 0;
+ in->opnds = NULL;
+ }
+ }
+
+ /* Pass 4: emit one LOAD_IMM per hoisted constant in the entry block, after
+ * any leading IR_PARAM_DECL prologue. */
+ Block* entry = &f->blocks[f->entry];
+ u32 insert_at = 0;
+ while (insert_at < entry->ninsts &&
+ (IROp)entry->insts[insert_at].op == IR_PARAM_DECL)
+ ++insert_at;
+ Inst* slot = block_insert_at(f, entry, insert_at, dup_count);
+ u32 w = 0;
+ for (u32 i = 0; i < n_entries; ++i) {
+ if (entries[i].canonical == PREG_NONE) continue;
+ Inst* in = &slot[w++];
+ in->op = (u16)IR_LOAD_IMM;
+ in->def = (Val)entries[i].canonical;
+ in->type = entries[i].type;
+ in->extra.imm = entries[i].imm;
+ in->nopnds = 1;
+ in->opnds = arena_array(f->arena, Operand, 1);
+ memset(&in->opnds[0], 0, sizeof(Operand));
+ in->opnds[0].kind = OPK_REG;
+ in->opnds[0].cls = entries[i].cls;
+ in->opnds[0].type = entries[i].type;
+ in->opnds[0].v.reg = entries[i].canonical;
+ ir_assign_inst_id(f, in);
+ }
+
+ /* Pass 5: apply remap to all operand uses (reuses the ADDR_OF-CSE walker,
+ * which also covers IR_CALL aux operands). */
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i)
+ addr_cse_apply_to_inst(&bl->insts[i], remap);
+ }
+
+ opt_analysis_invalidate(
+ f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP);
+}
diff --git a/src/opt/pass_combine.c b/src/opt/pass_combine.c
@@ -778,17 +778,29 @@ static int try_substitute_for_reg(CombineCtx* ctx, Inst* in, i32 i, u8 cls,
src_op = prod->opnds[1];
}
- /* Single-use within the producer's live range: producer's dst must have
- * exactly one further use before the next redef/barrier in this block (the
- * consumer we're about to rewrite). If the live range terminates inside
- * this block, the producer's def is killed before any successor sees it,
- * so the cross-block live-out check can be skipped. */
- int killed = 0;
- int uses_total = count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &def, &killed);
- if (uses_total != 1) return 0;
- if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live,
- ctx->bl->id, &def))
- return 0;
+ /* For a register-source IR_COPY this is local copy propagation: rewriting
+ * one use of the copy's dest to its source is value-safe whenever the source
+ * is unchanged between the copy and this consumer — already verified by the
+ * ctx_def_changed_since check above (which treats CALL/ASM/INTRINSIC as a
+ * clobber of every register). The copy itself is not removed here; if all its
+ * uses fold away and it is not live-out, post-combine DCE deletes it. So no
+ * single-use or cross-block live-out restriction is required, which lets the
+ * O1 path collapse the multi-use reg->reg copy chains it would otherwise
+ * leave behind (it never runs the O2 coalescer).
+ *
+ * The immediate / convert producers don't propagate a register value, so
+ * they keep the stricter single-use + live-out gate: substituting them into
+ * every use would duplicate a (possibly multi-instruction) materialization
+ * rather than shorten a copy chain. */
+ if (kind != SK_REG) {
+ int killed = 0;
+ int uses_total =
+ count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &def, &killed);
+ if (uses_total != 1) return 0;
+ if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live,
+ ctx->bl->id, &def))
+ return 0;
+ }
/* O1 (no coalescing) folds immediates into IR_COPY; O2 leaves the copy
* register-to-register so coalescing + self-copy removal handles it. */
diff --git a/src/opt/pass_inline.c b/src/opt/pass_inline.c
@@ -36,6 +36,13 @@ typedef struct InlineOrderCtx {
#define INLINE_ABS_GROWTH_LIMIT 64u
#define INLINE_HINT_ABS_GROWTH_LIMIT 128u
+/* Streaming O1 tiny-inline: a much smaller budget than the whole-program
+ * inliner. DEFAULT/HINT callees must fit under this cost; ALWAYS bypasses it.
+ * Bounded by max passes since an inlined straightline body never introduces a
+ * new call site. */
+#define INLINE_TINY_COST_LIMIT 8u
+#define TINY_INLINE_MAX_PASSES 4u
+
static u32 func_inline_cost(Func* f) {
u32 cost = 0;
if (!f) return 0;
@@ -235,7 +242,7 @@ static void map_mem(InlineMap* m, MemAccess* mem) {
}
static Operand map_operand(InlineMap* m, Operand op) {
- switch ((OpKind)op.kind) {
+ switch ((OptOperandKind)op.kind) {
case OPK_REG:
op.v.reg = map_preg(m, (PReg)op.v.reg);
break;
@@ -692,3 +699,79 @@ void opt_inline(FuncSet* fs, int max_iters) {
if (!changed) break;
}
}
+
+/* Streaming single-caller variant for the O1 pipeline. Same gates as
+ * try_inline_call, minus the whole-program growth check (which needs a
+ * base_cost array the streaming path lacks), plus the tiny cost cap. */
+static int try_tiny_inline_call(FuncSet* fs, Func* caller, u32 b, u32 i) {
+ Inst* in = &caller->blocks[b].insts[i];
+ Func* callee = direct_callee(fs, in);
+ u32 cost = 0;
+ CfreeCgInlinePolicy policy;
+ if (!callee) return 0;
+ policy = effective_inline_policy(in, callee);
+ metrics_count(caller->c, "opt.tiny_inline.candidates", 1);
+ if (policy == CFREE_CG_INLINE_NEVER) {
+ metrics_count(caller->c, "opt.tiny_inline.refuse_policy", 1);
+ return 0;
+ }
+ if (recursive_or_scc(fs, caller, callee)) {
+ metrics_count(caller->c, "opt.tiny_inline.refuse_scc", 1);
+ return 0;
+ }
+ if (!callee_inline_shape(callee, policy, &cost)) {
+ metrics_count(caller->c, "opt.tiny_inline.refuse_shape", 1);
+ return 0;
+ }
+ if (policy != CFREE_CG_INLINE_ALWAYS && cost > INLINE_TINY_COST_LIMIT) {
+ metrics_count(caller->c, "opt.tiny_inline.refuse_budget", 1);
+ return 0;
+ }
+ if (!inline_rewrite_supported(callee, in)) {
+ metrics_count(caller->c, "opt.tiny_inline.refuse_rewrite_shape", 1);
+ return 0;
+ }
+ if (!inline_call_site(caller, b, i, callee)) {
+ metrics_count(caller->c, "opt.tiny_inline.refuse_rewrite", 1);
+ return 0;
+ }
+ metrics_count(caller->c, "opt.tiny_inline.inlined", 1);
+ return 1;
+}
+
+int opt_try_tiny_inline(Func* caller, OptInlineCalleeLookup lookup, void* ctx) {
+ int total = 0;
+ if (!caller || !lookup || caller->opt_reg_ssa || caller->opt_rewritten)
+ return 0;
+ for (u32 pass = 0; pass < TINY_INLINE_MAX_PASSES; ++pass) {
+ int changed = 0;
+ for (u32 b = 0; b < caller->nblocks && !changed; ++b) {
+ Block* bl = &caller->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if ((IROp)in->op != IR_CALL) continue;
+ IRCallAux* aux = (IRCallAux*)in->extra.aux;
+ if (!aux || aux->desc.callee.kind != OPK_GLOBAL) continue;
+ Func* callee = lookup(ctx, aux->desc.callee.v.global.sym);
+ if (!callee) continue;
+ /* Ad-hoc 2-element FuncSet so the existing gates (direct_callee,
+ * recursive_or_scc) resolve the callee by symbol. */
+ Func* funcs[2] = {caller, callee};
+ FuncSet fs;
+ memset(&fs, 0, sizeof fs);
+ fs.c = caller->c;
+ fs.arena = caller->arena;
+ fs.funcs = funcs;
+ fs.nfuncs = (caller == callee) ? 1u : 2u;
+ fs.cap = fs.nfuncs;
+ if (try_tiny_inline_call(&fs, caller, b, i)) {
+ ++total;
+ changed = 1; /* indices invalidated by the split; restart scan */
+ break;
+ }
+ }
+ }
+ if (!changed) break;
+ }
+ return total;
+}
diff --git a/src/opt/pass_jump.c b/src/opt/pass_jump.c
@@ -317,6 +317,74 @@ static void cleanup_reorder_for_fallthrough(JumpCleanupCtx* c) {
}
}
+/* Rotate simple counted loops so the test sits at the bottom: the back-edge
+ * becomes the per-iteration conditional branch and the body falls through to
+ * the test, eliminating the unconditional back-jump the head-test shape emits
+ * every iteration.
+ *
+ * Pattern (after the greedy reorder):
+ *
+ * H (emit i): IR_CMP_BRANCH, succ[0]=E (exit, taken), succ[1]=B (body entry,
+ * the fallthrough, placed right after H by the greedy reorder)
+ * ... body chain ...
+ * L (emit k): the latch — a predecessor of H at a later emit position, i.e.
+ * the source of the loop's back-edge to H
+ *
+ * The rewrite moves H from position i to just after L (rotating the emit-order
+ * run [i..k] left by one) and inverts H's test in place: H's taken edge becomes
+ * the back-edge to the body entry (the per-iteration conditional) and its
+ * fallthrough becomes the exit E. No CFG edges move — reordering emit_order is
+ * always valid because pass_native_emit emits an explicit jump for any
+ * successor that isn't the textual next block. After the move L falls through
+ * to H (its back-edge `b H` is stripped), and the preheader keeps its now-
+ * forward `b H` (executed once, serving as the zero-trip guard); the
+ * unconditional back-jump every iteration is gone.
+ *
+ * Conservative guards: the body entry must be H's fallthrough and immediately
+ * follow H (the greedy chain), and H must have exactly one back-edge
+ * predecessor after it in emit order (reducible, single-latch loop). */
+static void cleanup_rotate_loops(JumpCleanupCtx* c) {
+ Func* f = c->f;
+ if (f->emit_order_n < 2u) return;
+ for (u32 i = 0; i + 1u < f->emit_order_n; ++i) {
+ u32 h = f->emit_order[i];
+ if (h >= f->nblocks) continue;
+ Block* H = &f->blocks[h];
+ if (!H->ninsts || H->nsucc != 2) continue;
+ Inst* hterm = &H->insts[H->ninsts - 1u];
+ if ((IROp)hterm->op != IR_CMP_BRANCH) continue;
+ u32 body = H->succ[1]; /* loop body entry (fallthrough) */
+ if (f->emit_order[i + 1u] != body) continue;
+ /* Latch: the single predecessor of H later in emit order (back-edge
+ * source). Zero or multiple => not a simple single-latch loop; skip. */
+ u32 latch_pos = BLOCK_NONE;
+ int ok = 1;
+ for (u32 p = 0; p < H->npreds; ++p) {
+ u32 pos = emit_order_index(c, H->preds[p]);
+ if (pos == BLOCK_NONE || pos <= i) continue;
+ if (latch_pos != BLOCK_NONE) {
+ ok = 0;
+ break;
+ }
+ latch_pos = pos;
+ }
+ if (!ok || latch_pos == BLOCK_NONE) continue;
+ CmpOp inverted;
+ if (!invert_cmp((CmpOp)hterm->extra.imm, &inverted)) continue;
+ u32 exit = H->succ[0];
+ for (u32 k = i; k < latch_pos; ++k) f->emit_order[k] = f->emit_order[k + 1u];
+ f->emit_order[latch_pos] = h;
+ hterm->extra.imm = inverted;
+ H->succ[0] = body; /* taken: back-edge to the loop body */
+ H->succ[1] = exit; /* fallthrough: loop exit */
+ for (u32 b = 0; b < f->nblocks; ++b) c->emit_index_by_block[b] = BLOCK_NONE;
+ for (u32 k = 0; k < f->emit_order_n; ++k) {
+ u32 bb = f->emit_order[k];
+ if (bb < f->nblocks) c->emit_index_by_block[bb] = k;
+ }
+ }
+}
+
static void cleanup_layout_fallthrough_branches(const JumpCleanupCtx* c) {
Func* f = c->f;
for (u32 b = 0; b < f->nblocks; ++b) {
@@ -426,6 +494,7 @@ void opt_jump_cleanup(Func* f, OptJumpCleanupStage stage) {
cleanup_branch_targets(&c);
} else if (stage == OPT_JUMP_CLEANUP_LAYOUT) {
cleanup_reorder_for_fallthrough(&c);
+ cleanup_rotate_loops(&c);
cleanup_invert_taken_fallthrough(&c);
cleanup_layout_fallthrough_branches(&c);
}
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -622,12 +622,25 @@ static void emit_call(NativeEmitCtx* e, Inst* in) {
for (u32 i = 0; i < aux->desc.nargs; ++i)
args[i] = abi_storage_loc(e, &aux->desc.args[i], in->loc);
if (aux->desc.ret.storage.kind) {
+ CfreeCgTypeId rty = aux->desc.ret.type;
+ int scalar = !cg_type_is_aggregate(e->c, rty) &&
+ type_size_or(e->c, rty, 8u) <= 8u;
results = arena_zarray(e->f->arena, NativeLoc, 1);
final_result = abi_storage_loc(e, &aux->desc.ret, in->loc);
- result_slot =
- temp_slot(e, aux->desc.ret.type, in->loc, NATIVE_FRAME_SLOT_SPILL);
- results[0] = loc_frame(aux->desc.ret.type,
- class_for_type(e, aux->desc.ret.type), result_slot);
+ /* Scalar result: hand plan_call the value's real destination (the MIR
+ * result reg, or its spill slot) directly, so it emits one move out of the
+ * ABI result register. Routing every scalar result through a fresh temp
+ * slot — store x0 then immediately reload — was a pure round trip on every
+ * call; emit_ret already avoids the analogous trip on returns. The temp
+ * slot is kept only for aggregate / oversized results, which plan_call /
+ * the callee write in parts and must land in memory. */
+ if (scalar && (final_result.kind == NATIVE_LOC_REG ||
+ final_result.kind == NATIVE_LOC_FRAME)) {
+ results[0] = final_result;
+ } else {
+ result_slot = temp_slot(e, rty, in->loc, NATIVE_FRAME_SLOT_SPILL);
+ results[0] = loc_frame(rty, class_for_type(e, rty), result_slot);
+ }
}
d.fn_type = aux->desc.fn_type;
d.callee = loc_from_operand(e, &aux->desc.callee, in->loc);
@@ -774,6 +787,14 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in,
addr = addr_from_operand(e, &in->opnds[0], in->loc);
legalize_addr(e, &addr, in->extra.mem, in->loc);
src = loc_from_operand(e, &in->opnds[1], in->loc);
+ /* Storing a constant 0 from the hardware zero register avoids
+ * materializing 0 into a scratch first (e.g. `strb wzr, [..]` rather than
+ * `movz w9,0; strb w9, [..]`). */
+ if (src.kind == NATIVE_LOC_IMM && src.v.imm == 0 &&
+ e->target->has_store_zero_reg &&
+ class_for_type(e, in->opnds[1].type) == NATIVE_REG_INT)
+ src = loc_reg(in->opnds[1].type, NATIVE_REG_INT,
+ e->target->store_zero_reg);
if (src.kind == NATIVE_LOC_REG && (src.v.reg == addr_base_reg(&addr) ||
src.v.reg == addr_index_reg(&addr))) {
NativeFrameSlot slot =
@@ -1339,9 +1360,16 @@ void opt_emit_native(Compiler* c, Func* f, NativeTarget* target) {
metrics_scope_end(c, "opt.native_emit.setup");
metrics_scope_begin(c, "opt.native_emit.func_begin");
+ /* The optimizer path knows the callee-save set and frame slots before the
+ * body, so the backend can emit an exact-size prologue here rather than
+ * reserving a worst-case NOP region patched at func_end. Signal this before
+ * func_begin (so it skips the reserved region) and emit the prologue once
+ * reserve_callee_saves + map_frame_slots have run. */
+ target->emit_minimal_prologue = target->emit_prologue != NULL;
target->func_begin(target, &fd);
reserve_callee_saves(&e);
map_frame_slots(&e);
+ if (target->emit_minimal_prologue) target->emit_prologue(target);
bind_params(&e);
metrics_scope_end(c, "opt.native_emit.func_begin");
diff --git a/test/opt/run.sh b/test/opt/run.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Behavioral check for the O1 tiny-function inliner.
+#
+# Compiles a toy program where __user_main calls the tiny helper add1(x)=x+1,
+# and asserts that at -O1 the `bl add1` is gone from __user_main (the helper
+# was inlined). Also confirms the program still computes the right value via
+# the in-process JIT.
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+CFREE="${CFREE:-$ROOT/build/cfree}"
+SRC="$ROOT/test/toy/cases/135_inline_cleanup_quality.toy"
+WORK="$ROOT/build/test/opt/inline"
+mkdir -p "$WORK"
+
+"$CFREE" cc -target aarch64-linux-gnu -O1 -c "$SRC" \
+ -o "$WORK/x.o" > "$WORK/cc.out" 2>&1
+"$CFREE" objdump -d "$WORK/x.o" > "$WORK/dis.out" 2>&1
+
+# Slice out the __user_main function body.
+awk '
+ /^[0-9a-f]+ <__user_main>:/ { in_fn = 1; next }
+ /^[0-9a-f]+ </ { in_fn = 0 }
+ in_fn { print }
+' "$WORK/dis.out" > "$WORK/user_main.dis"
+
+if grep -Eq '\bbl\b.*add1' "$WORK/user_main.dis"; then
+ printf 'tiny-inline check FAILED: add1 was not inlined into __user_main:\n' >&2
+ sed 's/^/ | /' "$WORK/user_main.dis" >&2
+ exit 1
+fi
+
+# The inlined body must actually be present (an add of the constant), i.e. the
+# function is more than just a tail jump.
+if ! grep -Eq '\badd\b' "$WORK/user_main.dis"; then
+ printf 'tiny-inline check FAILED: inlined add1 body missing from __user_main:\n' >&2
+ sed 's/^/ | /' "$WORK/user_main.dis" >&2
+ exit 1
+fi
+
+# Runtime correctness through the in-process JIT: main returns add1(41) == 42.
+for opt in -O0 -O1; do
+ if "$CFREE" run "$opt" -e main "$SRC" > /dev/null 2>&1; then
+ rc=0
+ else
+ rc=$?
+ fi
+ if [ "$rc" -ne 42 ]; then
+ printf 'tiny-inline runtime check FAILED at %s: exit %d (want 42)\n' \
+ "$opt" "$rc" >&2
+ exit 1
+ fi
+done
+
+printf 'tiny-inline: ok\n'
diff --git a/test/opt/tiny_inline_test.c b/test/opt/tiny_inline_test.c
@@ -0,0 +1,333 @@
+/* Unit tests for the O1 tiny-function inliner (opt_try_tiny_inline).
+ *
+ * Each case builds a callee and a caller as semantic CG IR via the recorder,
+ * lowers both to the optimizer's pre-machinize Func form with
+ * opt_func_from_cg_ir, then drives opt_try_tiny_inline with a lookup that
+ * resolves the callee symbol. We assert on whether the IR_CALL was replaced. */
+
+#include <cfree/cg.h>
+#include <cfree/core.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "cg/ir.h"
+#include "cg/ir_recorder.h"
+#include "opt/opt.h"
+#include "opt/opt_internal.h"
+
+#undef Operand
+#undef CGFuncDesc
+#undef CGParamDesc
+#undef CGCallDesc
+#undef CGLocalStorage
+
+static void* h_alloc(CfreeHeap* h, size_t n, size_t a) {
+ (void)h;
+ (void)a;
+ return n ? malloc(n) : NULL;
+}
+
+static void* h_realloc(CfreeHeap* h, void* p, size_t o, size_t n, size_t a) {
+ (void)h;
+ (void)o;
+ (void)a;
+ return realloc(p, n);
+}
+
+static void h_free(CfreeHeap* h, void* p, size_t n) {
+ (void)h;
+ (void)n;
+ free(p);
+}
+
+static CfreeHeap g_heap = {h_alloc, h_realloc, h_free, NULL};
+static int g_fails;
+static int g_checks;
+
+static void diag_emit(CfreeDiagSink* s, CfreeDiagKind k, CfreeSrcLoc loc,
+ const char* fmt, va_list ap) {
+ static const char* names[] = {"note", "warning", "error", "fatal"};
+ (void)s;
+ (void)loc;
+ fprintf(stderr, "%s: ", names[k]);
+ vfprintf(stderr, fmt, ap);
+ fputc('\n', stderr);
+}
+
+static CfreeDiagSink g_diag = {diag_emit, NULL, 0, 0};
+
+#define EXPECT(cond, ...) \
+ do { \
+ ++g_checks; \
+ if (!(cond)) { \
+ ++g_fails; \
+ fprintf(stderr, "FAIL %s:%d: ", __FILE__, __LINE__); \
+ fprintf(stderr, __VA_ARGS__); \
+ fputc('\n', stderr); \
+ } \
+ } while (0)
+
+typedef struct TestCtx {
+ CfreeContext ctx;
+ Compiler* c;
+ CfreeCgTypeId i32;
+ CfreeCgTypeId ptr;
+ CfreeCgTypeId fn1; /* i32(i32) */
+} TestCtx;
+
+static void tc_init(TestCtx* tc) {
+ CfreeTarget target;
+ CfreeCgBuiltinTypes b;
+ CfreeCgFuncSig sig;
+ CfreeCgFuncParam params[1];
+ memset(tc, 0, sizeof *tc);
+ tc->ctx.heap = &g_heap;
+ tc->ctx.diag = &g_diag;
+ tc->ctx.now = -1;
+ memset(&target, 0, sizeof target);
+ target.arch = CFREE_ARCH_ARM_64;
+ target.os = CFREE_OS_MACOS;
+ target.obj = CFREE_OBJ_MACHO;
+ target.ptr_size = 8;
+ target.ptr_align = 8;
+ if (cfree_compiler_new(target, &tc->ctx, (CfreeCompiler**)&tc->c) !=
+ CFREE_OK ||
+ !tc->c) {
+ fprintf(stderr, "fatal: compiler allocation failed\n");
+ abort();
+ }
+ b = cfree_cg_builtin_types(tc->c);
+ tc->i32 = b.id[CFREE_CG_BUILTIN_I32];
+ tc->ptr = cfree_cg_type_ptr(tc->c, b.id[CFREE_CG_BUILTIN_VOID], 0);
+ memset(&sig, 0, sizeof sig);
+ memset(params, 0, sizeof params);
+ params[0].type = tc->i32;
+ sig.ret = tc->i32;
+ sig.params = params;
+ sig.nparams = 1;
+ sig.call_conv = CFREE_CG_CC_TARGET_C;
+ tc->fn1 = cfree_cg_type_func((CfreeCompiler*)tc->c, sig);
+}
+
+static void tc_fini(TestCtx* tc) {
+ cfree_compiler_free(tc->c);
+ tc->c = NULL;
+}
+
+static Operand op_local(CGLocal local, CfreeCgTypeId type) {
+ Operand o;
+ memset(&o, 0, sizeof o);
+ o.kind = OPK_LOCAL;
+ o.type = type;
+ o.v.local = local;
+ return o;
+}
+
+static Operand op_global(ObjSymId sym, CfreeCgTypeId type) {
+ Operand o;
+ memset(&o, 0, sizeof o);
+ o.kind = OPK_GLOBAL;
+ o.type = type;
+ o.v.global.sym = sym;
+ return o;
+}
+
+typedef struct CapturedFunc {
+ CgIrFunc* func;
+} CapturedFunc;
+
+static void on_func(void* user, CgIrFunc* func) {
+ ((CapturedFunc*)user)->func = func;
+}
+
+static CgTarget* make_recorder(TestCtx* tc, CapturedFunc* cap) {
+ CgIrRecorderConfig cfg;
+ memset(&cfg, 0, sizeof cfg);
+ cfg.func_recorded = on_func;
+ cfg.user = cap;
+ return cg_ir_recorder_new(tc->c, NULL, &cfg);
+}
+
+/* Callee: i32 f(i32 x) { acc = x + x; acc = acc + x; ... (nbinops total); return
+ * acc; } -> straightline body of cost == nbinops. */
+static CgIrFunc* build_callee(TestCtx* tc, ObjSymId sym, u32 nbinops,
+ CfreeCgInlinePolicy policy) {
+ CapturedFunc cap;
+ CgTarget* t;
+ CGFuncDesc fd;
+ CGParamDesc pd;
+ CGLocal x, acc;
+ memset(&cap, 0, sizeof cap);
+ t = make_recorder(tc, &cap);
+ memset(&fd, 0, sizeof fd);
+ fd.sym = sym;
+ fd.fn_type = tc->fn1;
+ fd.inline_policy = policy;
+ t->func_begin(t, &fd);
+ memset(&pd, 0, sizeof pd);
+ pd.index = 0;
+ pd.type = tc->i32;
+ pd.size = 4;
+ pd.align = 4;
+ x = t->param(t, &pd);
+ acc = t->local(t, &(CGLocalDesc){.type = tc->i32, .size = 4, .align = 4});
+ for (u32 i = 0; i < nbinops; ++i)
+ t->binop(t, BO_IADD, op_local(acc, tc->i32),
+ op_local(i == 0 ? x : acc, tc->i32), op_local(x, tc->i32));
+ t->ret(t, &acc, 1);
+ t->func_end(t);
+ return cap.func;
+}
+
+/* Caller: i32 g(void) { arg = 41; res = callee(arg); return res; } */
+static CgIrFunc* build_caller(TestCtx* tc, ObjSymId callee_sym,
+ CfreeCgInlinePolicy call_policy) {
+ CapturedFunc cap;
+ CgTarget* t;
+ CGFuncDesc fd;
+ CGCallDesc call;
+ CGLocal arg, res;
+ CGLocal cargs[1];
+ CGLocal cres[1];
+ CfreeCgFuncSig sig;
+ memset(&cap, 0, sizeof cap);
+ t = make_recorder(tc, &cap);
+ memset(&fd, 0, sizeof fd);
+ fd.sym = callee_sym + 1u;
+ memset(&sig, 0, sizeof sig);
+ sig.ret = tc->i32;
+ sig.call_conv = CFREE_CG_CC_TARGET_C;
+ fd.fn_type = cfree_cg_type_func((CfreeCompiler*)tc->c, sig);
+ t->func_begin(t, &fd);
+ arg = t->local(t, &(CGLocalDesc){.type = tc->i32, .size = 4, .align = 4});
+ res = t->local(t, &(CGLocalDesc){.type = tc->i32, .size = 4, .align = 4});
+ t->load_imm(t, op_local(arg, tc->i32), 41);
+ memset(&call, 0, sizeof call);
+ cargs[0] = arg;
+ cres[0] = res;
+ call.fn_type = tc->fn1;
+ call.callee = op_global(callee_sym, tc->ptr);
+ call.args = cargs;
+ call.nargs = 1;
+ call.results = cres;
+ call.nresults = 1;
+ call.inline_policy = call_policy;
+ t->call(t, &call);
+ t->ret(t, &res, 1);
+ t->func_end(t);
+ return cap.func;
+}
+
+typedef struct LookupCtx {
+ ObjSymId sym;
+ Func* callee;
+} LookupCtx;
+
+static Func* lookup(void* ctx, ObjSymId sym) {
+ LookupCtx* l = (LookupCtx*)ctx;
+ return sym == l->sym ? l->callee : NULL;
+}
+
+static u32 count_calls(const Func* f) {
+ u32 n = 0;
+ for (u32 b = 0; b < f->nblocks; ++b)
+ for (u32 i = 0; i < f->blocks[b].ninsts; ++i)
+ if ((IROp)f->blocks[b].insts[i].op == IR_CALL) ++n;
+ return n;
+}
+
+static u32 count_binops(const Func* f) {
+ u32 n = 0;
+ for (u32 b = 0; b < f->nblocks; ++b)
+ for (u32 i = 0; i < f->blocks[b].ninsts; ++i)
+ if ((IROp)f->blocks[b].insts[i].op == IR_BINOP) ++n;
+ return n;
+}
+
+/* A tiny callee is inlined: the call is replaced and the callee's body lands in
+ * the caller. */
+static void tiny_callee_is_inlined(void) {
+ TestCtx tc;
+ ObjSymId sym = 1234;
+ tc_init(&tc);
+ Func* callee = opt_func_from_cg_ir(tc.c, build_callee(&tc, sym, 1,
+ CFREE_CG_INLINE_DEFAULT));
+ Func* caller =
+ opt_func_from_cg_ir(tc.c, build_caller(&tc, sym, CFREE_CG_INLINE_DEFAULT));
+ LookupCtx lc = {sym, callee};
+ EXPECT(count_calls(caller) == 1, "precondition: caller should have one call");
+ int n = opt_try_tiny_inline(caller, lookup, &lc);
+ EXPECT(n == 1, "expected one inline, got %d", n);
+ EXPECT(count_calls(caller) == 0, "call should be gone, %u remain",
+ count_calls(caller));
+ EXPECT(count_binops(caller) >= 1, "callee binop should be cloned in");
+ tc_fini(&tc);
+}
+
+/* A DEFAULT callee over the tiny cost cap is refused; the same body marked
+ * always_inline bypasses the cap. */
+static void over_budget_respects_policy(void) {
+ TestCtx tc;
+ ObjSymId sym = 2000;
+ tc_init(&tc);
+ /* cost 12 > INLINE_TINY_COST_LIMIT (8). */
+ Func* big_default = opt_func_from_cg_ir(
+ tc.c, build_callee(&tc, sym, 12, CFREE_CG_INLINE_DEFAULT));
+ Func* caller1 =
+ opt_func_from_cg_ir(tc.c, build_caller(&tc, sym, CFREE_CG_INLINE_DEFAULT));
+ LookupCtx lc1 = {sym, big_default};
+ int n1 = opt_try_tiny_inline(caller1, lookup, &lc1);
+ EXPECT(n1 == 0, "over-budget DEFAULT callee should be refused, got %d", n1);
+ EXPECT(count_calls(caller1) == 1, "call should survive refusal");
+
+ Func* big_always = opt_func_from_cg_ir(
+ tc.c, build_callee(&tc, sym, 12, CFREE_CG_INLINE_ALWAYS));
+ Func* caller2 =
+ opt_func_from_cg_ir(tc.c, build_caller(&tc, sym, CFREE_CG_INLINE_DEFAULT));
+ LookupCtx lc2 = {sym, big_always};
+ int n2 = opt_try_tiny_inline(caller2, lookup, &lc2);
+ EXPECT(n2 == 1, "always_inline should bypass the tiny cap, got %d", n2);
+ EXPECT(count_calls(caller2) == 0, "always_inline call should be inlined");
+ tc_fini(&tc);
+}
+
+/* A NEVER (noinline) callee is refused even when tiny. */
+static void never_policy_is_refused(void) {
+ TestCtx tc;
+ ObjSymId sym = 3000;
+ tc_init(&tc);
+ Func* callee = opt_func_from_cg_ir(
+ tc.c, build_callee(&tc, sym, 1, CFREE_CG_INLINE_NEVER));
+ Func* caller =
+ opt_func_from_cg_ir(tc.c, build_caller(&tc, sym, CFREE_CG_INLINE_DEFAULT));
+ LookupCtx lc = {sym, callee};
+ int n = opt_try_tiny_inline(caller, lookup, &lc);
+ EXPECT(n == 0, "NEVER callee should not be inlined, got %d", n);
+ EXPECT(count_calls(caller) == 1, "NEVER call should survive");
+ tc_fini(&tc);
+}
+
+/* An unresolved (forward-defined) callee is left alone. */
+static void unknown_callee_is_skipped(void) {
+ TestCtx tc;
+ ObjSymId sym = 4000;
+ tc_init(&tc);
+ Func* caller =
+ opt_func_from_cg_ir(tc.c, build_caller(&tc, sym, CFREE_CG_INLINE_DEFAULT));
+ LookupCtx lc = {sym, NULL}; /* lookup never returns a body */
+ int n = opt_try_tiny_inline(caller, lookup, &lc);
+ EXPECT(n == 0, "unresolved callee should be skipped, got %d", n);
+ EXPECT(count_calls(caller) == 1, "unresolved call should survive");
+ tc_fini(&tc);
+}
+
+int main(void) {
+ tiny_callee_is_inlined();
+ over_budget_respects_policy();
+ never_policy_is_refused();
+ unknown_callee_is_skipped();
+ fprintf(stderr, "tiny-inline: %d checks, %d failures\n", g_checks, g_fails);
+ return g_fails ? 1 : 0;
+}
diff --git a/test/test.mk b/test/test.mk
@@ -507,14 +507,26 @@ test-macho: lib $(TEST_RT_DEP) $(ROUNDTRIP_BIN_MACHO) $(LINK_EXE_RUNNER) $(JIT_R
bash test/link/run.sh
OPT_TEST_BIN = build/test/cg_ir_lower_test
+TINY_INLINE_TEST_BIN = build/test/tiny_inline_test
-test-opt: bin $(OPT_TEST_BIN)
+test-opt: bin $(OPT_TEST_BIN) test-opt-tiny-inline test-opt-inline
$(OPT_TEST_BIN)
$(OPT_TEST_BIN): test/opt/cg_ir_lower_test.c $(LIB_OBJS)
@mkdir -p $(dir $@)
$(CC) $(TEST_HOST_CFLAGS) -Isrc test/opt/cg_ir_lower_test.c $(LIB_OBJS) -o $@
+test-opt-tiny-inline: bin $(TINY_INLINE_TEST_BIN)
+ $(TINY_INLINE_TEST_BIN)
+
+$(TINY_INLINE_TEST_BIN): test/opt/tiny_inline_test.c $(LIB_OBJS)
+ @mkdir -p $(dir $@)
+ $(CC) $(TEST_HOST_CFLAGS) -Isrc test/opt/tiny_inline_test.c $(LIB_OBJS) -o $@
+
+# Behavioral disasm check: tiny callee `bl` disappears from its caller at -O1.
+test-opt-inline: bin
+ @CFREE=$(abspath $(BIN)) bash test/opt/run.sh
+
test-parse: test-parse-ok test-parse-err
test-parse-ok: lib $(TEST_RT_DEP) $(PARSE_RUNNER) $(ROUNDTRIP_BIN) $(LINK_EXE_RUNNER) $(JIT_RUNNER)