kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit eff3401d60b7db0ab8de3af4874fd5600fdcba68
parent 0a7004d9ad91577a885b3bdc19f05871fc6756d1
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed, 27 May 2026 22:38:11 -0700

opt: O1 codegen wins + tiny-function inliner

Codegen (aa64):
- Minimal prologue on opt path: emit exact-size prologue once callee-saves
  and frame slots are known (new emit_prologue hook gated by
  emit_minimal_prologue); only frame-size immediates are patched at func_end.
  NativeDirectTarget keeps the worst-case-reserve path, now branching over the
  unused tail instead of NOP-filling it. Saves ~17 executed NOPs per call.
- Save/restore fp/lr + integer callee-saves with stp/ldp pairs
  (signed-offset form added to isa.h).
- Eliminate the scalar call-result round trip: emit_call hands plan_call the
  MIR's destination directly so the ABI result reg moves once to its dest,
  not via a temp spill slot.
- Store integer 0 from the hardware zero register via new
  has_store_zero_reg / store_zero_reg hooks (strb wzr, ... on aa64).

Opt passes:
- pass_combine: strengthen the post-RA substitute into full local
  copy-propagation for reg-source IR_COPY (no single-use / live-out gate);
  immediate / convert forms keep the stricter gate.
- pass_addr_fold: new opt_hoist_loop_consts hoists single-def, non-zero
  LOAD_IMM that appears inside a loop to one entry-block def, mirroring
  opt_addr_of_global_cse. Single-def guard avoids remapping mutable
  promoted locals (loop counters initialised to a constant).
- pass_jump: new cleanup_rotate_loops moves a loop header below its latch
  and inverts the test in place, eliminating the per-iteration
  unconditional back-jump (pure emit-order reorder + branch invert, no
  CFG edges move).

Tail calls:
- Implement opt_on_tail_call_unrealizable_reason for the opt path using a
  new NativeTarget signature_stack_bytes hook. Sibling calls whose
  outgoing stack args don't fit the caller's incoming area now correctly
  fall back to call+ret (variadic caller, or variadic callee invoked with
  extra args, also fall back conservatively).

Inliner:
- Wire opt_try_tiny_inline into the O1 pipeline (streaming inline of tiny
  already-recorded callees), with test-opt-tiny-inline + test-opt-inline
  test targets and the run.sh disasm check.

Docs: refresh OPT_O1_PASSES.md for the new passes / emit-path features;
add OPT_O1_PERF_TODO.md and O1_INLINE.md.

Diffstat:
MMakefile | 2+-
Adoc/O1_INLINE.md | 188+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdoc/OPT_O1_PASSES.md | 825+++++++++++++++++++++++++++++--------------------------------------------------
Adoc/OPT_O1_PERF_TODO.md | 91+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/arch/aa64/isa.h | 23+++++++++++++++++++++++
Msrc/arch/aa64/native.c | 204+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
Msrc/arch/native_target.h | 25+++++++++++++++++++++++++
Msrc/opt/opt.c | 87++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Msrc/opt/opt.h | 2++
Msrc/opt/opt_internal.h | 10++++++++++
Msrc/opt/pass_addr_fold.c | 173+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/opt/pass_combine.c | 34+++++++++++++++++++++++-----------
Msrc/opt/pass_inline.c | 85++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Msrc/opt/pass_jump.c | 69+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/opt/pass_native_emit.c | 36++++++++++++++++++++++++++++++++----
Atest/opt/run.sh | 55+++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atest/opt/tiny_inline_test.c | 333+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest/test.mk | 14+++++++++++++-
18 files changed, 1660 insertions(+), 596 deletions(-)

diff --git a/Makefile b/Makefile @@ -154,7 +154,7 @@ LIB_SRCS_OBJ_COFF := $(filter-out %/archive.c,$(LIB_SRCS_OBJ_COFF)) LIB_SRCS_NONARCH += src/obj/archive_stubs.c endif -LIB_SRCS_OPT = $(filter-out src/opt/pass_inline.c src/opt/pass_o2.c,$(shell find src/opt -name '*.c' 2>/dev/null)) +LIB_SRCS_OPT = $(filter-out src/opt/pass_o2.c,$(shell find src/opt -name '*.c' 2>/dev/null)) LIB_SRCS_WASM_CORE = $(shell find src/wasm -name '*.c' 2>/dev/null) LIB_SRCS_API_AR = src/api/archive.c LIB_SRCS_API_DISASM = src/api/disasm.c diff --git a/doc/O1_INLINE.md b/doc/O1_INLINE.md @@ -0,0 +1,188 @@ +# Plan: Inline tiny functions at O1 + +## Context + +The cfree optimizer currently does **no function inlining in any live compile**. Both +`-O1` and `-O2` are normalized to the single O1 native path (`src/opt/opt.h:11`, +"O2 cutover window"); the SSA-based O2 pipeline (`opt_cleanup`) and the whole-program +inliner (`opt_inline`) exist in the tree but have **no callers** outside the stale, +no-longer-compiling `test/opt/opt_test.c`. Verified empirically: at `-O1`/`-O2` a call to a +trivial helper like `add1(x) = x+1` still emits a `bl` — nothing is inlined. + +We want O1 to inline *tiny* functions — simple `static inline`-style helpers — so callers +don't pay call/frame overhead for one- or two-instruction bodies. This is the single +biggest easy win for the kind of small leaf helpers that dominate idiomatic C. + +The good news: a complete, correct inlining toolkit already exists in +`src/opt/pass_inline.c` and operates on the **pre-machinize PReg form**, which is exactly +the IR shape present in the O1 pipeline before `opt_machinize_native`. This change is +therefore mostly *wiring* that machinery into the live O1 path, gated to tiny callees — +not writing an inliner from scratch. + +### Decisions +- **Policy:** inline `DEFAULT`/`HINT` callees only when cost ≤ `INLINE_TINY_COST_LIMIT` + (8 straightline ops); always refuse `NEVER`; always inline `ALWAYS` + (`always_inline`) regardless of the tiny cap. +- **Tests:** behavioral disasm fixture (reuse toy-135 at `-O1`) **plus** a fresh small + unit test in the `cg_ir_lower_test.c` style. Do **not** revive `opt_test.c` (it no + longer compiles against the current API — references the removed `CGTarget` type). + +## Key facts (verified) + +- `opt_on_func` (`src/opt/opt.c:171`) is the per-function sink callback: lowers each + `CgIrFunc` to a pre-machinize `Func` via `opt_func_from_cg_ir`, then runs + `opt_run_o1_native`. `OptImpl` struct at `opt.c:20`. +- `opt_run_o1_native` (`opt.c:43`): cfg build → jump-cleanup → cfg → `opt_simplify_local` + (line 66) → `opt_verify` (69) → `opt_machinize_native` (73). The window **after line 67 + and before line 68** is pre-machinize PReg form, CFG valid — the inliner's required shape. +- The recorder (`src/cg/ir_recorder.c`) calls `cg_ir_module_add_func` at `func_begin` and + fires `func_recorded` at `func_end`. So when a **caller** is processed, every function + defined **earlier** in the TU is already recorded and `complete`. Static inline helpers + are defined before use ⇒ callee body is available. Forward-defined callees are not + (acceptable limitation). +- Lifetimes: `Func` is allocated on `c->tu` (`ir_func_new`, `src/opt/ir.c:269`) and + `CgIrFunc`s live on the module — nothing is freed per-function. `opt_func_from_cg_ir` is + cheap and re-runnable, producing a fresh pre-machinize `Func` from a persistent `CgIrFunc`. +- Reusable machinery in `pass_inline.c` (all operate on pre-machinize PReg form): + `inline_call_site` (line 500), gates `callee_inline_shape` (164, already enforces a + policy-dependent cost cap and a straightline-op whitelist that **excludes `IR_CALL`**), + `inline_rewrite_supported` (473), `recursive_or_scc` (135), `direct_callee` (85), + `effective_inline_policy` (100). Cost limits at lines 34-37. + +## Implementation + +### 1. `src/opt/pass_inline.c` — add the tiny-inline driver (reuse existing gates) + +Keep all existing static gates as-is; add in the same TU so nothing else needs exposing. + +- Add constant near lines 34-37: + ```c + #define INLINE_TINY_COST_LIMIT 8u + ``` +- Add `try_tiny_inline_call(FuncSet* fs, Func* caller, u32 b, u32 i)` — mirrors + `try_inline_call` (line 607) but: no `base_cost`/`caller_growth_ok` (the streaming path + has no whole-program cost array, and tiny callees can't blow up the caller); add the tiny + gate after `callee_inline_shape`: + ```c + if (policy != CFREE_CG_INLINE_ALWAYS && cost > INLINE_TINY_COST_LIMIT) return 0; + ``` + Emit `opt.tiny_inline.candidates` / `opt.tiny_inline.inlined` metrics. +- Add public driver (callback-based so `opt.c` owns the registry/cache): + ```c + typedef Func* (*OptInlineCalleeLookup)(void* ctx, ObjSymId callee_sym); + int opt_try_tiny_inline(Func* caller, OptInlineCalleeLookup lookup, void* ctx); + ``` + It walks caller blocks/insts; for each `IR_CALL` resolves the callee `ObjSymId` from + `aux->desc.callee.v.global.sym`, calls `lookup(ctx, sym)` to get a fresh/cached + pre-machinize callee `Func`, builds a 2-element ad-hoc `FuncSet {caller, callee}` on + `caller->arena` (so `direct_callee`/`recursive_or_scc` work unchanged), and calls + `try_tiny_inline_call`. Returns the number of inlines performed. + + **Iteration:** fixed-point loop with cap `#define TINY_INLINE_MAX_PASSES 4`. After any + successful `inline_call_site` (which splits the block, invalidating indices), set + `changed` and restart the block scan. Note inlined bodies are straightline-only + (`op_supported_in_straightline_inline` excludes `IR_CALL`) ⇒ inlining never introduces + new calls, so this terminates quickly; the cap is a guard. + + **Recursion safety:** the only real check needed is `caller == callee` (kept via + `recursive_or_scc`). Any callee containing a call is already rejected by + `callee_inline_shape`'s whitelist, so transitive cycles cannot arise. + +### 2. `src/opt/opt_internal.h` — declare the new surface + +Add near the other pass prototypes: +```c +typedef Func* (*OptInlineCalleeLookup)(void* ctx, ObjSymId callee_sym); +int opt_try_tiny_inline(Func* caller, OptInlineCalleeLookup lookup, void* ctx); +void opt_inline(FuncSet*, int max_iters); /* formalize existing symbol */ +``` + +### 3. `src/opt/opt.c` — registry + wiring + +- Extend `OptImpl` (line 20) with a callee registry/cache, all on `c->tu`: + ```c + CgIrFunc** cg_by_sym; /* recorded functions, for callee lookup */ + Func** lowered_cache; /* parallel: lazily re-lowered callee Func */ + u32 ncg; + u32 cg_cap; + ``` + (zero-initialized by the existing `arena_znew` in `opt_cgtarget_new`). +- At the **top of `opt_on_func`** (before lowering the caller), append `cg_func` to + `cg_by_sym` (it is `complete`; earlier-defined funcs already present). Growing array on + `c->tu`. +- Add a lookup function `static Func* opt_tiny_callee_lookup(void* ctx, ObjSymId sym)`: + linear-scan `cg_by_sym` for `desc.sym == sym`; if found and `lowered_cache[idx] == NULL`, + `opt_func_from_cg_ir(o->c, cg_by_sym[idx])` and cache it (one re-lower per helper, reused + across all call sites — `inline_call_site` does not mutate the callee). Return NULL if + absent (forward-defined) so the call is left as-is. +- In `opt_run_o1_native`, between line 67 (`metrics_scope_end "opt.cfg.simplify_local"`) + and line 68: + ```c + metrics_scope_begin(o->c, "opt.o1.tiny_inline"); + int inlined = opt_try_tiny_inline(f, opt_tiny_callee_lookup, o); + metrics_scope_end(o->c, "opt.o1.tiny_inline"); + if (inlined) { + /* REQUIRED for correctness: inline_call_site invalidated CFG and left + * preds stale (it maintains succ + emit_order only). Verify + machinize + * + regalloc all consume preds/dominance. */ + opt_build_cfg(f); + /* QUALITY (optional): merge the single-pred/single-succ BR-glue chain the + * inliner introduced (pre -> callee body -> cont) back into straight-line + * blocks so regalloc doesn't see artificial block boundaries. Mirrors the + * prologue's build_cfg -> jump_cleanup -> build_cfg idiom (opt.c:57-63). + * NOTE: use opt_jump_cleanup, NOT opt_simplify_local — the latter is a + * per-inst peephole and does not merge blocks. Without this, output is + * still correct (post-RA opt_mir_jump_cleanup removes the dead jumps) but + * allocation is worse. */ + opt_jump_cleanup(f, OPT_JUMP_CLEANUP_CFG); + opt_build_cfg(f); + opt_verify(f, "o1-tiny-inline"); + } + ``` + The pass needs `OptImpl* o` for the lookup — `opt_run_o1_native` already has `o`. + +### 4. Tests + +**(a) New unit test** — `test/opt/tiny_inline_test.c`, modeled on +`test/opt/cg_ir_lower_test.c` (same `EXPECT` harness, builds `Func`s via `opt_func_from_cg_ir` +from hand-built `CgIrFunc` tapes, links `$(LIB_OBJS)` with `-Isrc`). Cases: + - Tiny callee (`add1(x)=x+1`, cost 1): drive `opt_try_tiny_inline` with a trivial lookup + closure returning the callee `Func`; assert caller has no `IR_CALL` and contains the + cloned `BINOP`. + - Threshold: a `DEFAULT` callee with cost > 8 is refused (call survives); the same body + marked `CFREE_CG_INLINE_ALWAYS` is inlined. + Add a `test-opt-tiny-inline` target + binary in `test/test.mk` next to `test-opt` + (~line 509), and add it to the aggregate list near line 66. + +**(b) Behavioral disasm** — reuse `test/toy/cases/135_inline_cleanup_quality.toy` +(`add1(x)=x+1`; `__user_main` calls `add1(41)`; `main` calls `__user_main`). Compile at +`-O1`, `cfree objdump -d`, and assert the inlined helper's `bl` is **gone** from its caller. +Realistic O1 assertions (O1 has no constant folding, so do **not** assert "main == constant +42"): `__user_main` contains no `bl ... # add1`, and `main` contains no `bl ... # __user_main`. +`test/opt/run.sh` already contains a (too-strong, orphaned) version of this check — repurpose +it to the no-`bl` assertions and wire it into a `test-opt-inline` make target (it is +currently not invoked by any target). + +## Verification + +1. `make bin && make lib` +2. Behavioral, aarch64 cross (host-arch-independent): + ``` + build/cfree cc -target aarch64-linux-gnu -O1 -c \ + test/toy/cases/135_inline_cleanup_quality.toy -o /tmp/ti.o + build/cfree objdump -d /tmp/ti.o + ``` + Confirm `<__user_main>` has no `bl ... add1` and `<main>` has no `bl ... __user_main` + (compare against the pre-change disasm, which shows both `bl`s). +3. `make test-opt-tiny-inline` (new unit test) and `make test-opt-inline` (disasm) pass. +4. Regression sweep: `make test-opt test-cg-api test-isa test-aa64-inline test-smoke-x64 + test-smoke-rv64` — confirm no behavioral changes elsewhere and that `opt_verify` + ("o1-tiny-inline") never trips. +5. Sanity that NEVER is honored and forward-defined callees are left alone (a quick + hand-written C file with `__attribute__((noinline))` and a forward-declared helper). + +## Out of scope +- O2 / SSA pipeline (entirely disabled). +- Cross-TU / forward-defined-callee inlining. +- Inlining callees with control flow richer than the existing straightline whitelist, or + with multi-part/aggregate ABI args/returns (rejected by `inline_rewrite_supported`). diff --git a/doc/OPT_O1_PASSES.md b/doc/OPT_O1_PASSES.md @@ -1,625 +1,412 @@ # O1 Optimizer Pass Pipeline -This document describes the O1 lowering pipeline, what each pass does, how it -should work, and what was lost or incorrectly ported in the refactor at commit -`f60a16d` (opt: rewrite pipeline to consume CgIrFunc input; replace pass_emit -with NativeTarget). +A reference for cfree's `-O1` lowering pipeline as it currently exists: the +entry path, what each pass does, the key data structures, and the +`NativeTarget` backend contract the emit step relies on. Intended as a map for +performance work (see `doc/OPT_O1_PERF_TODO.md` for the active runtime targets). + +Scope: **aa64 only**. The aarch64 backend is the only `NativeTarget` +implementation; x64/rv64 `NativeTarget` ports do not exist. The O0 path uses a +separate backend (`NativeDirectTarget`, §16) and is not covered here beyond the +boundary. + +History note: this pipeline replaced an older `CGTarget`-replay design (commit +`f60a16d`). The transitional "direct-replay bypass" — which sent functions with +alloca/varargs/asm/aggregates straight to unoptimized emission — **has been +deleted**. Every function now goes through the full pipeline below +unconditionally; varargs, inline asm, and aggregate/sret/byval are all handled +on the optimizer path. --- -## Pipeline overview +## Entry path -The O1 pipeline runs when `opt_cgtarget_new` is called with `level >= 1`. -`CgIrRecorder` records the full function as a `CgIrFunc`. When the function is -complete, `opt_on_func` is called: +`opt_cgtarget_new(level >= 1)` installs the optimizer as the CG sink. The C +frontend drives `CgIrRecorder`, which records each function as a `CgIrFunc` +tape. On function completion, `opt_on_func` (src/opt/opt.c) runs: -1. `opt_func_needs_direct_replay` — bypass check (see §1 below) -2. `opt_func_from_cg_ir` — convert `CgIrFunc` → `Func` (new in refactor) -3. `opt_run_o1_native` — run the pass sequence on `Func` +1. `opt_func_from_cg_ir` (src/opt/cg_ir_lower.c) — convert the recorded + `CgIrFunc` into the optimizer's `Func` CFG/IR, classifying each local as + register (`PReg`) or frame storage (§1). +2. `opt_run_o1_native` (src/opt/opt.c) — run the pass sequence on `Func`. -The pass sequence in `opt_run_o1_native` (src/opt/opt.c): - -``` -opt_build_cfg (×2, with opt_jump_cleanup(CFG) in between) -opt_simplify_local -opt_machinize_native ABI lowering -opt_addr_xform_pregs fold IR_ADDR_OF(local) → OPK_LOCAL in loads/stores -opt_promote_scalar_locals promote non-escaped frame slots → PRegs -opt_addr_of_global_cse hoist duplicate ADDR_OF(global) to entry block -opt_build_loop_tree -opt_live_blocks liveness bitmaps -opt_dead_def_elim_with_live pre-RA DCE -opt_regalloc_locations point-bitmap register allocator -opt_lower_to_mir insert spill/reload, lower PReg → hard reg -opt_mir_combine MIR-level peephole -opt_mir_dce -opt_mir_jump_cleanup(CFG) + opt_mir_build_cfg -opt_mir_jump_cleanup(LAYOUT) block reorder + branch stripping -opt_emit_native emit to NativeTarget -``` +There is no per-function or per-op bypass. --- -## §1 — Direct-replay bypass - -**Source**: `opt_func_needs_direct_replay` in src/opt/opt.c -**Route**: → `opt_replay_cg_ir_direct` → raw CgTarget replay (no optimization) +## Pipeline overview -A function bypasses the optimizer entirely if it contains: -- Aggregate or >8-byte params/results -- `CG_IR_ASM_BLOCK`, `CG_IR_ALLOCA`, `CG_IR_INTRINSIC` -- Varargs ops (`CG_IR_VA_*`) -- Aggregate call arguments or results +`opt_run_o1_native`, in order (verify/dump steps interleaved): -**Before the refactor**: These functions went through the optimizer. The old -`CGTarget` wrapper recorded them and replayed with `opt_emit`. The optimizer -handled them with frame slots from the start. Post-refactor, they are entirely -unoptimized — every function with an `alloca`, inline asm, or vararg is now -compiled at -O0 quality regardless of the requested optimization level. +``` +opt_build_cfg ┐ CFG build + cleanup +opt_jump_cleanup(CFG) │ +opt_build_cfg │ +opt_simplify_local ┘ +opt_try_tiny_inline streaming inline of tiny callees recorded earlier +opt_machinize_native ABI lowering (call/ret/param constraints) +opt_addr_xform_pregs fold ADDR_OF(local) → OPK_LOCAL in load/store +opt_promote_scalar_locals non-escaped frame slot → PReg +opt_addr_of_global_cse hoist duplicate ADDR_OF(global) to entry +opt_build_loop_tree loop nesting (informs spill cost) +opt_hoist_loop_consts hoist single-def LOAD_IMM (imm!=0) in loops to entry +opt_live_blocks per-block PReg liveness +opt_dead_def_elim_with_live pre-RA DCE +opt_regalloc_locations PReg → hard reg / spill slot +opt_lower_to_mir materialize hard regs; insert spill/reload +opt_mir_combine MIR peephole (substitute/addr-synth/sink/ext) +opt_mir_dce post-RA DCE +opt_mir_jump_cleanup(CFG) + build_cfg unreachable/jump-chain cleanup +opt_mir_jump_cleanup(LAYOUT) reorder for fallthrough + rotate loops + invert +opt_emit_native emit to NativeTarget (minimal prologue on opt path) +``` -**What to fix**: The bypass is too conservative. Alloca, varargs, and asm-block -functions can still benefit from register allocation and DCE on the non-alloca -portions. The check should be narrowed to only bypass the parts of the pipeline -that genuinely cannot handle those ops (currently none — `cg_ir_lower.c` -should be extended to handle these). +An `opt_verify` / `opt_mir_verify` runs after most stages (tags: `lowering-cfg`, +`lowering-machinize`, `o1-addr-xform`, `o1-promote-scalar`, +`o1-addr-global-cse`, `post-regalloc`, `lower-mir`, `post-mir-combine`, +`post-mir-dce`, `post-mir-jump-cfg`). `CFREE_DUMP=<tag>` dumps the IR at the +matching stage (`entry` before any pass, `pre-emit` just before emit); see +Debugging aids. --- -## §2 — opt_func_from_cg_ir (new pass, introduced in refactor) - -**Source**: src/opt/cg_ir_lower.c -**Purpose**: Convert a recorded `CgIrFunc` into the `Func` IR that the optimizer -works on. - -### Local classification +## §1 — opt_func_from_cg_ir (CgIrFunc → Func) -The critical function is `lower_locals` (cg_ir_lower.c:147). For each local: +**Source**: src/opt/cg_ir_lower.c -```c -m->address_taken = - local_needs_home(in) || local_address_used_in_cg_ir(l->src, in->id); +Converts the recorded tape into the optimizer `Func` and classifies locals in +`lower_locals`: -if (m->address_taken) { - m->storage.kind = CG_LOCAL_STORAGE_FRAME; // → OPK_LOCAL -} else { - m->storage.kind = CG_LOCAL_STORAGE_REG; // → OPK_REG (PReg) - m->storage.v.reg = (Reg)r; -} -``` +- **Address-taken or memory-required** locals (`local_needs_home`: + `in->address_taken`, `CG_LOCAL_ADDR_TAKEN`, `CG_LOCAL_MEMORY_REQUIRED`), and + **aggregate / >8-byte** locals, get `CG_LOCAL_STORAGE_FRAME` → `OPK_LOCAL`. +- All other locals get `CG_LOCAL_STORAGE_REG` → a fresh `PReg` → `OPK_REG`. -`local_needs_home` returns true if: -- `in->address_taken` — set by the recorder when `addr_of` is called on the local -- `CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED` flags +Address-taken locals start in frame storage; the addr-fold + promotion passes +(§5, §6) recover register storage for those whose address does not truly escape. -Address-taken locals get FRAME storage; all others get a PReg. The `opt_addr_xform_pregs` -and `opt_promote_scalar_locals` passes are then responsible for recovering -register storage for locals whose addresses don't truly escape. +va_list operands are lowered as opaque pointer *values* (a `NativeLoc`), not +flagged address-taken — all va-layout knowledge lives behind the `NativeTarget` +va hooks. Aggregate locals are forced to frame. -### Old approach (before refactor) +--- -The old `w_local` handler in the CGTarget wrapper assigned storage differently: +## §2 — opt_build_cfg / opt_jump_cleanup(CFG) / opt_simplify_local -```c -if (o->level < 2 && - (d->flags & (CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED)) == 0) { - // REG storage — even if addr_of will be called later -} else { - // FRAME storage -} -``` +**Sources**: src/opt/pass_cfg.c, src/opt/pass_jump.c, src/opt/pass_simplify.c -Address-taken status was determined conservatively at declaration time. Locals -declared without `CG_LOCAL_ADDR_TAKEN` got register storage initially. When -`addr_of` was called on them later, a `home_slot` was allocated alongside. -After recording, `opt_frame_home_addr_taken_locals` ran as part of `w_func_end` -and inserted explicit `IR_LOAD` / `IR_STORE` pairs around every use/def of the -local's register to keep the frame slot in sync. - -This gave the optimizer explicit IR_LOAD/IR_STORE pairs to work with before -running `opt_addr_xform_pregs` and `opt_promote_scalar_locals`. In the new -code, address-taken locals go directly to FRAME storage and the optimizer must -reconstruct the register-friendly form via the addr-fold and promotion passes. -The promotion chain (§3 + §4) should in theory produce equivalent output, but -depends on `opt_addr_xform_pregs` successfully classifying every `IR_ADDR_OF` -use as non-escaping. +Build the CFG, drop unreachable blocks and trivially-foldable branches, and fold +single-successor chains. Runs before `opt_machinize_native` so ABI lowering sees +an accurate live/critical-edge picture. `opt_build_cfg` runs twice with a jump +cleanup between to settle the shape. --- -## §3 — opt_build_cfg / opt_jump_cleanup(CFG) / opt_simplify_local +## §3 — opt_machinize_native -**Sources**: src/opt/pass_cfg.c -**Purpose**: Build the CFG, remove unreachable blocks and trivially-foldable -branches, and fold single-successor blocks into their predecessor. +**Source**: src/opt/pass_machinize.c -These passes are unchanged by the refactor and work the same as before. -They must run before `opt_machinize_native` so that the ABI lowering has an -accurate picture of which blocks are live and which are critical edges. +ABI lowering against the `NativeTarget`. Annotates call/ret/param instructions +with calling-convention constraints (argument/result registers, the call +clobber mask, callee-save markers) so the register allocator knows which hard +registers are killed across each call. Collects the target's register classes +(`machinize_collect_regs`): the allocable set, the reserved scratch set, and the +caller/callee-saved masks (`collect_class`), and checks allocable and scratch +sets do not overlap (`machinize_check_overlap`). --- -## §4 — opt_machinize_native +## §4 — opt_addr_xform_pregs -**Source**: src/opt/pass_machinize.c -**Purpose**: ABI lowering. Inserts calling-convention constraints (scratch regs, -clobbers, callee-save markers) as annotations on call/ret/param instructions so -the register allocator knows which registers are killed across calls. +**Source**: src/opt/pass_addr_fold.c -The function was renamed from `opt_machinize(Func*, CGTarget*)` to -`opt_machinize_native(Func*, NativeTarget*)` in the refactor. The body -(`machinize_reset`, `machinize_prepare_insts`, `machinize_collect_regs`, -`machinize_check_overlap`) is unchanged in structure. The only difference is -the target type — `NativeTarget*` replaces `CGTarget*` — which affects which -register metadata APIs are called to gather clobber and callee-save info. +Folds `IR_ADDR_OF(local)` whose result pointer is used only as the base of +non-observable zero-offset, no-index `IR_LOAD`/`IR_STORE`. When all uses +qualify, the `IR_ADDR_OF` is removed and each use is rewritten to +`OPK_LOCAL(slot)`. Partial folding leaves the `IR_ADDR_OF` alive for any +effective-address-shaped uses. When a slot's `IR_ADDR_OF` defs are all retired, +`FSF_ADDR_TAKEN` is cleared so §6 can promote it. --- -## §5 — opt_addr_xform_pregs +## §5 — opt_promote_scalar_locals -**Source**: src/opt/pass_addr_fold.c -**Purpose**: Fold `IR_ADDR_OF(local)` defs whose result pointer is used only as -the base of non-observable `IR_LOAD` / `IR_STORE` with zero offset and no index. -When all uses qualify, the `IR_ADDR_OF` is NOP'd and each use is rewritten to -`OPK_LOCAL(slot)`. If only some uses qualify, partial folding leaves the -`IR_ADDR_OF` alive for the EA-shaped uses. +**Source**: src/opt/pass_addr_fold.c -After folding, `FSF_ADDR_TAKEN` is cleared on any frame slot whose -`IR_ADDR_OF` defs were all retired. This enables `opt_promote_scalar_locals` -to promote the slot to a PReg. +For each `FS_LOCAL` slot without `FSF_ADDR_TAKEN`/`FSF_VOLATILE` whose every +`OPK_LOCAL(slot)` appearance is a matching-type non-observable `IR_LOAD.opnds[1]` +or `IR_STORE.opnds[0]`, replaces the slot with a fresh `PReg`: stores become +`IR_COPY preg, src`, loads become `IR_COPY dst, preg`. This pulls scalar locals +out of memory into the register namespace before allocation. -**How to be fast**: One forward pass per `IR_ADDR_OF` candidate. The function -is scanned once per candidate to classify all uses, then rewritten in a second -scan. This is O(candidates × insts), but in practice the number of surviving -non-folded `IR_ADDR_OF` defs is small. +**Limitation (perf):** `promote_inst_classify` rejects a slot if `OPK_LOCAL` +appears anywhere else (call aux, `IR_ADDR_OF` operand, any other instruction). +An address-taken local whose address is stored into a pointer and later +dereferenced is not promoted (would need ADDR_OF copy-propagation through +PRegs); genuinely aliased locals correctly stay in memory. --- -## §6 — opt_promote_scalar_locals +## §6 — opt_addr_of_global_cse -**Source**: src/opt/pass_addr_fold.c -**Purpose**: For each `FS_LOCAL` frame slot without `FSF_ADDR_TAKEN` or -`FSF_VOLATILE`, and whose every `OPK_LOCAL(slot)` appearance is either: -- `IR_LOAD.opnds[1]` — a matching-type, non-observable load, or -- `IR_STORE.opnds[0]` — a matching-type, non-observable store, +**Source**: src/opt/pass_addr_fold.c -replace the slot with a fresh PReg: each store becomes `IR_COPY preg, src` -and each load becomes `IR_COPY dst, preg`. The slot becomes unreferenced. +CSE for `IR_ADDR_OF(global{sym, addend})`: when the same (sym, addend) appears +in multiple defs, materialize one in the entry block (after the +`IR_PARAM_DECL` prologue) and reuse it, collapsing per-iteration `adrp`/`add` +pairs in loops. + +--- -**Critical limitation**: `promote_inst_classify` rejects a slot if `OPK_LOCAL(slot)` -appears anywhere other than `IR_LOAD.opnds[1]` or `IR_STORE.opnds[0]`. In -particular, an `OPK_LOCAL` appearing in a call aux (as an argument or result -storage), in an `IR_ADDR_OF` operand, or in any other instruction, blocks -promotion entirely for that slot. +## §7 — opt_build_loop_tree -This creates a coverage gap with the new `opt_func_from_cg_ir`: when a local -is address-taken, `lower_operand_value` returns `OPK_LOCAL` for every VALUE use -of that local (not just load/store address uses). If `opt_addr_xform_pregs` did -not fold away all `IR_ADDR_OF` defs for the local (because at least one use -escaped), the slot keeps `FSF_ADDR_TAKEN` and `opt_promote_scalar_locals` skips -it entirely. +**Source**: src/opt/pass_loop.c -In the old code, the load/store pairs around the local's register were always -IR_LOAD / IR_STORE instructions — exactly the form that `opt_promote_scalar_locals` -can handle — and the only frame-slot operands visible to the pass were those -load/store addresses. +Computes loop nesting depth, used as a spill-cost weight by the register +allocator (values live across deep loops are preferred for hard registers). --- -## §7 — opt_addr_of_global_cse +## §7.5 — opt_hoist_loop_consts -**Source**: src/opt/pass_addr_fold.c -**Purpose**: CSE for `IR_ADDR_OF(global{sym, addend})`. When the same -(sym, addend) pair appears in two or more `IR_ADDR_OF` defs, replace all with a -single def materialized in block 0 (after any `IR_PARAM_DECL` prologue). This -collapses the per-iteration `adrp`/`add` pair in tight loops. +**Source**: src/opt/pass_addr_fold.c -Unchanged by the refactor. +LICM-lite for constant materialization, mirroring `opt_addr_of_global_cse`'s +hoist-to-entry pattern for `LOAD_IMM`. For each distinct `(imm, type, cls)` +that appears inside a loop block (`loop_depth > 0`), allocates a canonical +PReg, emits one `LOAD_IMM` in the entry block (after `IR_PARAM_DECL`s), NOPs +the in-loop and any other defs of that key, and remaps every use to the +canonical PReg. Without this the backend re-materializes the constant +(`movz`/`movk`) on every iteration. + +Safety: only hoists `LOAD_IMM` whose def PReg is **single-defined** (the +LOAD_IMM is its only def). The PReg form here is non-SSA, so a mutable promoted +local initialized to a constant (e.g. a loop counter `i = 1`) is multiply +defined; remapping it would discard the increment defs and break the loop. +Skips `imm == 0` so the backend's store-zero-register path (see §14) keeps the +cheaper `strb wzr, …` form for stored zeros. --- ## §8 — opt_live_blocks + opt_dead_def_elim_with_live -**Sources**: src/opt/pass_analysis.c, src/opt/pass_lower.c -**Purpose**: Dataflow liveness over the PReg namespace; pre-RA dead definition -elimination. - -`opt_live_blocks` computes per-block liveness bitmaps (one bit per PReg) using -backward dataflow. `opt_dead_def_elim_with_live` sweeps each block backward: -any instruction all of whose defined PRegs are dead at its output and which has -no side effects is removed. - -Running DCE before register allocation reduces the number of live ranges and -the number of allocable PRegs, which keeps the point-bitmap allocator fast. +**Sources**: src/opt/pass_live.c, src/opt/pass_lower.c -Unchanged by the refactor. +`opt_live_blocks` computes per-block PReg liveness bitmaps via backward +dataflow. `opt_dead_def_elim_with_live` sweeps each block backward and removes +any side-effect-free instruction whose defined PRegs are all dead at its output. +Running before allocation shrinks live ranges and the allocable set. --- ## §9 — opt_regalloc_locations -**Source**: src/opt/pass_analysis.c -**Purpose**: Point-bitmap linear-scan register allocator. Allocates each PReg -to either a hard register or a spill slot. +**Source**: src/opt/pass_lower.c -Uses a `used_locs[p * loc_words + w]` bitmap (one row per PReg, one column-word -per location) to track which (PReg, location) pairs conflict. For each PReg in -program order, the allocator scans the point bitmaps to find the first -non-conflicting hard register; if none is available, assigns a spill slot. - -Unchanged by the refactor. +Point-bitmap linear-scan allocator. A `used_locs[p * loc_words + w]` bitmap +tracks (PReg, location) conflicts; for each PReg in program order it picks the +first non-conflicting hard register, else a spill slot. The allocable set is the +target's `allocable` list (aa64: caller-saved first so they're preferred, then +callee-saved under pressure). It never assigns a reserved scratch register. --- ## §10 — opt_lower_to_mir -**Source**: src/opt/pass_lower.c -**Purpose**: Rewrite the PReg-namespace Func into a hard-register MIR Func. -Every PReg operand is replaced with the hard register or spill slot assigned by -`opt_regalloc_locations`. Spilled PRegs get `IR_LOAD` (reload) instructions -inserted before each use and `IR_STORE` (spill) instructions inserted after -each def. - -The output `f->mir` is a `MFunc` containing the rewritten blocks. The original -`f->blocks` is also updated to point to the rewritten instruction arrays. +**Source**: src/opt/pass_lower.c -Unchanged by the refactor. +Rewrites the PReg `Func` into a hard-register MIR `MFunc` (`f->mir`): every PReg +operand becomes its assigned hard register or spill slot. Spilled PRegs get an +`IR_LOAD` (reload) before each use and an `IR_STORE` (spill) after each def. +`f->blocks` is updated to the rewritten arrays. From here the IR is non-SSA +(physical registers, multiply defined). --- ## §11 — opt_mir_combine -**Source**: src/opt/pass_combine.c -**Purpose**: MIR-level peephole. Four rewrites run in a per-block fixpoint: - -1. **Substitute**: propagate a trivial copy `r1 ← copy r2` through later uses - of `r1`, removing redundant register-to-register moves. - -2. **Addr-mode synthesis**: fold `r ← addr_of global` + `load/store r` into a - direct `load/store [global]` where the backend supports it. - -3. **Sink**: move a def instruction forward in the block to be adjacent to its - sole use, reducing live range pressure. - -4. **Ext-chain**: fold sign/zero extensions that feed into an instruction that - already performs the extension as part of its semantics. - -Also runs `opt_combine_compact_block` to merge adjacent spill pairs. - -Unchanged by the refactor. +**Source**: src/opt/pass_combine.c + +MIR peephole, per-block fixpoint, four rewrites: + +1. **Substitute** — propagate a copy `r1 ← r2` (or `r1 ← imm` / convert) into + later uses of `r1`, removing redundant moves. The reg-source form is full + local copy propagation: it rewrites every safe use of `r1` to `r2` (gated + only on `r2` being unchanged between the copy and the consumer, which + `ctx_def_changed_since` already verifies with call clobbers folded in); the + stricter single-use + live-out gate is kept only for the immediate/convert + forms, which would otherwise duplicate a multi-instruction materialization. +2. **Addr-mode synthesis** — fold `r ← addr_of`/`r ← base+index*scale` into a + load/store's `OPK_INDIRECT` base/index/scale/offset where the backend + accepts it. +3. **Sink** — move a def adjacent to its sole use to shrink its live range. +4. **Ext-chain** — fold ext-of-ext and ext feeding an instruction that already + extends. + +`opt_combine_compact_block` additionally merges adjacent spill/reload pairs. + +Because the MIR is non-SSA, each rewrite is gated on a **live-range safety +check**: `count_uses_in_live_range` counts uses of the producer's register up to +its next redefinition in the block, and a producer that is **live-out of the +block** (per `opt_block_live_out_has_phys_reg`, backed by the `hard_live` +analysis) is not folded away. Register kills are computed from each +instruction's precise def/clobber set (`inst_kills_phys_reg` → +`opt_hard_inst_use_def`), so a call kills only its caller-saved clobbers — a +value in a callee-saved register survives it. --- ## §12 — opt_mir_dce -**Source**: src/opt/pass_dce.c -**Purpose**: Post-RA dead definition elimination over hard-reg IR. Removes -instructions that define registers that are never used, provided they are -side-effect-free. Runs after `opt_mir_combine` to clean up any copy chains -that were fully substituted away. +**Source**: src/opt/pass_dce.c -Unchanged by the refactor. +Post-RA dead-def elimination over hard-register IR: removes side-effect-free +instructions whose defined registers are never used, cleaning up copy chains +substitution left behind. Uses the same `hard_live` machinery as §11. --- -## §13 — opt_mir_jump_cleanup(CFG) + opt_mir_build_cfg + opt_mir_jump_cleanup(LAYOUT) +## §13 — opt_mir_jump_cleanup(CFG) + build_cfg + jump_cleanup(LAYOUT) -**Source**: src/opt/pass_cfg.c -**Purpose**: -- `OPT_JUMP_CLEANUP_CFG`: Remove unreachable blocks and collapse chains of - unconditional jumps. -- `opt_mir_build_cfg`: Recompute the CFG successor lists. -- `OPT_JUMP_CLEANUP_LAYOUT`: `cleanup_reorder_for_fallthrough` — greedy chain - extension that orders blocks so that the most common taken-branch edge - becomes a fallthrough (eliminating the branch instruction); then strips - redundant unconditional jumps to the textual successor. +**Source**: src/opt/pass_jump.c, src/opt/pass_cfg.c -Unchanged by the refactor. +- `OPT_JUMP_CLEANUP_CFG`: drop unreachable blocks, collapse unconditional-jump + chains. +- `opt_mir_build_cfg`: recompute successor lists. +- `OPT_JUMP_CLEANUP_LAYOUT`: `cleanup_reorder_for_fallthrough` greedily orders + blocks so the hot taken edge falls through, then `cleanup_rotate_loops` + detects simple single-latch loops (header with `IR_CMP_BRANCH`, fallthrough + into the body chain, one back-edge predecessor later in emit order) and + moves the header to just after its latch, inverting the test in place. The + back-edge becomes the conditional branch and the latch falls through to the + header — the per-iteration unconditional back-jump is gone. `cleanup_invert_ + taken_fallthrough` + `cleanup_layout_fallthrough_branches` then strip + trailing `b`-to-next-block and invert any remaining mis-aligned conditional. --- -## §14 — opt_emit_native (replaces old opt_emit) - -**Source**: src/opt/pass_native_emit.c -**Old source (deleted)**: src/opt/pass_emit.c - -### Old implementation: `opt_emit` / `replay_func_to` - -`opt_emit` replayed the optimized MIR through the wrapped `CGTarget` using -`replay_func_to(identity=1)`. The `identity_regs` flag meant that PReg ids -from the optimized IR were used directly as `Reg` values when calling the -backend. The key mechanism: - -- **`xlat_storage`**: Checked `opt_preg_alloc_kind(f, pr)` to map a PReg's - storage to either `CG_LOCAL_STORAGE_REG` (hard register) or - `CG_LOCAL_STORAGE_FRAME` (spill slot). -- **`plan_hard_regs` / `reserve_hard_regs`**: Scanned the replayed IR once to - collect all hard register used, then called the wrapped backend's - `plan_hard_regs` / `reserve_hard_regs` hooks. This allowed the native - backend's frame layout to know the full set of callee-saved registers before - emitting the prologue. -- **`func_begin_known_frame`**: If the target supported it, the frame layout - (slot sizes, alignments, max outgoing call stack) was computed upfront and - passed to the backend in a single `func_begin_known_frame` call. Otherwise - fell back to the streaming `frame_slot` / `func_begin` protocol. - -### New implementation: `opt_emit_native` - -`opt_emit_native` replays the MIR into a `NativeTarget` using `NativeLoc` as -the currency (not `Reg`). The key differences: - -- **`loc_from_operand`**: Converts an `OptOperand` to a `NativeLoc` — - `OPT_OPK_REG` → `NATIVE_LOC_REG`, `OPT_OPK_LOCAL` (frame slot) → - `NATIVE_LOC_FRAME`, `OPT_OPK_IMM` → `NATIVE_LOC_IMM`, etc. -- **`map_frame_slots`**: Pre-allocates all frame slots by calling - `target->frame_slot` for each `IRFrameSlot`. Frame slot mapping is done - upfront via a single scan, similar to the old `known_frame` path. -- **`materialize`**: Loads a non-register `NativeLoc` into a scratch register - when a register operand is required (e.g., for call callee address). -- **`legalize_addr`**: After constructing a `NativeAddr`, calls - `target->addr_legal` to check if the address shape is supported, and if not, - collapses it to a base register via a `load_addr`. - -The `plan_hard_regs` / `reserve_hard_regs` hooks are not called in the new -implementation. This means the native backend must infer callee-saved register -usage from the frame layout pass rather than receiving an explicit list. Whether -this is correct depends on the backend. +## §14 — opt_emit_native + +**Source**: src/opt/pass_native_emit.c + +Replays the MIR into a `NativeTarget` using `NativeLoc` as the operand currency +(not raw `Reg`). Key helpers: + +- **`loc_from_operand`** — `OptOperand` → `NativeLoc`: `OPT_OPK_REG` → + `NATIVE_LOC_REG`, `OPT_OPK_LOCAL` → `NATIVE_LOC_FRAME`, `OPT_OPK_IMM` → + `NATIVE_LOC_IMM`, `OPT_OPK_INDIRECT` → `NATIVE_LOC_ADDR`, etc. +- **`reserve_callee_saves`** — scans the lowered MIR for the callee-saved + registers the allocator actually used and hands them to the backend's + `reserve_callee_saves` hook *before* frame-slot mapping, so the prologue saves + exactly that set. +- **`map_frame_slots`** — pre-allocates every `IRFrameSlot` via + `target->frame_slot` in one scan. +- **`emit_prologue` (minimal prologue)** — once the callee-save set and frame + slots are known the opt path calls this hook on backends that support it + (aa64) and sets `target->emit_minimal_prologue = 1` so `func_begin` skips its + worst-case NOP reservation. The backend emits an exact-size contiguous + prologue in place (no reserved region); the frame-size immediates are still + patched in `func_end` because the final frame isn't known until body emission + allocates its temporaries. `NativeDirectTarget` (O0) leaves `emit_prologue` + unset and keeps the reserve-and-patch path, which now branches over the + unused tail rather than NOP-filling it. +- **call result routing** — for scalar return values `emit_call` now passes the + MIR's destination location (a register or its spill slot) straight to + `plan_call`, so the backend moves the ABI result register directly to its + destination. The old code routed every scalar result through a fresh temp + spill slot (`bl; stur [slot]; ldur`), a round trip on every call; aggregates / + oversized results still use the temp slot path. +- **store-zero register** — when the value operand of an `IR_STORE` is the + integer immediate 0 and the backend advertises a hardware zero register + (`has_store_zero_reg` + `store_zero_reg`), the store goes straight from that + register instead of materializing `movz w,0` into a scratch first + (`strb wzr, …` on aa64). +- **tail-call realizability** — `opt_on_tail_call_unrealizable_reason` returns + a reason when the callee's outgoing stack-arg bytes exceed the area this + function received for its own incoming stack args (via the backend's + `signature_stack_bytes` hook). CG then falls back to an ordinary call + + return for an ALLOWED tail, or panics for `musttail`. +- **`materialize`** — loads a non-register `NativeLoc` into a reserved scratch + register when an instruction needs a register operand. +- **`legalize_addr` / `collapse_addr_to_reg`** — if `target->addr_legal` + rejects an address shape, materialize it into a **reserved scratch register** + via `load_addr` (never the base register — the base may hold a value the + allocator keeps live past this op). --- -## §15 — O0 path - -At `opt_level == 0`, `opt_cgtarget_new` is not called. The CgTarget returned by -`backend->make` is used directly. Before the refactor, the native backends were -CGTarget implementations (`CGTarget`-style). After `edbd83e` (aa64: delete old -CGTarget backend) and related commits, the native backends use `NativeDirectTarget`. - -The O0 regression (cfree O0 is 3.25x slower than gcc-15 -O0 as a geomean across -9 benchmarks) is therefore entirely in `NativeDirectTarget`, which is documented -as a work-in-progress single-pass lowering with many known limitations. +## Backend contract (`NativeTarget`, aa64) + +The emit step depends on these invariants; performance work that touches the +allocator or emit must preserve them. + +- **2-scratch discipline.** An emit hook may use at most the **2 reserved + scratch registers** as private temporaries (aa64 int `x9`/`x10`; the hand- + written-PLT temps `x16`/`x17` = TMP0/TMP1 are reserved separately and used by + address-materialization helpers). Every other register an op needs — operand + bases, results — is **provided by the caller**. The optimizer owns all + allocation; ops clobber nothing the allocator tracks live. +- **Allocable set (aa64).** Int: caller-saved `x8, x11..x15` first (preferred, + low spill cost), then callee-saved `x19..x28` under pressure. FP: caller-saved + `v18, v19` then callee-saved `v8..v15`. Scratch: int `{x9, x10}`, FP + `{v20, v21}` — disjoint from allocable. **Argument registers `x0..x7`/`v0..v7` + are *not* allocable** (see limitations). +- **No `plan_hard_regs`/`reserve_hard_regs`.** Because only caller-saved (plus + optionally-reserved callee-saved) registers are allocable and every emit hook + receives fully-resolved hard regs, the backend never needs an up-front + hard-reg plan. Callee-save planning happens through `reserve_callee_saves`. +- **Shared cores.** aa64 ops are factored so the semantic-operand path + (`NativeDirectTarget`, O0) and the `NativeLoc` path (this pipeline, O1) call + one shared core; the wrappers only convert operands. --- -## Summary of regressions introduced by f60a16d - -### Regression A — Bypassed functions (O1 quality loss) - -Functions with alloca, varargs, inline asm, or aggregate types now bypass all -optimization via `opt_replay_cg_ir_direct`. Before the refactor, these went -through the full optimizer. This is incorrect: alloca functions should still -benefit from register allocation on their non-alloca code; the bypass is a -conservative workaround for unimplemented cases in `cg_ir_lower.c`. +## §15 — O0 path (boundary) -### Regression B — Local classification round-trip (O1 quality loss) - -The new `cg_ir_lower.c` classifies address-taken locals as FRAME storage from -the start, rather than starting them as register storage and inserting -explicit load/store pairs (as `opt_frame_home_addr_taken_locals` did). The -downstream passes `opt_addr_xform_pregs` + `opt_promote_scalar_locals` are -supposed to recover register storage for non-escaping locals, but: - -1. `opt_promote_scalar_locals` rejects a slot if `OPK_LOCAL` appears anywhere - outside `IR_LOAD.opnds[1]` or `IR_STORE.opnds[0]`. But `lower_operand_value` - emits `OPK_LOCAL` for ALL value uses of an address-taken local — including - call args and binop operands — not only through explicit load/store. - -2. Any local whose `IR_ADDR_OF` use is not fully foldable by `opt_addr_xform_pregs` - (because the pointer truly escapes, or because of EA-shaped uses) retains - `FSF_ADDR_TAKEN` and is never promoted. - -The net effect: locals that are address-taken but whose address doesn't truly -escape (very common in C — temporaries, loop iteration variables, short-lived -pointers) stay in FRAME storage at O1 and incur memory traffic that the old -optimizer eliminated. - -### Regression C — Missing plan_hard_regs / reserve_hard_regs (potential correctness issue) - -The old `replay_func_to` called `target->plan_hard_regs` and `target->reserve_hard_regs` -with the complete set of hard registers used by the function. These hooks let the -native backend plan its callee-save region before emitting the prologue. The new -`opt_emit_native` does not call these hooks. If any backend relies on them for -correct callee-save handling, this is a correctness issue, not just a quality -issue. - -### Regression D — O0 path replaced by NativeDirectTarget - -Not introduced by f60a16d itself, but by the series of commits ending at -`edbd83e` (delete old CGTarget backend). The O0 regression is a NativeDirectTarget -limitation, separate from the optimizer. +At `opt_level == 0` the optimizer is not installed; the backend's +`NativeDirectTarget` (src/cg/native_direct_target.c) emits directly in a single +pass. It is a separate, simpler lowering with its own (documented) limitations +and is not part of this pipeline. --- -## Remediation priorities - -1. **Fix local classification** (§2 / Regression B): Either restore the - `opt_frame_home_addr_taken_locals` approach (start as REG, insert sync - pairs lazily), or extend `opt_promote_scalar_locals` to handle the - `OPK_LOCAL`-in-value-position case. The simpler fix is to not use - `OPK_LOCAL` as a value operand in `lower_operand_value` for address-taken - locals; instead emit `IR_LOAD(preg, frame_slot)` inline, matching what - `opt_frame_home_addr_taken_locals` used to produce. - -2. **Restore bypass functions to optimizer** (Regression A): Extend - `cg_ir_lower.c` to handle the cases currently routed to - `opt_replay_cg_ir_direct`. At minimum, alloca and non-aggregate vararg - functions should go through the optimizer. +## Debugging aids -3. **Wire plan_hard_regs / reserve_hard_regs** (Regression C): Survey the - aarch64 / x64 / rv64 native backends to check whether they implement these - hooks and what happens when they are not called. +- `CFREE_DUMP=<tag>` — dump the optimizer IR at a named stage and panic + (`entry`, `pre-emit`, or any `opt_verify` tag above). `CFREE_DUMP=1` dumps at + `entry`. Add an `opt_dbg_dump(o, f, "<tag>")` call next to an existing + `opt_verify` to expose another stage. +- `CFREE_DUMPCG=1` — dump the semantic CG IR tape (pre-lowering). Note the CG-IR + dumper does not print `OPK_INDIRECT` offsets. +- Both panic on the first recorded function; to dump all functions, temporarily + swap the `compiler_panic` in `opt_dbg_dump` / `opt_dbg_dump_cg` for + `cfree_debug_printf`. +- Disassembly: `cfree objdump -d <obj>` or `lldb -b -o "disassemble -n <fn>"`. --- -# Recovery Project: goal, decisions, and checklist - -This section tracks the active work to **fully recover O1 on the new -`CgIrFunc` → `NativeTarget` path and delete the direct-replay path entirely**. -It supersedes the speculative remediation notes above where they conflict. - -## Goal / success criteria - -1. **Completeness (priority 1):** every function compiles through the - optimizer pipeline. No `opt_replay_cg_ir_direct`. No per-op bypass. -2. **Correctness (priority 2):** `CFREE_OPT_LEVELS=1 CFREE_TEST_PATHS=R - ./test/toy/run.sh` fully green, and the default test suites stay green. -3. **Performance (priority 3):** O1 vs `gcc-15 -O0` (baseline in - `doc/OPT_PERF.md`): compile-time ≈5x faster, runtime ≈2x faster (geomean). - Measure only from a **clean `RELEASE=1` build** (the dev build is - ASan/UBSan-instrumented and not representative). - -Scope: **aa64 only** (the only `NativeTarget` implementation; host arch here). -x64/rv64 `NativeTarget` ports are out of scope. - -## Design decisions - -- **Arch/ABI neutrality at the optimizer boundary.** The optimizer (cg_ir_lower, - pass_native_emit, machinize) must make no ABI/layout assumptions. For varargs - it passes the va_list *pointer* opaquely (a `NativeLoc`) and the `va_arg` - type; all layout knowledge (pointer ABI vs AAPCS64 register-save-area, field - offsets, sizes) lives behind the `NativeTarget` va hooks, which query - `abi_va_list_layout`. Same principle applies to asm and aggregates. -- **Backend scratch discipline.** A `NativeTarget` op may use at most the **2 - reserved scratch registers** (aa64 `x16`/`x17` = TMP0/TMP1) as private - temporaries. Every *other* register it needs (operand bases, results) must be - **provided by the caller**, so the optimizer owns allocation/planning and the - op clobbers nothing the register allocator tracks live. This removed any need - for a va clobber-set; it is the contract new hooks must follow. -- **plan/reserve_hard_regs are unnecessary (Regression C is closed).** The aa64 - `NativeTarget` exposes only caller-saved registers as allocable - (`aa_int_allocable = {x8,x11..x15}`, scratch `{x9,x10}`), so the allocator - never assigns a callee-saved register and the prologue correctly saves only - FP/LR. Every emit hook already receives fully-resolved hard regs. The hooks - are not wired and will not be. The *consequence* is a small allocable set - (perf item below), not a correctness gap. -- **Shared cores, two thin wrappers.** aa64 backend ops are factored so the - semantic-operand path (`NativeDirectTarget`, -O0) and the `NativeLoc` path - (optimizer, O1) call one shared core; the wrappers only convert operands. - -## Checklist - -Integration test (drives this work): -`CFREE_OPT_LEVELS=1 CFREE_TEST_PATHS=R ./test/toy/run.sh` -(The `CFREE_NO_DIRECT_REPLAY` gate and the direct-replay path are now deleted — -every function goes through the optimizer unconditionally, so no gate is -needed.) - -Completeness — route all ops through the optimizer: -- [x] **Varargs** — `va_start_/va_arg_/va_end_/va_copy_` hooks on `NativeTarget`; - aa64 shared cores (2-temp discipline); cg_ir_lower lowers va operands as - pointer *values*; `local_address_used_in_cg_ir` no longer flags va - operands as address-taken. -- [x] **Latent `IR_ADDR_OF` spill-writeback bug** in `pass_native_emit.c` - (ADDR_OF result computed into scratch was never stored back). -- [x] **Inline asm** (`IR_ASM_BLOCK`) — DONE. `NativeTarget` `asm_block` binds - the optimizer's pre-allocated operand registers to the template (no - self-allocation; inputs are already live in their regs and outputs are - consumed via use/def data flow). aa64 asm clobber-mask / callee-save / - restore helpers refactored off `NativeDirectTarget` onto `AANativeTarget`. - Toy cases 102,104,105,108,110,19,20 all pass. -- [x] **Aggregates / sret / byval** — DONE. Aggregate locals forced to frame; - per-part ABI typing in plan_call/plan_ret; aggregate results via - copy_bytes; aggregate-typed IR_COPY/IR_LOAD/IR_STORE via copy_bytes; sret - x8 set after argument loads (was clobbering an arg allocated to x8). - 124/130/36/37 all pass. -- [ ] **BREAK_TO / CONTINUE_TO + SCOPE cond** — currently unused by frontends - (toy/c lower break/continue to `BR`+labels), but unwired in emit. Either - lower them to CFG edges in cg_ir_lower or wire emit, for true - completeness once a producer exists. - -Direct-path deletion: -- [x] Delete `opt_func_needs_direct_replay`, `opt_replay_cg_ir_direct`, the - `OptReplay` machinery, and the `replay_*` helpers in `opt.c`. DONE — every - function now goes through the optimizer (opt.c 680 → 269 lines). -- [x] Remove the `CFREE_NO_DIRECT_REPLAY` env gate. DONE. - -Performance (priority 3, after completeness + correctness): -- [x] **Expand aa64 allocable set — callee-saved** — DONE. x19..x28 and d8..d15 - are allocable (caller-saved stay first so they're preferred; callee-saved - chosen only under pressure). `pass_native_emit` scans the lowered MIR for - the callee-saved regs the allocator used and passes them to a new - `reserve_callee_saves` `NativeTarget` hook before frame-slot mapping; the - aa64 backend reserves save slots first (small FP-relative offsets), saves - them in the back-patched prologue and restores them in the epilogue and - the tail-call patch. FP class gained a `callee_saved_mask` (d8..d15). -- [ ] **Expand aa64 allocable set — arg registers** — make x0..x7 / v0..v7 - allocable so values (params, call args, temporaries) can live in them. - No plan yet. Key finding from the attempt (reverted): there is no safe - "entry-only" subset — making arg regs allocable simultaneously exposes a - parallel-move / shuffle hazard in *four* ABI emission paths (call-argument - setup, tail-call argument setup, function entry / param materialization, - and multi-result return). A "forbid arg-reg sources at calls/returns + - let each param keep only its own incoming arg reg" subset got the bypass - probe from 14→11 toy R-O1 failures but newly broke 3 previously-passing - tests, i.e. the subset is unsound. Correct approach: one general - parallel-move-with-memory sequencer (cycle-break via reserved scratch; - x0..x7/v0..v7 currently excluded so sources and destinations are disjoint) - applied uniformly to all four paths, gated by permutation/swap stress - tests. The across-call correctness safety net already exists - (`rewrite_call_save_one` saves caller-saved live-across-call values keyed - on the clobber mask, which includes arg regs). -- [~] **Local classification (Regression B)** — param round-trip CLOSED: - `bind_param` now receives the allocator-chosen destination `NativeLoc`, so - a register-allocated scalar param moves straight from its incoming arg - register into its hard register (no frame store+reload); the - `IR_PARAM_DECL` marker emits nothing. Remaining gap: address-taken locals - whose address is stored into a pointer variable then dereferenced are not - promoted (needs ADDR_OF copy-propagation through PRegs); genuinely aliased - locals correctly stay in memory. `opt_addr_xform_pregs` + - `opt_promote_scalar_locals` already handle direct `ADDR_OF(local)` → - load/store base folding. -- [ ] **Unit tests** — new targeted tests for the `CgIrFunc`→`NativeTarget` - path (local promotion, addr-fold, regalloc, lowered bypass ops); - re-enable a `test-opt` make target (old `test/opt/opt_test.c` is disabled - and uses the pre-refactor API). -- [ ] **Benchmark** — clean `RELEASE=1` build, run `make bench-opt`, confirm the - O1-vs-`gcc-15 -O0` targets; refresh `doc/OPT_PERF.md`. - -## Progress log - -- Varargs landed end-to-end on the optimizer path; `IR_ADDR_OF` writeback fixed. - Bypass-disabled R-path failures: 14 → 11. Default R-path (O0+O1): 408/408. - (commit "opt: route varargs through optimizer path; fix ADDR_OF spill writeback") -- Aggregate ABI, partial (commit "opt: aggregate ABI lowering ... (partial)"): - - Force aggregate / >8-byte locals to frame in `cg_ir_lower` `lower_locals` - (a 16-byte struct result local was being allocated to a single PReg). - - Type each ABI part by its own width in `aa_plan_call`/`aa_plan_ret` direct - paths via `aa_part_scalar_type` (was using the aggregate type → truncating - `mov w0,w9` for i64 fields). - - `emit_call`/`emit_ret`: aggregate/oversized results use `copy_bytes` / hand - `plan_ret` the value's memory location directly (no scalar temp copy). - - Result: no more "scalar too large" panics; sret no longer truncates. But - values still wrong (130→0, 124→40, 36/37→160); still 11 bypass-off failures. - -- Aggregate COPY/LOAD/STORE via `copy_bytes` (commit "opt: handle - aggregate-typed COPY/LOAD/STORE via byte copy"): the root cause of the - truncated/mis-offset aggregate moves was that `IR_COPY`/`IR_LOAD`/`IR_STORE` - on aggregate-typed operands were emitted as scalar moves. 130 and 124 now - pass. Bypass-off R-path failures: 11 → 9 (7 asm + 2 tail-sret). Default - R-path: 408/408. - -- Inline asm + direct-path deletion (commit "opt: route inline asm through - optimizer; delete direct-replay path"): `asm_block` `NativeTarget` hook binds - pre-allocated operand registers; all 7 asm cases pass bypass-off. With asm - done the bypass-off R-path reached 0 failures (tail-sret had already been - fixed), so the entire direct-replay path + `CFREE_NO_DIRECT_REPLAY` gate were - deleted (opt.c 680 → 269 lines). Every function now goes through the - optimizer. Full toy suite (R/L/C/W × O0/O1/O2): 1333 pass, 0 fail, 8 skip. - -- Callee-saved registers allocable (commit "aa64: make callee-saved registers - allocable at O1"): x19..x28 / d8..d15 added to the allocable set with - prologue/epilogue + tail-call save/restore via the new `reserve_callee_saves` - hook. Verified O0==O1 on register-pressure int and fp programs. - -- Parameter-home elimination (commit "opt: route incoming params straight into - their allocated register"): `bind_param` takes a destination `NativeLoc`; - params no longer round-trip through a frame home. Deleted - `allocate_param_home` / `local_home_for_preg` / `param_home_by_preg`. - -- Arg-register allocability attempted and reverted (see the unchecked perf item - above for the finding): the "entry-only" subset is unsound; the change is - indivisible and needs a general parallel-move sequencer across all four ABI - paths. Tree left at the param-home-elimination commit, suite green. - -Debugging aids: `CFREE_NO_DIRECT_REPLAY=1 cfree cc -O1 -c <case>.toy` + -`cfree objdump -d`; `CFREE_DUMP=1` / `CFREE_DUMPCG=1` dump optimizer/CG IR -(they `compiler_panic` on the first recorded function — temporarily swap the -panic in `opt_dbg_dump_cg`/`opt_dbg_dump` for `cfree_debug_printf` to dump all -functions). Note the CG-IR dumper does not print INDIRECT `ofs`. +## Known limitations affecting performance + +Feeds `doc/OPT_O1_PERF_TODO.md`. Recovery of the new pipeline is functionally +complete (every function optimized; varargs/asm/aggregates handled; callee-saved +allocable; params no longer round-trip through a frame home). Remaining +quality gaps: + +- **Argument registers not allocable.** `x0..x7`/`v0..v7` cannot hold + allocator-managed values, shrinking the allocable set and forcing extra spills + in register-pressured code. An "entry-only" subset was attempted and reverted + as unsound: making arg regs allocable simultaneously exposes a parallel-move / + shuffle hazard across four ABI paths (call-arg setup, tail-call arg setup, + function entry / param materialization, multi-result return). The correct fix + is one general parallel-move-with-memory sequencer (cycle-break via reserved + scratch) applied uniformly to all four, gated by permutation/swap stress + tests. +- **Address-taken-local promotion gap.** A local whose address is stored into a + pointer variable and later dereferenced stays in frame storage even when it + does not truly escape (§5). Needs ADDR_OF copy-propagation through PRegs. +- **`BREAK_TO` / `CONTINUE_TO` + SCOPE cond** are unwired in emit (no current + frontend produces them — toy/C lower break/continue to `BR` + labels). Wire + emit or lower to CFG edges once a producer exists. +- **Unit-test coverage** for the `CgIrFunc`→`NativeTarget` path (local + promotion, addr-fold, regalloc, lowered bypass ops) is thin; the old + `test/opt/opt_test.c` uses the pre-refactor API and is disabled. diff --git a/doc/OPT_O1_PERF_TODO.md b/doc/OPT_O1_PERF_TODO.md @@ -0,0 +1,91 @@ +# cfree -O1 performance TODO + +Working list of benchmarks where cfree's `-O1` codegen leaves clear runtime on +the table. These are the high-confidence targets pulled from the full +15-benchmark `scripts/opt_bench.sh` sweep (aarch64, Apple). + +## The bar + +cfree `-O1` runtime should be **at least 10% faster** than *both* reference +points — i.e. cfree runtime ≤ 0.90× theirs (speedup ≥ 1.10×): + +- **gcc-15 -O0** — an unoptimized native baseline; losing here means we trail + even a compiler doing no real optimization. +- **mir-c2m -O1** — a fast lightweight JIT-class optimizer; a peer we should + match or beat. + +A benchmark stays on this list until it clears 1.10× against both (where the +reference exists). + +## Current standings + +Numbers below are from a single-run sweep (`COMPILE_REPEATS=1`, +`RUN_REPEATS=1`), runtime in ms; speedup = reference_time / cfree_time +(>1 means cfree is faster). The four entries here were chosen because their +gaps are large enough to be real signal rather than single-run noise. + +| bench | cfree -O1 | gcc -O0 | vs gcc-O0 | mir -O1 | vs mir-O1 | behind | +| --- | ---: | ---: | ---: | ---: | ---: | --- | +| binary-trees | 3238 | 2562 | **0.79×** (slower) | n/a¹ | — | gcc | +| lists | 8417 | 8583 | 1.02× | 4907 | **0.58×** | gcc, mir | +| hash2 | 5965 | 7110 | 1.19× ✓ | 3765 | **0.63×** | mir | +| sieve | 5066 | 4977 | 0.98× | 4107 | **0.81×** | gcc, mir | + +¹ mir-c2m fails to compile `binary-trees` in our setup, so only the gcc +comparison applies there. + +## Per-benchmark notes + +### binary-trees — slower than unoptimized gcc (highest priority) +The only case where cfree `-O1` is *slower than gcc -O0* (0.79×). Workload is +allocation- and pointer-chasing-heavy (recursive tree build/walk). Likely +suspects: call-heavy inner loops, per-node field access codegen, and whatever +we emit around the allocator calls. Start here. + +### lists — 0.58× vs mir +Matches gcc-O0 (1.02×) but loses badly to mir. Doubly-linked-list traversal and +splice: lots of `p->next`/`p->prev` loads/stores in tight loops. The recently +fixed O1 store-address clobber bug lived in this same code shape; correctness is +now fine, but the field-access/loop codegen is still far behind mir. Compare our +loop bodies against mir's to find the gap. + +### hash2 — 0.63× vs mir +Clears the gcc bar (1.19×) but is the second-worst against mir. Open-addressing +hash probing — integer hashing arithmetic plus indexed loads in the probe loop. +Look at index/address computation and whether the probe loop carries redundant +materialization. + +### sieve — 0.81× vs mir, ~tied with gcc +Behind both. Classic nested sieve loop over a byte/array — bounded counted loops +with strided stores. Suggests loop/IV and array-addressing codegen headroom +(strength reduction, address-mode folding in the inner store). + +## Reproducing + +```sh +# Build the optimized compiler first (clean release): +rm -rf build/release && make RELEASE=1 bin + +# Run just these four, single-run, O0+O1, gcc + cfree + mir: +CFREE="$PWD/build/release/cfree" \ +CFREE_OPT_BENCHES="binary-trees lists hash2 sieve" \ +CFREE_OPT_BENCH_LEVELS="0 1" \ +CFREE_OPT_BENCH_COMPILE_REPEATS=1 CFREE_OPT_BENCH_RUN_REPEATS=1 \ +CFREE_OPT_BENCH_SKIP_GCC=0 CFREE_OPT_BENCH_SKIP_CLANG=1 CFREE_OPT_BENCH_SKIP_MIR=0 \ +bash scripts/opt_bench.sh +``` + +Per-iteration codegen for a single function is easiest to inspect via the +optimizer's staged IR dump (`CFREE_DUMP=pre-emit cfree cc -O1 -c bench.c ...`, +which panics after printing the pre-emit MIR) plus `objdump`/`lldb` +disassembly of the hot function. + +## Notes / caveats + +- Single-run timings: the four here were picked for gap size, but re-run with + repeats before/after a change to confirm movement (especially anything that + lands near the 1.0× line, e.g. `sieve`). +- Not on this list but also marginal (within single-run noise, 0.92–1.05×): + `hash`, `funnkuch-reduce`, `strcat`. Revisit once the four above improve. +- `cfree-run` (JIT) shares this codegen, so its runtimes track `cfree cc -O1`; + fixing these helps both paths. diff --git a/src/arch/aa64/isa.h b/src/arch/aa64/isa.h @@ -839,6 +839,7 @@ static inline u32 aa64_ldp64_pre(u32 Rt, u32 Rt2, u32 Rn, i32 imm7_scaled) { .Rt = Rt}); } + /* ==================================================================== * Hint instructions (NOP / YIELD / WFE / WFI / SEV / SEVL) * 1101 0101 0000 0011 0010 CRm(4) op2(3) 11111 @@ -972,6 +973,28 @@ static inline AA64LdStPSOff aa64_ldstp_soff_unpack(u32 w) { return f; } +/* 64-bit integer STP/LDP, signed offset (no writeback). imm7_scaled is + * byte_offset / 8. Used for the prologue/epilogue frame record and callee-save + * pairs, which address off a fixed base (x17 / fp). */ +static inline u32 aa64_stp64_soff(u32 Rt, u32 Rt2, u32 Rn, i32 imm7_scaled) { + return aa64_ldstp_soff_pack((AA64LdStPSOff){.opc = 2, + .V = 0, + .L = 0, + .imm7 = (u32)imm7_scaled & 0x7fu, + .Rt2 = Rt2, + .Rn = Rn, + .Rt = Rt}); +} +static inline u32 aa64_ldp64_soff(u32 Rt, u32 Rt2, u32 Rn, i32 imm7_scaled) { + return aa64_ldstp_soff_pack((AA64LdStPSOff){.opc = 2, + .V = 0, + .L = 1, + .imm7 = (u32)imm7_scaled & 0x7fu, + .Rt2 = Rt2, + .Rn = Rn, + .Rt = Rt}); +} + /* ==================================================================== * Load/store, unscaled 9-bit signed offset (LDUR / STUR, V=0 and V=1). * size(2) 111 V(1) 00 opc(2) 0 imm9(9) 00 Rn(5) Rt(5) diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c @@ -117,6 +117,7 @@ typedef struct AANativeTarget { u32 func_start; u32 prologue_pos; + u32 minimal_prologue_words; /* opt path: exact prologue length, else 0 */ MCLabel epilogue_label; AACalleeSave callee_saves[AA_MAX_CALLEE_SAVES]; @@ -804,10 +805,11 @@ static void aa_emit_variadic_reg_saves(AANativeTarget* a) { aa_emit_q_frame(a, 0, r, a->va_vr_slot, r * vai.fp_slot_size); } +static void aa_emit_entry_saves(AANativeTarget* a); + static void aa_func_begin(NativeTarget* t, const CGFuncDesc* fd) { AANativeTarget* a = aa_of(t); MCEmitter* mc = t->mc; - const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type); a->func = fd; a->nslots = 0; a->cum_off = AA_FRAME_SAVE_SIZE; @@ -829,8 +831,24 @@ static void aa_func_begin(NativeTarget* t, const CGFuncDesc* fd) { mc_begin_function(mc, fd->sym, fd->text_section_id, a->func_start); if (mc->cfi_startproc) mc->cfi_startproc(mc); a->prologue_pos = mc->pos(mc); - for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) aa_emit32(mc, 0xd503201fu); + a->minimal_prologue_words = 0; a->epilogue_label = mc->label_new(mc); + /* Optimizer path: emit nothing here. The exact-size prologue and the + * sret/variadic entry saves are emitted later by aa_emit_prologue, once the + * callee-save set and frame slots are known. The single-pass path reserves a + * worst-case region (patched in func_end) and emits the entry saves now. */ + if (t->emit_minimal_prologue) return; + for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) aa_emit32(mc, 0xd503201fu); + aa_emit_entry_saves(a); +} + +/* Emit the sret-pointer save (x8 → slot) and, for variadic functions, the + * argument register-save area. Run immediately after the prologue frame setup + * on both the single-pass path (from func_begin) and the optimizer path (from + * aa_emit_prologue). */ +static void aa_emit_entry_saves(AANativeTarget* a) { + NativeTarget* t = &a->base; + const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type); if (abi && abi->has_sret) { NativeFrameSlotDesc sd; NativeAddr addr; @@ -901,20 +919,14 @@ static void aa_reserve_callee_saves(NativeTarget* t, const u32* used, } static MemAccess aa_mem_for_type(NativeTarget* t, CfreeCgTypeId type, u32 size); +static void aa_words_callee_saves(AANativeTarget* a, int save, u32* words, + u32 cap, u32* n); static void aa_emit_callee_restores(AANativeTarget* a) { - MemAccess mem; - for (u32 i = a->ncallee_saves; i > 0; --i) { - const AACalleeSave* cs = &a->callee_saves[i - 1u]; - NativeAddr addr; - NativeLoc reg = aa_reg_loc(cs->type, (NativeAllocClass)cs->cls, cs->reg); - memset(&addr, 0, sizeof addr); - addr.base_kind = NATIVE_ADDR_BASE_FRAME; - addr.base.frame = cs->slot; - addr.base_type = cs->type; - mem = aa_mem_for_type(&a->base, cs->type, 8); - aa_emit_mem(a, 1, reg, addr, mem); - } + u32 words[AA_PROLOGUE_WORDS]; + u32 n = 0; + aa_words_callee_saves(a, 0, words, AA_PROLOGUE_WORDS, &n); + for (u32 i = 0; i < n; ++i) aa_emit32(a->base.mc, words[i]); } static void aa_words_load_imm(AANativeTarget* a, u32* words, u32 cap, u32* n, @@ -972,49 +984,110 @@ static void aa_words_saved_pair_addr(AANativeTarget* a, u32* words, u32 cap, static void aa_words_restore_frame(AANativeTarget* a, u32* words, u32 cap, u32* n, u32 frame_size) { if (!frame_size) return; - if (*n + 4u > cap) aa_panic(a, "instruction patch too small"); + if (*n + 3u > cap) aa_panic(a, "instruction patch too small"); words[(*n)++] = aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0); - words[(*n)++] = aa_ldur_v(3, 0, AA_FP, AA_TMP0, -16); - words[(*n)++] = aa_ldur_v(3, 0, AA_LR, AA_TMP0, -8); + words[(*n)++] = aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, -2); /* fp,lr @ -16 */ words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP0, 0, 0); } -static void aa_patch_prologue(AANativeTarget* a, u32 frame_size) { - u32 words[AA_PROLOGUE_WORDS]; +/* Emit callee-save store (save=1) or restore (save=0) words into `words`, + * pairing adjacent integer registers into a single stp/ldp. reserve_callee_saves + * allocates consecutive 8-byte slots in order, so callee_saves[i] sits 8 bytes + * above callee_saves[i+1]; for an int pair the lower-addressed reg[i+1] is the + * stp's Rt and reg[i] is Rt2. FP registers (and an unpaired trailing int) use + * the single-register stur/ldur form. */ +static void aa_words_callee_saves(AANativeTarget* a, int save, u32* words, + u32 cap, u32* n) { + for (u32 i = 0; i < a->ncallee_saves;) { + const AACalleeSave* cs = &a->callee_saves[i]; + i32 off = -(i32)aa_slot(a, cs->slot)->off; + if (off < -256 || off > 255) + aa_panic(a, "callee-save offset out of prologue range"); + if (i + 1u < a->ncallee_saves && cs->cls == (u8)NATIVE_REG_INT && + a->callee_saves[i + 1u].cls == (u8)NATIVE_REG_INT) { + const AACalleeSave* cs2 = &a->callee_saves[i + 1u]; + i32 off2 = -(i32)aa_slot(a, cs2->slot)->off; /* off - 8, lower address */ + if (*n >= cap) aa_panic(a, "prologue too large"); + words[(*n)++] = save + ? aa64_stp64_soff(cs2->reg, cs->reg, AA_FP, off2 / 8) + : aa64_ldp64_soff(cs2->reg, cs->reg, AA_FP, off2 / 8); + i += 2u; + } else { + u32 v = cs->cls == (u8)NATIVE_REG_FP ? 1u : 0u; + if (*n >= cap) aa_panic(a, "prologue too large"); + words[(*n)++] = save ? aa_stur_v(3, v, cs->reg, AA_FP, off) + : aa_ldur_v(3, v, cs->reg, AA_FP, off); + i += 1u; + } + } +} + +/* Build the prologue instruction words for `frame_size` into `words` (capacity + * `cap`), returning the count. Shared by the NativeDirectTarget patch path + * (reserves a fixed worst-case region, then patches it here) and the optimizer + * path (emits an exact-size region up front; see aa_emit_prologue). */ +static u32 aa_build_prologue_words(AANativeTarget* a, u32 frame_size, u32* words, + u32 cap) { u32 n = 0; + if (!frame_size) return 0; + aa_words_sub_sp_frame(a, words, cap, &n, frame_size); + aa_words_saved_pair_addr(a, words, cap, &n, frame_size); + if (n >= cap) aa_panic(a, "prologue too large"); + words[n++] = aa64_stp64_soff(AA_FP, AA_LR, AA_TMP1, 0); /* fp,lr @ [x17] */ + aa_words_frame_ptr_from_sp(a, words, cap, &n, frame_size); + /* Save callee-saved registers the allocator used, FP-relative. Their slots + * were reserved first (aa_reserve_callee_saves), so offsets fit stur's + * signed-9-bit immediate. */ + aa_words_callee_saves(a, 1, words, cap, &n); + return n; +} + +/* Patch the reserved prologue region (`region` words at prologue_pos) with the + * real prologue for `frame_size`. Used by the NativeDirectTarget single-pass + * path, which reserves AA_PROLOGUE_WORDS up front before the frame is known. + * The optimizer path reserves exactly the words it needs, so `region` equals + * the real prologue length and no tail remains. */ +static void aa_patch_prologue(AANativeTarget* a, u32 frame_size, u32 region) { + u32 words[AA_PROLOGUE_WORDS]; + u32 n; ObjSecId sec = a->func->text_section_id; + if (region > AA_PROLOGUE_WORDS) aa_panic(a, "prologue region too large"); memset(words, 0, sizeof words); - if (frame_size) { - aa_words_sub_sp_frame(a, words, AA_PROLOGUE_WORDS, &n, frame_size); - aa_words_saved_pair_addr(a, words, AA_PROLOGUE_WORDS, &n, frame_size); - if (n + 2u > AA_PROLOGUE_WORDS) aa_panic(a, "prologue too large"); - words[n++] = aa_stur_v(3, 0, AA_FP, AA_TMP1, 0); - words[n++] = aa_stur_v(3, 0, AA_LR, AA_TMP1, 8); - aa_words_frame_ptr_from_sp(a, words, AA_PROLOGUE_WORDS, &n, frame_size); - /* Save callee-saved registers the allocator used, FP-relative. Their slots - * were reserved first (aa_reserve_callee_saves), so offsets fit stur's - * signed-9-bit immediate. */ - for (u32 i = 0; i < a->ncallee_saves; ++i) { - const AACalleeSave* cs = &a->callee_saves[i]; - i32 off = -(i32)aa_slot(a, cs->slot)->off; - if (n >= AA_PROLOGUE_WORDS) aa_panic(a, "prologue too large"); - if (off < -256 || off > 255) - aa_panic(a, "callee-save offset out of prologue range"); - words[n++] = aa_stur_v(3, cs->cls == (u8)NATIVE_REG_FP ? 1u : 0u, cs->reg, - AA_FP, off); - } - } - while (n < AA_PROLOGUE_WORDS) words[n++] = 0xd503201fu; - for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) + n = aa_build_prologue_words(a, frame_size, words, region); + /* If the real prologue is shorter than the reserved region (the worst-case + * NDT reservation), branch straight to the body rather than leaving the + * trailing slots as NOPs that fall through and execute on every call. */ + if (n < region) { + words[n] = aa64_b(region - n); + for (u32 i = n + 1u; i < region; ++i) words[i] = 0xd503201fu; + } + for (u32 i = 0; i < region; ++i) aa_patch32(a->base.obj, sec, a->prologue_pos + i * 4u, words[i]); } +/* Optimizer path: emit an exact-size prologue in place (no reserved NOP + * region). The callee-save set and the static frame slots are final by now, so + * the prologue's instruction count is fixed; only the frame-size immediates + * (sub sp / save-area address / fp = sp+frame) still depend on body-emitted + * temporaries and are patched in func_end. We size the region with a frame that + * fits add/sub's imm12 (the real frame must too, or func_end's rebuild — capped + * at this length — panics). The sret/variadic entry saves follow, as on the + * single-pass path. */ +static void aa_emit_prologue(NativeTarget* t) { + AANativeTarget* a = aa_of(t); + u32 words[AA_PROLOGUE_WORDS]; + u32 est_frame = align_up_u32(a->cum_off + a->max_outgoing, 16u); + u32 n = aa_build_prologue_words(a, est_frame, words, AA_PROLOGUE_WORDS); + for (u32 i = 0; i < n; ++i) aa_emit32(t->mc, words[i]); + a->minimal_prologue_words = n; + aa_emit_entry_saves(a); +} + static void aa_emit_restore_frame(AANativeTarget* a, u32 frame_size) { MCEmitter* mc = a->base.mc; if (!frame_size) return; aa_emit32(mc, aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0)); - aa_emit32(mc, aa_ldur_v(3, 0, AA_FP, AA_TMP0, -16)); - aa_emit32(mc, aa_ldur_v(3, 0, AA_LR, AA_TMP0, -8)); + aa_emit32(mc, aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, -2)); /* fp,lr @ -16 */ aa_emit32(mc, aa64_add_imm(1, AA_SP, AA_TMP0, 0, 0)); } @@ -1030,19 +1103,12 @@ static void aa_patch_allocas(AANativeTarget* a) { } } -/* Append FP-relative loads that restore the saved callee registers. Shared by - * the tail-call patch; the function epilogue uses aa_emit_callee_restores. */ +/* Append FP-relative loads that restore the saved callee registers (stp/ldp + * paired, same as the prologue saves). Shared by the tail-call patch; the + * function epilogue uses aa_emit_callee_restores. */ static void aa_words_callee_restores(AANativeTarget* a, u32* words, u32 cap, u32* n) { - for (u32 i = a->ncallee_saves; i > 0; --i) { - const AACalleeSave* cs = &a->callee_saves[i - 1u]; - i32 off = -(i32)aa_slot(a, cs->slot)->off; - if (*n >= cap) aa_panic(a, "patch too small for callee restores"); - if (off < -256 || off > 255) - aa_panic(a, "callee-save offset out of restore range"); - words[(*n)++] = aa_ldur_v(3, cs->cls == (u8)NATIVE_REG_FP ? 1u : 0u, cs->reg, - AA_FP, off); - } + aa_words_callee_saves(a, 0, words, cap, n); } static void aa_patch_tail_sites(AANativeTarget* a, u32 frame_size) { @@ -1073,15 +1139,20 @@ static void aa_func_end(NativeTarget* t) { AANativeTarget* a = aa_of(t); MCEmitter* mc = t->mc; u32 frame_size = align_up_u32(a->cum_off + a->max_outgoing, 16u); + /* Optimizer path emitted an exact-size prologue (minimal_prologue_words); + * the single-pass path reserved a fixed worst-case region. Either way the + * frame-size immediates are only final now, so patch the region in place. */ + u32 prologue_region = + t->emit_minimal_prologue ? a->minimal_prologue_words : AA_PROLOGUE_WORDS; mc->label_place(mc, a->epilogue_label); aa_emit_callee_restores(a); aa_emit_restore_frame(a, frame_size); aa_emit32(mc, aa64_ret(AA_LR)); - aa_patch_prologue(a, frame_size); + aa_patch_prologue(a, frame_size, prologue_region); aa_patch_allocas(a); aa_patch_tail_sites(a, frame_size); if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) { - mc->cfi_set_next_pc_offset(mc, AA_PROLOGUE_WORDS * 4u); + mc->cfi_set_next_pc_offset(mc, prologue_region * 4u); mc->cfi_def_cfa(mc, AA_FP, 0); mc->cfi_offset(mc, AA_FP, -16); mc->cfi_offset(mc, AA_LR, -8); @@ -1870,6 +1941,23 @@ static u32 aa_call_stack_size(NativeTarget* t, const NativeCallDesc* desc) { return align_up_u32(stack, 16u); } +/* Stack-argument bytes a call with `fn_type`'s fixed parameters uses. Reuses + * aa_call_stack_size by routing the declared params through it (their ABI + * classification is independent of the actual operand locations, which + * aa_call_stack_size ignores for register/stack placement). */ +static u32 aa_signature_stack_bytes(NativeTarget* t, CfreeCgTypeId fn_type, + int* variadic, u32* nparams) { + const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fn_type); + NativeCallDesc d; + if (variadic) *variadic = abi ? (int)abi->variadic : 0; + if (nparams) *nparams = abi ? abi->nparams : 0u; + memset(&d, 0, sizeof d); + d.fn_type = fn_type; + d.nargs = abi ? abi->nparams : 0u; + if (d.nargs) d.args = arena_zarray(t->c->tu, NativeLoc, d.nargs); + return aa_call_stack_size(t, &d); +} + static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc, NativeCallPlan* plan) { NativeCallPlanRet* rets; @@ -2765,6 +2853,10 @@ NativeTarget* aa64_native_target_new(Compiler* c, ObjBuilder* obj, t->func_begin_known_frame = aa_func_begin_known_frame; t->note_frame_state = aa_note_frame_state; t->reserve_callee_saves = aa_reserve_callee_saves; + t->emit_prologue = aa_emit_prologue; + t->signature_stack_bytes = aa_signature_stack_bytes; + t->has_store_zero_reg = 1; + t->store_zero_reg = 31u; /* wzr/xzr in the Rt position of a store */ t->func_end = aa_func_end; t->frame_slot = aa_frame_slot; t->bind_param = aa_bind_native_param; diff --git a/src/arch/native_target.h b/src/arch/native_target.h @@ -279,6 +279,31 @@ struct NativeTarget { * up front. */ void (*reserve_callee_saves)(NativeTarget*, const u32* used_by_class, u32 nclasses); + /* Optional. When set, the optimizer emit path calls this once — after + * func_begin, reserve_callee_saves, and frame-slot mapping, but before the + * body — to emit a minimal, exact-size prologue in place (no reserved NOP + * region). Frame-size immediates are still patched in func_end, since the + * final frame size isn't known until body emission allocates its temporaries. + * Backends that leave this NULL fall back to the single-pass reserve-and-patch + * prologue used by NativeDirectTarget. Gated by `emit_minimal_prologue`, which + * the optimizer emit path sets before func_begin so func_begin can skip the + * reserved region. */ + void (*emit_prologue)(NativeTarget*); + u8 emit_minimal_prologue; + /* Bytes of stack-passed arguments the fixed parameters of this function + * signature use (the part beyond the register arg pools). Sets *variadic to + * whether the signature is variadic and *nparams to the fixed parameter + * count. Used to decide tail-call (sibling) realizability: the callee's + * outgoing stack args must fit the area the caller itself received. Either + * out-pointer may be NULL. May itself be NULL. */ + u32 (*signature_stack_bytes)(NativeTarget*, CfreeCgTypeId fn_type, + int* variadic, u32* nparams); + /* Integer hardware zero register, if the ISA has one (aa64 wzr/xzr, rv64 + * x0). When `has_store_zero_reg` is set, the emit path stores a constant 0 + * straight from `store_zero_reg` instead of materializing 0 into a scratch + * with a mov/movz first. */ + u8 has_store_zero_reg; + Reg store_zero_reg; void (*func_end)(NativeTarget*); NativeFrameSlot (*frame_slot)(NativeTarget*, const NativeFrameSlotDesc*); diff --git a/src/opt/opt.c b/src/opt/opt.c @@ -23,8 +23,45 @@ typedef struct OptImpl { NativeTarget* native; int level; Writer* dump_writer; + /* Registry of functions recorded so far, for tiny-inline callee lookup. + * `lowered_cache` is parallel to `cg_by_sym`: a lazily re-lowered + * pre-machinize Func for the callee, reused across all call sites. All on + * c->tu (persist for the whole translation unit). */ + CgIrFunc** cg_by_sym; + Func** lowered_cache; + u32 ncg; + u32 cg_cap; } OptImpl; +/* Lazily re-lower (and cache) the pre-machinize Func for a recorded callee + * symbol. Returns NULL for forward-defined callees not yet recorded. */ +static Func* opt_tiny_callee_lookup(void* ctx, ObjSymId sym) { + OptImpl* o = (OptImpl*)ctx; + for (u32 i = 0; i < o->ncg; ++i) { + if (o->cg_by_sym[i]->desc.sym != sym) continue; + if (!o->lowered_cache[i]) + o->lowered_cache[i] = opt_func_from_cg_ir(o->c, o->cg_by_sym[i]); + return o->lowered_cache[i]; + } + return NULL; +} + +static void opt_registry_add(OptImpl* o, CgIrFunc* f) { + if (o->ncg == o->cg_cap) { + u32 ncap = o->cg_cap ? o->cg_cap * 2u : 16u; + CgIrFunc** ncg = arena_array(o->c->tu, CgIrFunc*, ncap); + Func** nlc = arena_zarray(o->c->tu, Func*, ncap); + if (o->ncg) { + memcpy(ncg, o->cg_by_sym, sizeof(ncg[0]) * o->ncg); + memcpy(nlc, o->lowered_cache, sizeof(nlc[0]) * o->ncg); + } + o->cg_by_sym = ncg; + o->lowered_cache = nlc; + o->cg_cap = ncap; + } + o->cg_by_sym[o->ncg++] = f; +} + static void opt_dbg_dump(OptImpl* o, Func* f, const char* tag) { extern char* getenv(const char*); const char* s = getenv("CFREE_DUMP"); @@ -65,6 +102,21 @@ static void opt_run_o1_native(OptImpl* o, Func* f) { metrics_scope_begin(o->c, "opt.cfg.simplify_local"); opt_simplify_local(f); metrics_scope_end(o->c, "opt.cfg.simplify_local"); + + metrics_scope_begin(o->c, "opt.o1.tiny_inline"); + int inlined = opt_try_tiny_inline(f, opt_tiny_callee_lookup, o); + metrics_scope_end(o->c, "opt.o1.tiny_inline"); + if (inlined) { + /* inline_call_site invalidated CFG and left preds stale (it maintains + * succ + emit_order only); rebuilding is required before verify/machinize. + * Then merge the BR-glue chain (pre -> body -> cont) back into + * straight-line blocks so regalloc sees no artificial boundaries, mirroring + * the prologue's build_cfg -> jump_cleanup -> build_cfg idiom. */ + opt_build_cfg(f); + opt_jump_cleanup(f, OPT_JUMP_CLEANUP_CFG); + opt_build_cfg(f); + } + metrics_scope_begin(o->c, "opt.cfg.verify"); opt_verify(f, "lowering-cfg"); metrics_scope_end(o->c, "opt.cfg.verify"); @@ -98,6 +150,10 @@ static void opt_run_o1_native(OptImpl* o, Func* f) { metrics_scope_begin(o->c, "opt.build_loop_tree"); opt_build_loop_tree(f); metrics_scope_end(o->c, "opt.build_loop_tree"); + metrics_scope_begin(o->c, "opt.o1.hoist_loop_consts"); + opt_hoist_loop_consts(f); + metrics_scope_end(o->c, "opt.o1.hoist_loop_consts"); + opt_verify(f, "o1-hoist-loop-consts"); metrics_scope_begin(o->c, "opt.live_blocks.pre_dde"); memset(&live, 0, sizeof live); @@ -171,6 +227,9 @@ static void opt_dbg_dump_cg(OptImpl* o, const CgIrFunc* f) { static void opt_on_func(void* user, CgIrFunc* cg_func) { OptImpl* o = (OptImpl*)user; Func* f; + /* Register before lowering so later callers can resolve this (complete) + * function as a tiny-inline callee. */ + opt_registry_add(o, cg_func); opt_dbg_dump_cg(o, cg_func); /* The dump writer renders the semantic CG IR tape — the IR as recorded, * before lowering to the optimizer's CFG form. */ @@ -205,9 +264,31 @@ static int opt_on_local_static_data_begin(void* user, static const char* opt_on_tail_call_unrealizable_reason( void* user, const struct CGFuncDesc* caller, const CGCallDesc* call) { - (void)user; - (void)caller; - (void)call; + OptImpl* o = (OptImpl*)user; + int callee_va = 0, caller_va = 0; + u32 callee_nparams = 0; + u32 outgoing, incoming; + if (!o || !o->native || !o->native->signature_stack_bytes) return NULL; + /* A realized sibling call reuses the area this function received for its own + * incoming stack arguments to hold the callee's outgoing stack arguments, so + * the callee's must fit. If they don't, the tail isn't realizable and CG + * falls back to an ordinary call + return. */ + outgoing = o->native->signature_stack_bytes(o->native, call->fn_type, + &callee_va, &callee_nparams); + incoming = o->native->signature_stack_bytes(o->native, caller->fn_type, + &caller_va, NULL); + /* A variadic caller's incoming layout includes the register save / overflow + * area, which isn't a simple reusable parameter slab — don't realize. And if + * the call actually passes variadic (beyond-fixed) arguments, their stack + * footprint isn't captured by the fixed-parameter sizing above, so be + * conservative and fall back. A variadic callee invoked with only its fixed + * arguments (e.g. `musttail sink(x)`) is fine and uses the byte comparison. */ + if (caller_va) + return "variadic caller cannot host a sibling call"; + if (callee_va && call->nargs > callee_nparams) + return "variadic tail call arguments are not realizable as a sibling call"; + if (outgoing > incoming) + return "tail call stack arguments exceed the caller's parameter area"; return NULL; } diff --git a/src/opt/opt.h b/src/opt/opt.h @@ -29,6 +29,8 @@ void opt_promote_scalar_locals(Func*); /* O1: promote non-escaped scalar frame slots to mutable PRegs. */ void opt_addr_of_global_cse(Func*); /* O1: hoist duplicate ADDR_OF(global) defs to a single entry-block compute. */ +void opt_hoist_loop_consts(Func*); /* O1: hoist loop-invariant LOAD_IMM + materialization to the entry block. */ void opt_simplify_local(Func*); void opt_simplify(Func*); void opt_gvn(Func*); /* incl. constprop, redundant-load elim */ diff --git a/src/opt/opt_internal.h b/src/opt/opt_internal.h @@ -23,6 +23,7 @@ typedef struct OptPassCtx { const char* stage; } OptPassCtx; +typedef struct FuncSet FuncSet; struct FuncSet { Compiler* c; Arena* arena; @@ -157,6 +158,15 @@ u32 opt_call_clobber_mask_for(Func*, const Inst*, u8 cls); int opt_inst_has_side_effect(Func*, const Inst*); +/* Inlining. opt_inline is the whole-program driver (FuncSet over all retained + * funcs). opt_try_tiny_inline is the streaming O1 entry: it inlines tiny + * direct callees into one caller, resolving callee bodies through `lookup` + * (returns a fresh pre-machinize Func for a symbol, or NULL to skip). Returns + * the number of call sites inlined. */ +typedef Func* (*OptInlineCalleeLookup)(void* ctx, ObjSymId callee_sym); +int opt_try_tiny_inline(Func* caller, OptInlineCalleeLookup lookup, void* ctx); +void opt_inline(FuncSet*, int max_iters); + int opt_hard_empty(const OptHardRegSet*); int opt_hard_intersects(const OptHardRegSet*, const OptHardRegSet*); void opt_hard_live_step(OptHardRegSet* live, const OptHardRegSet* use, diff --git a/src/opt/pass_addr_fold.c b/src/opt/pass_addr_fold.c @@ -758,3 +758,176 @@ void opt_addr_of_global_cse(Func* f) { opt_analysis_invalidate( f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP); } + +/* LICM-lite for constant materialization. A LOAD_IMM has no inputs, so its + * result is loop-invariant and valid anywhere the entry block dominates (i.e. + * everywhere). When such a def lives inside a loop, the backend would otherwise + * re-materialize the constant (movz/movk) on every iteration; hoisting one copy + * to the entry block and reusing it removes that per-iteration work, mirroring + * opt_addr_of_global_cse for the `adrp`/`add` case. + * + * Scope/cost guards: + * - Only hoist constants that actually appear inside a loop (loop_depth > 0); + * non-loop constants gain nothing and would only lengthen live ranges. + * - Skip imm 0: the emit path stores a constant 0 from the hardware zero + * register (no materialization at all), which beats holding 0 in an + * allocated register across the loop. + * + * Must run after opt_build_loop_tree (reads block loop_depth) and before + * register allocation. */ +typedef struct ConstHoistEntry { + i64 imm; + CfreeCgTypeId type; + PReg canonical; + u8 cls; + u8 in_loop; +} ConstHoistEntry; + +static u32 const_hoist_find_or_add(ConstHoistEntry** entries, u32* n, u32* cap, + Arena* arena, i64 imm, CfreeCgTypeId type, + u8 cls) { + for (u32 i = 0; i < *n; ++i) { + if ((*entries)[i].imm == imm && (*entries)[i].type == type && + (*entries)[i].cls == cls) + return i; + } + if (*n == *cap) { + u32 ncap = *cap ? *cap * 2u : 16u; + ConstHoistEntry* nv = arena_array(arena, ConstHoistEntry, ncap); + if (*entries) memcpy(nv, *entries, sizeof(ConstHoistEntry) * (*n)); + *entries = nv; + *cap = ncap; + } + u32 idx = (*n)++; + ConstHoistEntry* e = &(*entries)[idx]; + memset(e, 0, sizeof *e); + e->imm = imm; + e->type = type; + e->cls = cls; + e->canonical = PREG_NONE; + return idx; +} + +static void const_hoist_count_def(Func* f, Inst* in, Operand* op, int is_def, + void* ud) { + u32* counts = (u32*)ud; + (void)in; + if (!is_def || op->kind != OPK_REG) return; + PReg p = (PReg)op->v.reg; + if (opt_reg_valid(f, p)) ++counts[p]; +} + +/* True for a LOAD_IMM whose constant is a hoist candidate: a non-zero immediate + * defining a register that has exactly one def in the function. The single-def + * requirement is the safety condition for remapping every use of that register + * to one canonical entry def — the PReg form here is non-SSA, so a mutable + * local promoted by opt_promote_scalar_locals (e.g. a loop counter initialized + * to a constant) is multiply defined; remapping it would discard the other + * defs (the increment) and break the loop. */ +static int const_hoist_candidate(const Func* f, const Inst* in, + const u32* def_counts) { + if ((IROp)in->op != IR_LOAD_IMM || in->nopnds < 1) return 0; + if (in->opnds[0].kind != OPK_REG || in->extra.imm == 0) return 0; + PReg p = (PReg)in->opnds[0].v.reg; + return opt_reg_valid(f, p) && def_counts[p] == 1; +} + +void opt_hoist_loop_consts(Func* f) { + if (!f || f->opt_reg_ssa || f->opt_rewritten) return; + if (f->nblocks == 0 || f->entry >= f->nblocks) return; + + /* Count defs per PReg so const_hoist_candidate can reject multiply-defined + * registers (mutable promoted locals) — see its comment. */ + u32* def_counts = arena_zarray(f->arena, u32, opt_reg_count(f)); + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) + opt_walk_inst_operands(f, &bl->insts[i], const_hoist_count_def, + def_counts); + } + + /* Pass 1: index LOAD_IMM defs by (imm, type, cls); flag keys seen in a loop. */ + ConstHoistEntry* entries = NULL; + u32 n_entries = 0, cap_entries = 0; + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + int in_loop = bl->loop_depth > 0; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + if (!const_hoist_candidate(f, in, def_counts)) continue; + u32 idx = const_hoist_find_or_add(&entries, &n_entries, &cap_entries, + f->arena, in->extra.imm, + in->opnds[0].type, in->opnds[0].cls); + if (in_loop) entries[idx].in_loop = 1; + } + } + + /* Pass 2: allocate a canonical PReg for each constant that appears in a loop. */ + u32 dup_count = 0; + for (u32 i = 0; i < n_entries; ++i) { + if (!entries[i].in_loop) continue; + entries[i].canonical = ir_alloc_preg(f, entries[i].type, entries[i].cls); + ++dup_count; + } + if (!dup_count) return; + + /* Pass 3: NOP every def of a hoisted constant, mapping its PReg to the + * canonical one (consolidates loop and non-loop occurrences alike). */ + PReg* remap = arena_zarray(f->arena, PReg, opt_reg_count(f)); + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + if (!const_hoist_candidate(f, in, def_counts)) continue; + u32 idx = const_hoist_find_or_add(&entries, &n_entries, &cap_entries, + f->arena, in->extra.imm, + in->opnds[0].type, in->opnds[0].cls); + if (entries[idx].canonical == PREG_NONE) continue; + PReg old = (PReg)in->opnds[0].v.reg; + if (opt_reg_valid(f, old)) remap[old] = entries[idx].canonical; + in->op = IR_NOP; + in->def = VAL_NONE; + in->ndefs = 0; + in->defs = NULL; + in->nopnds = 0; + in->opnds = NULL; + } + } + + /* Pass 4: emit one LOAD_IMM per hoisted constant in the entry block, after + * any leading IR_PARAM_DECL prologue. */ + Block* entry = &f->blocks[f->entry]; + u32 insert_at = 0; + while (insert_at < entry->ninsts && + (IROp)entry->insts[insert_at].op == IR_PARAM_DECL) + ++insert_at; + Inst* slot = block_insert_at(f, entry, insert_at, dup_count); + u32 w = 0; + for (u32 i = 0; i < n_entries; ++i) { + if (entries[i].canonical == PREG_NONE) continue; + Inst* in = &slot[w++]; + in->op = (u16)IR_LOAD_IMM; + in->def = (Val)entries[i].canonical; + in->type = entries[i].type; + in->extra.imm = entries[i].imm; + in->nopnds = 1; + in->opnds = arena_array(f->arena, Operand, 1); + memset(&in->opnds[0], 0, sizeof(Operand)); + in->opnds[0].kind = OPK_REG; + in->opnds[0].cls = entries[i].cls; + in->opnds[0].type = entries[i].type; + in->opnds[0].v.reg = entries[i].canonical; + ir_assign_inst_id(f, in); + } + + /* Pass 5: apply remap to all operand uses (reuses the ADDR_OF-CSE walker, + * which also covers IR_CALL aux operands). */ + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) + addr_cse_apply_to_inst(&bl->insts[i], remap); + } + + opt_analysis_invalidate( + f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP); +} diff --git a/src/opt/pass_combine.c b/src/opt/pass_combine.c @@ -778,17 +778,29 @@ static int try_substitute_for_reg(CombineCtx* ctx, Inst* in, i32 i, u8 cls, src_op = prod->opnds[1]; } - /* Single-use within the producer's live range: producer's dst must have - * exactly one further use before the next redef/barrier in this block (the - * consumer we're about to rewrite). If the live range terminates inside - * this block, the producer's def is killed before any successor sees it, - * so the cross-block live-out check can be skipped. */ - int killed = 0; - int uses_total = count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &def, &killed); - if (uses_total != 1) return 0; - if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live, - ctx->bl->id, &def)) - return 0; + /* For a register-source IR_COPY this is local copy propagation: rewriting + * one use of the copy's dest to its source is value-safe whenever the source + * is unchanged between the copy and this consumer — already verified by the + * ctx_def_changed_since check above (which treats CALL/ASM/INTRINSIC as a + * clobber of every register). The copy itself is not removed here; if all its + * uses fold away and it is not live-out, post-combine DCE deletes it. So no + * single-use or cross-block live-out restriction is required, which lets the + * O1 path collapse the multi-use reg->reg copy chains it would otherwise + * leave behind (it never runs the O2 coalescer). + * + * The immediate / convert producers don't propagate a register value, so + * they keep the stricter single-use + live-out gate: substituting them into + * every use would duplicate a (possibly multi-instruction) materialization + * rather than shorten a copy chain. */ + if (kind != SK_REG) { + int killed = 0; + int uses_total = + count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &def, &killed); + if (uses_total != 1) return 0; + if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live, + ctx->bl->id, &def)) + return 0; + } /* O1 (no coalescing) folds immediates into IR_COPY; O2 leaves the copy * register-to-register so coalescing + self-copy removal handles it. */ diff --git a/src/opt/pass_inline.c b/src/opt/pass_inline.c @@ -36,6 +36,13 @@ typedef struct InlineOrderCtx { #define INLINE_ABS_GROWTH_LIMIT 64u #define INLINE_HINT_ABS_GROWTH_LIMIT 128u +/* Streaming O1 tiny-inline: a much smaller budget than the whole-program + * inliner. DEFAULT/HINT callees must fit under this cost; ALWAYS bypasses it. + * Bounded by max passes since an inlined straightline body never introduces a + * new call site. */ +#define INLINE_TINY_COST_LIMIT 8u +#define TINY_INLINE_MAX_PASSES 4u + static u32 func_inline_cost(Func* f) { u32 cost = 0; if (!f) return 0; @@ -235,7 +242,7 @@ static void map_mem(InlineMap* m, MemAccess* mem) { } static Operand map_operand(InlineMap* m, Operand op) { - switch ((OpKind)op.kind) { + switch ((OptOperandKind)op.kind) { case OPK_REG: op.v.reg = map_preg(m, (PReg)op.v.reg); break; @@ -692,3 +699,79 @@ void opt_inline(FuncSet* fs, int max_iters) { if (!changed) break; } } + +/* Streaming single-caller variant for the O1 pipeline. Same gates as + * try_inline_call, minus the whole-program growth check (which needs a + * base_cost array the streaming path lacks), plus the tiny cost cap. */ +static int try_tiny_inline_call(FuncSet* fs, Func* caller, u32 b, u32 i) { + Inst* in = &caller->blocks[b].insts[i]; + Func* callee = direct_callee(fs, in); + u32 cost = 0; + CfreeCgInlinePolicy policy; + if (!callee) return 0; + policy = effective_inline_policy(in, callee); + metrics_count(caller->c, "opt.tiny_inline.candidates", 1); + if (policy == CFREE_CG_INLINE_NEVER) { + metrics_count(caller->c, "opt.tiny_inline.refuse_policy", 1); + return 0; + } + if (recursive_or_scc(fs, caller, callee)) { + metrics_count(caller->c, "opt.tiny_inline.refuse_scc", 1); + return 0; + } + if (!callee_inline_shape(callee, policy, &cost)) { + metrics_count(caller->c, "opt.tiny_inline.refuse_shape", 1); + return 0; + } + if (policy != CFREE_CG_INLINE_ALWAYS && cost > INLINE_TINY_COST_LIMIT) { + metrics_count(caller->c, "opt.tiny_inline.refuse_budget", 1); + return 0; + } + if (!inline_rewrite_supported(callee, in)) { + metrics_count(caller->c, "opt.tiny_inline.refuse_rewrite_shape", 1); + return 0; + } + if (!inline_call_site(caller, b, i, callee)) { + metrics_count(caller->c, "opt.tiny_inline.refuse_rewrite", 1); + return 0; + } + metrics_count(caller->c, "opt.tiny_inline.inlined", 1); + return 1; +} + +int opt_try_tiny_inline(Func* caller, OptInlineCalleeLookup lookup, void* ctx) { + int total = 0; + if (!caller || !lookup || caller->opt_reg_ssa || caller->opt_rewritten) + return 0; + for (u32 pass = 0; pass < TINY_INLINE_MAX_PASSES; ++pass) { + int changed = 0; + for (u32 b = 0; b < caller->nblocks && !changed; ++b) { + Block* bl = &caller->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + if ((IROp)in->op != IR_CALL) continue; + IRCallAux* aux = (IRCallAux*)in->extra.aux; + if (!aux || aux->desc.callee.kind != OPK_GLOBAL) continue; + Func* callee = lookup(ctx, aux->desc.callee.v.global.sym); + if (!callee) continue; + /* Ad-hoc 2-element FuncSet so the existing gates (direct_callee, + * recursive_or_scc) resolve the callee by symbol. */ + Func* funcs[2] = {caller, callee}; + FuncSet fs; + memset(&fs, 0, sizeof fs); + fs.c = caller->c; + fs.arena = caller->arena; + fs.funcs = funcs; + fs.nfuncs = (caller == callee) ? 1u : 2u; + fs.cap = fs.nfuncs; + if (try_tiny_inline_call(&fs, caller, b, i)) { + ++total; + changed = 1; /* indices invalidated by the split; restart scan */ + break; + } + } + } + if (!changed) break; + } + return total; +} diff --git a/src/opt/pass_jump.c b/src/opt/pass_jump.c @@ -317,6 +317,74 @@ static void cleanup_reorder_for_fallthrough(JumpCleanupCtx* c) { } } +/* Rotate simple counted loops so the test sits at the bottom: the back-edge + * becomes the per-iteration conditional branch and the body falls through to + * the test, eliminating the unconditional back-jump the head-test shape emits + * every iteration. + * + * Pattern (after the greedy reorder): + * + * H (emit i): IR_CMP_BRANCH, succ[0]=E (exit, taken), succ[1]=B (body entry, + * the fallthrough, placed right after H by the greedy reorder) + * ... body chain ... + * L (emit k): the latch — a predecessor of H at a later emit position, i.e. + * the source of the loop's back-edge to H + * + * The rewrite moves H from position i to just after L (rotating the emit-order + * run [i..k] left by one) and inverts H's test in place: H's taken edge becomes + * the back-edge to the body entry (the per-iteration conditional) and its + * fallthrough becomes the exit E. No CFG edges move — reordering emit_order is + * always valid because pass_native_emit emits an explicit jump for any + * successor that isn't the textual next block. After the move L falls through + * to H (its back-edge `b H` is stripped), and the preheader keeps its now- + * forward `b H` (executed once, serving as the zero-trip guard); the + * unconditional back-jump every iteration is gone. + * + * Conservative guards: the body entry must be H's fallthrough and immediately + * follow H (the greedy chain), and H must have exactly one back-edge + * predecessor after it in emit order (reducible, single-latch loop). */ +static void cleanup_rotate_loops(JumpCleanupCtx* c) { + Func* f = c->f; + if (f->emit_order_n < 2u) return; + for (u32 i = 0; i + 1u < f->emit_order_n; ++i) { + u32 h = f->emit_order[i]; + if (h >= f->nblocks) continue; + Block* H = &f->blocks[h]; + if (!H->ninsts || H->nsucc != 2) continue; + Inst* hterm = &H->insts[H->ninsts - 1u]; + if ((IROp)hterm->op != IR_CMP_BRANCH) continue; + u32 body = H->succ[1]; /* loop body entry (fallthrough) */ + if (f->emit_order[i + 1u] != body) continue; + /* Latch: the single predecessor of H later in emit order (back-edge + * source). Zero or multiple => not a simple single-latch loop; skip. */ + u32 latch_pos = BLOCK_NONE; + int ok = 1; + for (u32 p = 0; p < H->npreds; ++p) { + u32 pos = emit_order_index(c, H->preds[p]); + if (pos == BLOCK_NONE || pos <= i) continue; + if (latch_pos != BLOCK_NONE) { + ok = 0; + break; + } + latch_pos = pos; + } + if (!ok || latch_pos == BLOCK_NONE) continue; + CmpOp inverted; + if (!invert_cmp((CmpOp)hterm->extra.imm, &inverted)) continue; + u32 exit = H->succ[0]; + for (u32 k = i; k < latch_pos; ++k) f->emit_order[k] = f->emit_order[k + 1u]; + f->emit_order[latch_pos] = h; + hterm->extra.imm = inverted; + H->succ[0] = body; /* taken: back-edge to the loop body */ + H->succ[1] = exit; /* fallthrough: loop exit */ + for (u32 b = 0; b < f->nblocks; ++b) c->emit_index_by_block[b] = BLOCK_NONE; + for (u32 k = 0; k < f->emit_order_n; ++k) { + u32 bb = f->emit_order[k]; + if (bb < f->nblocks) c->emit_index_by_block[bb] = k; + } + } +} + static void cleanup_layout_fallthrough_branches(const JumpCleanupCtx* c) { Func* f = c->f; for (u32 b = 0; b < f->nblocks; ++b) { @@ -426,6 +494,7 @@ void opt_jump_cleanup(Func* f, OptJumpCleanupStage stage) { cleanup_branch_targets(&c); } else if (stage == OPT_JUMP_CLEANUP_LAYOUT) { cleanup_reorder_for_fallthrough(&c); + cleanup_rotate_loops(&c); cleanup_invert_taken_fallthrough(&c); cleanup_layout_fallthrough_branches(&c); } diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c @@ -622,12 +622,25 @@ static void emit_call(NativeEmitCtx* e, Inst* in) { for (u32 i = 0; i < aux->desc.nargs; ++i) args[i] = abi_storage_loc(e, &aux->desc.args[i], in->loc); if (aux->desc.ret.storage.kind) { + CfreeCgTypeId rty = aux->desc.ret.type; + int scalar = !cg_type_is_aggregate(e->c, rty) && + type_size_or(e->c, rty, 8u) <= 8u; results = arena_zarray(e->f->arena, NativeLoc, 1); final_result = abi_storage_loc(e, &aux->desc.ret, in->loc); - result_slot = - temp_slot(e, aux->desc.ret.type, in->loc, NATIVE_FRAME_SLOT_SPILL); - results[0] = loc_frame(aux->desc.ret.type, - class_for_type(e, aux->desc.ret.type), result_slot); + /* Scalar result: hand plan_call the value's real destination (the MIR + * result reg, or its spill slot) directly, so it emits one move out of the + * ABI result register. Routing every scalar result through a fresh temp + * slot — store x0 then immediately reload — was a pure round trip on every + * call; emit_ret already avoids the analogous trip on returns. The temp + * slot is kept only for aggregate / oversized results, which plan_call / + * the callee write in parts and must land in memory. */ + if (scalar && (final_result.kind == NATIVE_LOC_REG || + final_result.kind == NATIVE_LOC_FRAME)) { + results[0] = final_result; + } else { + result_slot = temp_slot(e, rty, in->loc, NATIVE_FRAME_SLOT_SPILL); + results[0] = loc_frame(rty, class_for_type(e, rty), result_slot); + } } d.fn_type = aux->desc.fn_type; d.callee = loc_from_operand(e, &aux->desc.callee, in->loc); @@ -774,6 +787,14 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in, addr = addr_from_operand(e, &in->opnds[0], in->loc); legalize_addr(e, &addr, in->extra.mem, in->loc); src = loc_from_operand(e, &in->opnds[1], in->loc); + /* Storing a constant 0 from the hardware zero register avoids + * materializing 0 into a scratch first (e.g. `strb wzr, [..]` rather than + * `movz w9,0; strb w9, [..]`). */ + if (src.kind == NATIVE_LOC_IMM && src.v.imm == 0 && + e->target->has_store_zero_reg && + class_for_type(e, in->opnds[1].type) == NATIVE_REG_INT) + src = loc_reg(in->opnds[1].type, NATIVE_REG_INT, + e->target->store_zero_reg); if (src.kind == NATIVE_LOC_REG && (src.v.reg == addr_base_reg(&addr) || src.v.reg == addr_index_reg(&addr))) { NativeFrameSlot slot = @@ -1339,9 +1360,16 @@ void opt_emit_native(Compiler* c, Func* f, NativeTarget* target) { metrics_scope_end(c, "opt.native_emit.setup"); metrics_scope_begin(c, "opt.native_emit.func_begin"); + /* The optimizer path knows the callee-save set and frame slots before the + * body, so the backend can emit an exact-size prologue here rather than + * reserving a worst-case NOP region patched at func_end. Signal this before + * func_begin (so it skips the reserved region) and emit the prologue once + * reserve_callee_saves + map_frame_slots have run. */ + target->emit_minimal_prologue = target->emit_prologue != NULL; target->func_begin(target, &fd); reserve_callee_saves(&e); map_frame_slots(&e); + if (target->emit_minimal_prologue) target->emit_prologue(target); bind_params(&e); metrics_scope_end(c, "opt.native_emit.func_begin"); diff --git a/test/opt/run.sh b/test/opt/run.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# Behavioral check for the O1 tiny-function inliner. +# +# Compiles a toy program where __user_main calls the tiny helper add1(x)=x+1, +# and asserts that at -O1 the `bl add1` is gone from __user_main (the helper +# was inlined). Also confirms the program still computes the right value via +# the in-process JIT. +set -euo pipefail + +ROOT="$(cd "$(dirname "$0")/../.." && pwd)" +CFREE="${CFREE:-$ROOT/build/cfree}" +SRC="$ROOT/test/toy/cases/135_inline_cleanup_quality.toy" +WORK="$ROOT/build/test/opt/inline" +mkdir -p "$WORK" + +"$CFREE" cc -target aarch64-linux-gnu -O1 -c "$SRC" \ + -o "$WORK/x.o" > "$WORK/cc.out" 2>&1 +"$CFREE" objdump -d "$WORK/x.o" > "$WORK/dis.out" 2>&1 + +# Slice out the __user_main function body. +awk ' + /^[0-9a-f]+ <__user_main>:/ { in_fn = 1; next } + /^[0-9a-f]+ </ { in_fn = 0 } + in_fn { print } +' "$WORK/dis.out" > "$WORK/user_main.dis" + +if grep -Eq '\bbl\b.*add1' "$WORK/user_main.dis"; then + printf 'tiny-inline check FAILED: add1 was not inlined into __user_main:\n' >&2 + sed 's/^/ | /' "$WORK/user_main.dis" >&2 + exit 1 +fi + +# The inlined body must actually be present (an add of the constant), i.e. the +# function is more than just a tail jump. +if ! grep -Eq '\badd\b' "$WORK/user_main.dis"; then + printf 'tiny-inline check FAILED: inlined add1 body missing from __user_main:\n' >&2 + sed 's/^/ | /' "$WORK/user_main.dis" >&2 + exit 1 +fi + +# Runtime correctness through the in-process JIT: main returns add1(41) == 42. +for opt in -O0 -O1; do + if "$CFREE" run "$opt" -e main "$SRC" > /dev/null 2>&1; then + rc=0 + else + rc=$? + fi + if [ "$rc" -ne 42 ]; then + printf 'tiny-inline runtime check FAILED at %s: exit %d (want 42)\n' \ + "$opt" "$rc" >&2 + exit 1 + fi +done + +printf 'tiny-inline: ok\n' diff --git a/test/opt/tiny_inline_test.c b/test/opt/tiny_inline_test.c @@ -0,0 +1,333 @@ +/* Unit tests for the O1 tiny-function inliner (opt_try_tiny_inline). + * + * Each case builds a callee and a caller as semantic CG IR via the recorder, + * lowers both to the optimizer's pre-machinize Func form with + * opt_func_from_cg_ir, then drives opt_try_tiny_inline with a lookup that + * resolves the callee symbol. We assert on whether the IR_CALL was replaced. */ + +#include <cfree/cg.h> +#include <cfree/core.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "cg/ir.h" +#include "cg/ir_recorder.h" +#include "opt/opt.h" +#include "opt/opt_internal.h" + +#undef Operand +#undef CGFuncDesc +#undef CGParamDesc +#undef CGCallDesc +#undef CGLocalStorage + +static void* h_alloc(CfreeHeap* h, size_t n, size_t a) { + (void)h; + (void)a; + return n ? malloc(n) : NULL; +} + +static void* h_realloc(CfreeHeap* h, void* p, size_t o, size_t n, size_t a) { + (void)h; + (void)o; + (void)a; + return realloc(p, n); +} + +static void h_free(CfreeHeap* h, void* p, size_t n) { + (void)h; + (void)n; + free(p); +} + +static CfreeHeap g_heap = {h_alloc, h_realloc, h_free, NULL}; +static int g_fails; +static int g_checks; + +static void diag_emit(CfreeDiagSink* s, CfreeDiagKind k, CfreeSrcLoc loc, + const char* fmt, va_list ap) { + static const char* names[] = {"note", "warning", "error", "fatal"}; + (void)s; + (void)loc; + fprintf(stderr, "%s: ", names[k]); + vfprintf(stderr, fmt, ap); + fputc('\n', stderr); +} + +static CfreeDiagSink g_diag = {diag_emit, NULL, 0, 0}; + +#define EXPECT(cond, ...) \ + do { \ + ++g_checks; \ + if (!(cond)) { \ + ++g_fails; \ + fprintf(stderr, "FAIL %s:%d: ", __FILE__, __LINE__); \ + fprintf(stderr, __VA_ARGS__); \ + fputc('\n', stderr); \ + } \ + } while (0) + +typedef struct TestCtx { + CfreeContext ctx; + Compiler* c; + CfreeCgTypeId i32; + CfreeCgTypeId ptr; + CfreeCgTypeId fn1; /* i32(i32) */ +} TestCtx; + +static void tc_init(TestCtx* tc) { + CfreeTarget target; + CfreeCgBuiltinTypes b; + CfreeCgFuncSig sig; + CfreeCgFuncParam params[1]; + memset(tc, 0, sizeof *tc); + tc->ctx.heap = &g_heap; + tc->ctx.diag = &g_diag; + tc->ctx.now = -1; + memset(&target, 0, sizeof target); + target.arch = CFREE_ARCH_ARM_64; + target.os = CFREE_OS_MACOS; + target.obj = CFREE_OBJ_MACHO; + target.ptr_size = 8; + target.ptr_align = 8; + if (cfree_compiler_new(target, &tc->ctx, (CfreeCompiler**)&tc->c) != + CFREE_OK || + !tc->c) { + fprintf(stderr, "fatal: compiler allocation failed\n"); + abort(); + } + b = cfree_cg_builtin_types(tc->c); + tc->i32 = b.id[CFREE_CG_BUILTIN_I32]; + tc->ptr = cfree_cg_type_ptr(tc->c, b.id[CFREE_CG_BUILTIN_VOID], 0); + memset(&sig, 0, sizeof sig); + memset(params, 0, sizeof params); + params[0].type = tc->i32; + sig.ret = tc->i32; + sig.params = params; + sig.nparams = 1; + sig.call_conv = CFREE_CG_CC_TARGET_C; + tc->fn1 = cfree_cg_type_func((CfreeCompiler*)tc->c, sig); +} + +static void tc_fini(TestCtx* tc) { + cfree_compiler_free(tc->c); + tc->c = NULL; +} + +static Operand op_local(CGLocal local, CfreeCgTypeId type) { + Operand o; + memset(&o, 0, sizeof o); + o.kind = OPK_LOCAL; + o.type = type; + o.v.local = local; + return o; +} + +static Operand op_global(ObjSymId sym, CfreeCgTypeId type) { + Operand o; + memset(&o, 0, sizeof o); + o.kind = OPK_GLOBAL; + o.type = type; + o.v.global.sym = sym; + return o; +} + +typedef struct CapturedFunc { + CgIrFunc* func; +} CapturedFunc; + +static void on_func(void* user, CgIrFunc* func) { + ((CapturedFunc*)user)->func = func; +} + +static CgTarget* make_recorder(TestCtx* tc, CapturedFunc* cap) { + CgIrRecorderConfig cfg; + memset(&cfg, 0, sizeof cfg); + cfg.func_recorded = on_func; + cfg.user = cap; + return cg_ir_recorder_new(tc->c, NULL, &cfg); +} + +/* Callee: i32 f(i32 x) { acc = x + x; acc = acc + x; ... (nbinops total); return + * acc; } -> straightline body of cost == nbinops. */ +static CgIrFunc* build_callee(TestCtx* tc, ObjSymId sym, u32 nbinops, + CfreeCgInlinePolicy policy) { + CapturedFunc cap; + CgTarget* t; + CGFuncDesc fd; + CGParamDesc pd; + CGLocal x, acc; + memset(&cap, 0, sizeof cap); + t = make_recorder(tc, &cap); + memset(&fd, 0, sizeof fd); + fd.sym = sym; + fd.fn_type = tc->fn1; + fd.inline_policy = policy; + t->func_begin(t, &fd); + memset(&pd, 0, sizeof pd); + pd.index = 0; + pd.type = tc->i32; + pd.size = 4; + pd.align = 4; + x = t->param(t, &pd); + acc = t->local(t, &(CGLocalDesc){.type = tc->i32, .size = 4, .align = 4}); + for (u32 i = 0; i < nbinops; ++i) + t->binop(t, BO_IADD, op_local(acc, tc->i32), + op_local(i == 0 ? x : acc, tc->i32), op_local(x, tc->i32)); + t->ret(t, &acc, 1); + t->func_end(t); + return cap.func; +} + +/* Caller: i32 g(void) { arg = 41; res = callee(arg); return res; } */ +static CgIrFunc* build_caller(TestCtx* tc, ObjSymId callee_sym, + CfreeCgInlinePolicy call_policy) { + CapturedFunc cap; + CgTarget* t; + CGFuncDesc fd; + CGCallDesc call; + CGLocal arg, res; + CGLocal cargs[1]; + CGLocal cres[1]; + CfreeCgFuncSig sig; + memset(&cap, 0, sizeof cap); + t = make_recorder(tc, &cap); + memset(&fd, 0, sizeof fd); + fd.sym = callee_sym + 1u; + memset(&sig, 0, sizeof sig); + sig.ret = tc->i32; + sig.call_conv = CFREE_CG_CC_TARGET_C; + fd.fn_type = cfree_cg_type_func((CfreeCompiler*)tc->c, sig); + t->func_begin(t, &fd); + arg = t->local(t, &(CGLocalDesc){.type = tc->i32, .size = 4, .align = 4}); + res = t->local(t, &(CGLocalDesc){.type = tc->i32, .size = 4, .align = 4}); + t->load_imm(t, op_local(arg, tc->i32), 41); + memset(&call, 0, sizeof call); + cargs[0] = arg; + cres[0] = res; + call.fn_type = tc->fn1; + call.callee = op_global(callee_sym, tc->ptr); + call.args = cargs; + call.nargs = 1; + call.results = cres; + call.nresults = 1; + call.inline_policy = call_policy; + t->call(t, &call); + t->ret(t, &res, 1); + t->func_end(t); + return cap.func; +} + +typedef struct LookupCtx { + ObjSymId sym; + Func* callee; +} LookupCtx; + +static Func* lookup(void* ctx, ObjSymId sym) { + LookupCtx* l = (LookupCtx*)ctx; + return sym == l->sym ? l->callee : NULL; +} + +static u32 count_calls(const Func* f) { + u32 n = 0; + for (u32 b = 0; b < f->nblocks; ++b) + for (u32 i = 0; i < f->blocks[b].ninsts; ++i) + if ((IROp)f->blocks[b].insts[i].op == IR_CALL) ++n; + return n; +} + +static u32 count_binops(const Func* f) { + u32 n = 0; + for (u32 b = 0; b < f->nblocks; ++b) + for (u32 i = 0; i < f->blocks[b].ninsts; ++i) + if ((IROp)f->blocks[b].insts[i].op == IR_BINOP) ++n; + return n; +} + +/* A tiny callee is inlined: the call is replaced and the callee's body lands in + * the caller. */ +static void tiny_callee_is_inlined(void) { + TestCtx tc; + ObjSymId sym = 1234; + tc_init(&tc); + Func* callee = opt_func_from_cg_ir(tc.c, build_callee(&tc, sym, 1, + CFREE_CG_INLINE_DEFAULT)); + Func* caller = + opt_func_from_cg_ir(tc.c, build_caller(&tc, sym, CFREE_CG_INLINE_DEFAULT)); + LookupCtx lc = {sym, callee}; + EXPECT(count_calls(caller) == 1, "precondition: caller should have one call"); + int n = opt_try_tiny_inline(caller, lookup, &lc); + EXPECT(n == 1, "expected one inline, got %d", n); + EXPECT(count_calls(caller) == 0, "call should be gone, %u remain", + count_calls(caller)); + EXPECT(count_binops(caller) >= 1, "callee binop should be cloned in"); + tc_fini(&tc); +} + +/* A DEFAULT callee over the tiny cost cap is refused; the same body marked + * always_inline bypasses the cap. */ +static void over_budget_respects_policy(void) { + TestCtx tc; + ObjSymId sym = 2000; + tc_init(&tc); + /* cost 12 > INLINE_TINY_COST_LIMIT (8). */ + Func* big_default = opt_func_from_cg_ir( + tc.c, build_callee(&tc, sym, 12, CFREE_CG_INLINE_DEFAULT)); + Func* caller1 = + opt_func_from_cg_ir(tc.c, build_caller(&tc, sym, CFREE_CG_INLINE_DEFAULT)); + LookupCtx lc1 = {sym, big_default}; + int n1 = opt_try_tiny_inline(caller1, lookup, &lc1); + EXPECT(n1 == 0, "over-budget DEFAULT callee should be refused, got %d", n1); + EXPECT(count_calls(caller1) == 1, "call should survive refusal"); + + Func* big_always = opt_func_from_cg_ir( + tc.c, build_callee(&tc, sym, 12, CFREE_CG_INLINE_ALWAYS)); + Func* caller2 = + opt_func_from_cg_ir(tc.c, build_caller(&tc, sym, CFREE_CG_INLINE_DEFAULT)); + LookupCtx lc2 = {sym, big_always}; + int n2 = opt_try_tiny_inline(caller2, lookup, &lc2); + EXPECT(n2 == 1, "always_inline should bypass the tiny cap, got %d", n2); + EXPECT(count_calls(caller2) == 0, "always_inline call should be inlined"); + tc_fini(&tc); +} + +/* A NEVER (noinline) callee is refused even when tiny. */ +static void never_policy_is_refused(void) { + TestCtx tc; + ObjSymId sym = 3000; + tc_init(&tc); + Func* callee = opt_func_from_cg_ir( + tc.c, build_callee(&tc, sym, 1, CFREE_CG_INLINE_NEVER)); + Func* caller = + opt_func_from_cg_ir(tc.c, build_caller(&tc, sym, CFREE_CG_INLINE_DEFAULT)); + LookupCtx lc = {sym, callee}; + int n = opt_try_tiny_inline(caller, lookup, &lc); + EXPECT(n == 0, "NEVER callee should not be inlined, got %d", n); + EXPECT(count_calls(caller) == 1, "NEVER call should survive"); + tc_fini(&tc); +} + +/* An unresolved (forward-defined) callee is left alone. */ +static void unknown_callee_is_skipped(void) { + TestCtx tc; + ObjSymId sym = 4000; + tc_init(&tc); + Func* caller = + opt_func_from_cg_ir(tc.c, build_caller(&tc, sym, CFREE_CG_INLINE_DEFAULT)); + LookupCtx lc = {sym, NULL}; /* lookup never returns a body */ + int n = opt_try_tiny_inline(caller, lookup, &lc); + EXPECT(n == 0, "unresolved callee should be skipped, got %d", n); + EXPECT(count_calls(caller) == 1, "unresolved call should survive"); + tc_fini(&tc); +} + +int main(void) { + tiny_callee_is_inlined(); + over_budget_respects_policy(); + never_policy_is_refused(); + unknown_callee_is_skipped(); + fprintf(stderr, "tiny-inline: %d checks, %d failures\n", g_checks, g_fails); + return g_fails ? 1 : 0; +} diff --git a/test/test.mk b/test/test.mk @@ -507,14 +507,26 @@ test-macho: lib $(TEST_RT_DEP) $(ROUNDTRIP_BIN_MACHO) $(LINK_EXE_RUNNER) $(JIT_R bash test/link/run.sh OPT_TEST_BIN = build/test/cg_ir_lower_test +TINY_INLINE_TEST_BIN = build/test/tiny_inline_test -test-opt: bin $(OPT_TEST_BIN) +test-opt: bin $(OPT_TEST_BIN) test-opt-tiny-inline test-opt-inline $(OPT_TEST_BIN) $(OPT_TEST_BIN): test/opt/cg_ir_lower_test.c $(LIB_OBJS) @mkdir -p $(dir $@) $(CC) $(TEST_HOST_CFLAGS) -Isrc test/opt/cg_ir_lower_test.c $(LIB_OBJS) -o $@ +test-opt-tiny-inline: bin $(TINY_INLINE_TEST_BIN) + $(TINY_INLINE_TEST_BIN) + +$(TINY_INLINE_TEST_BIN): test/opt/tiny_inline_test.c $(LIB_OBJS) + @mkdir -p $(dir $@) + $(CC) $(TEST_HOST_CFLAGS) -Isrc test/opt/tiny_inline_test.c $(LIB_OBJS) -o $@ + +# Behavioral disasm check: tiny callee `bl` disappears from its caller at -O1. +test-opt-inline: bin + @CFREE=$(abspath $(BIN)) bash test/opt/run.sh + test-parse: test-parse-ok test-parse-err test-parse-ok: lib $(TEST_RT_DEP) $(PARSE_RUNNER) $(ROUNDTRIP_BIN) $(LINK_EXE_RUNNER) $(JIT_RUNNER)