commit 9f00fa4fc9d80f10ce4bcc2c7bff5e8d32d59631
parent db8ec5a267e8f4ef1f768879adc779a5f2c48c6f
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Wed, 27 May 2026 05:33:02 -0700
opt: route varargs through optimizer path; fix ADDR_OF spill writeback
Begin recovering O1 on the CgIrFunc->NativeTarget path so the direct-replay
bypass can be deleted. Varargs now compile through the optimizer:
- Add arch-neutral va_start_/va_arg_/va_end_/va_copy_ hooks to NativeTarget.
The optimizer passes the va_list pointer opaquely as a NativeLoc and the
va_arg type; all va_list layout knowledge stays behind the hooks (queried
from abi_va_list_layout).
- Refactor the aa64 va logic into shared cores used by both the direct
(semantic-operand) and optimizer (NativeLoc) paths. The cores use only the
two reserved scratch regs (x16/x17) as private temporaries; the va_list base
register and va_arg result register are caller-provided, so the ops clobber
nothing the register allocator tracks live.
- cg_ir_lower lowers va operands as pointer values, and no longer classifies
va operands as address-taken (which had wrongly forced the pointer temp into
a frame slot).
Also fix a latent IR_ADDR_OF emit bug: a spilled ADDR_OF result was computed
into a scratch register but never written back to its slot.
A temporary CFREE_NO_DIRECT_REPLAY env gate in opt.c forces every function
through the optimizer for testing; it will be removed when the bypass is
deleted. See doc/OPT_O1_PASSES.md for the goal, design decisions, and checklist.
Diffstat:
6 files changed, 789 insertions(+), 95 deletions(-)
diff --git a/doc/OPT_O1_PASSES.md b/doc/OPT_O1_PASSES.md
@@ -0,0 +1,551 @@
+# O1 Optimizer Pass Pipeline
+
+This document describes the O1 lowering pipeline, what each pass does, how it
+should work, and what was lost or incorrectly ported in the refactor at commit
+`f60a16d` (opt: rewrite pipeline to consume CgIrFunc input; replace pass_emit
+with NativeTarget).
+
+---
+
+## Pipeline overview
+
+The O1 pipeline runs when `opt_cgtarget_new` is called with `level >= 1`.
+`CgIrRecorder` records the full function as a `CgIrFunc`. When the function is
+complete, `opt_on_func` is called:
+
+1. `opt_func_needs_direct_replay` — bypass check (see §1 below)
+2. `opt_func_from_cg_ir` — convert `CgIrFunc` → `Func` (new in refactor)
+3. `opt_run_o1_native` — run the pass sequence on `Func`
+
+The pass sequence in `opt_run_o1_native` (src/opt/opt.c):
+
+```
+opt_build_cfg (×2, with opt_jump_cleanup(CFG) in between)
+opt_simplify_local
+opt_machinize_native ABI lowering
+opt_addr_xform_pregs fold IR_ADDR_OF(local) → OPK_LOCAL in loads/stores
+opt_promote_scalar_locals promote non-escaped frame slots → PRegs
+opt_addr_of_global_cse hoist duplicate ADDR_OF(global) to entry block
+opt_build_loop_tree
+opt_live_blocks liveness bitmaps
+opt_dead_def_elim_with_live pre-RA DCE
+opt_regalloc_locations point-bitmap register allocator
+opt_lower_to_mir insert spill/reload, lower PReg → hard reg
+opt_mir_combine MIR-level peephole
+opt_mir_dce
+opt_mir_jump_cleanup(CFG) + opt_mir_build_cfg
+opt_mir_jump_cleanup(LAYOUT) block reorder + branch stripping
+opt_emit_native emit to NativeTarget
+```
+
+---
+
+## §1 — Direct-replay bypass
+
+**Source**: `opt_func_needs_direct_replay` in src/opt/opt.c
+**Route**: → `opt_replay_cg_ir_direct` → raw CgTarget replay (no optimization)
+
+A function bypasses the optimizer entirely if it contains:
+- Aggregate or >8-byte params/results
+- `CG_IR_ASM_BLOCK`, `CG_IR_ALLOCA`, `CG_IR_INTRINSIC`
+- Varargs ops (`CG_IR_VA_*`)
+- Aggregate call arguments or results
+
+**Before the refactor**: These functions went through the optimizer. The old
+`CGTarget` wrapper recorded them and replayed with `opt_emit`. The optimizer
+handled them with frame slots from the start. Post-refactor, they are entirely
+unoptimized — every function with an `alloca`, inline asm, or vararg is now
+compiled at -O0 quality regardless of the requested optimization level.
+
+**What to fix**: The bypass is too conservative. Alloca, varargs, and asm-block
+functions can still benefit from register allocation and DCE on the non-alloca
+portions. The check should be narrowed to only bypass the parts of the pipeline
+that genuinely cannot handle those ops (currently none — `cg_ir_lower.c`
+should be extended to handle these).
+
+---
+
+## §2 — opt_func_from_cg_ir (new pass, introduced in refactor)
+
+**Source**: src/opt/cg_ir_lower.c
+**Purpose**: Convert a recorded `CgIrFunc` into the `Func` IR that the optimizer
+works on.
+
+### Local classification
+
+The critical function is `lower_locals` (cg_ir_lower.c:147). For each local:
+
+```c
+m->address_taken =
+ local_needs_home(in) || local_address_used_in_cg_ir(l->src, in->id);
+
+if (m->address_taken) {
+ m->storage.kind = CG_LOCAL_STORAGE_FRAME; // → OPK_LOCAL
+} else {
+ m->storage.kind = CG_LOCAL_STORAGE_REG; // → OPK_REG (PReg)
+ m->storage.v.reg = (Reg)r;
+}
+```
+
+`local_needs_home` returns true if:
+- `in->address_taken` — set by the recorder when `addr_of` is called on the local
+- `CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED` flags
+
+Address-taken locals get FRAME storage; all others get a PReg. The `opt_addr_xform_pregs`
+and `opt_promote_scalar_locals` passes are then responsible for recovering
+register storage for locals whose addresses don't truly escape.
+
+### Old approach (before refactor)
+
+The old `w_local` handler in the CGTarget wrapper assigned storage differently:
+
+```c
+if (o->level < 2 &&
+ (d->flags & (CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED)) == 0) {
+ // REG storage — even if addr_of will be called later
+} else {
+ // FRAME storage
+}
+```
+
+Address-taken status was determined conservatively at declaration time. Locals
+declared without `CG_LOCAL_ADDR_TAKEN` got register storage initially. When
+`addr_of` was called on them later, a `home_slot` was allocated alongside.
+After recording, `opt_frame_home_addr_taken_locals` ran as part of `w_func_end`
+and inserted explicit `IR_LOAD` / `IR_STORE` pairs around every use/def of the
+local's register to keep the frame slot in sync.
+
+This gave the optimizer explicit IR_LOAD/IR_STORE pairs to work with before
+running `opt_addr_xform_pregs` and `opt_promote_scalar_locals`. In the new
+code, address-taken locals go directly to FRAME storage and the optimizer must
+reconstruct the register-friendly form via the addr-fold and promotion passes.
+The promotion chain (§3 + §4) should in theory produce equivalent output, but
+depends on `opt_addr_xform_pregs` successfully classifying every `IR_ADDR_OF`
+use as non-escaping.
+
+---
+
+## §3 — opt_build_cfg / opt_jump_cleanup(CFG) / opt_simplify_local
+
+**Sources**: src/opt/pass_cfg.c
+**Purpose**: Build the CFG, remove unreachable blocks and trivially-foldable
+branches, and fold single-successor blocks into their predecessor.
+
+These passes are unchanged by the refactor and work the same as before.
+They must run before `opt_machinize_native` so that the ABI lowering has an
+accurate picture of which blocks are live and which are critical edges.
+
+---
+
+## §4 — opt_machinize_native
+
+**Source**: src/opt/pass_machinize.c
+**Purpose**: ABI lowering. Inserts calling-convention constraints (scratch regs,
+clobbers, callee-save markers) as annotations on call/ret/param instructions so
+the register allocator knows which registers are killed across calls.
+
+The function was renamed from `opt_machinize(Func*, CGTarget*)` to
+`opt_machinize_native(Func*, NativeTarget*)` in the refactor. The body
+(`machinize_reset`, `machinize_prepare_insts`, `machinize_collect_regs`,
+`machinize_check_overlap`) is unchanged in structure. The only difference is
+the target type — `NativeTarget*` replaces `CGTarget*` — which affects which
+register metadata APIs are called to gather clobber and callee-save info.
+
+---
+
+## §5 — opt_addr_xform_pregs
+
+**Source**: src/opt/pass_addr_fold.c
+**Purpose**: Fold `IR_ADDR_OF(local)` defs whose result pointer is used only as
+the base of non-observable `IR_LOAD` / `IR_STORE` with zero offset and no index.
+When all uses qualify, the `IR_ADDR_OF` is NOP'd and each use is rewritten to
+`OPK_LOCAL(slot)`. If only some uses qualify, partial folding leaves the
+`IR_ADDR_OF` alive for the EA-shaped uses.
+
+After folding, `FSF_ADDR_TAKEN` is cleared on any frame slot whose
+`IR_ADDR_OF` defs were all retired. This enables `opt_promote_scalar_locals`
+to promote the slot to a PReg.
+
+**How to be fast**: One forward pass per `IR_ADDR_OF` candidate. The function
+is scanned once per candidate to classify all uses, then rewritten in a second
+scan. This is O(candidates × insts), but in practice the number of surviving
+non-folded `IR_ADDR_OF` defs is small.
+
+---
+
+## §6 — opt_promote_scalar_locals
+
+**Source**: src/opt/pass_addr_fold.c
+**Purpose**: For each `FS_LOCAL` frame slot without `FSF_ADDR_TAKEN` or
+`FSF_VOLATILE`, and whose every `OPK_LOCAL(slot)` appearance is either:
+- `IR_LOAD.opnds[1]` — a matching-type, non-observable load, or
+- `IR_STORE.opnds[0]` — a matching-type, non-observable store,
+
+replace the slot with a fresh PReg: each store becomes `IR_COPY preg, src`
+and each load becomes `IR_COPY dst, preg`. The slot becomes unreferenced.
+
+**Critical limitation**: `promote_inst_classify` rejects a slot if `OPK_LOCAL(slot)`
+appears anywhere other than `IR_LOAD.opnds[1]` or `IR_STORE.opnds[0]`. In
+particular, an `OPK_LOCAL` appearing in a call aux (as an argument or result
+storage), in an `IR_ADDR_OF` operand, or in any other instruction, blocks
+promotion entirely for that slot.
+
+This creates a coverage gap with the new `opt_func_from_cg_ir`: when a local
+is address-taken, `lower_operand_value` returns `OPK_LOCAL` for every VALUE use
+of that local (not just load/store address uses). If `opt_addr_xform_pregs` did
+not fold away all `IR_ADDR_OF` defs for the local (because at least one use
+escaped), the slot keeps `FSF_ADDR_TAKEN` and `opt_promote_scalar_locals` skips
+it entirely.
+
+In the old code, the load/store pairs around the local's register were always
+IR_LOAD / IR_STORE instructions — exactly the form that `opt_promote_scalar_locals`
+can handle — and the only frame-slot operands visible to the pass were those
+load/store addresses.
+
+---
+
+## §7 — opt_addr_of_global_cse
+
+**Source**: src/opt/pass_addr_fold.c
+**Purpose**: CSE for `IR_ADDR_OF(global{sym, addend})`. When the same
+(sym, addend) pair appears in two or more `IR_ADDR_OF` defs, replace all with a
+single def materialized in block 0 (after any `IR_PARAM_DECL` prologue). This
+collapses the per-iteration `adrp`/`add` pair in tight loops.
+
+Unchanged by the refactor.
+
+---
+
+## §8 — opt_live_blocks + opt_dead_def_elim_with_live
+
+**Sources**: src/opt/pass_analysis.c, src/opt/pass_lower.c
+**Purpose**: Dataflow liveness over the PReg namespace; pre-RA dead definition
+elimination.
+
+`opt_live_blocks` computes per-block liveness bitmaps (one bit per PReg) using
+backward dataflow. `opt_dead_def_elim_with_live` sweeps each block backward:
+any instruction all of whose defined PRegs are dead at its output and which has
+no side effects is removed.
+
+Running DCE before register allocation reduces the number of live ranges and
+the number of allocable PRegs, which keeps the point-bitmap allocator fast.
+
+Unchanged by the refactor.
+
+---
+
+## §9 — opt_regalloc_locations
+
+**Source**: src/opt/pass_analysis.c
+**Purpose**: Point-bitmap linear-scan register allocator. Allocates each PReg
+to either a hard register or a spill slot.
+
+Uses a `used_locs[p * loc_words + w]` bitmap (one row per PReg, one column-word
+per location) to track which (PReg, location) pairs conflict. For each PReg in
+program order, the allocator scans the point bitmaps to find the first
+non-conflicting hard register; if none is available, assigns a spill slot.
+
+Unchanged by the refactor.
+
+---
+
+## §10 — opt_lower_to_mir
+
+**Source**: src/opt/pass_lower.c
+**Purpose**: Rewrite the PReg-namespace Func into a hard-register MIR Func.
+Every PReg operand is replaced with the hard register or spill slot assigned by
+`opt_regalloc_locations`. Spilled PRegs get `IR_LOAD` (reload) instructions
+inserted before each use and `IR_STORE` (spill) instructions inserted after
+each def.
+
+The output `f->mir` is a `MFunc` containing the rewritten blocks. The original
+`f->blocks` is also updated to point to the rewritten instruction arrays.
+
+Unchanged by the refactor.
+
+---
+
+## §11 — opt_mir_combine
+
+**Source**: src/opt/pass_combine.c
+**Purpose**: MIR-level peephole. Four rewrites run in a per-block fixpoint:
+
+1. **Substitute**: propagate a trivial copy `r1 ← copy r2` through later uses
+ of `r1`, removing redundant register-to-register moves.
+
+2. **Addr-mode synthesis**: fold `r ← addr_of global` + `load/store r` into a
+ direct `load/store [global]` where the backend supports it.
+
+3. **Sink**: move a def instruction forward in the block to be adjacent to its
+ sole use, reducing live range pressure.
+
+4. **Ext-chain**: fold sign/zero extensions that feed into an instruction that
+ already performs the extension as part of its semantics.
+
+Also runs `opt_combine_compact_block` to merge adjacent spill pairs.
+
+Unchanged by the refactor.
+
+---
+
+## §12 — opt_mir_dce
+
+**Source**: src/opt/pass_dce.c
+**Purpose**: Post-RA dead definition elimination over hard-reg IR. Removes
+instructions that define registers that are never used, provided they are
+side-effect-free. Runs after `opt_mir_combine` to clean up any copy chains
+that were fully substituted away.
+
+Unchanged by the refactor.
+
+---
+
+## §13 — opt_mir_jump_cleanup(CFG) + opt_mir_build_cfg + opt_mir_jump_cleanup(LAYOUT)
+
+**Source**: src/opt/pass_cfg.c
+**Purpose**:
+- `OPT_JUMP_CLEANUP_CFG`: Remove unreachable blocks and collapse chains of
+ unconditional jumps.
+- `opt_mir_build_cfg`: Recompute the CFG successor lists.
+- `OPT_JUMP_CLEANUP_LAYOUT`: `cleanup_reorder_for_fallthrough` — greedy chain
+ extension that orders blocks so that the most common taken-branch edge
+ becomes a fallthrough (eliminating the branch instruction); then strips
+ redundant unconditional jumps to the textual successor.
+
+Unchanged by the refactor.
+
+---
+
+## §14 — opt_emit_native (replaces old opt_emit)
+
+**Source**: src/opt/pass_native_emit.c
+**Old source (deleted)**: src/opt/pass_emit.c
+
+### Old implementation: `opt_emit` / `replay_func_to`
+
+`opt_emit` replayed the optimized MIR through the wrapped `CGTarget` using
+`replay_func_to(identity=1)`. The `identity_regs` flag meant that PReg ids
+from the optimized IR were used directly as `Reg` values when calling the
+backend. The key mechanism:
+
+- **`xlat_storage`**: Checked `opt_preg_alloc_kind(f, pr)` to map a PReg's
+ storage to either `CG_LOCAL_STORAGE_REG` (hard register) or
+ `CG_LOCAL_STORAGE_FRAME` (spill slot).
+- **`plan_hard_regs` / `reserve_hard_regs`**: Scanned the replayed IR once to
+ collect all hard register used, then called the wrapped backend's
+ `plan_hard_regs` / `reserve_hard_regs` hooks. This allowed the native
+ backend's frame layout to know the full set of callee-saved registers before
+ emitting the prologue.
+- **`func_begin_known_frame`**: If the target supported it, the frame layout
+ (slot sizes, alignments, max outgoing call stack) was computed upfront and
+ passed to the backend in a single `func_begin_known_frame` call. Otherwise
+ fell back to the streaming `frame_slot` / `func_begin` protocol.
+
+### New implementation: `opt_emit_native`
+
+`opt_emit_native` replays the MIR into a `NativeTarget` using `NativeLoc` as
+the currency (not `Reg`). The key differences:
+
+- **`loc_from_operand`**: Converts an `OptOperand` to a `NativeLoc` —
+ `OPT_OPK_REG` → `NATIVE_LOC_REG`, `OPT_OPK_LOCAL` (frame slot) →
+ `NATIVE_LOC_FRAME`, `OPT_OPK_IMM` → `NATIVE_LOC_IMM`, etc.
+- **`map_frame_slots`**: Pre-allocates all frame slots by calling
+ `target->frame_slot` for each `IRFrameSlot`. Frame slot mapping is done
+ upfront via a single scan, similar to the old `known_frame` path.
+- **`materialize`**: Loads a non-register `NativeLoc` into a scratch register
+ when a register operand is required (e.g., for call callee address).
+- **`legalize_addr`**: After constructing a `NativeAddr`, calls
+ `target->addr_legal` to check if the address shape is supported, and if not,
+ collapses it to a base register via a `load_addr`.
+
+The `plan_hard_regs` / `reserve_hard_regs` hooks are not called in the new
+implementation. This means the native backend must infer callee-saved register
+usage from the frame layout pass rather than receiving an explicit list. Whether
+this is correct depends on the backend.
+
+---
+
+## §15 — O0 path
+
+At `opt_level == 0`, `opt_cgtarget_new` is not called. The CgTarget returned by
+`backend->make` is used directly. Before the refactor, the native backends were
+CGTarget implementations (`CGTarget`-style). After `edbd83e` (aa64: delete old
+CGTarget backend) and related commits, the native backends use `NativeDirectTarget`.
+
+The O0 regression (cfree O0 is 3.25x slower than gcc-15 -O0 as a geomean across
+9 benchmarks) is therefore entirely in `NativeDirectTarget`, which is documented
+as a work-in-progress single-pass lowering with many known limitations.
+
+---
+
+## Summary of regressions introduced by f60a16d
+
+### Regression A — Bypassed functions (O1 quality loss)
+
+Functions with alloca, varargs, inline asm, or aggregate types now bypass all
+optimization via `opt_replay_cg_ir_direct`. Before the refactor, these went
+through the full optimizer. This is incorrect: alloca functions should still
+benefit from register allocation on their non-alloca code; the bypass is a
+conservative workaround for unimplemented cases in `cg_ir_lower.c`.
+
+### Regression B — Local classification round-trip (O1 quality loss)
+
+The new `cg_ir_lower.c` classifies address-taken locals as FRAME storage from
+the start, rather than starting them as register storage and inserting
+explicit load/store pairs (as `opt_frame_home_addr_taken_locals` did). The
+downstream passes `opt_addr_xform_pregs` + `opt_promote_scalar_locals` are
+supposed to recover register storage for non-escaping locals, but:
+
+1. `opt_promote_scalar_locals` rejects a slot if `OPK_LOCAL` appears anywhere
+ outside `IR_LOAD.opnds[1]` or `IR_STORE.opnds[0]`. But `lower_operand_value`
+ emits `OPK_LOCAL` for ALL value uses of an address-taken local — including
+ call args and binop operands — not only through explicit load/store.
+
+2. Any local whose `IR_ADDR_OF` use is not fully foldable by `opt_addr_xform_pregs`
+ (because the pointer truly escapes, or because of EA-shaped uses) retains
+ `FSF_ADDR_TAKEN` and is never promoted.
+
+The net effect: locals that are address-taken but whose address doesn't truly
+escape (very common in C — temporaries, loop iteration variables, short-lived
+pointers) stay in FRAME storage at O1 and incur memory traffic that the old
+optimizer eliminated.
+
+### Regression C — Missing plan_hard_regs / reserve_hard_regs (potential correctness issue)
+
+The old `replay_func_to` called `target->plan_hard_regs` and `target->reserve_hard_regs`
+with the complete set of hard registers used by the function. These hooks let the
+native backend plan its callee-save region before emitting the prologue. The new
+`opt_emit_native` does not call these hooks. If any backend relies on them for
+correct callee-save handling, this is a correctness issue, not just a quality
+issue.
+
+### Regression D — O0 path replaced by NativeDirectTarget
+
+Not introduced by f60a16d itself, but by the series of commits ending at
+`edbd83e` (delete old CGTarget backend). The O0 regression is a NativeDirectTarget
+limitation, separate from the optimizer.
+
+---
+
+## Remediation priorities
+
+1. **Fix local classification** (§2 / Regression B): Either restore the
+ `opt_frame_home_addr_taken_locals` approach (start as REG, insert sync
+ pairs lazily), or extend `opt_promote_scalar_locals` to handle the
+ `OPK_LOCAL`-in-value-position case. The simpler fix is to not use
+ `OPK_LOCAL` as a value operand in `lower_operand_value` for address-taken
+ locals; instead emit `IR_LOAD(preg, frame_slot)` inline, matching what
+ `opt_frame_home_addr_taken_locals` used to produce.
+
+2. **Restore bypass functions to optimizer** (Regression A): Extend
+ `cg_ir_lower.c` to handle the cases currently routed to
+ `opt_replay_cg_ir_direct`. At minimum, alloca and non-aggregate vararg
+ functions should go through the optimizer.
+
+3. **Wire plan_hard_regs / reserve_hard_regs** (Regression C): Survey the
+ aarch64 / x64 / rv64 native backends to check whether they implement these
+ hooks and what happens when they are not called.
+
+---
+
+# Recovery Project: goal, decisions, and checklist
+
+This section tracks the active work to **fully recover O1 on the new
+`CgIrFunc` → `NativeTarget` path and delete the direct-replay path entirely**.
+It supersedes the speculative remediation notes above where they conflict.
+
+## Goal / success criteria
+
+1. **Completeness (priority 1):** every function compiles through the
+ optimizer pipeline. No `opt_replay_cg_ir_direct`. No per-op bypass.
+2. **Correctness (priority 2):** `CFREE_OPT_LEVELS=1 CFREE_TEST_PATHS=R
+ ./test/toy/run.sh` fully green, and the default test suites stay green.
+3. **Performance (priority 3):** O1 vs `gcc-15 -O0` (baseline in
+ `doc/OPT_PERF.md`): compile-time ≈5x faster, runtime ≈2x faster (geomean).
+ Measure only from a **clean `RELEASE=1` build** (the dev build is
+ ASan/UBSan-instrumented and not representative).
+
+Scope: **aa64 only** (the only `NativeTarget` implementation; host arch here).
+x64/rv64 `NativeTarget` ports are out of scope.
+
+## Design decisions
+
+- **Arch/ABI neutrality at the optimizer boundary.** The optimizer (cg_ir_lower,
+ pass_native_emit, machinize) must make no ABI/layout assumptions. For varargs
+ it passes the va_list *pointer* opaquely (a `NativeLoc`) and the `va_arg`
+ type; all layout knowledge (pointer ABI vs AAPCS64 register-save-area, field
+ offsets, sizes) lives behind the `NativeTarget` va hooks, which query
+ `abi_va_list_layout`. Same principle applies to asm and aggregates.
+- **Backend scratch discipline.** A `NativeTarget` op may use at most the **2
+ reserved scratch registers** (aa64 `x16`/`x17` = TMP0/TMP1) as private
+ temporaries. Every *other* register it needs (operand bases, results) must be
+ **provided by the caller**, so the optimizer owns allocation/planning and the
+ op clobbers nothing the register allocator tracks live. This removed any need
+ for a va clobber-set; it is the contract new hooks must follow.
+- **plan/reserve_hard_regs are unnecessary (Regression C is closed).** The aa64
+ `NativeTarget` exposes only caller-saved registers as allocable
+ (`aa_int_allocable = {x8,x11..x15}`, scratch `{x9,x10}`), so the allocator
+ never assigns a callee-saved register and the prologue correctly saves only
+ FP/LR. Every emit hook already receives fully-resolved hard regs. The hooks
+ are not wired and will not be. The *consequence* is a small allocable set
+ (perf item below), not a correctness gap.
+- **Shared cores, two thin wrappers.** aa64 backend ops are factored so the
+ semantic-operand path (`NativeDirectTarget`, -O0) and the `NativeLoc` path
+ (optimizer, O1) call one shared core; the wrappers only convert operands.
+
+## Checklist
+
+Integration test (drives this work):
+`CFREE_NO_DIRECT_REPLAY=1 CFREE_OPT_LEVELS=1 CFREE_TEST_PATHS=R ./test/toy/run.sh`
+(`CFREE_NO_DIRECT_REPLAY` is a temporary gate in `opt.c` that forces every
+function through the optimizer; delete it once the bypass is removed.)
+
+Completeness — route all ops through the optimizer:
+- [x] **Varargs** — `va_start_/va_arg_/va_end_/va_copy_` hooks on `NativeTarget`;
+ aa64 shared cores (2-temp discipline); cg_ir_lower lowers va operands as
+ pointer *values*; `local_address_used_in_cg_ir` no longer flags va
+ operands as address-taken.
+- [x] **Latent `IR_ADDR_OF` spill-writeback bug** in `pass_native_emit.c`
+ (ADDR_OF result computed into scratch was never stored back).
+- [ ] **Inline asm** (`IR_ASM_BLOCK`) — add a `NativeTarget` `asm_block` that
+ *binds the optimizer's pre-allocated operand registers* to the template
+ (machinize already fills `out_fixed_regs`/`in_fixed_regs`/`clobber_mask`
+ in `pass_machinize.c`; lower/analysis already apply them). Must NOT
+ self-allocate (the direct path does, which is unsafe when values are live
+ in regs across the asm). Refactor aa64 asm clobber-mask / callee-save /
+ restore helpers off `NativeDirectTarget` (same wrapper pattern as va).
+ Toy cases: 102,104,105,108,110,19,20.
+- [ ] **Aggregates / sret / byval** — ABI lowering gaps in the optimizer path:
+ 124 (slices, wrong value), 130 (record sret, wrong codegen), 36 ("scalar
+ too large" panic), 37 (tail sret). Covers aggregate params/results,
+ sret returns, and aggregate/by-value call arguments.
+- [ ] **BREAK_TO / CONTINUE_TO + SCOPE cond** — currently unused by frontends
+ (toy/c lower break/continue to `BR`+labels), but unwired in emit. Either
+ lower them to CFG edges in cg_ir_lower or wire emit, for true
+ completeness once a producer exists.
+
+Direct-path deletion:
+- [ ] Delete `opt_func_needs_direct_replay`, `opt_replay_cg_ir_direct`, the
+ `OptReplay` machinery, and the `replay_*` helpers in `opt.c`.
+- [ ] Remove the `CFREE_NO_DIRECT_REPLAY` env gate.
+
+Performance (priority 3, after completeness + correctness):
+- [ ] **Expand aa64 allocable set** — only 6 int allocable regs today; add
+ callee-saved x19..x28 (and callee-saved FP) with backend-tracked prologue
+ save/restore (`patch_apply` already rewrites the prologue after the body).
+ Likely the bulk of the current runtime regression.
+- [ ] **Local classification (Regression B)** — verify non-escaping
+ address-taken locals get promoted to registers; close any gap between
+ `opt_addr_xform_pregs` + `opt_promote_scalar_locals` and what the old path
+ achieved.
+- [ ] **Unit tests** — new targeted tests for the `CgIrFunc`→`NativeTarget`
+ path (local promotion, addr-fold, regalloc, lowered bypass ops);
+ re-enable a `test-opt` make target (old `test/opt/opt_test.c` is disabled
+ and uses the pre-refactor API).
+- [ ] **Benchmark** — clean `RELEASE=1` build, run `make bench-opt`, confirm the
+ O1-vs-`gcc-15 -O0` targets; refresh `doc/OPT_PERF.md`.
+
+## Progress log
+
+- Varargs landed end-to-end on the optimizer path; `IR_ADDR_OF` writeback fixed.
+ Bypass-disabled R-path failures: 14 → 11. Default R-path (O0+O1): 408/408.
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -2539,6 +2539,13 @@ static const NativeRegInfo aa_reg_info = {
.nclasses = sizeof aa_classes / sizeof aa_classes[0],
};
+static void aa_va_start_native(NativeTarget* t, NativeLoc ap_ptr);
+static void aa_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr,
+ CfreeCgTypeId type);
+static void aa_va_end_native(NativeTarget* t, NativeLoc ap_ptr);
+static void aa_va_copy_native(NativeTarget* t, NativeLoc dst_ap_ptr,
+ NativeLoc src_ap_ptr);
+
NativeTarget* aa64_native_target_new(Compiler* c, ObjBuilder* obj,
MCEmitter* mc) {
AANativeTarget* a = arena_znew(c->tu, AANativeTarget);
@@ -2590,6 +2597,10 @@ NativeTarget* aa64_native_target_new(Compiler* c, ObjBuilder* obj,
t->atomic_rmw = aa_atomic_rmw;
t->atomic_cas = aa_atomic_cas;
t->fence = aa_fence;
+ t->va_start_ = aa_va_start_native;
+ t->va_arg_ = aa_va_arg_native;
+ t->va_end_ = aa_va_end_native;
+ t->va_copy_ = aa_va_copy_native;
t->intrinsic = aa_intrinsic;
t->file_scope_asm = aa_file_scope_asm;
t->trap = aa_trap;
@@ -2779,154 +2790,232 @@ static void aa_load_ap_addr(NativeDirectTarget* d, Operand ap_addr,
d->native->load_addr(d->native, dst, ap);
}
-static void aa_va_start_(NativeDirectTarget* d, Operand ap_addr) {
- AANativeTarget* a = aa_of(d->native);
- ABIVaListInfo vai = abi_va_list_layout(d->base.c->abi);
+/* The va cores use only non-allocable registers for their temporaries
+ * (scratch x9/x10, reserved x16=TMP0 / x17=TMP1, vector v16) so they never
+ * clobber a value the optimizer's register allocator may hold live across the
+ * op. The va_list base register is supplied by the caller (ap.base.reg), which
+ * the optimizer materializes into a safe register before the call. */
+static u32 aa_va_base_reg(AANativeTarget* a, NativeAddr ap) {
+ if (ap.base_kind != NATIVE_ADDR_BASE_REG)
+ compiler_panic(a->base.c, a->func ? a->func->loc : (SrcLoc){0, 0, 0},
+ "aarch64 native target: va_list pointer not in register");
+ return ap.base.reg;
+}
+
+/* va_list layout is queried from the ABI; the optimizer/direct callers pass the
+ * va_list pointer opaquely. `ap` addresses the va_list object itself. */
+static void aa_va_start_core(AANativeTarget* a, NativeAddr ap) {
+ NativeTarget* t = &a->base;
+ ABIVaListInfo vai = abi_va_list_layout(t->c->abi);
NativeLoc ptr =
aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT, AA_TMP0);
- NativeAddr dst = aa_direct_pointer_addr(d, ap_addr);
if (vai.kind == ABI_VA_LIST_POINTER) {
aa_emit_add_imm(a, AA_TMP0, AA_FP, (i32)a->next_param_stack);
- aa_emit_mem(a, 0, ptr, dst, aa_mem_for_type(d->native, ptr.type, 8));
+ aa_emit_mem(a, 0, ptr, ap, aa_mem_for_type(t, ptr.type, 8));
return;
}
if (vai.kind == ABI_VA_LIST_AAPCS64) {
CfreeCgTypeId i32_ty = builtin_id(CFREE_CG_BUILTIN_I32);
NativeLoc i32tmp = aa_reg_loc(i32_ty, NATIVE_REG_INT, AA_TMP1);
- MemAccess ptr_mem = aa_mem_for_type(d->native, ptr.type, 8);
- MemAccess i32_mem = aa_mem_for_type(d->native, i32_ty, 4);
+ MemAccess ptr_mem = aa_mem_for_type(t, ptr.type, 8);
+ MemAccess i32_mem = aa_mem_for_type(t, i32_ty, 4);
AANativeSlot* gr = aa_slot(a, a->va_gr_slot);
AANativeSlot* vr = aa_slot(a, a->va_vr_slot);
+ u32 base = aa_va_base_reg(a, ap);
u32 used_gr = a->next_param_int < vai.gp_reg_count ? a->next_param_int
: vai.gp_reg_count;
u32 used_vr = a->next_param_fp < vai.fp_reg_count ? a->next_param_fp
: vai.fp_reg_count;
- aa_load_ap_addr(d, ap_addr, 15u);
aa_emit_add_imm(a, AA_TMP0, AA_FP, (i32)a->next_param_stack);
- aa_emit_mem(a, 0, ptr, aa_reg_addr(ptr.type, 15u, (i32)vai.stack_offset),
+ aa_emit_mem(a, 0, ptr, aa_reg_addr(ptr.type, base, (i32)vai.stack_offset),
ptr_mem);
aa_emit_add_imm(a, AA_TMP0, AA_FP,
-(i32)gr->off + (i32)(vai.gp_reg_count * vai.gp_slot_size));
- aa_emit_mem(a, 0, ptr, aa_reg_addr(ptr.type, 15u, (i32)vai.gr_top_offset),
+ aa_emit_mem(a, 0, ptr, aa_reg_addr(ptr.type, base, (i32)vai.gr_top_offset),
ptr_mem);
aa_emit_add_imm(a, AA_TMP0, AA_FP,
-(i32)vr->off + (i32)(vai.fp_reg_count * vai.fp_slot_size));
- aa_emit_mem(a, 0, ptr, aa_reg_addr(ptr.type, 15u, (i32)vai.vr_top_offset),
+ aa_emit_mem(a, 0, ptr, aa_reg_addr(ptr.type, base, (i32)vai.vr_top_offset),
ptr_mem);
- aa_emit_load_imm(a->base.mc, 0, AA_TMP1,
+ aa_emit_load_imm(t->mc, 0, AA_TMP1,
-(i32)((vai.gp_reg_count - used_gr) * vai.gp_slot_size));
- aa_emit_mem(a, 0, i32tmp, aa_reg_addr(i32_ty, 15u, (i32)vai.gr_offs_offset),
+ aa_emit_mem(a, 0, i32tmp, aa_reg_addr(i32_ty, base, (i32)vai.gr_offs_offset),
i32_mem);
- aa_emit_load_imm(a->base.mc, 0, AA_TMP1,
+ aa_emit_load_imm(t->mc, 0, AA_TMP1,
-(i32)((vai.fp_reg_count - used_vr) * vai.fp_slot_size));
- aa_emit_mem(a, 0, i32tmp, aa_reg_addr(i32_ty, 15u, (i32)vai.vr_offs_offset),
+ aa_emit_mem(a, 0, i32tmp, aa_reg_addr(i32_ty, base, (i32)vai.vr_offs_offset),
i32_mem);
return;
}
- {
- compiler_panic(d->base.c, d->loc,
- "aarch64 native target: unsupported va_list layout");
- }
+ compiler_panic(t->c, a->func ? a->func->loc : (SrcLoc){0, 0, 0},
+ "aarch64 native target: unsupported va_list layout");
}
-static void aa_va_arg_(NativeDirectTarget* d, Operand dst_op, Operand ap_addr,
- CfreeCgTypeId type) {
- AANativeTarget* a = aa_of(d->native);
- ABIVaListInfo vai = abi_va_list_layout(d->base.c->abi);
+static void aa_va_arg_core(AANativeTarget* a, NativeLoc dst, NativeAddr ap,
+ CfreeCgTypeId type) {
+ NativeTarget* t = &a->base;
+ ABIVaListInfo vai = abi_va_list_layout(t->c->abi);
NativeLoc cur =
aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT, AA_TMP0);
- NativeLoc val = aa_reg_loc(
- type, cg_type_is_float(d->base.c, type) ? NATIVE_REG_FP : NATIVE_REG_INT,
- cg_type_is_float(d->base.c, type) ? 16u : 9u);
- NativeAddr src, dst;
- MemAccess ptr_mem = aa_mem_for_type(d->native, cur.type, 8);
- MemAccess val_mem =
- aa_mem_for_type(d->native, type, type_size32(d->native, type));
+ /* The fetched value is written directly into the caller-provided register
+ * `dst`, which the caller guarantees is distinct from the va_list base
+ * register. Only TMP0/TMP1 are used as private scratch. */
+ NativeLoc val = dst;
+ NativeAddr src;
+ MemAccess ptr_mem = aa_mem_for_type(t, cur.type, 8);
+ MemAccess val_mem = aa_mem_for_type(t, type, type_size32(t, type));
+ if (dst.kind != NATIVE_LOC_REG)
+ compiler_panic(t->c, a->func ? a->func->loc : (SrcLoc){0, 0, 0},
+ "aarch64 native target: va_arg destination must be a "
+ "register");
if (vai.kind == ABI_VA_LIST_POINTER) {
- NativeAddr ap = aa_direct_pointer_addr(d, ap_addr);
aa_emit_mem(a, 1, cur, ap, ptr_mem);
src = aa_reg_addr(type, AA_TMP0, 0);
+ aa_emit_add_imm(a, AA_TMP1, AA_TMP0, 8);
+ aa_emit_mem(a, 0, aa_reg_loc(cur.type, NATIVE_REG_INT, AA_TMP1), ap,
+ ptr_mem);
aa_emit_mem(a, 1, val, src, val_mem);
- aa_emit_add_imm(a, AA_TMP0, AA_TMP0, 8);
- aa_emit_mem(a, 0, cur, ap, ptr_mem);
- dst = aa_direct_materialize_addr(d, dst_op);
- aa_emit_mem(a, 0, val, dst, val_mem);
return;
}
if (vai.kind == ABI_VA_LIST_AAPCS64) {
CfreeCgTypeId i32_ty = builtin_id(CFREE_CG_BUILTIN_I32);
NativeLoc off = aa_reg_loc(i32_ty, NATIVE_REG_INT, AA_TMP1);
- MemAccess i32_mem = aa_mem_for_type(d->native, i32_ty, 4);
- int is_fp = cg_type_is_float(d->base.c, type);
+ MemAccess i32_mem = aa_mem_for_type(t, i32_ty, 4);
+ int is_fp = cg_type_is_float(t->c, type);
+ u32 base = aa_va_base_reg(a, ap);
u32 offs_field = is_fp ? vai.vr_offs_offset : vai.gr_offs_offset;
u32 top_field = is_fp ? vai.vr_top_offset : vai.gr_top_offset;
u32 slot_size = is_fp ? vai.fp_slot_size : vai.gp_slot_size;
- MCLabel stack_label = d->native->mc->label_new(d->native->mc);
- MCLabel done_label = d->native->mc->label_new(d->native->mc);
- aa_load_ap_addr(d, ap_addr, 15u);
- aa_emit_mem(a, 1, off, aa_reg_addr(i32_ty, 15u, (i32)offs_field), i32_mem);
- aa_emit32(a->base.mc, aa64_subs_imm12(0, AA64_ZR, AA_TMP1, 0, 0));
- aa_emit32(a->base.mc,
- aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(CMP_GE_S)}));
- a->base.mc->emit_label_ref(a->base.mc, stack_label, R_AARCH64_CONDBR19, 4,
- 0);
- aa_emit_mem(a, 1, cur, aa_reg_addr(cur.type, 15u, (i32)top_field), ptr_mem);
- aa_emit32(a->base.mc, aa_sbfm(1, AA_TMP1, AA_TMP1, 0, 31));
- aa_emit32(a->base.mc, aa64_add(1, AA_TMP0, AA_TMP0, AA_TMP1));
+ MCLabel stack_label = t->mc->label_new(t->mc);
+ MCLabel done_label = t->mc->label_new(t->mc);
+ aa_emit_mem(a, 1, off, aa_reg_addr(i32_ty, base, (i32)offs_field), i32_mem);
+ aa_emit32(t->mc, aa64_subs_imm12(0, AA64_ZR, AA_TMP1, 0, 0));
+ aa_emit32(t->mc, aa64_brcond_pack((AA64BrCond){.cond = cmp_cond(CMP_GE_S)}));
+ t->mc->emit_label_ref(t->mc, stack_label, R_AARCH64_CONDBR19, 4, 0);
+ aa_emit_mem(a, 1, cur, aa_reg_addr(cur.type, base, (i32)top_field), ptr_mem);
+ aa_emit32(t->mc, aa_sbfm(1, AA_TMP1, AA_TMP1, 0, 31));
+ aa_emit32(t->mc, aa64_add(1, AA_TMP0, AA_TMP0, AA_TMP1));
aa_emit_mem(a, 1, val, aa_reg_addr(type, AA_TMP0, 0), val_mem);
aa_emit_add_imm(a, AA_TMP1, AA_TMP1, (i32)slot_size);
- aa_emit_mem(a, 0, off, aa_reg_addr(i32_ty, 15u, (i32)offs_field), i32_mem);
- aa_emit32(a->base.mc, aa64_b(0));
- a->base.mc->emit_label_ref(a->base.mc, done_label, R_AARCH64_JUMP26, 4, 0);
- a->base.mc->label_place(a->base.mc, stack_label);
- aa_emit_mem(a, 1, cur, aa_reg_addr(cur.type, 15u, (i32)vai.stack_offset),
+ aa_emit_mem(a, 0, off, aa_reg_addr(i32_ty, base, (i32)offs_field), i32_mem);
+ aa_emit32(t->mc, aa64_b(0));
+ t->mc->emit_label_ref(t->mc, done_label, R_AARCH64_JUMP26, 4, 0);
+ t->mc->label_place(t->mc, stack_label);
+ aa_emit_mem(a, 1, cur, aa_reg_addr(cur.type, base, (i32)vai.stack_offset),
ptr_mem);
aa_emit_mem(a, 1, val, aa_reg_addr(type, AA_TMP0, 0), val_mem);
aa_emit_add_imm(a, AA_TMP0, AA_TMP0, 8);
- aa_emit_mem(a, 0, cur, aa_reg_addr(cur.type, 15u, (i32)vai.stack_offset),
+ aa_emit_mem(a, 0, cur, aa_reg_addr(cur.type, base, (i32)vai.stack_offset),
ptr_mem);
- a->base.mc->label_place(a->base.mc, done_label);
- dst = aa_direct_materialize_addr(d, dst_op);
- aa_emit_mem(a, 0, val, dst, val_mem);
+ t->mc->label_place(t->mc, done_label);
return;
}
- compiler_panic(d->base.c, d->loc,
+ compiler_panic(t->c, a->func ? a->func->loc : (SrcLoc){0, 0, 0},
"aarch64 native target: unsupported va_list layout");
}
-static void aa_va_end_(NativeDirectTarget* d, Operand ap_addr) {
- (void)d;
- (void)ap_addr;
-}
-
-static void aa_va_copy_(NativeDirectTarget* d, Operand dst_ap_addr,
- Operand src_ap_addr) {
- AANativeTarget* a = aa_of(d->native);
- ABIVaListInfo vai = abi_va_list_layout(d->base.c->abi);
+static void aa_va_copy_core(AANativeTarget* a, NativeAddr dst_ap,
+ NativeAddr src_ap) {
+ NativeTarget* t = &a->base;
+ ABIVaListInfo vai = abi_va_list_layout(t->c->abi);
NativeLoc tmp =
aa_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT, AA_TMP0);
- MemAccess mem = aa_mem_for_type(d->native, tmp.type, 8);
+ MemAccess mem = aa_mem_for_type(t, tmp.type, 8);
if (vai.kind == ABI_VA_LIST_POINTER) {
- NativeAddr src = aa_direct_pointer_addr(d, src_ap_addr);
- NativeAddr dst;
- aa_emit_mem(a, 1, tmp, src, mem);
- dst = aa_direct_pointer_addr(d, dst_ap_addr);
- aa_emit_mem(a, 0, tmp, dst, mem);
+ aa_emit_mem(a, 1, tmp, src_ap, mem);
+ aa_emit_mem(a, 0, tmp, dst_ap, mem);
return;
}
if (vai.kind == ABI_VA_LIST_AAPCS64) {
- aa_load_ap_addr(d, src_ap_addr, 14u);
- aa_load_ap_addr(d, dst_ap_addr, 15u);
+ u32 sb = aa_va_base_reg(a, src_ap);
+ u32 db = aa_va_base_reg(a, dst_ap);
for (u32 off = 0; off < vai.type.size; off += 8u) {
- aa_emit_mem(a, 1, tmp, aa_reg_addr(tmp.type, 14u, (i32)off), mem);
- aa_emit_mem(a, 0, tmp, aa_reg_addr(tmp.type, 15u, (i32)off), mem);
+ aa_emit_mem(a, 1, tmp, aa_reg_addr(tmp.type, sb, (i32)off), mem);
+ aa_emit_mem(a, 0, tmp, aa_reg_addr(tmp.type, db, (i32)off), mem);
}
return;
}
- compiler_panic(d->base.c, d->loc,
+ compiler_panic(t->c, a->func ? a->func->loc : (SrcLoc){0, 0, 0},
"aarch64 native target: unsupported va_list layout");
}
+/* ---- Direct-path (NativeDirectTarget) wrappers: convert semantic operands to
+ * NativeAddr/NativeLoc, then call the shared cores above. ---- */
+
+/* The cores reserve x16/x17 (TMP0/TMP1) as private scratch and require the
+ * va_list base register(s) to be distinct from those. aa_direct_pointer_addr
+ * returns the pointer in TMP1, so the direct wrappers first relocate it into
+ * x9/x10 before calling the cores. */
+static NativeAddr aa_direct_va_base(NativeDirectTarget* d, Operand ap_addr,
+ u32 reg) {
+ aa_load_ap_addr(d, ap_addr, reg);
+ return aa_reg_addr(builtin_id(CFREE_CG_BUILTIN_I64), reg, 0);
+}
+
+static void aa_va_start_(NativeDirectTarget* d, Operand ap_addr) {
+ aa_va_start_core(aa_of(d->native), aa_direct_va_base(d, ap_addr, 10u));
+}
+
+static void aa_va_arg_(NativeDirectTarget* d, Operand dst_op, Operand ap_addr,
+ CfreeCgTypeId type) {
+ AANativeTarget* a = aa_of(d->native);
+ int is_fp = cg_type_is_float(d->base.c, type);
+ NativeLoc res =
+ aa_reg_loc(type, is_fp ? NATIVE_REG_FP : NATIVE_REG_INT, is_fp ? 16u : 9u);
+ MemAccess val_mem =
+ aa_mem_for_type(d->native, type, type_size32(d->native, type));
+ NativeAddr dst;
+ aa_va_arg_core(a, res, aa_direct_va_base(d, ap_addr, 10u), type);
+ dst = aa_direct_materialize_addr(d, dst_op);
+ aa_emit_mem(a, 0, res, dst, val_mem);
+}
+
+static void aa_va_end_(NativeDirectTarget* d, Operand ap_addr) {
+ (void)d;
+ (void)ap_addr;
+}
+
+static void aa_va_copy_(NativeDirectTarget* d, Operand dst_ap_addr,
+ Operand src_ap_addr) {
+ AANativeTarget* a = aa_of(d->native);
+ NativeAddr src = aa_direct_va_base(d, src_ap_addr, 9u);
+ NativeAddr dst = aa_direct_va_base(d, dst_ap_addr, 10u);
+ aa_va_copy_core(a, dst, src);
+}
+
+/* ---- NativeTarget (optimizer) hooks: the optimizer passes the va_list
+ * pointer as a materialized register; layout is resolved inside the cores. ---- */
+
+static NativeAddr aa_va_addr_from_ptr(NativeLoc ap_ptr) {
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_REG;
+ addr.cls = NATIVE_REG_INT;
+ addr.base.reg = ap_ptr.v.reg;
+ addr.base_type = ap_ptr.type;
+ return addr;
+}
+
+static void aa_va_start_native(NativeTarget* t, NativeLoc ap_ptr) {
+ aa_va_start_core(aa_of(t), aa_va_addr_from_ptr(ap_ptr));
+}
+
+static void aa_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr,
+ CfreeCgTypeId type) {
+ aa_va_arg_core(aa_of(t), dst, aa_va_addr_from_ptr(ap_ptr), type);
+}
+
+static void aa_va_end_native(NativeTarget* t, NativeLoc ap_ptr) {
+ (void)t;
+ (void)ap_ptr;
+}
+
+static void aa_va_copy_native(NativeTarget* t, NativeLoc dst_ap_ptr,
+ NativeLoc src_ap_ptr) {
+ aa_va_copy_core(aa_of(t), aa_va_addr_from_ptr(dst_ap_ptr),
+ aa_va_addr_from_ptr(src_ap_ptr));
+}
+
AA_UNUSED_FN static const char* aa_asm_constraint_body(const char* s) {
if (!s) return "";
if (s[0] == '=' && s[1] == '&') return s + 2;
diff --git a/src/arch/native_target.h b/src/arch/native_target.h
@@ -336,6 +336,18 @@ struct NativeTarget {
NativeAddr addr, NativeLoc expected, NativeLoc desired,
MemAccess, MemOrder success, MemOrder failure);
void (*fence)(NativeTarget*, MemOrder);
+ /* Variadic support. The optimizer passes the va_list pointer opaquely as a
+ * NativeLoc (a register or memory location holding the address of the
+ * va_list object); va_arg additionally receives the argument type and a
+ * destination location for the fetched value. All va_list layout knowledge
+ * (pointer ABI vs register-save-area ABI, field offsets, sizes) lives behind
+ * these hooks, which query the target ABI -- the optimizer makes no layout
+ * assumptions. */
+ void (*va_start_)(NativeTarget*, NativeLoc ap_ptr);
+ void (*va_arg_)(NativeTarget*, NativeLoc dst, NativeLoc ap_ptr,
+ CfreeCgTypeId type);
+ void (*va_end_)(NativeTarget*, NativeLoc ap_ptr);
+ void (*va_copy_)(NativeTarget*, NativeLoc dst_ap_ptr, NativeLoc src_ap_ptr);
void (*intrinsic)(NativeTarget*, IntrinKind, const NativeLoc* dsts, u32 ndst,
const NativeLoc* args, u32 narg);
void (*asm_block)(NativeTarget*, const char* tmpl, const AsmConstraint* outs,
diff --git a/src/opt/cg_ir_lower.c b/src/opt/cg_ir_lower.c
@@ -126,17 +126,15 @@ static int local_address_used_in_cg_ir(const CgIrFunc* f, CGLocal local) {
return 1;
break;
case CG_IR_AGG_COPY:
- case CG_IR_VA_COPY:
if ((in->nopnds > 0u &&
operand_uses_local_addr(&in->opnds[0], local)) ||
(in->nopnds > 1u && operand_uses_local_addr(&in->opnds[1], local)))
return 1;
break;
- case CG_IR_VA_START:
- case CG_IR_VA_END:
- if (in->nopnds > 0u && operand_uses_local_addr(&in->opnds[0], local))
- return 1;
- break;
+ /* VA_START/VA_ARG/VA_END/VA_COPY consume a pointer *value* (the address of
+ * the va_list, produced by an earlier ADDR_OF); they do not take the
+ * address of their pointer operand, so they must not force it to a frame
+ * slot. */
default:
break;
}
@@ -977,10 +975,13 @@ static void lower_one_inst(CgIrLower* l, u32 idx) {
break;
case CG_IR_VA_START:
case CG_IR_VA_END:
- lower_addr_value_ops(l, out, in, 1, 0);
+ /* The operand is a pointer value (the address of the va_list object),
+ * produced by an earlier ADDR_OF. Lower as a value so it can live in a
+ * register; the backend va hook consumes the pointer. */
+ lower_use_ops(l, out, in, 1);
break;
case CG_IR_VA_COPY:
- lower_addr_value_ops(l, out, in, 2, 0);
+ lower_use_ops(l, out, in, 2);
break;
case CG_IR_ATOMIC_RMW:
lower_value_ops(l, out, in, 3);
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -41,6 +41,8 @@ static int opt_type_large_or_aggregate(Compiler* c, CfreeCgTypeId ty) {
}
static int opt_func_needs_direct_replay(OptImpl* o, const CgIrFunc* f) {
+ extern char* getenv(const char*);
+ if (getenv("CFREE_NO_DIRECT_REPLAY")) return 0;
for (u32 i = 0; i < f->desc.nresults; ++i)
if (opt_type_large_or_aggregate(o->c, f->desc.result_types[i])) return 1;
for (u32 i = 0; i < f->desc.nparams; ++i)
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -754,14 +754,19 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in,
}
e->target->store(e->target, addr, src, in->extra.mem);
return;
- case IR_ADDR_OF:
- dst = loc_from_operand(e, &in->opnds[0], in->loc);
+ case IR_ADDR_OF: {
+ NativeLoc real = loc_from_operand(e, &in->opnds[0], in->loc);
addr = addr_from_operand(e, &in->opnds[1], in->loc);
+ dst = real;
if (dst.kind != NATIVE_LOC_REG)
- dst = materialize(e, dst, class_for_type(e, in->opnds[0].type),
- in->opnds[0].type, REG_NONE, REG_NONE, in->loc);
+ dst = scratch_loc(e, in->opnds[0].type,
+ class_for_type(e, in->opnds[0].type), REG_NONE,
+ REG_NONE, in->loc);
e->target->load_addr(e->target, dst, addr);
+ if (real.kind != NATIVE_LOC_REG)
+ write_loc(e, real, dst, mem_for_type(e->c, in->opnds[0].type), in->loc);
return;
+ }
case IR_TLS_ADDR_OF: {
IRTlsAux* aux = (IRTlsAux*)in->extra.aux;
dst = loc_from_operand(e, &in->opnds[0], in->loc);
@@ -1064,10 +1069,44 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in,
mem_for_type(e->c, in->opnds[1].type), in->loc);
return;
}
- case IR_VA_START:
- case IR_VA_ARG:
- case IR_VA_END:
- case IR_VA_COPY:
+ case IR_VA_START: {
+ NativeLoc ap = materialize(e, loc_from_operand(e, &in->opnds[0], in->loc),
+ NATIVE_REG_INT, in->opnds[0].type, REG_NONE,
+ REG_NONE, in->loc);
+ e->target->va_start_(e->target, ap);
+ return;
+ }
+ case IR_VA_END: {
+ NativeLoc ap = materialize(e, loc_from_operand(e, &in->opnds[0], in->loc),
+ NATIVE_REG_INT, in->opnds[0].type, REG_NONE,
+ REG_NONE, in->loc);
+ e->target->va_end_(e->target, ap);
+ return;
+ }
+ case IR_VA_COPY: {
+ NativeLoc d = materialize(e, loc_from_operand(e, &in->opnds[0], in->loc),
+ NATIVE_REG_INT, in->opnds[0].type, REG_NONE,
+ REG_NONE, in->loc);
+ NativeLoc s = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc),
+ NATIVE_REG_INT, in->opnds[1].type, d.v.reg,
+ REG_NONE, in->loc);
+ e->target->va_copy_(e->target, d, s);
+ return;
+ }
+ case IR_VA_ARG: {
+ CfreeCgTypeId ty = in->opnds[0].type;
+ NativeLoc ap = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc),
+ NATIVE_REG_INT, in->opnds[1].type, REG_NONE,
+ REG_NONE, in->loc);
+ /* The result must land in a register distinct from the va_list pointer;
+ * fetch into a scratch register, then write to the real destination. */
+ NativeLoc res = scratch_loc(e, ty, class_for_type(e, ty), ap.v.reg,
+ REG_NONE, in->loc);
+ e->target->va_arg_(e->target, res, ap, ty);
+ write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), res,
+ mem_for_type(e->c, ty), in->loc);
+ return;
+ }
case IR_BREAK_TO:
case IR_CONTINUE_TO:
case IR_ASM_BLOCK: