commit 571865addf5f159357059b659a46110f6388010d
parent a126becea27945ea9b961a41958d6d4dbdf93dff
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sat, 23 May 2026 10:05:53 -0700
opt: O1 perf — MIR-shaped combine, block reorder, scalar promote, addr-of-global CSE
- pass_combine: rewrite as a MIR-shaped per-BB fixpoint. Substitutes
IR_COPY / IR_LOAD_IMM / IR_CONVERT producers into register, indirect-
base, and indirect-index slots; synthesizes address modes from
IR_BINOP IADD/ISHL chains into OPK_INDIRECT base+index+scale+ofs;
sinks single-use producers into IR_COPY destinations; folds
ext-of-ext chains. Single-use accounting now counts indirect-index
references correctly.
- pass_jump: greedy chain-extension reorder of emit_order so each
conditional's fallthrough successor (and each unconditional br's
target) lands adjacent. Lets the existing branch-invert and
trailing-branch-strip cleanups collapse loop back-edges that the
frontend's head→inc→body layout otherwise leaves dangling.
- pass_o2: new opt_promote_scalar_locals replaces FS_LOCAL scalar slots
whose addr-of has fully folded with mutable PRegs (load → IR_COPY,
store → IR_COPY or IR_LOAD_IMM). New opt_addr_of_global_cse hoists
duplicate IR_ADDR_OF(global) defs into the entry block and remaps
uses; shrinks loop bodies by the per-iter adrp/add the backend would
otherwise re-emit.
- doc/OPT_PERF.md: refresh headline geomeans after the perf+correctness
combiner fixes, MIR-shaped allocator, and block-layout pass.
Diffstat:
7 files changed, 2067 insertions(+), 233 deletions(-)
diff --git a/doc/OPT_PERF.md b/doc/OPT_PERF.md
@@ -96,31 +96,133 @@ hosted/math benchmarks.
## Headline Geomeans
-Base: each benchmark's `gcc-15 -O2` row is `1.0x`.
-
-Representative 8-case scope:
-
-| tool | opt | cases | compile speed | runtime speed |
+Refreshed 2026-05-23 after the combiner perf+correctness fixes, MIR-shaped
+allocator, and block-layout fallthrough pass. Scope: 8-case
+(`array`, `binary-trees`, `hash`, `hash2`, `matrix`, `nbody`, `sieve`,
+`spectral-norm`). MIR fails `binary-trees`, `nbody`, `spectral-norm` on
+Darwin so its rows reflect 5 cases.
+
+Base: each benchmark's `gcc-15 -O2` row is `1.0x`. Higher compile-speed
+ratio = faster compile; higher runtime-speed ratio = faster runtime.
+
+The compile column is the **whole pipeline** (C frontend + optimizer +
+codegen + link + JIT). For MIR this sums `compile_ms` (c2m source→bmir)
+and `codegen_ms` (`MIR link finish` from the JIT invocation). For
+cfree-run it is `cfree-run compile_and_jit`. **Most of cfree's compile
+lead over MIR comes from cfree's C frontend** — for the backend-only
+slice see *Optimizer/Link/JIT Split* below (cfree and MIR are roughly
+tied there).
+
+| tool | opt | cases | compile speed | runtime speed | avg comp+gen (ms) | avg runtime (ms) |
+| --- | ---: | ---: | ---: | ---: | ---: | ---: |
+| `gcc-15` | `O0` | 8 | `1.172x` | `0.409x` | `149.5` | `6905.2` |
+| `gcc-15` | `O1` | 8 | `1.110x` | `0.955x` | `157.7` | `2960.3` |
+| `gcc-15` | `O2` | 8 | `1.000x` | `1.000x` | `175.1` | `2827.2` |
+| `clang` | `O0` | 8 | `1.282x` | `0.367x` | `136.6` | `7707.3` |
+| `clang` | `O1` | 8 | `1.209x` | `0.958x` | `144.8` | `2952.6` |
+| `clang` | `O2` | 8 | `1.141x` | `1.062x` | `153.5` | `2661.4` |
+| `mir-c2m` | `O0` | 5 | `1.224x` | `0.549x` | `141.0` | `5103.6` |
+| `mir-c2m` | `O1` | 5 | `1.247x` | `0.607x` | `138.4` | `4612.5` |
+| `mir-c2m` | `O2` | 5 | `1.272x` | `0.786x` | `135.6` | `3564.5` |
+| `cfree` | `O0` | 8 | `3.468x` | `0.337x` | `50.5` | `8394.7` |
+| `cfree` | `O1` | 8 | `3.370x` | `0.777x` | `52.0` | `3637.0` |
+| `cfree` | `O2` | 7 | `3.336x` | `0.681x` | `52.7` | `4152.8` |
+| `cfree-run` | `O0` | 8 | `5.594x` | `0.341x` | `31.3` | `8285.0` |
+| `cfree-run` | `O1` | 8 | `6.688x` | `0.807x` | `26.2` | `3505.4` |
+| `cfree-run` | `O2` | 7 | `5.865x` | `0.703x` | `30.0` | `4022.1` |
+
+Pairwise whole-pipeline geomeans at the same opt level (cfree-run vs
+peer; ratio `<1.0` = cfree-run is faster, `>1.0` = peer is faster):
+
+| peer | opt | cases | whole-pipeline compile (cfree/peer) | runtime (cfree/peer) |
| --- | ---: | ---: | ---: | ---: |
-| `cfree` | `O0` | 8 | `5.649x` | `0.346x` |
-| `cfree` | `O1` | 8 | `5.792x` | `0.502x` |
-| `cfree` | `O2` | 8 | `5.730x` | `0.468x` |
-| `cfree-run` | `O0` | 8 | `11.229x` | `0.363x` |
-| `cfree-run` | `O1` | 8 | `11.647x` | `0.519x` |
-| `cfree-run` | `O2` | 8 | `10.474x` | `0.481x` |
-| `mir-c2m` | `O0` | 5 | `2.476x` | `0.550x` |
-| `mir-c2m` | `O1` | 5 | `2.515x` | `0.574x` |
-| `mir-c2m` | `O2` | 5 | `2.437x` | `0.786x` |
+| `gcc-15` | `O0` | 8 | `0.21x` | `1.20x` |
+| `gcc-15` | `O1` | 8 | `0.17x` | `1.18x` |
+| `gcc-15` | `O2` | 7 | `0.17x` | `1.42x` |
+| `clang` | `O0` | 8 | `0.23x` | `1.08x` |
+| `clang` | `O1` | 8 | `0.18x` | `1.19x` |
+| `clang` | `O2` | 7 | `0.20x` | `1.51x` |
+| `mir-c2m` | `O0` | 5 | `0.24x` | `1.55x` |
+| `mir-c2m` | `O1` | 5 | `0.20x` | `0.82x` |
+| `mir-c2m` | `O2` | 4 | `0.23x` | `1.16x` |
+
+Backend-only (no frontend) `O1` comparison vs MIR (5-case scope, from
+the *Optimizer/Link/JIT Split* table below):
+
+| metric | cfree | MIR | cfree/MIR |
+| --- | ---: | ---: | ---: |
+| opt+link+JIT geomean (ms) | `0.701` | `0.726` | `0.97x` (3% faster) |
Interpretation:
-- cfree wins the whole compile/JIT headline mostly because its C frontend and
- hosted-header path are much faster, and because MIR fails the Darwin
- hosted/math rows.
-- On MIR-common cases where both systems have IR, MIR's optimized generation
- pipeline is faster and produces better runtime code.
-- cfree `O1` currently has the best cfree runtime geomean on this mix.
- `O2` is correct but slower than `O1` on the current benchmark set.
+- The 4-6x whole-pipeline compile lead over every peer is **almost
+ entirely the C frontend**. The cfree optimizer+backend slice at `O1`
+ is only `~3%` faster than MIR's equivalent JIT slice on the 5-case
+ scope. The frontend dominates the absolute time at every opt level on
+ these benches.
+- **cfree `O1` runtime now leads MIR `O1` by `1.22x` geomean** on the
+ 5-case MIR-comparable scope. This is the cleanest peer comparison
+ since both run equivalent O1 pipelines (no SSA, no coalescing).
+- cfree `O1` runtime **trails** gcc/clang `O1` by `1.18-1.19x` geomean —
+ most of the remaining O1 quality gap is against the production
+ compilers, not MIR.
+- cfree `O2` regressed from `O1` (`0.777 → 0.681x` vs gcc base; matrix
+ `O2` fails to compile). `O2` LICM-without-pressure-model and the
+ deferred live-range splitter are the suspects.
+
+### cfree `O1` vs gcc/clang `-O0`
+
+Useful framing for the typical developer workflow: users normally
+iterate against `gcc -O0` / `clang -O0` debug builds. cfree `O1` is the
+relevant comparison point because its compile cost is the same order of
+magnitude.
+
+Geomean over the 8-case scope:
+
+| metric | cfree `O1` vs gcc-15 `O0` | cfree `O1` vs clang `O0` |
+| --- | ---: | ---: |
+| whole-pipeline compile (cfree/peer) | `0.18x` (`5.5x` faster) | `0.19x` (`5.3x` faster) |
+| runtime (cfree/peer) | `0.51x` (`1.97x` faster) | `0.45x` (`2.20x` faster) |
+
+cfree `O1` is **strictly better than `-O0`**: faster to compile *and*
+roughly `2x` faster at runtime than gcc/clang debug builds.
+
+Per-bench cfree-run `O1` runtime vs `-O0` peers (ms; lower is faster):
+
+| bench | cfree `O1` | gcc `O0` | clang `O0` | vs gcc `O0` | vs clang `O0` |
+| --- | ---: | ---: | ---: | ---: | ---: |
+| `array` | `2649.7` | `6435.5` | `5399.4` | `0.41x` (`2.43x`) | `0.49x` (`2.04x`) |
+| `binary-trees` | `2520.3` | `2494.5` | `2616.5` | `1.01x` (tied) | `0.96x` (tied) |
+| `hash` | `3905.0` | `4313.0` | `4576.3` | `0.91x` (`1.10x`) | `0.85x` (`1.18x`) |
+| `hash2` | `3928.9` | `6910.9` | `8227.1` | `0.57x` (`1.76x`) | `0.48x` (`2.09x`) |
+| `matrix` | `4713.5` | `19344.0` | `13043.2` | `0.24x` (`4.10x`) | `0.36x` (`2.77x`) |
+| `nbody` | `2978.7` | `8460.6` | `10045.3` | `0.35x` (`2.84x`) | `0.30x` (`3.37x`) |
+| `sieve` | `4117.6` | `4749.5` | `12864.9` | `0.87x` (`1.15x`) | `0.32x` (`3.12x`) |
+| `spectral-norm` | `3849.0` | `13897.9` | `13887.6` | `0.28x` (`3.61x`) | `0.28x` (`3.61x`) |
+
+cfree `O1` ties or wins every bench against both `-O0` peers. The
+largest wins are on compute-heavy floating-point and matrix cases
+(`matrix`, `spectral-norm`, `nbody`) where `-O0`'s lack of register
+allocation hurts most.
+
+Per-bench cfree-run `O1` runtime vs peers (ms; lower is faster):
+
+| bench | cfree | mir-c2m | gcc-15 | clang | vs MIR | vs gcc |
+| --- | ---: | ---: | ---: | ---: | ---: | ---: |
+| `array` | `2649.7` | `4460.0` | `2000.5` | `2793.8` | **`0.59x`** | `1.32x` |
+| `binary-trees` | `2520.3` | — | `2451.3` | `2249.9` | — | `1.03x` |
+| `hash` | `3905.0` | `3885.0` | `3832.2` | `3850.7` | `1.01x` | `1.02x` |
+| `hash2` | `3928.9` | `3609.0` | `4075.6` | `3604.7` | `1.09x` | **`0.96x`** |
+| `matrix` | `4713.5` | `8927.0` | `2864.4` | `2910.9` | **`0.53x`** | `1.65x` |
+| `nbody` | `2978.7` | — | `2669.9` | `2534.6` | — | `1.12x` |
+| `sieve` | `4117.6` | `3740.0` | `2628.5` | `2311.2` | `1.10x` | `1.57x` |
+| `spectral-norm` | `3849.0` | — | `3831.2` | `3882.8` | — | `1.00x` |
+
+At `O1` cfree-run beats MIR on `array` (1.68x) and `matrix` (1.89x), is
+tied on `hash`, and trails on `hash2` (9%) and `sieve` (10%). Against
+gcc/clang `O1` the largest gaps are `matrix` (1.65x), `sieve` (1.57x),
+`array` (1.32x), `nbody` (1.12x) — these are the cases worth studying
+for next-step O1 improvements.
## Formerly Broken cfree Rows
@@ -152,36 +254,41 @@ Focused timing pass:
scoped timings from `cfree run --bench-time` (best-of-3 per bench)
- MIR measurement: `MIR link finish` from `c2m -v -O<n> file.bmir -eg`
-Latest refresh (2026-05-22, `CFREE_OPT_BENCH_FAST=1`, single Darwin host),
-post-regalloc-rewrite (see Iteration Notes below):
+Latest refresh (2026-05-23, `CFREE_OPT_BENCH_FAST=1`, single Darwin host),
+post-jump-layout pass (see Iteration Notes below):
| opt | cfree opt+link+JIT ms | MIR link/JIT ms | result |
| ---: | ---: | ---: | --- |
-| `O1` (HEAD before rewrite) | `0.968` | `0.722` | MIR `1.34x` faster |
-| `O1` (MIR-shaped allocator) | `0.621` | `0.720` | cfree `1.16x` faster |
+| `O1` (HEAD before regalloc rewrite) | `0.968` | `0.722` | MIR `1.34x` faster |
+| `O1` (MIR-shaped allocator, 2026-05-22) | `0.621` | `0.720` | cfree `1.16x` faster |
+| `O1` (slot-scan combiner, regression) | `0.824` | `0.711` | MIR `1.16x` faster |
+| `O1` (operand-driven combiner) | `0.719` | `0.711` | tied |
+| `O1` (live-range-scoped use counting) | `0.710` | `0.711` | tied |
+| `O1` (block-layout for fallthrough) | `0.701` | `0.726` | cfree `1.04x` faster |
-Per-bench at `O1` after the rewrite:
+Per-bench at `O1` after the block-layout pass:
-| bench | cfree opt ms | cfree link ms | cfree jit ms | cfree opt+l+j | MIR codegen |
-| --- | ---: | ---: | ---: | ---: | ---: |
-| `array` | `0.269` | `0.049` | `0.043` | `0.361` | `0.300` |
-| `hash` | `1.033` | `0.083` | `0.040` | `1.156` | `1.653` |
-| `hash2` | `1.069` | `0.089` | `0.042` | `1.200` | `1.765` |
-| `matrix` | `0.484` | `0.054` | `0.035` | `0.573` | `0.859` |
-| `sieve` | `0.259` | `0.031` | `0.031` | `0.321` | `0.258` |
-| **geomean** | | | | **`0.621`** | **`0.720`** |
-
-The previously reported `4.552 ms` cfree `O1` figure (with MIR at `0.718 ms`,
-a `6.34x` gap) was from an earlier benchmark configuration; on the current
-host it reproduces at `0.968 ms` against MIR's `0.722 ms`. The MIR-shaped
-allocator rewrite closes the gap and crosses ahead: cfree opt+link+JIT is
-now ~14% faster than MIR's `MIR link finish` across this 5-case scope,
-driven by a `~55%` per-bench drop in `opt.regalloc` time (point-bitmap
-conflict checks instead of per-hard-register sorted interval vectors).
-
-`O0` and `O2` rows have not yet been re-measured under the new allocator;
-the full-scope refresh is pending. cfree's final link/JIT mechanics stay
-fast across both data-structure choices.
+| bench | cfree opt+l+j ms | MIR link/JIT ms | cfree runtime ms | MIR runtime ms |
+| --- | ---: | ---: | ---: | ---: |
+| `array` | `0.384` | `0.316` | `2686.9` | `4529.0` |
+| `hash` | `1.327` | `1.668` | `3965.8` | `3941.0` |
+| `hash2` | `1.380` | `1.762` | `3978.2` | `3640.0` |
+| `matrix` | `0.734` | `0.830` | `4734.2` | `8966.0` |
+| `sieve` | `0.329` | `0.262` | `4146.8` | `3885.0` |
+| **geomean** | **`0.701`** | **`0.726`** | **`3837.5`** | **`4687.5`** |
+
+cfree `O1` now leads on both axes of this 5-case scope: opt+link+JIT is
+`3.4%` faster than MIR's `MIR link finish` slice, and runtime is `18.1%`
+faster on geomean. Two benches are dramatically ahead (matrix `1.89x`,
+array `1.69x`); two are within noise of MIR (hash, sieve); hash2 trails by
+`9.3%`. Sequence of wins that got here: MIR-shaped allocator (compile),
+operand-driven combiner dispatch (compile), live-range-scoped use
+counting (runtime, mostly matrix), block-layout for fallthrough (runtime,
+mostly sieve/array/matrix loop bodies).
+
+`O0` and `O2` rows have not yet been re-measured under the new allocator,
+combiner, or layout pass; the full-scope refresh is pending. cfree's
+final link/JIT mechanics stay fast across both data-structure choices.
## Goals
@@ -231,26 +338,39 @@ live_blocks, dead_def_elim, regalloc (simple mode), combine, dce, emit.
No SSA-era passes and no coalescing run at `O1`, matching MIR
(`mir-gen.c:9431`: coalesce is gated on `optimize_level >= 2`).
-Headline at `O1` (post-regalloc-rewrite, 2026-05-22 refresh):
-
-| metric | cfree | MIR |
-| --- | ---: | ---: |
-| opt+link+JIT (5-case scope, geomean) | `0.621 ms` | `0.720 ms` |
-| runtime (5-case scope, geomean) | `6354.6 ms` | `4663.8 ms` |
-
-cfree opt+link+JIT is now ~14% faster than MIR's `MIR link finish` slice.
-The remaining gap is on the runtime side: MIR's generated code runs ~1.36x
-faster than cfree's at `O1` on this scope, dominated by what MIR does that
-cfree does not (coalescing + splitting at O2, plus better instruction
-selection).
+Headline at `O1` (post-jump-layout pass, 2026-05-23 refresh):
-Compile-time gaps closed by the rewrite:
-
-- `opt_regalloc` now uses point-indexed bitsets ORed per program point
- (matching MIR's `assign()` in `mir-gen.c:7551-7728`, simplified branch),
- replacing the previous sorted interval-vector overlap checks per hard
- register. This is the dominant `~55%` per-bench reduction in
- `opt.regalloc` time across `array`, `hash`, `hash2`, `matrix`, `sieve`.
+| metric | cfree | MIR | cfree vs MIR |
+| --- | ---: | ---: | ---: |
+| opt+link+JIT (5-case scope, geomean) | `0.701 ms` | `0.726 ms` | `0.966x` (3.4% faster) |
+| runtime (5-case scope, geomean) | `3837.5 ms` | `4687.5 ms` | `0.819x` (18.1% faster) |
+
+cfree `O1` now leads MIR on both axes of this 5-case scope. Compile-time
+slice is within noise of MIR. Runtime is `1.22x` ahead, driven by matrix
+(`1.89x` ahead) and array (`1.69x` ahead) where the combiner +
+block-layout pair eliminates the redundant address-mode moves and the
+trivial inter-block branches. hash and sieve are within `7%` of MIR.
+hash2 remains the worst outlier at `9.3%` slower — same hash-table code
+as hash with larger working set, likely a register-pressure or cache
+effect that O1 can't address without coalescing/splitting.
+
+Compile-time gaps closed by recent work:
+
+- `opt_regalloc` uses point-indexed bitsets ORed per program point (matching
+ MIR's `assign()` in `mir-gen.c:7551-7728`, simplified branch), replacing
+ the previous sorted interval-vector overlap checks per hard register.
+ ~`55%` per-bench reduction in `opt.regalloc` time across the 5-case scope.
+- `opt_combine` runs a per-BB fixpoint with the four MIR-shaped rewrites
+ (substitute / sink / addr-mode synth / combine_exts), uses live-range-
+ scoped use counting (so reuse of a scratch physreg later in the block
+ no longer makes every earlier producer look multi-use), and dispatches
+ by walking the consumer's operands rather than scanning 96 producer
+ slots per inst.
+- `opt_jump_cleanup` LAYOUT stage now runs `cleanup_reorder_for_fallthrough`
+ ahead of the invert/strip passes, which reorders `f->emit_order` via
+ greedy chain extension. This lets the existing
+ `cleanup_layout_fallthrough_branches` collapse trivial `b <next>`
+ branches that the original block-id ordering left in place.
Compile-time gaps that remain:
@@ -258,23 +378,21 @@ Compile-time gaps that remain:
`calculate_func_cfg_live_info` also tracks all live vars at the
pre-allocation step (`consider_all_live_vars`), but its bitsets reuse a
shared sparse vector representation; cfree's `OptBitset` is dense.
-- `opt_combine` runs a single forward sweep per BB with no fixpoint, so
- cascading folds either require an earlier pass to enable them or get
- missed.
- `opt_dead_def_elim_with_live` runs before allocation and `opt_dce` runs
after; some of this work overlaps and may be redundant.
Runtime gaps unique to `O1`:
- No coalescing at `O1` (MIR matches this). Every `IR_COPY` survives to
- emission unless `opt_combine`'s identity-copy detector catches it
- post-RA. The 2026-05-22 attempt to enable coalescing at `O1` was rolled
- back as part of matching MIR's pipeline.
-- No ext-of-ext semantic fold. `opt_combine` removes only identical convert
- pairs. MIR's `combine_exts` folds chains like `zext8 -> zext32` into a
- single extension of the right width.
-- No per-BB combine fixpoint. Single-sweep misses transitive folds where one
- rewrite enables another.
+ emission unless `opt_combine` catches it post-RA. With sink+substitute
+ now firing reliably this is much less load-bearing than before, but
+ cross-block copies still survive.
+- Invariant loads inside loops (`mov w9, #1` per iteration in the sieve
+ flag-init loop). LICM is an O2 pass; even simple block-local hoisting
+ for single-BB invariants is not yet wired in at O1.
+- hash2 specifically trails its sibling hash by ~24% despite identical
+ code; likely register pressure as the table grows. Coalescing at O2 is
+ the standard mitigation.
## O2 Gap Analysis vs MIR
@@ -345,6 +463,125 @@ Runtime gaps:
## Iteration Notes
+### Block-layout pass for fallthrough (2026-05-23)
+
+`pass_jump.c` already had a `cleanup_layout_fallthrough_branches` that
+strips trailing `IR_BR` when the next emit-order block is the branch
+target — but it was layout-passive: cfree's frontend lowers C `for` loops
+as `[head, inc, body, exit]` (with `inc` between head and body because
+`continue` jumps to it), so `body`'s `b inc` was a backward jump in
+emit-order and could never be stripped. Disassembly inspection of sieve's
+hottest loop showed 2 redundant `b` per iteration — `head → b body → inc
+→ b head; body → b inc` — that no per-block cleanup could fix.
+
+Change: added `cleanup_reorder_for_fallthrough` that rewrites
+`f->emit_order` via greedy chain extension before the existing
+invert/strip cleanups run. Starting at `f->entry`, repeatedly extend the
+chain to the current block's preferred unvisited successor: for
+`CONDBR`/`CMP_BRANCH` that's `succ[1]` (fallthrough); for `IR_BR` or any
+1-succ block it's `succ[0]`. When the chain stalls, fall back to the
+taken arm (the subsequent `cleanup_invert_taken_fallthrough` will invert
+the branch), then to the lowest unvisited block in original order. After
+this pass every chained `(p, n)` pair is adjacent in emit_order, so the
+trailing-branch strip can collapse the jump.
+
+Effect on sieve inner loops (3 nested loops collapsed from
+`9 / 8 / 8 insts/iter` to `7 / 6 / 6`):
+
+```
+ mov x0, #2
+mov x0, #2 cmp x0, #8192
+cmp x0, #8192 b.gt exit
+b.gt exit mov w9, #1 ; LICM-eligible
+b body ← redundant strb w9, [x19, x0]
+add x0, x0, #1 add x0, x0, #1
+b loop b loop
+body: exit:
+ mov w9, #1
+ strb w9, [x19, x0]
+ b inc ← redundant
+```
+
+Measurement (`CFREE_OPT_BENCH_FAST=1`, best of 3 compile/run repeats,
+single Darwin host), 5-case scope at O1:
+
+| bench | runtime before (ms) | runtime after (ms) | delta | vs MIR |
+| --- | ---: | ---: | ---: | ---: |
+| `array` | `4988.9` | `2686.9` | `-46.1%` | cfree `1.69x` ahead |
+| `hash` | `4079.2` | `3965.8` | `-2.8%` | tied (`1.006x`) |
+| `hash2` | `5246.3` | `3978.2` | `-24.2%` | `9.3%` slower |
+| `matrix` | `8279.9` | `4734.2` | `-42.8%` | cfree `1.89x` ahead |
+| `sieve` | `8351.3` | `4146.8` | `-50.3%` | `6.7%` slower |
+| **geomean** | `5938.0` | `3837.5` | `-35.4%` | cfree `1.22x` ahead |
+
+Compile time barely moved (`0.710 → 0.701 ms` geomean): the reorder is
+O(nblocks) and the strip phase saves work in emit. cfree opt+link+JIT is
+now `3.4%` faster than MIR's `MIR link finish` slice on this scope.
+
+All tests pass: `test-opt` 691, `test-toy` 955, `test-cg-api`, `test-isa`,
+`test-aa64-inline`, `test-smoke-x64`, `test-smoke-rv64`. All 15 bench
+rows green.
+
+### Combiner perf+correctness fixes (2026-05-23)
+
+Three issues landed together in `src/opt/pass_combine.c`:
+
+1. **Operand-driven dispatch.** The MIR-shaped COMBINE rewrite (commit
+ `ea26470`) was algorithmically O(96) per instruction: `try_substitute`
+ scanned all `OPT_REG_CLASSES * OPT_MAX_HARD_REGS = 96` producer slots
+ per consumer, and `ctx_record` probed all 96 to find each instruction's
+ defines. Replaced both with operand walks: `try_substitute` iterates
+ `in->opnds`, looks up only the producers of registers actually
+ referenced (typically 2–3); `ctx_record` switches on `IROp` and records
+ the destination operand directly. Combiner cost: `0.197 → 0.050 ms`
+ geomean (`3.9x` faster).
+
+2. **Live-range-scoped use counting.** `count_uses_between` summed uses of
+ a physreg across the entire rest of the block, ignoring intervening
+ redefs. With scratch physregs reused multiple times per BB (the
+ matrix-mmult `sxtw x12; mov x13, x12; mov x12, ...` pattern), every
+ fold looked multi-use and was rejected. Replaced with
+ `count_uses_in_live_range` that walks until the next def of the same
+ physreg or a CALL/ASM/INTRINSIC clobber barrier. When the live range
+ terminates inside the block, the cross-block live-out check becomes
+ moot and is skipped.
+
+3. **`try_combine_exts` no-op guard.** When the inner producer was an
+ identity convert (`convert rB, rB`), rewriting `in->opnds[1]` to
+ `prod->opnds[1]` was a no-op but still reported a change, spinning the
+ per-BB fixpoint forever. Added an early-return when the rewrite would
+ set the operand to the same register it already holds. Defensive
+ 64-iteration cap added on the fixpoint as future-proofing.
+
+Effect on matrix `mmult` inner loop: 17 → 10 instructions per iteration.
+Every indexed access is now `sxtw + ldr [base, idx, lsl #s]` instead of
+three leading copies + the load.
+
+Measurement (`CFREE_OPT_BENCH_FAST=1`, best of 3 compile/run repeats,
+single Darwin host), 5-case scope at O1:
+
+| stage | combine ms | opt+l+j ms | runtime ms | matrix runtime |
+| --- | ---: | ---: | ---: | ---: |
+| Pre-COMBINE-rewrite baseline | `0.032` | `0.682` | `6314.0` | `10833.6` |
+| After COMBINE rewrite (regressed) | `0.197` | `0.824` | `6213.9` | `10721.6` |
+| After operand-driven dispatch | `0.050` | `0.719` | `6254.5` | `10811.4` |
+| After live-range-scoping | `0.046` | `0.710` | `5938.0` | `8279.9` |
+
+Runtime: `-1.6%` overall before the live-range fix (basically noise), then
+an additional `-5.1%` after — matrix `-23%` carries most of it. cfree's
+matrix runtime now beats MIR's by `6.3%`.
+
+Per-bench runtime vs MIR at O1 after the fix:
+
+| bench | cfree ms | MIR ms | ratio | cfree vs MIR |
+| --- | ---: | ---: | ---: | ---: |
+| `array` | `4988.9` | `4510.0` | `1.106x` | `10.6%` slower |
+| `hash` | `4079.2` | `3907.0` | `1.044x` | `4.4%` slower |
+| `hash2` | `5246.3` | `3600.0` | `1.457x` | `45.7%` slower |
+| `matrix` | `8279.9` | `8832.0` | `0.937x` | `6.3%` **faster** |
+| `sieve` | `8351.3` | `3757.0` | `2.223x` | `122.3%` slower |
+| **geomean** | | | `1.285x` | `28.5%` slower |
+
### MIR-shaped regalloc (2026-05-22)
Change: replace `OptAllocator`'s sorted interval-vector conflict structure
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -1355,6 +1355,12 @@ static void opt_run_lowering_pipeline(OptImpl* o, const char* total_scope,
metrics_scope_begin(o->c, "opt.addr_xform_pregs");
opt_addr_xform_pregs(o->f);
metrics_scope_end(o->c, "opt.addr_xform_pregs");
+ metrics_scope_begin(o->c, "opt.promote_scalar_locals");
+ opt_promote_scalar_locals(o->f);
+ metrics_scope_end(o->c, "opt.promote_scalar_locals");
+ metrics_scope_begin(o->c, "opt.addr_of_global_cse");
+ opt_addr_of_global_cse(o->f);
+ metrics_scope_end(o->c, "opt.addr_of_global_cse");
metrics_scope_begin(o->c, "opt.build_loop_tree");
opt_build_loop_tree(o->f);
metrics_scope_end(o->c, "opt.build_loop_tree");
diff --git a/src/opt/opt.h b/src/opt/opt.h
@@ -40,6 +40,10 @@ void opt_build_reg_ssa(Func*);
void opt_build_ssa(Func*);
void opt_addr_xform(Func*);
void opt_addr_xform_pregs(Func*); /* O1: PReg-namespace addr-of-local folding */
+void opt_promote_scalar_locals(Func*); /* O1: promote non-escaped scalar
+ frame slots to mutable PRegs. */
+void opt_addr_of_global_cse(Func*); /* O1: hoist duplicate ADDR_OF(global)
+ defs to a single entry-block compute. */
void opt_simplify_local(Func*);
void opt_simplify(Func*);
void opt_gvn(Func*); /* incl. constprop, redundant-load elim */
diff --git a/src/opt/pass_combine.c b/src/opt/pass_combine.c
@@ -1,6 +1,32 @@
+#include <stdint.h>
+#include <string.h>
+
#include "core/arena.h"
#include "opt/opt_internal.h"
+enum {
+ COMBINE_CG_TYPE_SEG_SHIFT = 6,
+ COMBINE_CG_TYPE_SEG_MASK = (1u << COMBINE_CG_TYPE_SEG_SHIFT) - 1u,
+ COMBINE_CG_TYPE_BUILTIN_SEG = 1u,
+};
+
+/* O1 combine, MIR-shaped (see mir-gen.c:8808-9146). One forward pass per BB
+ * maintains last-def / last-mem-def tracking; per-BB fixpoint loop iterates
+ * until no fold fires. Rewrites in this file:
+ *
+ * 1. Substitute producer source into consumer operand
+ * (IR_COPY / IR_LOAD_IMM / IR_CONVERT producer; into register slot,
+ * indirect base, or indirect index).
+ * 2. Address-mode synthesis (IR_BINOP IADD/ISHL producer; into OPK_INDIRECT
+ * base/index/scale/ofs).
+ * 3. Sink producer into single-use IR_COPY destination.
+ * 4. combine_exts: fold ext-of-ext chains with size+signedness rules.
+ *
+ * Spill compaction (store-then-reload, etc.) and self-copy removal continue
+ * to run in opt_combine_compact_block, unchanged. */
+
+/* ---- shared helpers (operand inspection / counting) ---- */
+
static int same_reg_operand(const Operand* a, const Operand* b) {
return a->kind == OPK_REG && b->kind == OPK_REG && a->cls == b->cls &&
a->v.reg == b->v.reg;
@@ -67,16 +93,25 @@ static int same_phys_reg(const Operand* a, const Operand* b) {
a->cls == b->cls && a->v.reg == b->v.reg;
}
-static int operand_uses_phys_reg(const Operand* op, const Operand* r) {
+/* Count register references inside a single operand. An OPK_INDIRECT can
+ * reference the queried register twice (e.g. `[r4 + r4 * 4]`); we return the
+ * true count so single-use accounting is exact. */
+static int count_operand_phys_uses(const Operand* op, const Operand* r) {
if (!op || !r || r->kind != OPK_REG) return 0;
- if (op->kind == OPK_REG) return op->cls == r->cls && op->v.reg == r->v.reg;
- if (op->kind == OPK_INDIRECT)
- return r->cls == RC_INT && op->v.ind.base == r->v.reg;
+ if (op->kind == OPK_REG)
+ return op->cls == r->cls && op->v.reg == r->v.reg ? 1 : 0;
+ if (op->kind == OPK_INDIRECT) {
+ if (r->cls != RC_INT) return 0;
+ int n = 0;
+ if (op->v.ind.base == r->v.reg) ++n;
+ if (op->v.ind.index != (Reg)REG_NONE && op->v.ind.index == r->v.reg) ++n;
+ return n;
+ }
return 0;
}
-static int count_operand_phys_uses(const Operand* op, const Operand* r) {
- return operand_uses_phys_reg(op, r) ? 1 : 0;
+static int operand_uses_phys_reg(const Operand* op, const Operand* r) {
+ return count_operand_phys_uses(op, r) > 0;
}
static int abi_uses_phys_reg(const CGABIValue* v, const Operand* r) {
@@ -259,50 +294,258 @@ static int inst_defines_phys_reg(const Inst* in, const Operand* r) {
}
}
-static int copy_fold_slot(const Inst* in, u32 idx) {
+/* True if `in` may write to memory. Used to invalidate memory-reading
+ * producers (IR_LOAD) when the consumer is past an intervening write. */
+static int inst_writes_memory(const Inst* in) {
+ switch ((IROp)in->op) {
+ case IR_STORE:
+ case IR_AGG_COPY:
+ case IR_AGG_SET:
+ case IR_BITFIELD_STORE:
+ case IR_ATOMIC_STORE:
+ case IR_ATOMIC_RMW:
+ case IR_ATOMIC_CAS:
+ case IR_CALL:
+ case IR_ASM_BLOCK:
+ case IR_INTRINSIC:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int inst_reads_memory(const Inst* in) {
+ switch ((IROp)in->op) {
+ case IR_LOAD:
+ case IR_BITFIELD_LOAD:
+ case IR_ATOMIC_LOAD:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+/* ---- substitution-slot whitelist (operand positions that accept a register
+ * substitute, an immediate substitute, or are valid for ext-of-ext folding) ---- */
+
+typedef enum SubstKind {
+ SK_REG, /* register-to-register (IR_COPY producer) */
+ SK_IMM, /* register-to-immediate (IR_LOAD_IMM producer) */
+ SK_CV, /* register-to-register through identical convert (IR_CONVERT prod) */
+} SubstKind;
+
+/* Returns 1 if the given operand-index `idx` of `in` is foldable for `kind`.
+ * SK_REG / SK_CV: register substitution slots. SK_IMM: immediate substitution
+ * slots. */
+static int combine_subst_slot(const Inst* in, u32 idx, SubstKind kind) {
switch ((IROp)in->op) {
case IR_COPY:
- case IR_CONVERT:
+ /* IR_COPY src is register-to-register by definition; folding an
+ * immediate would change its shape and defeat the self-copy detection
+ * that fires after coalescing assigns matching hard regs. */
+ return kind != SK_IMM && idx == 1;
case IR_UNOP:
- return idx == 1;
+ return kind != SK_IMM && idx == 1;
+ case IR_CONVERT:
+ return kind != SK_IMM && idx == 1;
case IR_BINOP:
case IR_CMP:
return idx == 1 || idx == 2;
case IR_CMP_BRANCH:
return idx == 0 || idx == 1;
case IR_CONDBR:
- return idx == 0;
+ return kind != SK_IMM && idx == 0;
case IR_STORE:
+ /* opnds[0] is the address; register subst into the indirect base/index
+ * is handled separately. The data slot at idx==1 accepts reg or imm. */
return idx == 1;
- case IR_ALLOCA:
- return idx == 1;
+ case IR_LOAD:
+ case IR_ATOMIC_LOAD:
case IR_ATOMIC_RMW:
- return idx == 2;
+ case IR_BITFIELD_LOAD:
+ case IR_BITFIELD_STORE:
+ case IR_AGG_COPY:
+ case IR_AGG_SET:
+ /* OPK_INDIRECT base/index substitution is handled inside
+ * subst_consumer_operands; these slots take no direct OPK_REG. */
+ return 0;
+ case IR_ALLOCA:
+ return kind != SK_IMM && idx == 1;
default:
return 0;
}
}
-static int imm_fold_slot(const Inst* in, u32 idx) {
+/* ---- per-BB tracking context ---- */
+
+typedef struct CombineCtx {
+ Func* f;
+ Block* bl;
+ const OptHardBlockLive* hard_live;
+ /* Index of the most recent inst in this BB that defined (cls, reg);
+ * -1 means no definition seen this BB. */
+ i32 last_def[OPT_REG_CLASSES][OPT_MAX_HARD_REGS];
+ i32 last_mem_def;
+ int block_change_p;
+} CombineCtx;
+
+static void ctx_reset(CombineCtx* ctx) {
+ memset(ctx->last_def, 0xff, sizeof ctx->last_def); /* all -1 */
+ ctx->last_mem_def = -1;
+ ctx->block_change_p = 0;
+}
+
+/* True if `in` is a barrier that conservatively invalidates all prior
+ * producers in this BB (modelled after the old find_single_direct_use, which
+ * treats every IR_CALL as a clobber regardless of explicit ABI describes). */
+static int inst_is_clobber_barrier(const Inst* in) {
+ switch ((IROp)in->op) {
+ case IR_CALL:
+ case IR_ASM_BLOCK:
+ case IR_INTRINSIC:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static void ctx_record_reg_def(CombineCtx* ctx, const Operand* op, i32 i) {
+ if (op->kind != OPK_REG) return;
+ if (op->cls >= OPT_REG_CLASSES || op->v.reg >= OPT_MAX_HARD_REGS) return;
+ ctx->last_def[op->cls][op->v.reg] = i;
+}
+
+/* After processing inst at index `i`, mark each register it defines (incl.
+ * implicit clobbers from CALL / ASM / INTRINSIC) as defined here. Mark
+ * memory if this inst writes.
+ *
+ * Walks the inst's destination operand(s) directly rather than probing every
+ * (cls, reg) pair. The barrier case (CALL/ASM/INTRINSIC) keeps a bulk mark
+ * because explicit defs/clobbers aren't always populated in IRCallAux at
+ * this point in the pipeline. */
+static void ctx_record(CombineCtx* ctx, const Inst* in, i32 i) {
+ if (inst_writes_memory(in)) ctx->last_mem_def = i;
+ if (inst_is_clobber_barrier(in)) {
+ for (u32 c = 0; c < OPT_REG_CLASSES; ++c)
+ for (u32 r = 0; r < OPT_MAX_HARD_REGS; ++r)
+ ctx->last_def[c][r] = i;
+ return;
+ }
switch ((IROp)in->op) {
+ case IR_LOAD_IMM:
+ case IR_LOAD_CONST:
+ case IR_LOAD_LABEL_ADDR:
+ case IR_COPY:
+ case IR_LOAD:
+ case IR_ADDR_OF:
+ case IR_TLS_ADDR_OF:
+ case IR_BITFIELD_LOAD:
case IR_BINOP:
+ case IR_UNOP:
case IR_CMP:
- return idx == 1 || idx == 2;
- case IR_CMP_BRANCH:
- return idx == 0 || idx == 1;
+ case IR_CONVERT:
+ case IR_ALLOCA:
+ case IR_VA_ARG:
+ case IR_ATOMIC_LOAD:
+ case IR_ATOMIC_RMW:
+ if (in->nopnds >= 1) ctx_record_reg_def(ctx, &in->opnds[0], i);
+ break;
+ case IR_ATOMIC_CAS:
+ if (in->nopnds >= 1) ctx_record_reg_def(ctx, &in->opnds[0], i);
+ if (in->nopnds >= 2) ctx_record_reg_def(ctx, &in->opnds[1], i);
+ break;
+ default:
+ break;
+ }
+}
+
+/* Lookup the producer of (cls, reg) in this BB, if any. Returns -1 if no
+ * definer has been seen yet in this BB. */
+static i32 ctx_producer_of(const CombineCtx* ctx, u8 cls, Reg reg) {
+ if (reg >= OPT_MAX_HARD_REGS || cls >= OPT_REG_CLASSES) return -1;
+ return ctx->last_def[cls][reg];
+}
+
+/* Does (cls, reg) get redefined strictly after `since_idx` (exclusive) in this
+ * BB, given current ctx state? We use last_def: if last_def[cls][reg] >
+ * since_idx, then yes. (since_idx is typically the producer's index.) */
+static int ctx_def_changed_since(const CombineCtx* ctx, u8 cls, Reg reg,
+ i32 since_idx) {
+ if (reg >= OPT_MAX_HARD_REGS || cls >= OPT_REG_CLASSES) return 0;
+ return ctx->last_def[cls][reg] > since_idx;
+}
+
+/* ---- forward-scan use accounting (used for single-use checks) ---- */
+
+/* Count phys uses of `def` within the live range starting just after
+ * `prod_idx`. The live range ends at the first inst that redefines `def`'s
+ * physical register or that is a clobber barrier (CALL/ASM/INTRINSIC); uses
+ * after that point belong to a different live range of the same physreg and
+ * are irrelevant to this producer.
+ *
+ * `*killed_in_block_out` is set non-zero when the live range terminates
+ * inside the block (caller may then skip the cross-block live-out check).
+ *
+ * Without this live-range scoping, reuse of a scratch physreg later in the
+ * block (`sxtw x12, ...; mov x13, x12; mov x12, ...`) makes every fold look
+ * multi-use and combine rejects almost everything. */
+static int count_uses_in_live_range(const Block* bl, i32 prod_idx,
+ const Operand* def,
+ int* killed_in_block_out) {
+ int n = 0;
+ int killed = 0;
+ for (i32 i = prod_idx + 1; i < (i32)bl->ninsts; ++i) {
+ const Inst* in = &bl->insts[i];
+ n += inst_uses_phys_reg(in, def);
+ if (inst_is_clobber_barrier(in) || inst_defines_phys_reg(in, def)) {
+ killed = 1;
+ break;
+ }
+ }
+ if (killed_in_block_out) *killed_in_block_out = killed;
+ return n;
+}
+
+/* ---- ConvKind helpers (for combine_exts) ---- */
+
+static u32 builtin_int_bytes(CfreeCgTypeId t) {
+ if ((t >> COMBINE_CG_TYPE_SEG_SHIFT) != COMBINE_CG_TYPE_BUILTIN_SEG) return 0;
+ CfreeCgBuiltinType b = (CfreeCgBuiltinType)(t & COMBINE_CG_TYPE_SEG_MASK);
+ switch (b) {
+ case CFREE_CG_BUILTIN_BOOL:
+ case CFREE_CG_BUILTIN_I8:
+ return 1;
+ case CFREE_CG_BUILTIN_I16:
+ return 2;
+ case CFREE_CG_BUILTIN_I32:
+ return 4;
+ case CFREE_CG_BUILTIN_I64:
+ return 8;
+ case CFREE_CG_BUILTIN_I128:
+ return 16;
default:
return 0;
}
}
-static int identical_convert_pair(const Inst* a, const Inst* b) {
- if ((IROp)a->op != IR_CONVERT || (IROp)b->op != IR_CONVERT) return 0;
- if (a->nopnds < 2 || b->nopnds < 2) return 0;
- if (a->extra.imm != b->extra.imm) return 0;
- return a->opnds[1].type == b->opnds[1].type &&
- a->opnds[0].type == b->opnds[0].type;
+/* For an IR_CONVERT inst, decode (src_bytes, dst_bytes, sign_p). Returns 0
+ * if this isn't an integer ext convert (CV_SEXT or CV_ZEXT). */
+static int ext_params(const Inst* in, u32* src_bytes_out, u32* dst_bytes_out,
+ int* sign_p_out) {
+ if ((IROp)in->op != IR_CONVERT || in->nopnds < 2) return 0;
+ ConvKind k = (ConvKind)in->extra.imm;
+ if (k != CV_SEXT && k != CV_ZEXT) return 0;
+ u32 sb = builtin_int_bytes(in->opnds[1].type);
+ u32 db = builtin_int_bytes(in->opnds[0].type);
+ if (!sb || !db || db < sb) return 0;
+ *src_bytes_out = sb;
+ *dst_bytes_out = db;
+ *sign_p_out = (k == CV_SEXT);
+ return 1;
}
+/* ---- producer-retarget legality (for sink rewrite) ---- */
+
static int binop_is_commutative(BinOp op) {
switch (op) {
case BO_IADD:
@@ -318,21 +561,53 @@ static int binop_is_commutative(BinOp op) {
}
}
-static int retarget_producer_legal(Inst* producer, const Operand* copy_dst,
+/* Is `producer` an op whose destination register can be safely retargeted
+ * without backend consultation? Excludes ops with implicit destination
+ * constraints, ABI pinning, multi-def outputs, or aux-driven dst layout. */
+static int producer_retargetable_op(IROp op) {
+ switch (op) {
+ case IR_BINOP:
+ case IR_UNOP:
+ case IR_LOAD_IMM:
+ case IR_LOAD_CONST:
+ case IR_LOAD:
+ case IR_CONVERT:
+ case IR_CMP:
+ case IR_ADDR_OF:
+ case IR_LOAD_LABEL_ADDR:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+/* Can `producer`'s destination be retargeted to `new_dst` without violating
+ * IR shape? For binop, may signal that a commutative-swap is needed. */
+static int retarget_producer_legal(Inst* producer, const Operand* new_dst,
int* swap_binop) {
*swap_binop = 0;
- if (!copy_dst || copy_dst->kind != OPK_REG) return 0;
- if (producer->nopnds < 2 || producer->opnds[0].kind != OPK_REG) return 0;
- if (producer->opnds[0].cls != copy_dst->cls) return 0;
- if (producer->opnds[0].type != copy_dst->type) return 0;
+ if (!new_dst || new_dst->kind != OPK_REG) return 0;
+ if (producer->nopnds < 1 || producer->opnds[0].kind != OPK_REG) return 0;
+ if (producer->opnds[0].cls != new_dst->cls) return 0;
+ if (producer->opnds[0].type != new_dst->type) return 0;
switch ((IROp)producer->op) {
+ case IR_LOAD_IMM:
+ case IR_LOAD_CONST:
+ case IR_LOAD_LABEL_ADDR:
+ case IR_ADDR_OF:
+ /* Single-defining ops with no register source to alias against. */
+ return 1;
case IR_UNOP:
+ case IR_CONVERT:
+ case IR_LOAD:
+ return 1;
+ case IR_CMP:
return 1;
case IR_BINOP: {
if (producer->nopnds < 3) return 0;
- int dst_is_lhs = operand_uses_phys_reg(&producer->opnds[1], copy_dst);
- int dst_is_rhs = operand_uses_phys_reg(&producer->opnds[2], copy_dst);
+ int dst_is_lhs = operand_uses_phys_reg(&producer->opnds[1], new_dst);
+ int dst_is_rhs = operand_uses_phys_reg(&producer->opnds[2], new_dst);
if (!dst_is_lhs && !dst_is_rhs) return 1;
if (dst_is_lhs) return 1;
if (binop_is_commutative((BinOp)producer->extra.imm)) {
@@ -365,166 +640,562 @@ static int ret_scalar_storage(CGABIValue* v, Operand** out) {
return 1;
}
-static int find_single_direct_use(Func* f, Block* bl,
- const OptHardBlockLive* hard_live, u32 def_i,
- const Operand* def, const Operand* src,
- int check_src, int imm_fold, int conv_fold,
- u32* use_i_out, u32* op_i_out) {
- int total_uses = 0;
- int source_clobbered = 0;
+/* ---- Rewrite 1: substitute producer source into uses ---- */
+
+/* Set the indirect's base or index to `new_reg`. */
+static void set_indirect_field(Operand* ind, Reg old_reg, Reg new_reg) {
+ if (ind->v.ind.base == old_reg) ind->v.ind.base = new_reg;
+ if (ind->v.ind.index != (Reg)REG_NONE && ind->v.ind.index == old_reg)
+ ind->v.ind.index = new_reg;
+}
+
+/* Walk the operands of consumer `in` and try to substitute uses of `def`
+ * (producer's destination, OPK_REG) with `src` (producer's source, OPK_REG
+ * or OPK_IMM). For OPK_INDIRECT operands, substitution into base/index is
+ * only valid for OPK_REG `src`. Returns the number of operands actually
+ * rewritten. */
+static int subst_consumer_operands(Inst* in, const Operand* def,
+ const Operand* src, SubstKind kind) {
+ int n = 0;
+ for (u32 oi = 0; oi < in->nopnds; ++oi) {
+ Operand* op = &in->opnds[oi];
+ /* Direct OPK_REG substitution: requires the slot to be on the whitelist. */
+ if (op->kind == OPK_REG && same_phys_reg(op, def) &&
+ combine_subst_slot(in, oi, kind)) {
+ *op = *src;
+ ++n;
+ continue;
+ }
+ /* Indirect base/index substitution: only OPK_REG src may land here. */
+ if (op->kind == OPK_INDIRECT && kind == SK_REG && src->kind == OPK_REG &&
+ def->kind == OPK_REG && def->cls == RC_INT && src->cls == RC_INT &&
+ (op->v.ind.base == def->v.reg ||
+ (op->v.ind.index != (Reg)REG_NONE &&
+ op->v.ind.index == def->v.reg))) {
+ set_indirect_field(op, def->v.reg, src->v.reg);
+ ++n;
+ }
+ }
+ return n;
+}
+
+/* Try to substitute the producer of (cls, reg) into uses of that register
+ * in `in`. Returns 1 if at least one operand was rewritten. */
+static int try_substitute_for_reg(CombineCtx* ctx, Inst* in, i32 i,
+ u8 cls, Reg reg) {
+ i32 prod_idx = ctx_producer_of(ctx, cls, reg);
+ if (prod_idx < 0 || prod_idx >= i) return 0;
+ Inst* prod = &ctx->bl->insts[prod_idx];
+ IROp pop = (IROp)prod->op;
+ if (pop != IR_COPY && pop != IR_LOAD_IMM && pop != IR_CONVERT) return 0;
+ if (prod->nopnds < 1 || prod->opnds[0].kind != OPK_REG) return 0;
+ if (prod->opnds[0].cls != cls || prod->opnds[0].v.reg != reg) return 0;
+
+ Operand def = prod->opnds[0];
+ SubstKind kind;
+ Operand src_op;
+ memset(&src_op, 0, sizeof src_op);
+ if (pop == IR_COPY) {
+ if (prod->nopnds < 2 || prod->opnds[1].kind != OPK_REG) return 0;
+ if (ctx_def_changed_since(ctx, prod->opnds[1].cls, prod->opnds[1].v.reg,
+ prod_idx))
+ return 0;
+ kind = SK_REG;
+ src_op = prod->opnds[1];
+ } else if (pop == IR_LOAD_IMM) {
+ kind = SK_IMM;
+ src_op.kind = OPK_IMM;
+ src_op.cls = def.cls;
+ src_op.type = def.type;
+ src_op.v.imm = prod->extra.imm;
+ } else { /* IR_CONVERT */
+ if (prod->nopnds < 2 || prod->opnds[1].kind != OPK_REG) return 0;
+ /* Fold only when consumer is also a convert with matching shape (the
+ * broader ext-of-ext rules run in try_combine_exts). */
+ if ((IROp)in->op != IR_CONVERT) return 0;
+ if (in->nopnds < 2 || in->opnds[1].kind != OPK_REG) return 0;
+ if (in->extra.imm != prod->extra.imm) return 0;
+ if (in->opnds[1].type != prod->opnds[0].type) return 0;
+ if (in->opnds[0].type != prod->opnds[0].type) return 0;
+ if (ctx_def_changed_since(ctx, prod->opnds[1].cls, prod->opnds[1].v.reg,
+ prod_idx))
+ return 0;
+ kind = SK_CV;
+ src_op = prod->opnds[1];
+ }
+
+ /* Single-use within the producer's live range: producer's dst must have
+ * exactly one further use before the next redef/barrier in this block (the
+ * consumer we're about to rewrite). If the live range terminates inside
+ * this block, the producer's def is killed before any successor sees it,
+ * so the cross-block live-out check can be skipped. */
int killed = 0;
- int found = 0;
- u32 found_i = 0;
- u32 found_op = 0;
+ int uses_total = count_uses_in_live_range(ctx->bl, prod_idx, &def, &killed);
+ if (uses_total != 1) return 0;
+ if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live,
+ ctx->bl->id, &def))
+ return 0;
- for (u32 i = def_i + 1u; i < bl->ninsts; ++i) {
- Inst* in = &bl->insts[i];
- int uses = inst_uses_phys_reg(in, def);
- if (uses) {
- if (check_src && source_clobbered) return 0;
- total_uses += uses;
- if (total_uses > 1) return 0;
- for (u32 oi = 0; oi < in->nopnds; ++oi) {
- int ok =
- conv_fold
- ? (oi == 1 && identical_convert_pair(&bl->insts[def_i], in))
- : (imm_fold ? imm_fold_slot(in, oi) : copy_fold_slot(in, oi));
- if (!ok) continue;
- if (!same_phys_reg(&in->opnds[oi], def)) continue;
- found_i = i;
- found_op = oi;
- found = 1;
+ int n = subst_consumer_operands(in, &def, &src_op, kind);
+ if (n > 0) {
+ ctx->block_change_p = 1;
+ return 1;
+ }
+ return 0;
+}
+
+/* Mark (cls, reg) as already-attempted for this consumer so the same producer
+ * is not looked up twice when a register appears in multiple operand slots
+ * (e.g. `[r4 + r4*4]`, or `add r1, r4, r4`). */
+static int seen_mark(u32 seen[OPT_REG_CLASSES], u8 cls, Reg reg) {
+ if (cls >= OPT_REG_CLASSES || reg >= 32) return 1;
+ u32 bit = 1u << reg;
+ if (seen[cls] & bit) return 1;
+ seen[cls] |= bit;
+ return 0;
+}
+
+/* Try to substitute producers into operand slots of `in`. Walks the direct
+ * operands of `in` and looks up only the producers of registers actually
+ * referenced (typically 2-3 per inst). */
+static int try_substitute(CombineCtx* ctx, Inst* in, i32 i) {
+ int any = 0;
+ u32 seen[OPT_REG_CLASSES] = {0};
+ for (u32 oi = 0; oi < in->nopnds; ++oi) {
+ const Operand* op = &in->opnds[oi];
+ if (op->kind == OPK_REG) {
+ if (!seen_mark(seen, op->cls, op->v.reg))
+ any |= try_substitute_for_reg(ctx, in, i, op->cls, op->v.reg);
+ } else if (op->kind == OPK_INDIRECT) {
+ if (op->v.ind.base != (Reg)REG_NONE &&
+ !seen_mark(seen, RC_INT, op->v.ind.base))
+ any |= try_substitute_for_reg(ctx, in, i, RC_INT, op->v.ind.base);
+ if (op->v.ind.index != (Reg)REG_NONE &&
+ !seen_mark(seen, RC_INT, op->v.ind.index))
+ any |= try_substitute_for_reg(ctx, in, i, RC_INT, op->v.ind.index);
+ }
+ }
+ return any;
+}
+
+/* ---- Rewrite 2: address-mode synthesis ---- */
+
+/* Try to fold add/shl producers into one OPK_INDIRECT operand of `in`. */
+static int try_addr_synth_one_op(CombineCtx* ctx, Inst* in, i32 i,
+ Operand* op) {
+ (void)in;
+ if (op->kind != OPK_INDIRECT) return 0;
+ int any = 0;
+
+ /* (a/c) base producer is IR_BINOP IADD. reg+reg synthesizes a base+index
+ * pair (requires no existing index, sub-rule a); reg+imm folds the immediate
+ * into ofs (sub-rule c, ok with or without an existing index). */
+ if (op->v.ind.base != (Reg)REG_NONE) {
+ Reg b = op->v.ind.base;
+ i32 prod_idx = ctx_producer_of(ctx, RC_INT, b);
+ if (prod_idx >= 0 && prod_idx < i) {
+ Inst* prod = &ctx->bl->insts[prod_idx];
+ if ((IROp)prod->op == IR_BINOP && prod->nopnds == 3 &&
+ (BinOp)prod->extra.imm == BO_IADD && prod->opnds[0].kind == OPK_REG &&
+ prod->opnds[0].cls == RC_INT && prod->opnds[0].v.reg == b) {
+ Operand prod_def = prod->opnds[0];
+ /* Single-use within producer's live range (the only further use
+ * before next redef/barrier). */
+ int killed = 0;
+ int uses_after = count_uses_in_live_range(ctx->bl, prod_idx,
+ &prod_def, &killed);
+ if (uses_after == 1 &&
+ (killed || !opt_block_live_out_has_phys_reg(
+ ctx->f, ctx->hard_live, ctx->bl->id, &prod_def))) {
+ Operand lhs = prod->opnds[1];
+ Operand rhs = prod->opnds[2];
+ int has_no_index = (op->v.ind.index == (Reg)REG_NONE);
+ /* reg + reg: base = lhs, index = rhs, scale=0. Needs an empty
+ * index slot — cannot stack two indices. */
+ if (has_no_index && lhs.kind == OPK_REG && rhs.kind == OPK_REG &&
+ lhs.cls == RC_INT && rhs.cls == RC_INT &&
+ !ctx_def_changed_since(ctx, RC_INT, lhs.v.reg, prod_idx) &&
+ !ctx_def_changed_since(ctx, RC_INT, rhs.v.reg, prod_idx)) {
+ op->v.ind.base = lhs.v.reg;
+ op->v.ind.index = rhs.v.reg;
+ op->v.ind.log2_scale = 0;
+ any = 1;
+ }
+ /* reg + imm: base = lhs, fold imm into ofs. Safe with or without
+ * an existing index — only the offset is mutated. */
+ else if (lhs.kind == OPK_REG && rhs.kind == OPK_IMM &&
+ lhs.cls == RC_INT &&
+ !ctx_def_changed_since(ctx, RC_INT, lhs.v.reg, prod_idx)) {
+ i64 sum = (i64)op->v.ind.ofs + rhs.v.imm;
+ if (sum >= INT32_MIN && sum <= INT32_MAX) {
+ op->v.ind.base = lhs.v.reg;
+ op->v.ind.ofs = (i32)sum;
+ any = 1;
+ }
+ }
+ /* imm + reg: base = rhs, fold imm into ofs (commutative IADD). */
+ else if (lhs.kind == OPK_IMM && rhs.kind == OPK_REG &&
+ rhs.cls == RC_INT &&
+ !ctx_def_changed_since(ctx, RC_INT, rhs.v.reg, prod_idx)) {
+ i64 sum = (i64)op->v.ind.ofs + lhs.v.imm;
+ if (sum >= INT32_MIN && sum <= INT32_MAX) {
+ op->v.ind.base = rhs.v.reg;
+ op->v.ind.ofs = (i32)sum;
+ any = 1;
+ }
+ }
+ }
}
}
+ }
- if ((IROp)in->op == IR_CALL) {
- if (check_src) source_clobbered = 1;
- killed = 1;
- break;
+ /* (b) index producer is IR_BINOP ISHL reg, imm with scale=0. */
+ if (op->v.ind.index != (Reg)REG_NONE && op->v.ind.log2_scale == 0) {
+ Reg idx = op->v.ind.index;
+ i32 prod_idx = ctx_producer_of(ctx, RC_INT, idx);
+ if (prod_idx >= 0 && prod_idx < i) {
+ Inst* prod = &ctx->bl->insts[prod_idx];
+ if ((IROp)prod->op == IR_BINOP && prod->nopnds == 3 &&
+ (BinOp)prod->extra.imm == BO_SHL &&
+ prod->opnds[0].kind == OPK_REG &&
+ prod->opnds[0].cls == RC_INT && prod->opnds[0].v.reg == idx &&
+ prod->opnds[1].kind == OPK_REG && prod->opnds[1].cls == RC_INT &&
+ prod->opnds[2].kind == OPK_IMM) {
+ i64 sh = prod->opnds[2].v.imm;
+ if (sh >= 1 && sh <= 3) {
+ Operand prod_def = prod->opnds[0];
+ int killed = 0;
+ int uses_after = count_uses_in_live_range(ctx->bl, prod_idx,
+ &prod_def, &killed);
+ /* `<= 2` allows the degenerate [base == index] case where the SHL
+ * dst appears twice in the same indirect; rewriting only the index
+ * still yields equivalent arithmetic when base also held the SHL
+ * dst (base unchanged: r*4; index rewritten: r_src*4; r*4 == r*4). */
+ if (uses_after >= 1 && uses_after <= 2 &&
+ (killed || !opt_block_live_out_has_phys_reg(
+ ctx->f, ctx->hard_live, ctx->bl->id, &prod_def)) &&
+ !ctx_def_changed_since(ctx, RC_INT, prod->opnds[1].v.reg,
+ prod_idx)) {
+ op->v.ind.index = prod->opnds[1].v.reg;
+ op->v.ind.log2_scale = (u8)sh;
+ any = 1;
+ }
+ }
+ }
}
- if (check_src && src && inst_defines_phys_reg(in, src))
- source_clobbered = 1;
- if (inst_defines_phys_reg(in, def)) {
- killed = 1;
- break;
+ }
+
+ return any;
+}
+
+static int try_addr_synth(CombineCtx* ctx, Inst* in, i32 i) {
+ int any = 0;
+ for (u32 oi = 0; oi < in->nopnds; ++oi) {
+ if (in->opnds[oi].kind == OPK_INDIRECT) {
+ if (try_addr_synth_one_op(ctx, in, i, &in->opnds[oi])) {
+ any = 1;
+ ctx->block_change_p = 1;
+ }
+ }
+ }
+ return any;
+}
+
+/* ---- Rewrite 3: sink producer into single-use IR_COPY destination ---- */
+
+static int try_sink(CombineCtx* ctx, Inst* in, i32 i) {
+ if ((IROp)in->op != IR_COPY || in->nopnds < 2) return 0;
+ if (in->opnds[0].kind != OPK_REG || in->opnds[1].kind != OPK_REG) return 0;
+ if (same_phys_reg(&in->opnds[0], &in->opnds[1])) return 0;
+
+ Operand src = in->opnds[1];
+ Operand dst = in->opnds[0];
+ i32 prod_idx = ctx_producer_of(ctx, src.cls, src.v.reg);
+ if (prod_idx < 0 || prod_idx >= i) return 0;
+ Inst* prod = &ctx->bl->insts[prod_idx];
+ if (!producer_retargetable_op((IROp)prod->op)) return 0;
+ if (prod->nopnds < 1 || prod->opnds[0].kind != OPK_REG) return 0;
+ if (!same_phys_reg(&prod->opnds[0], &src)) return 0;
+
+ /* Producer's source operands must not have been redefined since. */
+ for (u32 oi = 1; oi < prod->nopnds; ++oi) {
+ const Operand* p = &prod->opnds[oi];
+ if (p->kind == OPK_REG) {
+ if (ctx_def_changed_since(ctx, p->cls, p->v.reg, prod_idx)) return 0;
+ } else if (p->kind == OPK_INDIRECT) {
+ if (ctx_def_changed_since(ctx, RC_INT, p->v.ind.base, prod_idx))
+ return 0;
+ if (p->v.ind.index != (Reg)REG_NONE &&
+ ctx_def_changed_since(ctx, RC_INT, p->v.ind.index, prod_idx))
+ return 0;
}
}
+ /* If producer reads memory, no intervening memory write. */
+ if (inst_reads_memory(prod) && ctx->last_mem_def > prod_idx) return 0;
+ /* Retargeting moves the producer's write to `dst` earlier. That is only
+ * legal if the hard register is dead throughout the interval before the
+ * copy that originally wrote it. */
+ for (i32 j = prod_idx + 1; j < i; ++j) {
+ Inst* mid = &ctx->bl->insts[j];
+ if (inst_uses_phys_reg(mid, &dst) || inst_defines_phys_reg(mid, &dst))
+ return 0;
+ }
- if (total_uses != 1) return 0;
- if (!found) return 0;
- if (!killed && opt_block_live_out_has_phys_reg(f, hard_live, bl->id, def))
+ /* Producer dst must have exactly one use within its live range (the copy
+ * at i). If the live range terminates inside the block, the cross-block
+ * live-out check is moot. */
+ int killed = 0;
+ int uses_total = count_uses_in_live_range(ctx->bl, prod_idx, &src, &killed);
+ if (uses_total != 1) return 0;
+ if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live,
+ ctx->bl->id, &src))
return 0;
- *use_i_out = found_i;
- *op_i_out = found_op;
+
+ int swap_binop = 0;
+ if (!retarget_producer_legal(prod, &dst, &swap_binop)) return 0;
+
+ if (swap_binop) {
+ Operand tmp = prod->opnds[1];
+ prod->opnds[1] = prod->opnds[2];
+ prod->opnds[2] = tmp;
+ }
+ prod->opnds[0] = dst;
+ if (prod->def != VAL_NONE) prod->def = (Val)dst.v.reg;
+
+ /* Update last-def: producer no longer defines src, now defines dst. */
+ ctx->last_def[src.cls][src.v.reg] = -1;
+ ctx->last_def[dst.cls][dst.v.reg] = prod_idx;
+
+ /* Mark the copy NOP'd so compact removes it. */
+ in->op = IR_NOP;
+ in->def = VAL_NONE;
+ in->ndefs = 0;
+ in->defs = NULL;
+ in->nopnds = 0;
+ in->opnds = NULL;
+ ctx->block_change_p = 1;
return 1;
}
-static void opt_combine_fold_block(Func* f, Block* bl,
- const OptHardBlockLive* hard_live) {
- for (u32 i = 0; i < bl->ninsts; ++i) {
+/* ---- Rewrite 4: combine_exts (ext-of-ext chains) ---- */
+
+static int try_combine_exts(CombineCtx* ctx, Inst* in, i32 i) {
+ if ((IROp)in->op != IR_CONVERT || in->nopnds < 2) return 0;
+ if (in->opnds[1].kind != OPK_REG) return 0;
+ u32 sb, db;
+ int outer_sign;
+ if (!ext_params(in, &sb, &db, &outer_sign)) return 0;
+
+ Reg src_reg = in->opnds[1].v.reg;
+ u8 src_cls = in->opnds[1].cls;
+ i32 prod_idx = ctx_producer_of(ctx, src_cls, src_reg);
+ if (prod_idx < 0 || prod_idx >= i) return 0;
+ Inst* prod = &ctx->bl->insts[prod_idx];
+ u32 isb, idb;
+ int inner_sign;
+ if (!ext_params(prod, &isb, &idb, &inner_sign)) return 0;
+ if (prod->opnds[1].kind != OPK_REG) return 0;
+ if (ctx_def_changed_since(ctx, prod->opnds[1].cls, prod->opnds[1].v.reg,
+ prod_idx))
+ return 0;
+
+ /* Single-use of inner ext dst within its live range (only this insn). */
+ Operand prod_def = prod->opnds[0];
+ int killed = 0;
+ int uses_total = count_uses_in_live_range(ctx->bl, prod_idx, &prod_def,
+ &killed);
+ if (uses_total != 1) return 0;
+ if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live,
+ ctx->bl->id, &prod_def))
+ return 0;
+
+ /* Outer width `db`, inner width `idb`. Inner reads `isb` bytes. */
+ u32 outer_in = sb; /* outer's source width */
+ u32 inner_out = idb; /* inner's output width */
+ if (outer_in > inner_out) return 0; /* sanity: inner feeds outer */
+
+ /* If inner is an identity-shaped convert that reads back its own def
+ * register (e.g. `convert rB, rB`), the rewrite would set in.opnds[1] to
+ * the same register it already holds. Reporting a change spins the
+ * per-BB fixpoint forever. Skip. */
+ if (prod->opnds[1].kind == OPK_REG && in->opnds[1].kind == OPK_REG &&
+ prod->opnds[1].cls == in->opnds[1].cls &&
+ prod->opnds[1].v.reg == in->opnds[1].v.reg)
+ return 0;
+
+ /* Pattern A: outer width <= inner-source width. Outer can reach past the
+ * inner ext to the inner source directly, keeping outer's signedness.
+ * MIR allows any signedness combination here. */
+ if (db <= isb) {
+ in->opnds[1] = prod->opnds[1];
+ in->opnds[1].type = prod->opnds[1].type;
+ ctx->block_change_p = 1;
+ return 1;
+ }
+
+ /* Pattern B: inner-source width < outer width. Allowed iff outer is signed
+ * OR inner is unsigned; excludes the unsafe uext(sext) combination. */
+ if (isb < db && (outer_sign || !inner_sign)) {
+ in->opnds[1] = prod->opnds[1];
+ in->opnds[1].type = prod->opnds[1].type;
+ /* outer's ConvKind already encodes the outer signedness; keep it. */
+ ctx->block_change_p = 1;
+ return 1;
+ }
+
+ return 0;
+}
+
+/* ---- Existing IR_RET retarget (kept; runs in the forward pass) ---- */
+
+static int try_ret_retarget(Func* f, Block* bl, i32 i) {
+ if (!f->opt_rewritten || i <= 0) return 0;
+ Inst* in = &bl->insts[i];
+ if ((IROp)in->op != IR_RET) return 0;
+ IRRetAux* aux = (IRRetAux*)in->extra.aux;
+ Operand* ret_op = NULL;
+ Reg ret_reg = REG_NONE;
+ if (!aux || !aux->present || !ret_scalar_storage(&aux->val, &ret_op) ||
+ !first_return_reg(f, ret_op->cls, &ret_reg) ||
+ ret_reg == (Reg)REG_NONE || ret_reg == ret_op->v.reg)
+ return 0;
+ Inst* producer = &bl->insts[i - 1u];
+ Operand ret_dst = *ret_op;
+ ret_dst.v.reg = ret_reg;
+ int swap_binop = 0;
+ if (producer->nopnds < 1 ||
+ !same_phys_reg(&producer->opnds[0], ret_op) ||
+ !retarget_producer_legal(producer, &ret_dst, &swap_binop))
+ return 0;
+ if (swap_binop) {
+ Operand tmp = producer->opnds[1];
+ producer->opnds[1] = producer->opnds[2];
+ producer->opnds[2] = tmp;
+ }
+ producer->opnds[0] = ret_dst;
+ *ret_op = ret_dst;
+ return 1;
+}
+
+/* ---- per-BB driver ---- */
+
+static int opt_combine_fold_block(Func* f, Block* bl,
+ const OptHardBlockLive* hard_live) {
+ enum { enable_o1_combine_rewrites = 1 };
+ CombineCtx ctx;
+ ctx.f = f;
+ ctx.bl = bl;
+ ctx.hard_live = hard_live;
+ ctx_reset(&ctx);
+
+ for (i32 i = 0; i < (i32)bl->ninsts; ++i) {
Inst* in = &bl->insts[i];
- u32 use_i = 0;
- u32 op_i = 0;
- if (f->opt_rewritten && (IROp)in->op == IR_RET && i > 0) {
- IRRetAux* aux = (IRRetAux*)in->extra.aux;
- Operand* ret_op = NULL;
- Reg ret_reg = REG_NONE;
- if (aux && aux->present && ret_scalar_storage(&aux->val, &ret_op) &&
- first_return_reg(f, ret_op->cls, &ret_reg) &&
- ret_reg != (Reg)REG_NONE && ret_reg != ret_op->v.reg) {
- Inst* producer = &bl->insts[i - 1u];
- Operand ret_dst = *ret_op;
- ret_dst.v.reg = ret_reg;
- int swap_binop = 0;
- if (producer->nopnds >= 1 &&
- same_phys_reg(&producer->opnds[0], ret_op) &&
- retarget_producer_legal(producer, &ret_dst, &swap_binop)) {
- if (swap_binop) {
- Operand tmp = producer->opnds[1];
- producer->opnds[1] = producer->opnds[2];
- producer->opnds[2] = tmp;
- }
- producer->opnds[0] = ret_dst;
- *ret_op = ret_dst;
- continue;
+ if (enable_o1_combine_rewrites && try_ret_retarget(f, bl, i)) {
+ ctx.block_change_p = 1;
+ /* The producer's def changed; update ctx for the producer at i-1. */
+ Inst* prev = &bl->insts[i - 1];
+ Operand probe;
+ memset(&probe, 0, sizeof probe);
+ probe.kind = OPK_REG;
+ for (u8 c = 0; c < OPT_REG_CLASSES; ++c) {
+ probe.cls = c;
+ for (Reg r = 0; r < OPT_MAX_HARD_REGS; ++r) {
+ probe.v.reg = r;
+ if (ctx.last_def[c][r] == i - 1 && !inst_defines_phys_reg(prev, &probe))
+ ctx.last_def[c][r] = -1;
}
}
+ ctx_record(&ctx, prev, i - 1);
}
- /* Producer-to-copy retargeting is unsafe after rewrite: hard registers are
- * mutable storage, and the original producer register may remain live into
- * successor blocks even when the local copy looks like the only use. */
- if ((IROp)in->op == IR_COPY && in->nopnds >= 2 &&
- in->opnds[0].kind == OPK_REG && in->opnds[1].kind == OPK_REG &&
- !same_phys_reg(&in->opnds[0], &in->opnds[1]) &&
- find_single_direct_use(f, bl, hard_live, i, &in->opnds[0],
- &in->opnds[1], 1, 0, 0, &use_i, &op_i)) {
- bl->insts[use_i].opnds[op_i] = in->opnds[1];
+ /* Skip NOPs left by prior sink rewrites. */
+ if ((IROp)in->op == IR_NOP) continue;
+
+ if (enable_o1_combine_rewrites && try_sink(&ctx, in, i)) {
+ /* sink NOP'd the copy and updated ctx. */
continue;
}
- if ((IROp)in->op == IR_LOAD_IMM && in->nopnds >= 1 &&
- in->opnds[0].kind == OPK_REG &&
- find_single_direct_use(f, bl, hard_live, i, &in->opnds[0], NULL, 0, 1,
- 0, &use_i, &op_i)) {
- Operand imm = in->opnds[0];
- imm.kind = OPK_IMM;
- imm.v.imm = in->extra.imm;
- bl->insts[use_i].opnds[op_i] = imm;
+ if (enable_o1_combine_rewrites) {
+ try_combine_exts(&ctx, in, i);
+ try_substitute(&ctx, in, i);
+ try_addr_synth(&ctx, in, i);
+ }
+
+ ctx_record(&ctx, in, i);
+ }
+ return ctx.block_change_p;
+}
+
+static int opt_combine_compact_block(Func* f, Block* bl) {
+ u32 w = 0;
+ int changed = 0;
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+
+ /* Drop NOPs (e.g. copies sunk by try_sink). */
+ if ((IROp)in->op == IR_NOP) {
+ changed = 1;
continue;
}
- if ((IROp)in->op == IR_CONVERT && in->nopnds >= 2 &&
- in->opnds[0].kind == OPK_REG && in->opnds[1].kind == OPK_REG &&
- find_single_direct_use(f, bl, hard_live, i, &in->opnds[0],
- &in->opnds[1], 1, 0, 1, &use_i, &op_i)) {
- bl->insts[use_i].opnds[op_i] = in->opnds[1];
+ if ((IROp)in->op == IR_COPY && in->nopnds == 2 &&
+ same_reg_operand(&in->opnds[0], &in->opnds[1])) {
+ changed = 1;
+ continue;
}
+
+ if (w) {
+ Inst* prev = &bl->insts[w - 1u];
+ if ((IROp)prev->op == IR_STORE && (IROp)in->op == IR_LOAD &&
+ same_spill_slot_and_size(f, prev, in) &&
+ same_reg_operand(&prev->opnds[1], &in->opnds[0])) {
+ changed = 1;
+ continue;
+ }
+ if ((IROp)prev->op == IR_LOAD && (IROp)in->op == IR_STORE &&
+ same_spill_slot_and_size(f, prev, in) &&
+ same_reg_operand(&prev->opnds[0], &in->opnds[1])) {
+ changed = 1;
+ continue;
+ }
+ if ((IROp)prev->op == IR_LOAD && (IROp)in->op == IR_LOAD &&
+ same_spill_slot_and_size(f, prev, in) &&
+ same_reg_operand(&prev->opnds[0], &in->opnds[0])) {
+ changed = 1;
+ continue;
+ }
+ if ((IROp)prev->op == IR_STORE && (IROp)in->op == IR_STORE &&
+ same_spill_access(f, prev, in, NULL)) {
+ bl->insts[w - 1u] = *in;
+ changed = 1;
+ continue;
+ }
+ }
+
+ bl->insts[w++] = *in;
}
+ bl->ninsts = w;
+ return changed;
}
void opt_combine(Func* f) {
if (!f) return;
opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE);
OptHardBlockLive* hard_live = opt_maybe_build_hard_live(f);
+ /* Per-BB fixpoint, matching MIR's combine loop (mir-gen.c:9037-9038). */
+ /* Per-BB fixpoint with a defensive iteration cap. Each rewrite is supposed
+ * to be monotonic (it only fires when it strictly improves the IR), so we
+ * shouldn't hit the cap in practice — but a noisy abort beats a hang if a
+ * future rewrite accidentally oscillates. */
+ enum { MAX_COMBINE_ITERS = 64 };
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
- opt_combine_fold_block(f, bl, hard_live);
- u32 w = 0;
- for (u32 i = 0; i < bl->ninsts; ++i) {
- Inst* in = &bl->insts[i];
- if ((IROp)in->op == IR_COPY && in->nopnds == 2 &&
- same_reg_operand(&in->opnds[0], &in->opnds[1])) {
- continue;
- }
-
- if (w) {
- Inst* prev = &bl->insts[w - 1u];
- if ((IROp)prev->op == IR_STORE && (IROp)in->op == IR_LOAD &&
- same_spill_slot_and_size(f, prev, in) &&
- same_reg_operand(&prev->opnds[1], &in->opnds[0])) {
- continue;
- }
- if ((IROp)prev->op == IR_LOAD && (IROp)in->op == IR_STORE &&
- same_spill_slot_and_size(f, prev, in) &&
- same_reg_operand(&prev->opnds[0], &in->opnds[1])) {
- continue;
- }
- if ((IROp)prev->op == IR_LOAD && (IROp)in->op == IR_LOAD &&
- same_spill_slot_and_size(f, prev, in) &&
- same_reg_operand(&prev->opnds[0], &in->opnds[0])) {
- continue;
- }
- if ((IROp)prev->op == IR_STORE && (IROp)in->op == IR_STORE &&
- same_spill_access(f, prev, in, NULL)) {
- bl->insts[w - 1u] = *in;
- continue;
- }
- }
-
- bl->insts[w++] = *in;
+ for (int iter = 0; iter < MAX_COMBINE_ITERS; ++iter) {
+ int folded = opt_combine_fold_block(f, bl, hard_live);
+ int compacted = opt_combine_compact_block(f, bl);
+ if (!folded && !compacted) break;
}
- bl->ninsts = w;
}
}
diff --git a/src/opt/pass_jump.c b/src/opt/pass_jump.c
@@ -227,6 +227,97 @@ static void cleanup_invert_taken_fallthrough(JumpCleanupCtx* c) {
}
}
+/* Greedy chain-extension reorder: pick the entry block, then repeatedly
+ * extend the current chain to its preferred unvisited successor. For
+ * conditional branches that's `succ[1]` (fallthrough); for IR_BR or any
+ * 1-succ block it's `succ[0]`. When the chain stalls, start a new one from
+ * the lowest unvisited block id. After this pass, every chained pair `(p,
+ * n)` is adjacent in emit_order, so the existing branch-invert and
+ * trailing-branch-strip cleanups can collapse the jump.
+ *
+ * cfree's frontend loop lowering puts `inc` between `head` and `body`
+ * (because `continue` jumps to `inc`), forcing `b body` and `b inc` per
+ * iteration. Without reordering the trailing-branch strip can't fire —
+ * `body`'s next block in original order is `exit`, not `inc`. After
+ * reordering, body→inc is adjacent and the back-edge collapses. */
+static u32 chain_preferred_succ(const Block* bl) {
+ if (!bl->ninsts) {
+ return bl->nsucc ? bl->succ[0] : BLOCK_NONE;
+ }
+ IROp op = (IROp)bl->insts[bl->ninsts - 1u].op;
+ switch (op) {
+ case IR_CMP_BRANCH:
+ case IR_CONDBR:
+ return bl->nsucc >= 2u ? bl->succ[1] : BLOCK_NONE;
+ case IR_BR:
+ return bl->nsucc >= 1u ? bl->succ[0] : BLOCK_NONE;
+ default:
+ /* RET/SWITCH/INDIRECT_BRANCH: no implicit fallthrough preference. */
+ return BLOCK_NONE;
+ }
+}
+
+static void cleanup_reorder_for_fallthrough(JumpCleanupCtx* c) {
+ Func* f = c->f;
+ if (f->nblocks < 2u || f->emit_order_n < 2u) return;
+ u8* visited = arena_zarray(f->arena, u8, f->nblocks);
+ u32* new_order = arena_array(f->arena, u32, f->emit_order_n);
+ u32 w = 0;
+
+ /* Entry must come first regardless. */
+ u32 entry = f->entry;
+ if (entry >= f->nblocks) return;
+ new_order[w++] = entry;
+ visited[entry] = 1;
+ u32 cur = entry;
+
+ /* Extend by preferred successor. When the chain stalls, restart from the
+ * next unvisited block in original emit_order to preserve some locality. */
+ u32 scan_cursor = 0;
+ for (;;) {
+ u32 next = chain_preferred_succ(&f->blocks[cur]);
+ if (next >= f->nblocks || visited[next]) {
+ /* Fall back: any unvisited successor of cur — extends the chain even
+ * if it's the "taken" arm of a conditional. The subsequent
+ * cleanup_invert_taken_fallthrough will invert the branch. */
+ const Block* cb = &f->blocks[cur];
+ next = BLOCK_NONE;
+ for (u32 s = 0; s < cb->nsucc; ++s) {
+ u32 cand = cb->succ[s];
+ if (cand < f->nblocks && !visited[cand]) {
+ next = cand;
+ break;
+ }
+ }
+ }
+ if (next >= f->nblocks || visited[next]) {
+ /* Start a new chain from the lowest unvisited block in original order. */
+ next = BLOCK_NONE;
+ while (scan_cursor < f->emit_order_n) {
+ u32 cand = f->emit_order[scan_cursor++];
+ if (cand < f->nblocks && !visited[cand]) {
+ next = cand;
+ break;
+ }
+ }
+ if (next >= f->nblocks) break;
+ }
+ new_order[w++] = next;
+ visited[next] = 1;
+ cur = next;
+ }
+
+ if (w != f->emit_order_n) return; /* Conservative: bail if we missed any. */
+ memcpy(f->emit_order, new_order, sizeof(u32) * w);
+ /* Refresh the cached emit-index map so subsequent cleanups see the new
+ * order. */
+ for (u32 b = 0; b < f->nblocks; ++b) c->emit_index_by_block[b] = BLOCK_NONE;
+ for (u32 i = 0; i < f->emit_order_n; ++i) {
+ u32 b = f->emit_order[i];
+ if (b < f->nblocks) c->emit_index_by_block[b] = i;
+ }
+}
+
static void cleanup_layout_fallthrough_branches(const JumpCleanupCtx* c) {
Func* f = c->f;
for (u32 b = 0; b < f->nblocks; ++b) {
@@ -336,6 +427,7 @@ void opt_jump_cleanup(Func* f, OptJumpCleanupStage stage) {
cleanup_invert_jump_fallthrough(&c);
cleanup_branch_targets(&c);
} else if (stage == OPT_JUMP_CLEANUP_LAYOUT) {
+ cleanup_reorder_for_fallthrough(&c);
cleanup_invert_taken_fallthrough(&c);
cleanup_layout_fallthrough_branches(&c);
}
diff --git a/src/opt/pass_o2.c b/src/opt/pass_o2.c
@@ -798,21 +798,565 @@ void opt_addr_xform_pregs(Func* f) {
Operand* op = &use->opnds[o];
if (op->kind != OPK_INDIRECT) continue;
if ((PReg)op->v.ind.base != p) continue;
+ if (op->v.ind.ofs != 0 || op->v.ind.index != (Reg)REG_NONE)
+ continue; /* EA-shaped; leave alone */
Operand folded = local;
folded.type = use->extra.mem.type ? use->extra.mem.type : local.type;
*op = folded;
}
}
}
- addr_inst_remove(in);
+ if (!has_ea) addr_inst_remove(in);
changed = 1;
}
}
+ /* After folding, walk all frame slots and clear FSF_ADDR_TAKEN on any
+ * slot whose surviving IR_ADDR_OF defs (if any) have all been retired.
+ * The frontend-set ADDR_TAKEN flag is conservative; if we proved the
+ * address no longer escapes, downstream passes (opt_promote_scalar_locals)
+ * can take advantage of the actual non-escape state. */
+ if (changed) {
+ u8* still_taken =
+ arena_zarray(f->arena, u8, f->nframe_slots ? f->nframe_slots : 1u);
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if ((IROp)in->op != IR_ADDR_OF) continue;
+ if (in->nopnds < 2 || in->opnds[1].kind != OPK_LOCAL) continue;
+ FrameSlot slot = in->opnds[1].v.frame_slot;
+ if (slot && slot <= f->nframe_slots) still_taken[slot - 1u] = 1;
+ }
+ }
+ for (u32 s = 0; s < f->nframe_slots; ++s) {
+ if (!still_taken[s])
+ f->frame_slots[s].flags &= (u16)~FSF_ADDR_TAKEN;
+ }
+ }
if (changed)
opt_analysis_invalidate(
f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP);
}
+/* Scalar local promotion for the O1 pipeline. Runs after
+ * `opt_addr_xform_pregs` has folded zero-EA `OPK_INDIRECT(p)` uses to
+ * `OPK_LOCAL(slot)` and retired non-escaping `IR_ADDR_OF` defs. For each
+ * frame slot that is now only referenced as the base of matching-type,
+ * non-observable `IR_LOAD`/`IR_STORE`, the slot is replaced by a fresh
+ * mutable PReg: each store becomes `IR_COPY P_slot, src` (or `IR_LOAD_IMM`
+ * for an immediate source), each load becomes `IR_COPY dst, P_slot`. The
+ * slot becomes unreferenced and the backend drops it from the frame.
+ *
+ * A mutable PReg in `-O1` IR has the same data-flow semantics as a named
+ * memory cell that does not escape (multiple defs, multiple uses, value at
+ * a use comes from whichever def reaches it via CFG edges). No phis are
+ * required because the IR model has no phis; PReg flow becomes hard-reg
+ * flow after regalloc, and regalloc already handles it.
+ *
+ * Conditions for promotion (per slot):
+ *
+ * 1. Slot kind is FS_LOCAL (real locals, not spills, sret, alloca).
+ * 2. Slot has no FSF_ADDR_TAKEN, FSF_VOLATILE flag (after
+ * `opt_addr_xform_pregs` has cleared the conservative ADDR_TAKEN
+ * flag for slots whose IR_ADDR_OF defs were all retired).
+ * 3. Slot's declared type is scalar (int, float, bool, ptr, enum).
+ * 4. Every appearance of `OPK_LOCAL(slot)` in any instruction operand is
+ * either:
+ * - `IR_LOAD.opnds[1]` with matching `access.type == slot.type`,
+ * no observable mem flags, dst is OPK_REG;
+ * - `IR_STORE.opnds[0]` with matching `access.type == slot.type`,
+ * no observable mem flags, src is OPK_REG or OPK_IMM.
+ * 5. Slot does not appear in any aux operand position (calls, asm, etc.)
+ * or as an OPK_LOCAL anywhere else (e.g., a surviving IR_ADDR_OF).
+ *
+ * Param-slot case: FS_PARAM slots are excluded. The backend prologue is
+ * responsible for moving the ABI-incoming hard reg into the slot, and that
+ * move is not visible in the IR (there is no `IR_STORE OPK_LOCAL(slot)` to
+ * rewrite). At O1 the wrapper already places scalar params in REG storage
+ * when the frontend does not force a memory home, so the param's value
+ * arrives in a PReg without needing this pass. If a future scheme records
+ * the entry-move as a synthetic IR_STORE OPK_LOCAL(slot), this pass would
+ * promote it the same way it promotes any other store-to-slot. */
+
+static int promote_local_type_is_scalar(Func* f, CfreeCgTypeId ty) {
+ if (!ty) return 0;
+ CfreeCgTypeKind kind = cfree_cg_type_kind((CfreeCompiler*)f->c, ty);
+ switch (kind) {
+ case CFREE_CG_TYPE_BOOL:
+ case CFREE_CG_TYPE_INT:
+ case CFREE_CG_TYPE_FLOAT:
+ case CFREE_CG_TYPE_PTR:
+ case CFREE_CG_TYPE_ENUM:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int promote_op_uses_slot(const Operand* op, FrameSlot slot) {
+ return op && op->kind == OPK_LOCAL && op->v.frame_slot == slot;
+}
+
+static int promote_abivalue_uses_slot(const CGABIValue* v, FrameSlot slot) {
+ if (!v) return 0;
+ if (promote_op_uses_slot(&v->storage, slot)) return 1;
+ for (u32 i = 0; i < v->nparts; ++i)
+ if (promote_op_uses_slot((const Operand*)&v->parts[i].op, slot)) return 1;
+ return 0;
+}
+
+static int promote_aux_uses_slot(const Inst* in, FrameSlot slot) {
+ switch ((IROp)in->op) {
+ case IR_CALL: {
+ IRCallAux* aux = (IRCallAux*)in->extra.aux;
+ if (!aux) return 0;
+ if (aux->use_plan_replay) {
+ if (promote_op_uses_slot(&aux->plan.callee, slot)) return 1;
+ for (u32 i = 0; i < aux->plan.nargs; ++i)
+ if (promote_op_uses_slot(&aux->plan.args[i].src, slot)) return 1;
+ for (u32 i = 0; i < aux->plan.nrets; ++i)
+ if (promote_op_uses_slot(&aux->plan.rets[i].dst, slot)) return 1;
+ } else {
+ if (promote_op_uses_slot(&aux->desc.callee, slot)) return 1;
+ for (u32 i = 0; i < aux->desc.nargs; ++i)
+ if (promote_abivalue_uses_slot(
+ (const CGABIValue*)&aux->desc.args[i], slot))
+ return 1;
+ if (promote_abivalue_uses_slot(&aux->desc.ret, slot)) return 1;
+ }
+ return 0;
+ }
+ case IR_RET: {
+ IRRetAux* aux = (IRRetAux*)in->extra.aux;
+ if (!aux || !aux->present) return 0;
+ return promote_abivalue_uses_slot(&aux->val, slot);
+ }
+ case IR_SCOPE_BEGIN: {
+ IRScopeAux* aux = (IRScopeAux*)in->extra.aux;
+ if (!aux) return 0;
+ return promote_op_uses_slot(&aux->desc.cond, slot);
+ }
+ case IR_ASM_BLOCK: {
+ IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
+ if (!aux) return 0;
+ for (u32 i = 0; i < aux->nin; ++i)
+ if (promote_op_uses_slot(&aux->in_ops[i], slot)) return 1;
+ for (u32 i = 0; i < aux->nout; ++i)
+ if (promote_op_uses_slot(&aux->out_ops[i], slot)) return 1;
+ return 0;
+ }
+ case IR_INTRINSIC: {
+ IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
+ if (!aux) return 0;
+ for (u32 i = 0; i < aux->narg; ++i)
+ if (promote_op_uses_slot(&aux->args[i], slot)) return 1;
+ for (u32 i = 0; i < aux->ndst; ++i)
+ if (promote_op_uses_slot(&aux->dsts[i], slot)) return 1;
+ return 0;
+ }
+ default:
+ return 0;
+ }
+}
+
+/* Per-inst check. Returns:
+ * 1 = "instruction touches slot in a promotable position" (load/store base).
+ * 0 = "instruction does not touch slot at all".
+ * -1 = "instruction touches slot in a non-promotable way" (e.g., wrong
+ * operand position, type mismatch, observable flags, aux use). */
+static int promote_inst_classify(const Inst* in, FrameSlot slot,
+ CfreeCgTypeId slot_ty) {
+ int touched = 0;
+ /* IR_LOAD: opnds[0]=dst REG, opnds[1]=addr (allowed: OPK_LOCAL slot). */
+ if ((IROp)in->op == IR_LOAD) {
+ if (in->nopnds >= 2 && promote_op_uses_slot(&in->opnds[1], slot)) {
+ if (opt_mem_observable(&in->extra.mem)) return -1;
+ if (in->opnds[0].kind != OPK_REG) return -1;
+ CfreeCgTypeId at = in->extra.mem.type;
+ if (at && at != slot_ty) return -1;
+ touched = 1;
+ }
+ /* opnds[0] is the dst REG — never OPK_LOCAL by construction. */
+ if (in->nopnds >= 1 && promote_op_uses_slot(&in->opnds[0], slot))
+ return -1;
+ } else if ((IROp)in->op == IR_STORE) {
+ if (in->nopnds >= 1 && promote_op_uses_slot(&in->opnds[0], slot)) {
+ if (opt_mem_observable(&in->extra.mem)) return -1;
+ if (in->nopnds < 2) return -1;
+ Operand* src = &in->opnds[1];
+ if (src->kind != OPK_REG && src->kind != OPK_IMM) return -1;
+ CfreeCgTypeId at = in->extra.mem.type;
+ if (at && at != slot_ty) return -1;
+ touched = 1;
+ }
+ /* opnds[1] is the src value — should never be OPK_LOCAL for a scalar. */
+ if (in->nopnds >= 2 && promote_op_uses_slot(&in->opnds[1], slot))
+ return -1;
+ } else {
+ /* Any other instruction with an OPK_LOCAL(slot) operand blocks promotion. */
+ for (u32 o = 0; o < in->nopnds; ++o)
+ if (promote_op_uses_slot(&in->opnds[o], slot)) return -1;
+ }
+ if (promote_aux_uses_slot(in, slot)) return -1;
+ return touched;
+}
+
+/* Rewrite an `IR_STORE OPK_LOCAL(slot), src` into a PReg def. If src is
+ * OPK_IMM, emit IR_LOAD_IMM into preg; otherwise emit IR_COPY. */
+static void promote_rewrite_store(Func* f, Inst* in, PReg preg,
+ CfreeCgTypeId ty, u8 cls) {
+ Operand src = in->opnds[1];
+ Operand* opnds = arena_array(f->arena, Operand, 2);
+ memset(&opnds[0], 0, sizeof opnds[0]);
+ opnds[0].kind = OPK_REG;
+ opnds[0].type = ty;
+ opnds[0].cls = cls;
+ opnds[0].v.reg = (Reg)preg;
+ in->type = ty;
+ in->def = (Val)preg;
+ if (src.kind == OPK_IMM) {
+ in->op = IR_LOAD_IMM;
+ in->nopnds = 1;
+ in->opnds = opnds;
+ in->extra.imm = src.v.imm;
+ } else {
+ opnds[1] = src;
+ opnds[1].type = ty;
+ opnds[1].cls = cls;
+ in->op = IR_COPY;
+ in->nopnds = 2;
+ in->opnds = opnds;
+ memset(&in->extra, 0, sizeof in->extra);
+ }
+}
+
+/* Rewrite an `IR_LOAD dst, OPK_LOCAL(slot)` into `IR_COPY dst, preg`. */
+static void promote_rewrite_load(Func* f, Inst* in, PReg preg,
+ CfreeCgTypeId ty, u8 cls) {
+ Operand dst = in->opnds[0];
+ Operand* opnds = arena_array(f->arena, Operand, 2);
+ opnds[0] = dst;
+ opnds[0].type = ty;
+ opnds[0].cls = cls;
+ memset(&opnds[1], 0, sizeof opnds[1]);
+ opnds[1].kind = OPK_REG;
+ opnds[1].type = ty;
+ opnds[1].cls = cls;
+ opnds[1].v.reg = (Reg)preg;
+ in->op = IR_COPY;
+ in->type = ty;
+ in->nopnds = 2;
+ in->opnds = opnds;
+ memset(&in->extra, 0, sizeof in->extra);
+}
+
+void opt_promote_scalar_locals(Func* f) {
+ if (!f || f->opt_reg_ssa || f->opt_rewritten) return;
+ if (!f->nframe_slots) return;
+ int changed = 0;
+ for (u32 sidx = 0; sidx < f->nframe_slots; ++sidx) {
+ IRFrameSlot* slot = &f->frame_slots[sidx];
+ FrameSlot id = slot->id;
+ /* FS_PARAM slots are owned by the backend prologue (which copies the
+ * ABI-incoming hard reg into the slot before any user IR runs); there
+ * is no IR-level store to rewrite. At O1, the wrapper already places
+ * scalar params in REG storage when the frontend does not force a
+ * memory home, so the FS_PARAM promotion path is normally a no-op.
+ * Only promote FS_LOCAL slots. */
+ if (slot->kind != FS_LOCAL) continue;
+ if (slot->flags & (FSF_ADDR_TAKEN | FSF_VOLATILE)) continue;
+ if (!promote_local_type_is_scalar(f, slot->type)) continue;
+ int touched_count = 0;
+ int rejected = 0;
+ for (u32 b = 0; b < f->nblocks && !rejected; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ int r = promote_inst_classify(in, id, slot->type);
+ if (r < 0) { rejected = 1; break; }
+ touched_count += r;
+ }
+ }
+ if (rejected || !touched_count) continue;
+ u8 cls = (cfree_cg_type_kind((CfreeCompiler*)f->c, slot->type) ==
+ CFREE_CG_TYPE_FLOAT)
+ ? RC_FP
+ : RC_INT;
+ PReg preg = ir_alloc_preg(f, slot->type, cls);
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if ((IROp)in->op == IR_LOAD && in->nopnds >= 2 &&
+ promote_op_uses_slot(&in->opnds[1], id)) {
+ promote_rewrite_load(f, in, preg, slot->type, cls);
+ } else if ((IROp)in->op == IR_STORE && in->nopnds >= 2 &&
+ promote_op_uses_slot(&in->opnds[0], id)) {
+ promote_rewrite_store(f, in, preg, slot->type, cls);
+ }
+ }
+ }
+ /* The frame slot is now unreferenced. Leave the slot table entry in
+ * place (compaction would require remapping every other slot id);
+ * the backend's frame layout pass simply omits unreferenced slots. */
+ changed = 1;
+ }
+ if (changed)
+ opt_analysis_invalidate(
+ f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP);
+}
+
+/* CSE-style hoist of `IR_ADDR_OF(OPK_GLOBAL{sym, addend})` defs that appear
+ * more than once in the same function. The address is a link-time constant
+ * (TLS and IFUNC live on separate IROps), so all occurrences compute the
+ * same value; consolidating to a single entry-block def shrinks each loop
+ * body by the per-iter `adrp`/`add` pair the backend would otherwise re-emit.
+ *
+ * Implementation:
+ * - Walk all insts, group ADDR_OF defs by (sym, addend).
+ * - For each key with >= 2 defs: allocate a fresh PReg, materialize one
+ * IR_ADDR_OF in block 0 (after any IR_PARAM_DECL prologue), build a
+ * preg-remap from each original def-PReg to the new PReg, and NOP each
+ * original def.
+ * - One IR walk applies the remap to every operand `v.reg` /
+ * `v.ind.base`.
+ *
+ * Runs after opt_addr_xform_pregs so local addr-of has already been folded
+ * out; the remaining IR_ADDR_OF defs are global. */
+
+typedef struct AddrCseEntry {
+ ObjSymId sym;
+ i64 addend;
+ PReg canonical; /* freshly allocated PReg, def in block 0 */
+ CfreeCgTypeId addr_type; /* operand[0].type from the first def */
+ u8 cls; /* operand[0].cls from the first def */
+ u32 count; /* number of original ADDR_OF defs seen */
+} AddrCseEntry;
+
+static u32 addr_cse_find_or_add(AddrCseEntry** entries, u32* n, u32* cap,
+ Arena* arena, ObjSymId sym, i64 addend) {
+ for (u32 i = 0; i < *n; ++i) {
+ if ((*entries)[i].sym == sym && (*entries)[i].addend == addend) return i;
+ }
+ if (*n == *cap) {
+ u32 ncap = *cap ? *cap * 2u : 16u;
+ AddrCseEntry* nv = arena_array(arena, AddrCseEntry, ncap);
+ if (*entries) memcpy(nv, *entries, sizeof(AddrCseEntry) * (*n));
+ *entries = nv;
+ *cap = ncap;
+ }
+ u32 idx = (*n)++;
+ AddrCseEntry* e = &(*entries)[idx];
+ memset(e, 0, sizeof *e);
+ e->sym = sym;
+ e->addend = addend;
+ e->canonical = PREG_NONE;
+ e->count = 0;
+ return idx;
+}
+
+static void addr_cse_apply_to_operand(Operand* op, const PReg* remap) {
+ /* remap is zero-initialized; 0 means "no remap" (preg 0 is reserved as
+ * unused). PREG_NONE = 0xffffffff and would be a valid remap target but
+ * we never produce that. */
+ if (!op) return;
+ if (op->kind == OPK_REG) {
+ PReg p = (PReg)op->v.reg;
+ if (p != PREG_NONE && p != 0 && remap[p] != 0)
+ op->v.reg = remap[p];
+ } else if (op->kind == OPK_INDIRECT) {
+ PReg p = (PReg)op->v.ind.base;
+ if (p != PREG_NONE && p != 0 && remap[p] != 0)
+ op->v.ind.base = remap[p];
+ if (op->v.ind.index != (Reg)REG_NONE) {
+ PReg pi = (PReg)op->v.ind.index;
+ if (pi != PREG_NONE && pi != 0 && remap[pi] != 0)
+ op->v.ind.index = remap[pi];
+ }
+ }
+}
+
+static void addr_cse_apply_to_inst(Inst* in, const PReg* remap) {
+ for (u32 o = 0; o < in->nopnds; ++o)
+ addr_cse_apply_to_operand(&in->opnds[o], remap);
+ /* IR_CALL aux carries operands too; rewrite both replay variants. */
+ if ((IROp)in->op == IR_CALL) {
+ IRCallAux* aux = (IRCallAux*)in->extra.aux;
+ if (!aux) return;
+ if (aux->use_plan_replay) {
+ addr_cse_apply_to_operand(&aux->plan.callee, remap);
+ for (u32 i = 0; i < aux->plan.nargs; ++i)
+ addr_cse_apply_to_operand(&aux->plan.args[i].src, remap);
+ for (u32 i = 0; i < aux->plan.nrets; ++i)
+ addr_cse_apply_to_operand(&aux->plan.rets[i].dst, remap);
+ } else {
+ addr_cse_apply_to_operand(&aux->desc.callee, remap);
+ for (u32 i = 0; i < aux->desc.nargs; ++i) {
+ CGABIValue* v = (CGABIValue*)&aux->desc.args[i];
+ addr_cse_apply_to_operand(&v->storage, remap);
+ for (u32 k = 0; k < v->nparts; ++k)
+ addr_cse_apply_to_operand((Operand*)&v->parts[k].op, remap);
+ }
+ addr_cse_apply_to_operand(&aux->desc.ret.storage, remap);
+ for (u32 k = 0; k < aux->desc.ret.nparts; ++k)
+ addr_cse_apply_to_operand((Operand*)&aux->desc.ret.parts[k].op, remap);
+ }
+ } else if ((IROp)in->op == IR_RET) {
+ IRRetAux* aux = (IRRetAux*)in->extra.aux;
+ if (aux && aux->present) {
+ addr_cse_apply_to_operand(&aux->val.storage, remap);
+ for (u32 k = 0; k < aux->val.nparts; ++k)
+ addr_cse_apply_to_operand((Operand*)&aux->val.parts[k].op, remap);
+ }
+ } else if ((IROp)in->op == IR_ASM_BLOCK) {
+ IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
+ if (!aux) return;
+ for (u32 i = 0; i < aux->nin; ++i)
+ addr_cse_apply_to_operand(&aux->in_ops[i], remap);
+ for (u32 i = 0; i < aux->nout; ++i)
+ addr_cse_apply_to_operand(&aux->out_ops[i], remap);
+ } else if ((IROp)in->op == IR_INTRINSIC) {
+ IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
+ if (!aux) return;
+ for (u32 i = 0; i < aux->narg; ++i)
+ addr_cse_apply_to_operand(&aux->args[i], remap);
+ for (u32 i = 0; i < aux->ndst; ++i)
+ addr_cse_apply_to_operand(&aux->dsts[i], remap);
+ }
+}
+
+static Inst* block_insert_at(Func* f, Block* bl, u32 at, u32 k) {
+ if (at > bl->ninsts) at = bl->ninsts;
+ if (bl->ninsts + k > bl->cap) {
+ u32 ncap = bl->cap ? bl->cap : 8u;
+ while (ncap < bl->ninsts + k) ncap *= 2u;
+ Inst* nb = arena_zarray(f->arena, Inst, ncap);
+ if (bl->insts && at) memcpy(nb, bl->insts, sizeof(Inst) * at);
+ if (bl->insts && bl->ninsts > at)
+ memcpy(nb + at + k, bl->insts + at,
+ sizeof(Inst) * (bl->ninsts - at));
+ bl->insts = nb;
+ bl->cap = ncap;
+ } else {
+ if (bl->ninsts > at)
+ memmove(bl->insts + at + k, bl->insts + at,
+ sizeof(Inst) * (bl->ninsts - at));
+ }
+ for (u32 i = 0; i < k; ++i) memset(&bl->insts[at + i], 0, sizeof(Inst));
+ bl->ninsts += k;
+ return &bl->insts[at];
+}
+
+void opt_addr_of_global_cse(Func* f) {
+ if (!f || f->opt_reg_ssa || f->opt_rewritten) return;
+ if (f->nblocks == 0) return;
+
+ /* Pass 1: index ADDR_OF(global) defs by (sym, addend). */
+ AddrCseEntry* entries = NULL;
+ u32 n_entries = 0;
+ u32 cap_entries = 0;
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if ((IROp)in->op != IR_ADDR_OF) continue;
+ if (in->nopnds < 2) continue;
+ if (in->opnds[0].kind != OPK_REG) continue;
+ if (in->opnds[1].kind != OPK_GLOBAL) continue;
+ u32 idx = addr_cse_find_or_add(&entries, &n_entries, &cap_entries,
+ f->arena, in->opnds[1].v.global.sym,
+ in->opnds[1].v.global.addend);
+ AddrCseEntry* e = &entries[idx];
+ if (e->count == 0) {
+ e->addr_type = in->opnds[0].type;
+ e->cls = in->opnds[0].cls;
+ }
+ ++e->count;
+ }
+ }
+ if (!n_entries) return;
+
+ /* Pass 2: for each duplicate key, allocate a canonical PReg. */
+ u32 dup_count = 0;
+ for (u32 i = 0; i < n_entries; ++i) {
+ if (entries[i].count >= 2) {
+ entries[i].canonical = ir_alloc_preg(f, entries[i].addr_type,
+ entries[i].cls);
+ ++dup_count;
+ }
+ }
+ if (!dup_count) return;
+
+ /* Pass 3: walk again, build per-old-PReg remap and NOP duplicate defs. */
+ PReg* remap = arena_zarray(f->arena, PReg, opt_reg_count(f));
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if ((IROp)in->op != IR_ADDR_OF) continue;
+ if (in->nopnds < 2) continue;
+ if (in->opnds[0].kind != OPK_REG) continue;
+ if (in->opnds[1].kind != OPK_GLOBAL) continue;
+ u32 idx = addr_cse_find_or_add(&entries, &n_entries, &cap_entries,
+ f->arena, in->opnds[1].v.global.sym,
+ in->opnds[1].v.global.addend);
+ if (entries[idx].canonical == PREG_NONE) continue; /* singleton */
+ PReg old = (PReg)in->opnds[0].v.reg;
+ if (opt_reg_valid(f, old)) remap[old] = entries[idx].canonical;
+ /* NOP the original def. */
+ in->op = IR_NOP;
+ in->def = VAL_NONE;
+ in->ndefs = 0;
+ in->defs = NULL;
+ in->nopnds = 0;
+ in->opnds = NULL;
+ }
+ }
+
+ /* Pass 4: hoist a single ADDR_OF for each duplicated key to the entry
+ * block, inserted after any leading IR_PARAM_DECL instructions. */
+ if (f->entry >= f->nblocks) return;
+ Block* entry = &f->blocks[f->entry];
+ u32 insert_at = 0;
+ while (insert_at < entry->ninsts &&
+ (IROp)entry->insts[insert_at].op == IR_PARAM_DECL)
+ ++insert_at;
+ Inst* slot = block_insert_at(f, entry, insert_at, dup_count);
+ u32 w = 0;
+ for (u32 i = 0; i < n_entries; ++i) {
+ if (entries[i].canonical == PREG_NONE) continue;
+ Inst* in = &slot[w++];
+ in->op = (u16)IR_ADDR_OF;
+ in->def = (Val)entries[i].canonical;
+ in->type = entries[i].addr_type;
+ in->nopnds = 2;
+ in->opnds = arena_array(f->arena, Operand, 2);
+ memset(&in->opnds[0], 0, sizeof(Operand));
+ in->opnds[0].kind = OPK_REG;
+ in->opnds[0].cls = entries[i].cls;
+ in->opnds[0].type = entries[i].addr_type;
+ in->opnds[0].v.reg = entries[i].canonical;
+ memset(&in->opnds[1], 0, sizeof(Operand));
+ in->opnds[1].kind = OPK_GLOBAL;
+ in->opnds[1].cls = entries[i].cls;
+ in->opnds[1].type = entries[i].addr_type;
+ in->opnds[1].v.global.sym = entries[i].sym;
+ in->opnds[1].v.global.addend = entries[i].addend;
+ ir_assign_inst_id(f, in);
+ }
+
+ /* Pass 5: apply remap to all operand uses in the function. */
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ addr_cse_apply_to_inst(&bl->insts[i], remap);
+ }
+ }
+
+ opt_analysis_invalidate(
+ f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP);
+}
+
static u64 gvn_width_mask(u32 width) {
if (width >= 64u) return ~0ull;
return (1ull << width) - 1ull;
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -5307,6 +5307,22 @@ static void opt_combine_sinks_or_preserves_producer_copy_after_rewrite(void) {
EXPECT(count_op(clobber, IR_COPY) == 1,
"producer retargeting should not cross a destination clobber");
+ Func* dst_live = new_func(&tc);
+ dst_live->opt_rewritten = 1;
+ li = ir_emit(dst_live, dst_live->entry, IR_LOAD_IMM);
+ li->opnds = arena_array(dst_live->arena, Operand, 1);
+ li->opnds[0] = op_reg_(24, tc.i32);
+ li->nopnds = 1;
+ li->extra.imm = 0;
+ emit_phys_copy(dst_live, dst_live->entry, 21, 20, tc.i32);
+ emit_phys_copy(dst_live, dst_live->entry, 20, 24, tc.i32);
+ emit_ret_val(dst_live, dst_live->entry, 20, tc.i32);
+
+ opt_combine(dst_live);
+ EXPECT(dst_live->blocks[dst_live->entry].insts[0].opnds[0].v.reg == 24 &&
+ count_op(dst_live, IR_COPY) == 2,
+ "producer retargeting should not clobber an intervening dst use");
+
Func* liveout = new_func(&tc);
liveout->opt_rewritten = 1;
u32 succ = ir_block_new(liveout);
@@ -5460,6 +5476,267 @@ static void opt_combine_copy_chains_and_convert_pairs(void) {
tc_fini(&tc);
}
+/* Helper: emit IR_LOAD with arbitrary indirect operand. */
+static Inst* tt_emit_load_phys_indirect(Func* f, u32 b, Reg dst, Operand addr,
+ CfreeCgTypeId ty) {
+ Inst* in = ir_emit(f, b, IR_LOAD);
+ in->opnds = arena_array(f->arena, Operand, 2);
+ in->opnds[0] = op_reg_(dst, ty);
+ in->opnds[1] = addr;
+ in->nopnds = 2;
+ in->extra.mem = mem_unknown_(ty, 4);
+ return in;
+}
+
+static void opt_combine_substitutes_into_indirect_base_and_index(void) {
+ TestCtx tc;
+ tc_init(&tc);
+
+ /* Base substitution: `copy r4, r2 ; load r1, [r4 + 8]` → load uses r2. */
+ Func* f = new_func(&tc);
+ f->opt_rewritten = 1;
+ emit_phys_copy(f, f->entry, 4, 2, tc.i32);
+ Operand addr = op_indirect_(4, tc.i32);
+ addr.v.ind.ofs = 8;
+ tt_emit_load_phys_indirect(f, f->entry, 1, addr, tc.i32);
+ emit_ret_val(f, f->entry, 1, tc.i32);
+
+ opt_combine(f);
+ opt_dce(f);
+ Inst* load = NULL;
+ for (u32 i = 0; i < f->blocks[f->entry].ninsts; ++i)
+ if ((IROp)f->blocks[f->entry].insts[i].op == IR_LOAD)
+ load = &f->blocks[f->entry].insts[i];
+ EXPECT(load && load->opnds[1].kind == OPK_INDIRECT &&
+ load->opnds[1].v.ind.base == 2 && load->opnds[1].v.ind.ofs == 8 &&
+ load->opnds[1].v.ind.index == (Reg)REG_NONE,
+ "single-use copy should substitute into indirect base");
+ EXPECT(count_op(f, IR_COPY) == 0,
+ "copy whose only use is an indirect base should be dead-DCE'd");
+
+ /* Index substitution: `copy r4, r2 ; load r1, [r0 + r4 * 4]` */
+ Func* g = new_func(&tc);
+ g->opt_rewritten = 1;
+ emit_phys_copy(g, g->entry, 4, 2, tc.i32);
+ Operand iaddr = op_indexed_indirect_(0, 4, 2, 0, tc.i32);
+ tt_emit_load_phys_indirect(g, g->entry, 1, iaddr, tc.i32);
+ emit_ret_val(g, g->entry, 1, tc.i32);
+
+ opt_combine(g);
+ opt_dce(g);
+ load = NULL;
+ for (u32 i = 0; i < g->blocks[g->entry].ninsts; ++i)
+ if ((IROp)g->blocks[g->entry].insts[i].op == IR_LOAD)
+ load = &g->blocks[g->entry].insts[i];
+ EXPECT(load && load->opnds[1].kind == OPK_INDIRECT &&
+ load->opnds[1].v.ind.base == 0 &&
+ load->opnds[1].v.ind.index == 2 &&
+ load->opnds[1].v.ind.log2_scale == 2,
+ "single-use copy should substitute into indirect index");
+ EXPECT(count_op(g, IR_COPY) == 0,
+ "copy whose only use is an indirect index should be dead-DCE'd");
+ tc_fini(&tc);
+}
+
+static void opt_combine_synthesizes_address_modes(void) {
+ TestCtx tc;
+ tc_init(&tc);
+
+ /* reg + reg: `add r4, r2, r3 ; load r1, [r4]` → `load r1, [r2 + r3*1]` */
+ Func* f = new_func(&tc);
+ f->opt_rewritten = 1;
+ emit_phys_binop(f, f->entry, 4, 2, 3, tc.i32, BO_IADD);
+ Operand addr = op_indirect_(4, tc.i32);
+ tt_emit_load_phys_indirect(f, f->entry, 1, addr, tc.i32);
+ emit_ret_val(f, f->entry, 1, tc.i32);
+
+ opt_combine(f);
+ opt_dce(f);
+ Inst* load = NULL;
+ for (u32 i = 0; i < f->blocks[f->entry].ninsts; ++i)
+ if ((IROp)f->blocks[f->entry].insts[i].op == IR_LOAD)
+ load = &f->blocks[f->entry].insts[i];
+ EXPECT(load && load->opnds[1].kind == OPK_INDIRECT &&
+ load->opnds[1].v.ind.base == 2 &&
+ load->opnds[1].v.ind.index == 3 &&
+ load->opnds[1].v.ind.log2_scale == 0,
+ "add reg+reg producer should synthesize base+index addr mode");
+ EXPECT(count_op(f, IR_BINOP) == 0,
+ "synthesized address mode should kill the producing add");
+
+ /* reg + imm: `add r4, r2, 16 ; load r1, [r4 + 4]` → `load r1, [r2 + 20]` */
+ Func* g = new_func(&tc);
+ g->opt_rewritten = 1;
+ Inst* add = ir_emit(g, g->entry, IR_BINOP);
+ add->opnds = arena_array(g->arena, Operand, 3);
+ add->opnds[0] = op_reg_(4, tc.i32);
+ add->opnds[1] = op_reg_(2, tc.i32);
+ add->opnds[2] = op_imm_(16, tc.i32);
+ add->nopnds = 3;
+ add->extra.imm = BO_IADD;
+ Operand a2 = op_indirect_(4, tc.i32);
+ a2.v.ind.ofs = 4;
+ tt_emit_load_phys_indirect(g, g->entry, 1, a2, tc.i32);
+ emit_ret_val(g, g->entry, 1, tc.i32);
+
+ opt_combine(g);
+ opt_dce(g);
+ load = NULL;
+ for (u32 i = 0; i < g->blocks[g->entry].ninsts; ++i)
+ if ((IROp)g->blocks[g->entry].insts[i].op == IR_LOAD)
+ load = &g->blocks[g->entry].insts[i];
+ EXPECT(load && load->opnds[1].kind == OPK_INDIRECT &&
+ load->opnds[1].v.ind.base == 2 &&
+ load->opnds[1].v.ind.ofs == 20 &&
+ load->opnds[1].v.ind.index == (Reg)REG_NONE,
+ "add reg+imm producer should fold immediate into indirect offset");
+ EXPECT(count_op(g, IR_BINOP) == 0,
+ "folded immediate offset should kill the producing add");
+
+ /* scale-from-shl chained with reg+reg add: produces scaled index. */
+ Func* h = new_func(&tc);
+ h->opt_rewritten = 1;
+ Inst* shl = ir_emit(h, h->entry, IR_BINOP);
+ shl->opnds = arena_array(h->arena, Operand, 3);
+ shl->opnds[0] = op_reg_(4, tc.i32);
+ shl->opnds[1] = op_reg_(3, tc.i32);
+ shl->opnds[2] = op_imm_(2, tc.i32);
+ shl->nopnds = 3;
+ shl->extra.imm = BO_SHL;
+ emit_phys_binop(h, h->entry, 5, 2, 4, tc.i32, BO_IADD);
+ Operand a3 = op_indirect_(5, tc.i32);
+ tt_emit_load_phys_indirect(h, h->entry, 1, a3, tc.i32);
+ emit_ret_val(h, h->entry, 1, tc.i32);
+
+ opt_combine(h);
+ opt_dce(h);
+ load = NULL;
+ for (u32 i = 0; i < h->blocks[h->entry].ninsts; ++i)
+ if ((IROp)h->blocks[h->entry].insts[i].op == IR_LOAD)
+ load = &h->blocks[h->entry].insts[i];
+ EXPECT(load && load->opnds[1].kind == OPK_INDIRECT &&
+ load->opnds[1].v.ind.base == 2 &&
+ load->opnds[1].v.ind.index == 3 &&
+ load->opnds[1].v.ind.log2_scale == 2,
+ "shl+add chain should synthesize base + scaled index across fixpoint");
+ EXPECT(count_op(h, IR_BINOP) == 0,
+ "chained add+shl producers should both die after synthesis");
+
+ /* Blocked when consumer already has an index: cannot stack two indices. */
+ Func* blk = new_func(&tc);
+ blk->opt_rewritten = 1;
+ emit_phys_binop(blk, blk->entry, 5, 2, 3, tc.i32, BO_IADD);
+ Operand a4 = op_indexed_indirect_(5, 6, 0, 0, tc.i32);
+ tt_emit_load_phys_indirect(blk, blk->entry, 1, a4, tc.i32);
+ emit_ret_val(blk, blk->entry, 1, tc.i32);
+
+ opt_combine(blk);
+ load = NULL;
+ for (u32 i = 0; i < blk->blocks[blk->entry].ninsts; ++i)
+ if ((IROp)blk->blocks[blk->entry].insts[i].op == IR_LOAD)
+ load = &blk->blocks[blk->entry].insts[i];
+ EXPECT(load && load->opnds[1].kind == OPK_INDIRECT &&
+ load->opnds[1].v.ind.base == 5 &&
+ load->opnds[1].v.ind.index == 6,
+ "addr-mode synthesis should refuse to stack two indices");
+ EXPECT(count_op(blk, IR_BINOP) == 1,
+ "blocked synthesis should leave the producing add in place");
+
+ /* base IADD reg+imm with a pre-existing index: imm-fold sub-rule (c)
+ * mutates only the offset and is safe even when index != REG_NONE.
+ * `add r5, r2, 12 ; load r1, [r5 + r6*2 + 4]` → `[r2 + r6*2 + 16]`. */
+ Func* m = new_func(&tc);
+ m->opt_rewritten = 1;
+ Inst* madd = ir_emit(m, m->entry, IR_BINOP);
+ madd->opnds = arena_array(m->arena, Operand, 3);
+ madd->opnds[0] = op_reg_(5, tc.i32);
+ madd->opnds[1] = op_reg_(2, tc.i32);
+ madd->opnds[2] = op_imm_(12, tc.i32);
+ madd->nopnds = 3;
+ madd->extra.imm = BO_IADD;
+ Operand a5 = op_indexed_indirect_(5, 6, 1, 4, tc.i32);
+ tt_emit_load_phys_indirect(m, m->entry, 1, a5, tc.i32);
+ emit_ret_val(m, m->entry, 1, tc.i32);
+
+ opt_combine(m);
+ opt_dce(m);
+ load = NULL;
+ for (u32 i = 0; i < m->blocks[m->entry].ninsts; ++i)
+ if ((IROp)m->blocks[m->entry].insts[i].op == IR_LOAD)
+ load = &m->blocks[m->entry].insts[i];
+ EXPECT(load && load->opnds[1].kind == OPK_INDIRECT &&
+ load->opnds[1].v.ind.base == 2 &&
+ load->opnds[1].v.ind.index == 6 &&
+ load->opnds[1].v.ind.log2_scale == 1 &&
+ load->opnds[1].v.ind.ofs == 16,
+ "reg+imm base producer should imm-fold even when an index is set");
+ EXPECT(count_op(m, IR_BINOP) == 0,
+ "imm-fold synthesis with existing index should kill the producing add");
+
+ tc_fini(&tc);
+}
+
+static void opt_combine_full_ext_of_ext_rules(void) {
+ TestCtx tc;
+ tc_init(&tc);
+
+ /* sext-of-sext: outer 8 bytes, inner-source 1 byte. Outer signed, inner
+ * signed. Pattern B (w2 < w with outer signed) fires; outer chains
+ * directly to the inner source. */
+ Func* a = new_func(&tc);
+ a->opt_rewritten = 1;
+ emit_convert_typed(a, a->entry, 10, 11, tc.i32, tc.i8, CV_SEXT);
+ emit_convert_typed(a, a->entry, 12, 10, tc.i64, tc.i32, CV_SEXT);
+ emit_ret_val(a, a->entry, 12, tc.i64);
+
+ opt_combine(a);
+ Inst* cv = &a->blocks[a->entry].insts[1];
+ EXPECT(cv->opnds[1].v.reg == 11,
+ "sext(sext) chain should chain to the innermost source");
+
+ /* uext-of-sext (the unsafe combination): outer unsigned, inner signed,
+ * outer wider than inner source — must be left alone (would lose
+ * sign info). */
+ Func* b = new_func(&tc);
+ b->opt_rewritten = 1;
+ emit_convert_typed(b, b->entry, 10, 11, tc.i32, tc.i8, CV_SEXT);
+ emit_convert_typed(b, b->entry, 12, 10, tc.i64, tc.i32, CV_ZEXT);
+ emit_ret_val(b, b->entry, 12, tc.i64);
+
+ opt_combine(b);
+ cv = &b->blocks[b->entry].insts[1];
+ EXPECT(cv->opnds[1].v.reg == 10,
+ "uext(sext) chain must NOT collapse (would lose sign info)");
+
+ /* sext-of-uext (mixed but safe): outer signed picks up outer signedness;
+ * collapses to inner source. */
+ Func* c = new_func(&tc);
+ c->opt_rewritten = 1;
+ emit_convert_typed(c, c->entry, 10, 11, tc.i32, tc.i8, CV_ZEXT);
+ emit_convert_typed(c, c->entry, 12, 10, tc.i64, tc.i32, CV_SEXT);
+ emit_ret_val(c, c->entry, 12, tc.i64);
+
+ opt_combine(c);
+ cv = &c->blocks[c->entry].insts[1];
+ EXPECT(cv->opnds[1].v.reg == 11,
+ "sext(uext) chain should chain to the innermost source");
+
+ /* Pattern A: outer width <= inner-source width. Outer can chain past the
+ * inner ext regardless of signedness combination. */
+ Func* d = new_func(&tc);
+ d->opt_rewritten = 1;
+ emit_convert_typed(d, d->entry, 10, 11, tc.i64, tc.i32, CV_SEXT);
+ emit_convert_typed(d, d->entry, 12, 10, tc.i32, tc.i32, CV_ZEXT);
+ emit_ret_val(d, d->entry, 12, tc.i32);
+
+ opt_combine(d);
+ cv = &d->blocks[d->entry].insts[1];
+ EXPECT(cv->opnds[1].v.reg == 11,
+ "outer width <= inner-source width should chain past inner ext");
+
+ tc_fini(&tc);
+}
+
static void opt_dce_physical_dead_defs(void) {
TestCtx tc;
tc_init(&tc);
@@ -6709,9 +6986,12 @@ int main(void) {
opt_regalloc_spill_requires_scratch();
opt_combine_spill_peeps();
opt_combine_single_use_copy_and_imm();
- opt_combine_preserves_producer_copy_after_rewrite();
+ opt_combine_sinks_or_preserves_producer_copy_after_rewrite();
opt_combine_keeps_unsafe_and_multiuse_defs();
opt_combine_copy_chains_and_convert_pairs();
+ opt_combine_substitutes_into_indirect_base_and_index();
+ opt_combine_synthesizes_address_modes();
+ opt_combine_full_ext_of_ext_rules();
opt_dce_physical_dead_defs();
opt_dead_def_keeps_observable_loads();
opt_dead_def_elim_test();