commit b946ca4de6884fcfbe3285ff5e6154ebaf0aadc2
parent eff3401d60b7db0ab8de3af4874fd5600fdcba68
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 28 May 2026 06:39:15 -0700
opt: O1 codegen batch — loop-imm hoist, ABI aliasing hints, slim prologue, IMUL strength-reduce, call copy-prop
New pass `opt_lower_loop_imm_operands` (pass_lower_loop_imm.c): before
`opt_hoist_loop_consts` runs, walk loop-body instructions and rewrite
non-foldable inline imm operands (those the backend cannot fold via
`imm_legal`) to a fresh `IR_LOAD_IMM` + `OPK_REG` use. The existing
hoister then lifts them to the entry block. Zero immediates and
backend-foldable immediates are left in place (they produce no loop
materialize). Wired into `opt_run_o1_native` with a verify checkpoint.
pass_simplify: canonicalize commutative integer binops (IADD, IMUL, AND,
OR, XOR) to put an inline OPK_IMM on the rhs. Both the loop-imm hoister
and the aa64 IMUL strength reducer inspect only opnds[2]; this ensures
they see the constant when the front-end happens to emit it as the lhs.
pass_combine: extend substitution to two previously-skipped sites:
- IR_CALL aux operands (plan/desc callee + args): copies feeding a call
argument or the callee operand are now propagated through, reducing
pre-call mov chains.
- Indirect base/index on store opcodes (IR_STORE, IR_ATOMIC_STORE,
IR_BITFIELD_STORE, IR_AGG_COPY, IR_AGG_SET): address-register copies
into store address slots are now substituted (only the address changes,
not the value — safe for all store shapes).
pass_lower: add `apply_abi_aliasing_hints`. Sets a soft
`preferred_hard_reg` hint (new i8 field in OptPRegInfo, -1 = none) on
IR_CALL result PRegs and IR_RET value PRegs toward the ABI return
register (x0 on aa64). `hard_reg_alloc_score` adds +1 to any non-hinted
choice; the allocator also tries the hinted reg outside the standard
allocable set (e.g. x0 reserved on aa64) using `opt_ranges_overlap_kind`
for precise unit-length-overlap allowance. When the allocator lands the
result directly in x0, emit_call / emit_ret elide the materialization
move.
aa64 slim prologue / epilogue (native.c):
- Tier A (slim_prologue): no callee-saves, no alloca, no body spills
(cum_off == AA_FRAME_SAVE_SIZE), no outgoing stack args → 2-insn
prologue `stp x29,x30,[sp,#-16]! ; mov x29,sp` and 1-insn epilogue
`ldp x29,x30,[sp],#16`. CFI switches to SP-anchored CFA (sp+16).
- Small-frame fast path (slim_small_frame): frame_size ≤ 520+16, no
alloca → prologue uses `stp x29,x30,[sp,#N-16]` directly (skips the
x17 scratch), epilogue uses `ldp x29,x30,[sp,#N-16] ; add sp,sp,#N`
(skips the x10 scratch). Applies to almost every non-Tier-A function
with a small frame.
- Post-indexed STP/LDP encoder/decoder (`aa64_stp64_post`,
`aa64_ldp64_post`, AA64_FMT_LDSTP_POST) added to isa.h/isa.c.
aa64 IMUL strength reduction (native.c): `aa64_imul_strength_reducible`
recognizes mul-by-constant shapes that map to a single non-mul
instruction: 0, ±1, ±2^k, 2^k+1, 1−2^k. `aa_emit_mul_const_imm` emits
the corresponding mov/neg/lsl/add-lsl/sub-lsl. Wired into `aa_imm_legal`
(NATIVE_IMM_BINOP/BO_IMUL) and `aa_binop` so the optimizer keeps the
constant inline and the emitter strength-reduces it.
Infrastructure:
- `block_insert_at` renamed to `opt_block_insert_at` (promoted to
opt_internal.h) so pass_lower_loop_imm can reuse it.
- `ranges_overlap_kind` promoted to `opt_ranges_overlap_kind` (declared
in opt_internal.h) for use in the hint-aware allocator path.
doc/OPT_O1_PERF_TODO.md: refresh standings with 3-run numbers; add
disassembly walkthrough of the redundant-mov pattern driving binary-trees
gap; update per-benchmark notes to reflect the lists/hash2/sieve movement.
Diffstat:
14 files changed, 922 insertions(+), 68 deletions(-)
diff --git a/doc/OPT_O1_PERF_TODO.md b/doc/OPT_O1_PERF_TODO.md
@@ -19,46 +19,162 @@ reference exists).
## Current standings
-Numbers below are from a single-run sweep (`COMPILE_REPEATS=1`,
-`RUN_REPEATS=1`), runtime in ms; speedup = reference_time / cfree_time
-(>1 means cfree is faster). The four entries here were chosen because their
-gaps are large enough to be real signal rather than single-run noise.
+Numbers below are from a 3-run sweep on aarch64/Apple (`COMPILE_REPEATS=3`,
+`RUN_REPEATS=3`, best-of), runtime in ms; speedup = reference_time / cfree_time
+(>1 means cfree is faster).
| bench | cfree -O1 | gcc -O0 | vs gcc-O0 | mir -O1 | vs mir-O1 | behind |
| --- | ---: | ---: | ---: | ---: | ---: | --- |
-| binary-trees | 3238 | 2562 | **0.79×** (slower) | n/a¹ | — | gcc |
-| lists | 8417 | 8583 | 1.02× | 4907 | **0.58×** | gcc, mir |
-| hash2 | 5965 | 7110 | 1.19× ✓ | 3765 | **0.63×** | mir |
-| sieve | 5066 | 4977 | 0.98× | 4107 | **0.81×** | gcc, mir |
+| binary-trees | 3106 | 2634 | **0.85×** (slower) | n/a¹ | — | gcc |
+| lists | 5243 | 8817 | 1.68× ✓ | 4978 | 0.95× | mir |
+| hash2 | 5372 | 7365 | 1.37× ✓ | 3841 | **0.72×** | mir |
+| sieve | 4991 | 5023 | 1.01× | 4006 | **0.80×** | gcc (~tied), mir |
¹ mir-c2m fails to compile `binary-trees` in our setup, so only the gcc
comparison applies there.
+Movement since the previous refresh (single-run numbers):
+- `lists` went from 0.58× to 0.95× vs mir and 1.02× to 1.68× vs gcc — the
+ recent O1 codegen wins + tiny-function inliner closed most of the gap. Still
+ not 10% past mir, but no longer the worst case.
+- The other three are roughly unchanged; the bar still hasn't been cleared
+ against mir on hash2/sieve, and binary-trees is still slower than gcc-O0.
+
## Per-benchmark notes
-### binary-trees — slower than unoptimized gcc (highest priority)
-The only case where cfree `-O1` is *slower than gcc -O0* (0.79×). Workload is
-allocation- and pointer-chasing-heavy (recursive tree build/walk). Likely
-suspects: call-heavy inner loops, per-node field access codegen, and whatever
-we emit around the allocator calls. Start here.
-
-### lists — 0.58× vs mir
-Matches gcc-O0 (1.02×) but loses badly to mir. Doubly-linked-list traversal and
-splice: lots of `p->next`/`p->prev` loads/stores in tight loops. The recently
-fixed O1 store-address clobber bug lived in this same code shape; correctness is
-now fine, but the field-access/loop codegen is still far behind mir. Compare our
-loop bodies against mir's to find the gap.
-
-### hash2 — 0.63× vs mir
-Clears the gcc bar (1.19×) but is the second-worst against mir. Open-addressing
-hash probing — integer hashing arithmetic plus indexed loads in the probe loop.
-Look at index/address computation and whether the probe loop carries redundant
-materialization.
-
-### sieve — 0.81× vs mir, ~tied with gcc
-Behind both. Classic nested sieve loop over a byte/array — bounded counted loops
-with strided stores. Suggests loop/IV and array-addressing codegen headroom
-(strength reduction, address-mode folding in the inner store).
+### binary-trees — slower than unoptimized gcc (still highest priority)
+The only case where cfree `-O1` is *slower than gcc -O0* (0.85×). Workload is
+recursive tree build/walk: tiny leaf-ish functions called millions of times
+plus `malloc`/`free` on each node. Most wall-clock is in `malloc`; the
+compiler-visible slack lives in the per-call overhead. Inspecting `ItemCheck`,
+`BottomUpTree`, `DeleteTree`, `NewTreeNode` shows ~4–6 redundant moves per
+recursive call site:
+
+```
+ItemCheck (inner recursive call):
+ ldr x8, [x19]
+ mov x11, x8 <-- redundant copy
+ mov x0, x11 <-- could be `mov x0, x8` straight from the ldr
+ bl ItemCheck
+ mov x8, x0 <-- materialize the return into the SSA reg
+ movz x9, 0x1 <-- the `1 +` constant, not hoisted (only 1 use, fine)
+ add x20, x9, x8
+```
+
+Drivers for the gap, in order:
+1. **Copy coalescing leaves intermediate `mov` chains** at call boundaries:
+ `mov xN, xM; mov x0, xN` instead of folding through. Every recursive call
+ in `ItemCheck`/`BottomUpTree`/`DeleteTree` has at least one such redundant
+ pair. With ~7.6M recursive calls (depth=19) this is tens of millions of
+ wasted ops.
+2. **Standard 9-insn prologue/epilogue** (`sub sp; add x17; stp x29,x30;
+ add x29` + mirror on exit) on tiny non-leaf functions like `NewTreeNode`.
+ Leaf detection + skipping FP save/restore where the callee makes no calls
+ would help (`ItemCheck` recurses so it's not a leaf; `NewTreeNode` *does*
+ call malloc, also not a leaf — but neither needs the FP-frame setup we
+ currently emit).
+3. **`NewTreeNode` spills x19/x20** as callee-saves only to ferry the two
+ incoming args across one `malloc` call — pure overhead vs keeping the
+ live values in caller-save scratch + reload from a small spill slot.
+
+This is where the biggest absolute wall-clock win is, even though the per-op
+codegen looks roughly equivalent to gcc -O0.
+
+### hash2 — 0.72× vs mir
+Clears the gcc bar (1.37×) but is the worst against mir. The hot loop is
+`ht_hashcode`:
+
+```
+0xc88: movz x9, 0x5 <-- loop-invariant constant, NOT hoisted
+0xc8c: mul x14, x9, x11 <-- could be `add x14, x11, x11, lsl #2` (5*v)
+0xc90: ldrb w11, [x8]
+0xc94: sxtb x13, w11
+0xc98: add x11, x13, x14
+0xc9c: add x8, x8, #1
+0xca0: ldrb w13, [x8]
+0xca4: cmp w13, #0
+0xca8: b.ne 0xc88
+```
+
+Two clear wins:
+1. **Loop-invariant immediate operand not hoisted.** The IR carries the `5`
+ as an inline `imm:` operand on the `binop mul`; `opt_machinize_native`
+ leaves it there, the emitter materializes it with `movz x9, 0x5` inside
+ the loop on every iteration. `opt_hoist_loop_consts` only hoists explicit
+ `IR_LOAD_IMM` defs (see `src/opt/pass_addr_fold.c:835`), so this never
+ becomes a hoist candidate.
+
+ Fix sketch: between `opt_machinize_native` and `opt_hoist_loop_consts`,
+ add a pass that lowers non-zero immediate operands on machine-irrelevant
+ positions (mul/add/sub second operand outside imm-fold range, store value
+ operand) to `IR_LOAD_IMM` + reg-use. Then the existing hoist pass picks
+ them up. Same fix unblocks sieve (see below).
+2. **`mul x14, x9, x11` for a constant 5×** — strength-reduce small-integer
+ multiplications to `add x, y, y, lsl #N` / `sub` sequences for power-of-two
+ neighbours. Independent of the hoist fix.
+
+### sieve — 0.80× vs mir, ~tied with gcc
+Behind both. Two hot inner loops; both leave clear codegen on the table.
+
+Init loop (`flags[i] = 1`):
+```
+0x2ac: mov x13, x8 <-- coalesce miss (copy of the IV)
+0x2b0: movz w9, 0x1 <-- loop-invariant, NOT hoisted (same as hash2)
+0x2b4: strb w9, [x19, x13]
+0x2b8: add x8, x8, #1
+0x2bc: cmp x8, #2, lsl #12
+0x2c0: b.le 0x2ac
+```
+
+Mark-multiples loop (`flags[k] = 0`):
+```
+0x2ec: mov x14, x13 <-- coalesce miss; could just use x13 for the index
+0x2f0: strb wzr, [x19, x14] (strb wzr is already good — zero-source fast path
+0x2f4: add x13, x13, x8 fires here)
+0x2f8: cmp x13, #2, lsl #12
+0x2fc: b.le 0x2ec
+```
+
+So:
+1. **Same imm-operand hoist gap** as hash2 — the inline `imm:1` on `store`
+ becomes a fresh `movz w9, 0x1` per iteration. The existing zero-source
+ fast path (`pass_native_emit.c:790`) handles the strb-zero case in the
+ second loop, which is why only the init loop has the `movz`.
+2. **Copy not coalesced for the IV/index** in both loops. The `mov x13, x8`
+ and `mov x14, x13` are pure register-allocator copies that the post-RA
+ passes don't clean up. Worth tracing what `opt_lower_to_mir` + the post-RA
+ jump-cleanup leaves; these moves likely come from how the IR copy ops in
+ block 4 of `sieve_min`'s IR survive RA (`copy def=v9 opnds=[v14,v8]` +
+ `copy def=v10 opnds=[v13,v12]` — two copies feeding one store).
+
+### lists — 0.95× vs mir (no longer bleeding)
+Last refresh was 0.58×; current 0.95×. Now within shot of the 1.10× bar.
+Same imm-hoist + copy-coalesce wins should close the remaining ~15%.
+
+## Cross-cutting fixes (would help multiple benches at once)
+
+These are the two themes that recur across the four benches above:
+
+1. **Hoist loop-invariant *immediate operands*, not just `IR_LOAD_IMM`.**
+ `opt_hoist_loop_consts` in `src/opt/pass_addr_fold.c` only sees defs of
+ `IR_LOAD_IMM`. Constants that arrive at emit as inline `imm:` operands on
+ `binop`/`store`/etc. are re-materialized by the emitter inside the loop
+ every iteration (sieve-init, hash2-hashcode). A pre-hoist lowering pass
+ that walks loop-body insts, takes their non-foldable imm operands, and
+ converts them to `IR_LOAD_IMM + reg use` would let the existing hoist
+ pass do the rest.
+2. **Copy coalescing leaves redundant `mov xN, xM` artifacts.** Seen in
+ sieve (both loops, IV/index copy), binary-trees (every recursive call
+ site has at least one redundant `mov` pair), and likely in lists. Most
+ of these are SSA-level `copy` ops that survive into machine code because
+ post-RA copy elimination is doing less than it could. Worth a focused
+ look at what `opt_mir_combine` / `opt_mir_dce` see for these inputs.
+
+Targeted secondary wins:
+- Strength-reduce small-integer multiplications by constants to `add`/`sub`
+ + shifted-register forms (hash2 `5 *`).
+- Slimmer prologue/epilogue for functions that don't need FP-frame setup
+ (binary-trees).
## Reproducing
@@ -66,11 +182,11 @@ with strided stores. Suggests loop/IV and array-addressing codegen headroom
# Build the optimized compiler first (clean release):
rm -rf build/release && make RELEASE=1 bin
-# Run just these four, single-run, O0+O1, gcc + cfree + mir:
+# Run just these four with 3 repeats (best-of), O0+O1, gcc + cfree + mir:
CFREE="$PWD/build/release/cfree" \
CFREE_OPT_BENCHES="binary-trees lists hash2 sieve" \
CFREE_OPT_BENCH_LEVELS="0 1" \
-CFREE_OPT_BENCH_COMPILE_REPEATS=1 CFREE_OPT_BENCH_RUN_REPEATS=1 \
+CFREE_OPT_BENCH_COMPILE_REPEATS=3 CFREE_OPT_BENCH_RUN_REPEATS=3 \
CFREE_OPT_BENCH_SKIP_GCC=0 CFREE_OPT_BENCH_SKIP_CLANG=1 CFREE_OPT_BENCH_SKIP_MIR=0 \
bash scripts/opt_bench.sh
```
@@ -82,10 +198,11 @@ disassembly of the hot function.
## Notes / caveats
-- Single-run timings: the four here were picked for gap size, but re-run with
- repeats before/after a change to confirm movement (especially anything that
- lands near the 1.0× line, e.g. `sieve`).
-- Not on this list but also marginal (within single-run noise, 0.92–1.05×):
- `hash`, `funnkuch-reduce`, `strcat`. Revisit once the four above improve.
+- Numbers above are best-of-3; re-run with the same `COMPILE_REPEATS=3
+ RUN_REPEATS=3` after a change to confirm movement. Anything near 1.0× (e.g.
+ `sieve` vs gcc) is within noise and should be confirmed with a separate
+ re-run.
+- Not on this list but worth revisiting once the four above improve: `hash`,
+ `funnkuch-reduce`, `strcat` (previously marginal at single-run).
- `cfree-run` (JIT) shares this codegen, so its runtimes track `cfree cc -O1`;
fixing these helps both paths.
diff --git a/src/arch/aa64/isa.c b/src/arch/aa64/isa.c
@@ -339,6 +339,20 @@ const AA64InsnDesc aa64_insn_table[] = {
AA64_ASMFL_SF1,
{0, 0}},
+ /* ----- Load/store pair, post-indexed (opc=10 X / opc=01 D) ----- */
+ {MN("stp"),
+ 0xA8800000u,
+ 0xFFC00000u,
+ AA64_FMT_LDSTP_POST,
+ AA64_ASMFL_SF1,
+ {0, 0}},
+ {MN("ldp"),
+ 0xA8C00000u,
+ 0xFFC00000u,
+ AA64_FMT_LDSTP_POST,
+ AA64_ASMFL_SF1,
+ {0, 0}},
+
/* ----- Unconditional branch (immediate) ----- */
{MN("b"), 0x14000000u, 0xFC000000u, AA64_FMT_BR_IMM, 0, {0, 0}},
{MN("bl"), 0x94000000u, 0xFC000000u, AA64_FMT_BR_IMM, 0, {0, 0}},
@@ -675,6 +689,26 @@ static void print_ldstp_soff(StrBuf* sb, u32 w) {
print_ldstp_common(sb, aa64_ldstp_soff_unpack(w), /*pre=*/0);
}
+/* Post-indexed reuses the pre/soff field layout (same encoder struct). The
+ * common printer renders `[Rn]` (no offset inside brackets) when pre=0; the
+ * "#imm, post" form is rendered by appending after the closing bracket. */
+static void print_ldstp_post(StrBuf* sb, u32 w) {
+ AA64LdStPPre f = aa64_ldstp_pre_unpack(w); /* same field layout */
+ i64 scale = (f.V == 1) ? (f.opc == 0 ? 4 : (f.opc == 1 ? 8 : 16))
+ : (f.opc == 2 ? 8 : 4);
+ /* Render Rt, Rt2, [Rn], #imm — same prefix as the soff form, then a
+ * post-bracket displacement. Reuse the common printer for the prefix
+ * by zeroing imm7 in a copy so it omits the `, #imm` inside `[..]`. */
+ AA64LdStPPre stripped = f;
+ stripped.imm7 = 0;
+ print_ldstp_common(sb, stripped, /*pre=*/0);
+ i64 byte_off = sext((u64)f.imm7, 7) * scale;
+ if (byte_off) {
+ strbuf_puts(sb, ", #");
+ strbuf_put_i64(sb, byte_off);
+ }
+}
+
static void print_br_imm(StrBuf* sb, u32 w, u64 vaddr) {
AA64BrImm f = aa64_brimm_unpack(w);
i64 ofs = sext((u64)f.imm26, 26) * 4;
@@ -825,6 +859,9 @@ void aa64_print_operands(StrBuf* sb, const AA64InsnDesc* desc, u32 word,
case AA64_FMT_LDSTP_SOFF:
print_ldstp_soff(sb, word);
break;
+ case AA64_FMT_LDSTP_POST:
+ print_ldstp_post(sb, word);
+ break;
case AA64_FMT_LDST_SIMM9:
print_ldst_simm9(sb, word, desc);
break;
diff --git a/src/arch/aa64/isa.h b/src/arch/aa64/isa.h
@@ -48,6 +48,7 @@ typedef enum AA64Format {
AA64_FMT_LDST_UIMM, /* load/store, unsigned 12-bit immediate offset */
AA64_FMT_LDSTP_PRE, /* load/store pair, pre-indexed */
AA64_FMT_LDSTP_SOFF, /* load/store pair, signed-offset */
+ AA64_FMT_LDSTP_POST, /* load/store pair, post-indexed */
AA64_FMT_LDST_SIMM9, /* load/store, unscaled 9-bit signed offset
(LDUR / STUR, V=0 and V=1) */
AA64_FMT_BR_IMM, /* unconditional branch (immediate) — B / BL */
@@ -839,6 +840,37 @@ static inline u32 aa64_ldp64_pre(u32 Rt, u32 Rt2, u32 Rn, i32 imm7_scaled) {
.Rt = Rt});
}
+/* Post-indexed STP/LDP — same field layout as the pre-indexed form, only
+ * bits[25:23] differ (001 vs 011); reuse AA64LdStPPre. Used for the slim
+ * prologue's epilogue restore: `ldp x29,x30,[sp],#16`. */
+#define AA64_LDSTP_POST_FAMILY_MATCH 0x28800000u
+#define AA64_LDSTP_POST_FAMILY_MASK 0x7FC00000u /* bits 30:23 */
+
+static inline u32 aa64_ldstp_post_pack(AA64LdStPPre f) {
+ return ((f.opc & 3u) << 30) | AA64_LDSTP_POST_FAMILY_MATCH |
+ ((f.V & 1u) << 26) | ((f.L & 1u) << 22) | ((f.imm7 & 0x7fu) << 15) |
+ ((f.Rt2 & 0x1fu) << 10) | ((f.Rn & 0x1fu) << 5) | (f.Rt & 0x1fu);
+}
+
+static inline u32 aa64_stp64_post(u32 Rt, u32 Rt2, u32 Rn, i32 imm7_scaled) {
+ return aa64_ldstp_post_pack((AA64LdStPPre){.opc = 2,
+ .V = 0,
+ .L = 0,
+ .imm7 = (u32)imm7_scaled & 0x7fu,
+ .Rt2 = Rt2,
+ .Rn = Rn,
+ .Rt = Rt});
+}
+static inline u32 aa64_ldp64_post(u32 Rt, u32 Rt2, u32 Rn, i32 imm7_scaled) {
+ return aa64_ldstp_post_pack((AA64LdStPPre){.opc = 2,
+ .V = 0,
+ .L = 1,
+ .imm7 = (u32)imm7_scaled & 0x7fu,
+ .Rt2 = Rt2,
+ .Rn = Rn,
+ .Rt = Rt});
+}
+
/* ====================================================================
* Hint instructions (NOP / YIELD / WFE / WFI / SEV / SEVL)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -122,6 +122,21 @@ typedef struct AANativeTarget {
AACalleeSave callee_saves[AA_MAX_CALLEE_SAVES];
u32 ncallee_saves;
+
+ /* Set at func_end when this function qualifies for the slim prologue/epilogue
+ * (Tier A: no body locals/spills, no callee-saves, no alloca, no outgoing
+ * stack args, no sret/variadic). When set, the prologue patch and epilogue
+ * emit a 2-insn `stp x29,x30,[sp,#-16]! ; mov x29,sp` and matching `ldp
+ * x29,x30,[sp],#16 ; ret` instead of the fat 4+3-insn FP-frame form. */
+ u8 slim_prologue;
+ /* Set at func_end when frame_size - 16 fits stp's signed 7-bit scaled
+ * immediate (frame_size <= 520). Skips the `add x17, sp, #(N-16)` scratch
+ * materialization in the prologue (stp x29,x30,[sp,#N-16] instead) and
+ * the matching `add x10, fp, #0` in the epilogue (ldp x29,x30,[sp,#N-16]
+ * + add sp,sp,#N). Mutually exclusive with `slim_prologue` (Tier A wins
+ * when both would apply). Applies to almost every function with a small
+ * frame, including those with callee-saves and locals. */
+ u8 slim_small_frame;
} AANativeTarget;
static AANativeTarget* aa_of(NativeTarget* t) { return (AANativeTarget*)t; }
@@ -674,19 +689,65 @@ static int aa_addr_legal(NativeTarget* t, const NativeAddr* addr,
return addr->log2_scale == sz;
}
+/* True if `mul Rd, Rn, #c` can be replaced by a single non-mul aarch64
+ * instruction using only Rn as a source (no extra scratch reg). Constants
+ * that match: 0, 1, -1, +/-2^k, 2^k+1, 1-2^k for k in [1..width-1]. The
+ * shift exponent must fit imm6 for the operand width (width = 32 if !sf
+ * else 64). The emit side is aa_emit_mul_const_imm. */
+static int aa64_imul_strength_reducible(u32 sf, i64 imm) {
+ u32 max_sh = sf ? 63u : 31u;
+ u64 a;
+ if (imm == 0 || imm == 1 || imm == -1) return 1;
+ /* +2^k */
+ a = (u64)imm;
+ if (imm > 0 && (a & (a - 1u)) == 0u) {
+ u32 k = (u32)__builtin_ctzll(a);
+ return k <= max_sh;
+ }
+ /* -2^k */
+ if (imm < 0) {
+ a = (u64)(-imm);
+ if (a && (a & (a - 1u)) == 0u) {
+ u32 k = (u32)__builtin_ctzll(a);
+ return k >= 1u && k <= max_sh;
+ }
+ }
+ /* 2^k + 1 (k >= 1, so c >= 3) */
+ if (imm >= 3) {
+ u64 m = (u64)(imm - 1);
+ if ((m & (m - 1u)) == 0u) {
+ u32 k = (u32)__builtin_ctzll(m);
+ return k >= 1u && k <= max_sh;
+ }
+ }
+ /* 1 - 2^k (k >= 1, so c <= -1) */
+ if (imm <= -1) {
+ u64 m = (u64)(1 - imm);
+ if (m && (m & (m - 1u)) == 0u) {
+ u32 k = (u32)__builtin_ctzll(m);
+ return k >= 1u && k <= max_sh;
+ }
+ }
+ return 0;
+}
+
/* Which constant operands the backend can fold directly into an instruction
* (so the optimizer can leave them as immediates instead of materializing a
- * register). Currently: add/sub/cmp 12-bit immediates (optionally <<12), and
- * any value for a plain register move (movz/movk synthesizes it). */
+ * register). Currently: add/sub/cmp 12-bit immediates (optionally <<12),
+ * any value for a plain register move (movz/movk synthesizes it), and
+ * strength-reducible mul constants (handled in aa_binop via shift / shifted
+ * add or sub). */
static int aa_imm_legal(NativeTarget* t, NativeImmUse use, u32 op,
CfreeCgTypeId type, i64 imm) {
u32 imm12, sh;
- (void)t;
- (void)type;
switch (use) {
case NATIVE_IMM_BINOP:
if ((BinOp)op == BO_IADD || (BinOp)op == BO_ISUB)
return aa64_addsub_imm_fits(imm < 0 ? -imm : imm, &imm12, &sh);
+ if ((BinOp)op == BO_IMUL) {
+ u32 sf = type_size32(t, type) == 8u ? 1u : 0u;
+ return aa64_imul_strength_reducible(sf, imm);
+ }
return 0;
case NATIVE_IMM_CMP:
/* cmp lowers to subs #imm12; cmn (negative) is not wired, so require a
@@ -825,6 +886,8 @@ static void aa_func_begin(NativeTarget* t, const CGFuncDesc* fd) {
a->ntail_sites = 0;
a->nalloca_patches = 0;
a->ncallee_saves = 0;
+ a->slim_prologue = 0;
+ a->slim_small_frame = 0;
mc->set_section(mc, fd->text_section_id);
mc->emit_align(mc, 4, 0);
a->func_start = mc->pos(mc);
@@ -984,6 +1047,27 @@ static void aa_words_saved_pair_addr(AANativeTarget* a, u32* words, u32 cap,
static void aa_words_restore_frame(AANativeTarget* a, u32* words, u32 cap,
u32* n, u32 frame_size) {
if (!frame_size) return;
+ if (a->slim_prologue) {
+ if (*n + 1u > cap) aa_panic(a, "instruction patch too small");
+ /* `ldp x29, x30, [sp], #16` — pops the saved pair and restores sp. */
+ words[(*n)++] = aa64_ldp64_post(AA_FP, AA_LR, AA_SP, 2);
+ return;
+ }
+ if (a->slim_small_frame) {
+ /* `ldp x29,x30,[sp,#N-16] ; add sp,sp,#N` — skip the `add x10,fp,#0`
+ * scratch the fat path uses. Restoring fp,lr through sp+offset avoids
+ * the scratch entirely; the subsequent `add sp` then unwinds the frame
+ * without depending on the (now-clobbered) old fp. */
+ u32 save_off = frame_size - AA_FRAME_SAVE_SIZE;
+ u32 imm12, sh;
+ if (*n + 2u > cap) aa_panic(a, "instruction patch too small");
+ words[(*n)++] =
+ aa64_ldp64_soff(AA_FP, AA_LR, AA_SP, (i32)(save_off / 8u));
+ if (!aa64_addsub_imm_fits(frame_size, &imm12, &sh))
+ aa_panic(a, "slim_small_frame: frame_size out of addsub imm range");
+ words[(*n)++] = aa64_add_imm(1, AA_SP, AA_SP, imm12, sh);
+ return;
+ }
if (*n + 3u > cap) aa_panic(a, "instruction patch too small");
words[(*n)++] = aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0);
words[(*n)++] = aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, -2); /* fp,lr @ -16 */
@@ -1030,10 +1114,29 @@ static u32 aa_build_prologue_words(AANativeTarget* a, u32 frame_size, u32* words
u32 cap) {
u32 n = 0;
if (!frame_size) return 0;
+ if (a->slim_prologue) {
+ if (cap < 2u) aa_panic(a, "prologue too large");
+ /* `stp x29, x30, [sp, #-16]!` — push the saved pair and adjust sp in
+ * one instruction. `mov x29, sp` keeps the AAPCS64 backtrace chain
+ * intact (some unwinders walk x29 directly rather than via DWARF). */
+ words[n++] = aa64_stp64_pre(AA_FP, AA_LR, AA_SP, -2);
+ words[n++] = aa64_add_imm(1, AA_FP, AA_SP, 0, 0);
+ return n;
+ }
aa_words_sub_sp_frame(a, words, cap, &n, frame_size);
- aa_words_saved_pair_addr(a, words, cap, &n, frame_size);
- if (n >= cap) aa_panic(a, "prologue too large");
- words[n++] = aa64_stp64_soff(AA_FP, AA_LR, AA_TMP1, 0); /* fp,lr @ [x17] */
+ if (a->slim_small_frame) {
+ /* `stp x29, x30, [sp, #(frame_size-16)]` — skip the `add x17, sp, #N-16`
+ * scratch step the fat path emits. Valid when (frame_size - 16) fits the
+ * stp signed-7-bit scaled immediate (i.e. frame_size <= 520). */
+ u32 save_off = frame_size - AA_FRAME_SAVE_SIZE;
+ if (n >= cap) aa_panic(a, "prologue too large");
+ words[n++] =
+ aa64_stp64_soff(AA_FP, AA_LR, AA_SP, (i32)(save_off / 8u));
+ } else {
+ aa_words_saved_pair_addr(a, words, cap, &n, frame_size);
+ if (n >= cap) aa_panic(a, "prologue too large");
+ words[n++] = aa64_stp64_soff(AA_FP, AA_LR, AA_TMP1, 0); /* fp,lr @ [x17] */
+ }
aa_words_frame_ptr_from_sp(a, words, cap, &n, frame_size);
/* Save callee-saved registers the allocator used, FP-relative. Their slots
* were reserved first (aa_reserve_callee_saves), so offsets fit stur's
@@ -1086,6 +1189,20 @@ static void aa_emit_prologue(NativeTarget* t) {
static void aa_emit_restore_frame(AANativeTarget* a, u32 frame_size) {
MCEmitter* mc = a->base.mc;
if (!frame_size) return;
+ if (a->slim_prologue) {
+ aa_emit32(mc, aa64_ldp64_post(AA_FP, AA_LR, AA_SP, 2));
+ return;
+ }
+ if (a->slim_small_frame) {
+ u32 save_off = frame_size - AA_FRAME_SAVE_SIZE;
+ u32 imm12, sh;
+ aa_emit32(mc,
+ aa64_ldp64_soff(AA_FP, AA_LR, AA_SP, (i32)(save_off / 8u)));
+ if (!aa64_addsub_imm_fits(frame_size, &imm12, &sh))
+ aa_panic(a, "slim_small_frame: frame_size out of addsub imm range");
+ aa_emit32(mc, aa64_add_imm(1, AA_SP, AA_SP, imm12, sh));
+ return;
+ }
aa_emit32(mc, aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0));
aa_emit32(mc, aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, -2)); /* fp,lr @ -16 */
aa_emit32(mc, aa64_add_imm(1, AA_SP, AA_TMP0, 0, 0));
@@ -1144,6 +1261,29 @@ static void aa_func_end(NativeTarget* t) {
* frame-size immediates are only final now, so patch the region in place. */
u32 prologue_region =
t->emit_minimal_prologue ? a->minimal_prologue_words : AA_PROLOGUE_WORDS;
+ /* Slim Tier A eligibility (set before emitting the epilogue / patching the
+ * prologue so the *_restore_frame / *_build_prologue_words helpers pick the
+ * slim form). Conditions: no callee-saves needed, no alloca, no body
+ * locals/spills (cum_off untouched past the reserved fp/lr save area), no
+ * outgoing stack args, and only on the optimizer path (the NDT reserves a
+ * much larger prologue region and isn't on the bench path). sret/variadic
+ * disqualify naturally because their entry-save slots advance cum_off. */
+ a->slim_prologue =
+ t->emit_minimal_prologue && a->ncallee_saves == 0 &&
+ a->nalloca_patches == 0 && a->cum_off == AA_FRAME_SAVE_SIZE &&
+ a->max_outgoing == 0;
+ /* Universal small-frame fast path: skip the x17/x10 scratch when the
+ * saved-pair offset (frame_size - 16) fits stp's signed 7-bit scaled
+ * immediate. Mutually exclusive with the Tier A slim form (Tier A is
+ * strictly tighter — 2-insn prologue, 1-insn restore). Disqualify alloca:
+ * alloca dynamically moves sp during the body, and the fat epilogue's
+ * `add sp, fp, #0` (via x10) is what restores sp from fp. The slim
+ * epilogue's `add sp, sp, #N` only undoes the static frame, leaving sp
+ * pointing into the alloca area. */
+ a->slim_small_frame =
+ !a->slim_prologue && a->nalloca_patches == 0 &&
+ frame_size >= AA_FRAME_SAVE_SIZE &&
+ (frame_size - AA_FRAME_SAVE_SIZE) <= 504u;
mc->label_place(mc, a->epilogue_label);
aa_emit_callee_restores(a);
aa_emit_restore_frame(a, frame_size);
@@ -1153,7 +1293,14 @@ static void aa_func_end(NativeTarget* t) {
aa_patch_tail_sites(a, frame_size);
if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) {
mc->cfi_set_next_pc_offset(mc, prologue_region * 4u);
- mc->cfi_def_cfa(mc, AA_FP, 0);
+ if (a->slim_prologue) {
+ /* After `stp x29,x30,[sp,#-16]!; mov x29,sp`: CFA = sp + 16, fp at
+ * CFA-16, lr at CFA-8. SP-anchored stays correct for the entire body
+ * since slim Tier A has no further sp moves. */
+ mc->cfi_def_cfa(mc, AA_SP, 16);
+ } else {
+ mc->cfi_def_cfa(mc, AA_FP, 0);
+ }
mc->cfi_offset(mc, AA_FP, -16);
mc->cfi_offset(mc, AA_LR, -8);
}
@@ -1482,6 +1629,85 @@ static void aa_set_bytes(NativeTarget* t, NativeAddr dst, NativeLoc byte_value,
aa_store_native(t, aa_addr_plus(dst, off), byte, mem);
}
+static void aa_lsl_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn, u32 sh);
+
+/* Strength-reduce `mul rd, rn, #imm` for the constants accepted by
+ * aa64_imul_strength_reducible into a single non-mul instruction. Callers
+ * must gate on aa64_imul_strength_reducible(sf, imm) — this routine panics
+ * on unhandled constants. */
+static void aa_emit_mul_const_imm(NativeTarget* t, u32 sf, u32 rd, u32 rn,
+ i64 imm) {
+ u64 a;
+ if (imm == 0) {
+ aa_emit32(t->mc, aa64_mov_reg(sf, rd, AA64_ZR));
+ return;
+ }
+ if (imm == 1) {
+ if (rd != rn) aa_emit32(t->mc, aa64_mov_reg(sf, rd, rn));
+ return;
+ }
+ if (imm == -1) {
+ aa_emit32(t->mc, aa64_neg(sf, rd, rn));
+ return;
+ }
+ /* +2^k: lsl rd, rn, #k */
+ a = (u64)imm;
+ if (imm > 0 && (a & (a - 1u)) == 0u) {
+ u32 k = (u32)__builtin_ctzll(a);
+ aa_lsl_imm(t, sf, rd, rn, k);
+ return;
+ }
+ /* -2^k: sub rd, xzr, rn, lsl #k */
+ if (imm < 0) {
+ a = (u64)(-imm);
+ if (a && (a & (a - 1u)) == 0u) {
+ u32 k = (u32)__builtin_ctzll(a);
+ aa_emit32(t->mc, aa64_addsubsr_pack((AA64AddSubSR){.sf = sf,
+ .op = 1u,
+ .S = 0u,
+ .shift = 0u,
+ .Rm = rn,
+ .imm6 = k,
+ .Rn = AA64_ZR,
+ .Rd = rd}));
+ return;
+ }
+ }
+ /* 2^k + 1: add rd, rn, rn, lsl #k */
+ if (imm >= 3) {
+ u64 m = (u64)(imm - 1);
+ if ((m & (m - 1u)) == 0u) {
+ u32 k = (u32)__builtin_ctzll(m);
+ aa_emit32(t->mc, aa64_addsubsr_pack((AA64AddSubSR){.sf = sf,
+ .op = 0u,
+ .S = 0u,
+ .shift = 0u,
+ .Rm = rn,
+ .imm6 = k,
+ .Rn = rn,
+ .Rd = rd}));
+ return;
+ }
+ }
+ /* 1 - 2^k: sub rd, rn, rn, lsl #k */
+ if (imm <= -1) {
+ u64 m = (u64)(1 - imm);
+ if (m && (m & (m - 1u)) == 0u) {
+ u32 k = (u32)__builtin_ctzll(m);
+ aa_emit32(t->mc, aa64_addsubsr_pack((AA64AddSubSR){.sf = sf,
+ .op = 1u,
+ .S = 0u,
+ .shift = 0u,
+ .Rm = rn,
+ .imm6 = k,
+ .Rn = rn,
+ .Rd = rd}));
+ return;
+ }
+ }
+ aa_panic(aa_of(t), "aa_emit_mul_const_imm: unhandled constant");
+}
+
static void aa_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc lhs,
NativeLoc rhs) {
u32 sf = loc_is_64(t, dst) ? 1u : 0u;
@@ -1519,6 +1745,10 @@ static void aa_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc lhs,
: aa64_sub_imm(sf, rd, rn, imm12, sh));
return;
}
+ if (rhs.kind == NATIVE_LOC_IMM && op == BO_IMUL) {
+ aa_emit_mul_const_imm(t, sf, rd, rn, rhs.v.imm);
+ return;
+ }
switch (op) {
case BO_IADD:
aa_emit32(t->mc, aa64_add(sf, rd, rn, rm));
diff --git a/src/opt/ir.h b/src/opt/ir.h
@@ -600,9 +600,10 @@ typedef struct OptPRegInfo {
i32 tied_hard_reg; /* -1 means no fixed/tied physical register need. */
Reg hard_reg;
FrameSlot spill_slot;
- u8 alloc_kind; /* OptAllocKind */
- u8 cls; /* RegClass */
- u8 pad[2];
+ u8 alloc_kind; /* OptAllocKind */
+ u8 cls; /* RegClass */
+ i8 preferred_hard_reg;/* soft hint for allocator; -1 = no hint */
+ u8 pad[1];
u32 forbidden_hard_regs; /* bit r means PReg may not allocate hard reg r. */
} OptPRegInfo;
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -150,6 +150,10 @@ static void opt_run_o1_native(OptImpl* o, Func* f) {
metrics_scope_begin(o->c, "opt.build_loop_tree");
opt_build_loop_tree(f);
metrics_scope_end(o->c, "opt.build_loop_tree");
+ metrics_scope_begin(o->c, "opt.o1.lower_loop_imm");
+ opt_lower_loop_imm_operands(f, o->native);
+ metrics_scope_end(o->c, "opt.o1.lower_loop_imm");
+ opt_verify(f, "o1-lower-loop-imm");
metrics_scope_begin(o->c, "opt.o1.hoist_loop_consts");
opt_hoist_loop_consts(f);
metrics_scope_end(o->c, "opt.o1.hoist_loop_consts");
diff --git a/src/opt/opt.h b/src/opt/opt.h
@@ -31,6 +31,10 @@ void opt_addr_of_global_cse(Func*); /* O1: hoist duplicate ADDR_OF(global)
defs to a single entry-block compute. */
void opt_hoist_loop_consts(Func*); /* O1: hoist loop-invariant LOAD_IMM
materialization to the entry block. */
+void opt_lower_loop_imm_operands(
+ Func*, NativeTarget*); /* O1: convert non-foldable inline imm operands in
+ loop bodies into IR_LOAD_IMM + reg uses so the
+ hoister can lift them. */
void opt_simplify_local(Func*);
void opt_simplify(Func*);
void opt_gvn(Func*); /* incl. constprop, redundant-load elim */
diff --git a/src/opt/opt_internal.h b/src/opt/opt_internal.h
@@ -153,6 +153,11 @@ void opt_walk_abivalue(Func*, Inst*, CGABIValue*, int storage_def,
OptOperandWalkFn, void*);
void opt_walk_inst_operands(Func*, Inst*, OptOperandWalkFn, void*);
+/* Make room for k new instructions starting at index `at` in block `bl`.
+ * Returns a pointer to the first new slot (zero-initialized). Grows the
+ * block's inst array (doubling) as needed. */
+Inst* opt_block_insert_at(Func*, Block* bl, u32 at, u32 k);
+
int opt_mem_observable(const MemAccess*);
u32 opt_call_clobber_mask_for(Func*, const Inst*, u8 cls);
@@ -178,5 +183,11 @@ OptHardRegSet opt_hard_live_out_for_block(const OptHardBlockLive*);
int opt_block_live_out_has_phys_reg(Func*, const OptHardBlockLive*, u32 block,
const Operand*);
void opt_coalesce_ranges(Func*, const OptLiveRangeSet*);
+/* Return 0 (no overlap), 1 (a single unit-length overlap), or 2 (real
+ * conflict: an overlap longer than one point, or two or more disjoint
+ * unit-length overlaps). A unit overlap is the safe swap-friendly case
+ * where one PReg dies exactly where another is born (`dst = op(... src ...)`
+ * with dst placed in src's reg). */
+int opt_ranges_overlap_kind(const OptLiveRangeSet*, PReg a, PReg b);
#endif
diff --git a/src/opt/pass_addr_fold.c b/src/opt/pass_addr_fold.c
@@ -628,7 +628,7 @@ static void addr_cse_apply_to_inst(Inst* in, const PReg* remap) {
}
}
-static Inst* block_insert_at(Func* f, Block* bl, u32 at, u32 k) {
+Inst* opt_block_insert_at(Func* f, Block* bl, u32 at, u32 k) {
if (at > bl->ninsts) at = bl->ninsts;
if (bl->ninsts + k > bl->cap) {
u32 ncap = bl->cap ? bl->cap : 8u;
@@ -723,7 +723,7 @@ void opt_addr_of_global_cse(Func* f) {
while (insert_at < entry->ninsts &&
(IROp)entry->insts[insert_at].op == IR_PARAM_DECL)
++insert_at;
- Inst* slot = block_insert_at(f, entry, insert_at, dup_count);
+ Inst* slot = opt_block_insert_at(f, entry, insert_at, dup_count);
u32 w = 0;
for (u32 i = 0; i < n_entries; ++i) {
if (entries[i].canonical == PREG_NONE) continue;
@@ -901,7 +901,7 @@ void opt_hoist_loop_consts(Func* f) {
while (insert_at < entry->ninsts &&
(IROp)entry->insts[insert_at].op == IR_PARAM_DECL)
++insert_at;
- Inst* slot = block_insert_at(f, entry, insert_at, dup_count);
+ Inst* slot = opt_block_insert_at(f, entry, insert_at, dup_count);
u32 w = 0;
for (u32 i = 0; i < n_entries; ++i) {
if (entries[i].canonical == PREG_NONE) continue;
diff --git a/src/opt/pass_coalesce.c b/src/opt/pass_coalesce.c
@@ -51,7 +51,11 @@ static void coalesce_add_related(CoalesceCtx* c, PReg v) {
c->related[c->nrelated++] = v;
}
+int opt_ranges_overlap_kind(const OptLiveRangeSet* ranges, PReg a, PReg b);
static int ranges_overlap_kind(const OptLiveRangeSet* ranges, PReg a, PReg b) {
+ return opt_ranges_overlap_kind(ranges, a, b);
+}
+int opt_ranges_overlap_kind(const OptLiveRangeSet* ranges, PReg a, PReg b) {
/* Returns 0 (no overlap), 1 (a single unit-length overlap), or 2 (real
* conflict: an overlap longer than one point, or two or more disjoint
* unit-length overlaps).
diff --git a/src/opt/pass_combine.c b/src/opt/pass_combine.c
@@ -699,6 +699,34 @@ static void set_indirect_field(Operand* ind, Reg old_reg, Reg new_reg) {
ind->v.ind.index = new_reg;
}
+/* Substitute `def` -> `src` in a single aux register slot (no slot-index
+ * whitelist; caller has already restricted to SK_REG and ensured src is a
+ * register). Returns 1 if rewritten. */
+static int subst_one_aux_operand(Operand* op, const Operand* def,
+ const Operand* src) {
+ if (op->kind == OPK_REG && same_phys_reg(op, def)) {
+ *op = *src;
+ return 1;
+ }
+ if (op->kind == OPK_INDIRECT && src->kind == OPK_REG &&
+ def->cls == RC_INT && src->cls == RC_INT &&
+ (op->v.ind.base == def->v.reg ||
+ (op->v.ind.index != (Reg)REG_NONE && op->v.ind.index == def->v.reg))) {
+ set_indirect_field(op, def->v.reg, src->v.reg);
+ return 1;
+ }
+ return 0;
+}
+
+static int subst_abivalue_uses(CGABIValue* v, const Operand* def,
+ const Operand* src) {
+ int n = 0;
+ n += subst_one_aux_operand(&v->storage, def, src);
+ for (u32 i = 0; i < v->nparts; ++i)
+ n += subst_one_aux_operand((Operand*)&v->parts[i].op, def, src);
+ return n;
+}
+
/* Walk the operands of consumer `in` and try to substitute uses of `def`
* (producer's destination, OPK_REG) with `src` (producer's source, OPK_REG
* or OPK_IMM). For OPK_INDIRECT operands, substitution into base/index is
@@ -717,19 +745,41 @@ static int subst_consumer_operands(Inst* in, const Operand* def,
++n;
continue;
}
- /* Indirect base/index substitution: only OPK_REG src may land here. */
+ /* Indirect base/index substitution: only OPK_REG src may land here. The
+ * substitution only changes which register computes the address, not the
+ * value being stored — safe for IR_STORE / IR_ATOMIC_STORE /
+ * IR_BITFIELD_STORE / IR_AGG_COPY / IR_AGG_SET as well. */
if (op->kind == OPK_INDIRECT && kind == SK_REG && src->kind == OPK_REG &&
def->kind == OPK_REG && def->cls == RC_INT && src->cls == RC_INT &&
(op->v.ind.base == def->v.reg ||
(op->v.ind.index != (Reg)REG_NONE && op->v.ind.index == def->v.reg))) {
- if ((IROp)in->op == IR_STORE || (IROp)in->op == IR_ATOMIC_STORE ||
- (IROp)in->op == IR_BITFIELD_STORE || (IROp)in->op == IR_AGG_COPY ||
- (IROp)in->op == IR_AGG_SET)
- continue;
set_indirect_field(op, def->v.reg, src->v.reg);
++n;
}
}
+
+ /* Aux register uses: IR_CALL plan/desc args + callee. SK_REG only —
+ * substituting an immediate or convert into an ABI-bound use would break
+ * the call lowering. IR_RET is intentionally excluded: the return register
+ * is live-out by ABI, so a producer copying into it stays alive even after
+ * substituting val.storage, and emit_ret would emit a redundant second
+ * move. */
+ if (kind == SK_REG) {
+ if ((IROp)in->op == IR_CALL) {
+ IRCallAux* aux = (IRCallAux*)in->extra.aux;
+ if (aux) {
+ if (aux->use_plan_replay) {
+ n += subst_one_aux_operand(&aux->plan.callee, def, src);
+ for (u32 k = 0; k < aux->plan.nargs; ++k)
+ n += subst_one_aux_operand(&aux->plan.args[k].src, def, src);
+ } else {
+ n += subst_one_aux_operand(&aux->desc.callee, def, src);
+ for (u32 k = 0; k < aux->desc.nargs; ++k)
+ n += subst_abivalue_uses((CGABIValue*)&aux->desc.args[k], def, src);
+ }
+ }
+ }
+ }
return n;
}
@@ -824,24 +874,57 @@ static int seen_mark(u32 seen[OPT_REG_CLASSES], u8 cls, Reg reg) {
return 0;
}
+static void try_substitute_aux_operand(CombineCtx* ctx, Inst* in, i32 i,
+ const Operand* op,
+ u32 seen[OPT_REG_CLASSES], int* any) {
+ if (op->kind == OPK_REG) {
+ if (!seen_mark(seen, op->cls, op->v.reg))
+ *any |= try_substitute_for_reg(ctx, in, i, op->cls, op->v.reg);
+ } else if (op->kind == OPK_INDIRECT) {
+ if (op->v.ind.base != (Reg)REG_NONE &&
+ !seen_mark(seen, RC_INT, op->v.ind.base))
+ *any |= try_substitute_for_reg(ctx, in, i, RC_INT, op->v.ind.base);
+ if (op->v.ind.index != (Reg)REG_NONE &&
+ !seen_mark(seen, RC_INT, op->v.ind.index))
+ *any |= try_substitute_for_reg(ctx, in, i, RC_INT, op->v.ind.index);
+ }
+}
+
+static void try_substitute_aux_abivalue(CombineCtx* ctx, Inst* in, i32 i,
+ const CGABIValue* v,
+ u32 seen[OPT_REG_CLASSES], int* any) {
+ try_substitute_aux_operand(ctx, in, i, &v->storage, seen, any);
+ for (u32 k = 0; k < v->nparts; ++k)
+ try_substitute_aux_operand(ctx, in, i, &v->parts[k].op, seen, any);
+}
+
/* Try to substitute producers into operand slots of `in`. Walks the direct
* operands of `in` and looks up only the producers of registers actually
- * referenced (typically 2-3 per inst). */
+ * referenced (typically 2-3 per inst). Also walks IR_CALL aux register uses
+ * (args + callee) so reg->reg copies feeding call args or the callee get
+ * propagated. See subst_consumer_operands for why IR_RET is excluded. */
static int try_substitute(CombineCtx* ctx, Inst* in, i32 i) {
int any = 0;
u32 seen[OPT_REG_CLASSES] = {0};
for (u32 oi = 0; oi < in->nopnds; ++oi) {
const Operand* op = &in->opnds[oi];
- if (op->kind == OPK_REG) {
- if (!seen_mark(seen, op->cls, op->v.reg))
- any |= try_substitute_for_reg(ctx, in, i, op->cls, op->v.reg);
- } else if (op->kind == OPK_INDIRECT) {
- if (op->v.ind.base != (Reg)REG_NONE &&
- !seen_mark(seen, RC_INT, op->v.ind.base))
- any |= try_substitute_for_reg(ctx, in, i, RC_INT, op->v.ind.base);
- if (op->v.ind.index != (Reg)REG_NONE &&
- !seen_mark(seen, RC_INT, op->v.ind.index))
- any |= try_substitute_for_reg(ctx, in, i, RC_INT, op->v.ind.index);
+ try_substitute_aux_operand(ctx, in, i, op, seen, &any);
+ }
+ if ((IROp)in->op == IR_CALL) {
+ IRCallAux* aux = (IRCallAux*)in->extra.aux;
+ if (aux) {
+ if (aux->use_plan_replay) {
+ try_substitute_aux_operand(ctx, in, i, &aux->plan.callee, seen, &any);
+ for (u32 k = 0; k < aux->plan.nargs; ++k)
+ try_substitute_aux_operand(ctx, in, i, &aux->plan.args[k].src, seen,
+ &any);
+ } else {
+ try_substitute_aux_operand(ctx, in, i, &aux->desc.callee, seen, &any);
+ for (u32 k = 0; k < aux->desc.nargs; ++k)
+ try_substitute_aux_abivalue(ctx, in, i,
+ (const CGABIValue*)&aux->desc.args[k],
+ seen, &any);
+ }
}
}
return any;
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -321,6 +321,73 @@ static int is_caller_saved(Func* f, u8 cls, Reg r) {
return (f->opt_caller_saved[cls] & (1u << r)) != 0;
}
+static Reg first_ret_reg(Func* f, u8 cls) {
+ if (!f || cls >= OPT_REG_CLASSES) return REG_NONE;
+ u32 mask = f->opt_ret_regs[cls];
+ for (Reg r = 0; r < 32; ++r)
+ if (mask & (1u << r)) return r;
+ return REG_NONE;
+}
+
+static void set_preg_pref_to_ret_reg(Func* f, const Operand* op) {
+ if (!op || op->kind != OPK_REG) return;
+ PReg v = (PReg)op->v.reg;
+ if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return;
+ u8 cls = f->preg_info[v].cls;
+ if (cls >= OPT_REG_CLASSES) return;
+ Reg hint = first_ret_reg(f, cls);
+ if (hint == REG_NONE || hint >= 32) return;
+ /* Don't override a real pin. */
+ if (f->preg_info[v].tied_hard_reg >= 0) return;
+ /* apply_param_incoming_register_hazards conservatively forbids incoming
+ * param regs (e.g. x0) for every body PReg, because liveness doesn't
+ * model the implicit entry-move from x0 -> the param's home reg. That
+ * forbid is overly broad for call-result and ret-value PRegs:
+ * - A call-result PReg's def is mid-function (the call writes x0); its
+ * live range starts after every entry move, so it can't alias the
+ * entry-window use of x0.
+ * - A ret-value PReg is consumed at IR_RET (function exit); its live
+ * range can extend through the body but the entry-move into the
+ * param's home reg has already completed by the time any body inst
+ * could define this PReg.
+ * Clear the forbid for the hinted reg so the allocator can actually
+ * pick it. The general conflict check (alloc_group_conflicts_bit) still
+ * excludes intervening clobbers like other calls. */
+ f->preg_info[v].forbidden_hard_regs &= ~(1u << hint);
+ f->preg_info[v].preferred_hard_reg = (i8)hint;
+}
+
+static void set_preg_pref_for_abivalue(Func* f, const CGABIValue* v) {
+ if (!v) return;
+ set_preg_pref_to_ret_reg(f, &v->storage);
+ for (u32 i = 0; i < v->nparts; ++i)
+ set_preg_pref_to_ret_reg(f, &v->parts[i].op);
+}
+
+/* Set a soft "prefer the ABI return reg" hint on:
+ * - IR_CALL result PRegs (so emit_call's `mov result, x0` is elided)
+ * - IR_RET value PRegs (so emit_ret's `mov x0, value` is elided)
+ *
+ * The hint is a tie-breaker only — see hard_reg_alloc_score. The allocator's
+ * existing conflict checks already exclude regs with real interference (e.g.
+ * a result PReg live across another call cannot pick x0). */
+static void apply_abi_aliasing_hints(Func* f) {
+ if (!f || !f->preg_info) return;
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if ((IROp)in->op == IR_CALL) {
+ IRCallAux* aux = (IRCallAux*)in->extra.aux;
+ if (aux) set_preg_pref_for_abivalue(f, &aux->desc.ret);
+ } else if ((IROp)in->op == IR_RET) {
+ IRRetAux* aux = (IRRetAux*)in->extra.aux;
+ if (aux && aux->present) set_preg_pref_for_abivalue(f, &aux->val);
+ }
+ }
+ }
+}
+
/* ---------------------------------------------------------------------------
* Register allocator, MIR-shaped.
*
@@ -486,6 +553,14 @@ static u32 hard_reg_alloc_score(Func* f, const OptAllocator* a,
a->hard_open && bit < a->hard_loc_bits && a->hard_open[bit];
if (!already_open) score += pi ? pi->copy_cost : 50u;
}
+ /* Soft hint: tie-break toward a preferred hard reg by adding +1 to every
+ * non-hinted choice. Used by apply_abi_aliasing_hints to put IR_CALL
+ * results / IR_RET values into the ABI return register so emit_call /
+ * emit_ret can elide the materialization move. Stays well below the
+ * live-across-call (+1000) penalty and the callee-save (+20) gap so it
+ * never overrides real costs. */
+ if (vi->preferred_hard_reg >= 0 && (Reg)vi->preferred_hard_reg != hr)
+ score += 1u;
return score;
}
@@ -557,8 +632,10 @@ static void opt_init_preg_info_from_ranges(Func* f,
i32 tied = old ? old[v].tied_hard_reg : -1;
u32 forbidden = old ? old[v].forbidden_hard_regs : 0;
u32 old_frequency = old ? old[v].frequency : 0;
+ i8 pref = old ? old[v].preferred_hard_reg : (i8)-1;
OptPRegInfo* vi = &info[v];
vi->tied_hard_reg = tied;
+ vi->preferred_hard_reg = pref;
vi->hard_reg = REG_NONE;
vi->spill_slot = FRAME_SLOT_NONE;
vi->alloc_kind = OPT_ALLOC_NONE;
@@ -883,6 +960,57 @@ static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
best_score = score;
}
}
+ /* Also consider the preferred hard reg if it's outside the standard
+ * allocable set (e.g. x0 on aa64: reserved as the ABI ret reg, not in
+ * aa_int_allocable). Used by apply_abi_aliasing_hints to let an IR_CALL
+ * result PReg or IR_RET value PReg land directly in x0, eliding the
+ * materialization move emit_call/emit_ret would otherwise emit. The
+ * existing caller-save penalty in hard_reg_alloc_score (+1000) keeps
+ * cross-call values away from x0; only short-lived PRegs benefit. */
+ if (vi->preferred_hard_reg >= 0) {
+ Reg hint = (Reg)vi->preferred_hard_reg;
+ int already_tried = 0;
+ for (u32 r = 0; r < f->opt_hard_reg_count[cls]; ++r) {
+ if (f->opt_hard_regs[cls][r] == hint) {
+ already_tried = 1;
+ break;
+ }
+ }
+ if (!already_tried && hint < 32 &&
+ !(gi.forbidden_hard_regs & (1u << hint))) {
+ u32 bit = hard_loc_bit(cls, hint);
+ int hint_safe = !alloc_group_conflicts_bit(a, bit);
+ /* The bitmap conflict can be falsely positive when an
+ * already-assigned PReg ends exactly where v begins — the
+ * swap-friendly pattern like `sub x0, x21, x0`, where the previous
+ * call's result occupies x0 and the sub both reads it and writes
+ * the new value. Fall back to a precise per-PReg interference check
+ * that allows the unit-length overlap (same rule used by
+ * opt_coalesce_ranges for moves). */
+ if (!hint_safe) {
+ int real_conflict = 0;
+ for (PReg u = 1; u < opt_reg_count(f); ++u) {
+ if (u == v) continue;
+ const OptPRegInfo* ui = &f->preg_info[u];
+ if (ui->alloc_kind != OPT_ALLOC_HARD) continue;
+ if (ui->hard_reg != hint) continue;
+ if (opt_ranges_overlap_kind(ranges, u, v) >= 2) {
+ real_conflict = 1;
+ break;
+ }
+ }
+ if (!real_conflict) hint_safe = 1;
+ }
+ if (hint_safe) {
+ u32 score = hard_reg_alloc_score(f, a, vi, hint);
+ if (!found || score < best_score) {
+ found = 1;
+ best = hint;
+ best_score = score;
+ }
+ }
+ }
+ }
if (found) {
alloc_assign_group_hard(f, a, ranges, v, best);
} else {
@@ -1794,6 +1922,7 @@ static void opt_regalloc_place(Func* f, int allow_live_range_split,
opt_init_preg_info_from_ranges(f, &ranges);
opt_apply_asm_constraints_from_live(f, &live);
apply_param_incoming_register_hazards(f);
+ apply_abi_aliasing_hints(f);
/* MIR coalesces only at -O2 (mir-gen.c:9431); match that here. At O1 the
* point-bitmap allocator emits copies through the natural conflict-free
* path. IRF_NO_COALESCE protects SSA edge copies inserted at O2. */
diff --git a/src/opt/pass_lower_loop_imm.c b/src/opt/pass_lower_loop_imm.c
@@ -0,0 +1,172 @@
+/* O1 pre-hoist lowering for loop-body immediate operands.
+ *
+ * `opt_hoist_loop_consts` consolidates duplicate IR_LOAD_IMM defs of the same
+ * constant out of a loop. But many constants never become IR_LOAD_IMM defs:
+ * they arrive at the emitter as inline `imm:` operands on IR_BINOP / IR_CMP /
+ * IR_CMP_BRANCH / IR_STORE. If the backend can fold the immediate into the
+ * instruction (e.g. `add x, y, #1`), the emitter keeps it inline and there is
+ * nothing to hoist. If it cannot (e.g. `mul x, y, #5` on aarch64), the emitter
+ * materializes the constant into a scratch register inside the loop every
+ * iteration.
+ *
+ * This pass walks loop-body instructions and rewrites those non-foldable inline
+ * imm operands to a fresh IR_LOAD_IMM + OPK_REG use so the existing hoister
+ * picks them up. Stores of non-zero imms are also lowered (the emitter has a
+ * zero-source fast path but materializes every other inline imm).
+ *
+ * Guards:
+ * - Only loop bodies (loop_depth > 0) — extending live ranges outside loops
+ * is unprofitable.
+ * - Skip zero imms — the emitter has zero-reg fast paths (e.g. `strb wzr`)
+ * that beat holding 0 in an allocated register.
+ * - Skip imms the target says it can fold (`imm_legal`) — those produce no
+ * materialize in the loop and lifting them would just lengthen live ranges.
+ *
+ * Must run after opt_machinize_native and opt_build_loop_tree (reads block
+ * loop_depth and target->imm_legal), and before opt_hoist_loop_consts (which
+ * does the hoisting). */
+#include <cfree/cg.h>
+#include <string.h>
+
+#include "opt/opt_internal.h"
+
+typedef enum LowerImmKind {
+ LOWER_IMM_NONE,
+ LOWER_IMM_BINOP, /* foldable iff imm_legal(NATIVE_IMM_BINOP, ...) */
+ LOWER_IMM_CMP, /* foldable iff imm_legal(NATIVE_IMM_CMP, ...) */
+ LOWER_IMM_STORE, /* never foldable as inline imm; emit materializes always */
+} LowerImmKind;
+
+/* For each instruction shape we care about, return which operand index holds
+ * the imm-bearing source, and what use-kind/sub-op to pass to imm_legal. */
+static u32 inst_imm_slot(const Inst* in, LowerImmKind* kind_out, u32* sub_out) {
+ switch ((IROp)in->op) {
+ case IR_BINOP:
+ *kind_out = LOWER_IMM_BINOP;
+ *sub_out = (u32)in->extra.imm;
+ return 2u;
+ case IR_CMP:
+ *kind_out = LOWER_IMM_CMP;
+ *sub_out = (u32)in->extra.imm;
+ return 2u;
+ case IR_CMP_BRANCH:
+ *kind_out = LOWER_IMM_CMP;
+ *sub_out = (u32)in->extra.imm;
+ return 1u;
+ case IR_STORE:
+ *kind_out = LOWER_IMM_STORE;
+ *sub_out = 0u;
+ return 1u;
+ default:
+ *kind_out = LOWER_IMM_NONE;
+ *sub_out = 0u;
+ return 0u;
+ }
+}
+
+static int operand_should_lift(NativeTarget* target, const Operand* op,
+ LowerImmKind kind, u32 sub) {
+ if (op->kind != OPK_IMM) return 0;
+ if (op->v.imm == 0) return 0;
+ switch (kind) {
+ case LOWER_IMM_BINOP:
+ case LOWER_IMM_CMP: {
+ NativeImmUse use = (kind == LOWER_IMM_BINOP) ? NATIVE_IMM_BINOP
+ : NATIVE_IMM_CMP;
+ if (target->imm_legal &&
+ target->imm_legal(target, use, sub, op->type, op->v.imm))
+ return 0;
+ return 1;
+ }
+ case LOWER_IMM_STORE:
+ return 1; /* emitter has no fold path for store value imms */
+ case LOWER_IMM_NONE:
+ default:
+ return 0;
+ }
+}
+
+/* Count how many imm operands in `bl` are lift candidates, so we can grow the
+ * block's inst array once rather than per-lift. */
+static u32 count_lifts_in_block(Func* f, Block* bl, NativeTarget* target) {
+ u32 n = 0;
+ (void)f;
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ LowerImmKind kind;
+ u32 sub;
+ u32 slot = inst_imm_slot(in, &kind, &sub);
+ if (kind == LOWER_IMM_NONE) continue;
+ if (slot >= in->nopnds) continue;
+ if (operand_should_lift(target, &in->opnds[slot], kind, sub)) ++n;
+ }
+ return n;
+}
+
+void opt_lower_loop_imm_operands(Func* f, NativeTarget* target) {
+ if (!f || f->opt_reg_ssa || f->opt_rewritten) return;
+ if (!target) return;
+ if (f->nblocks == 0) return;
+
+ /* Process blocks one at a time, inserting per-block in a single pre-grown
+ * batch. Each lift produces one IR_LOAD_IMM directly before its consumer. */
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ if (bl->loop_depth == 0) continue;
+ u32 nlifts = count_lifts_in_block(f, bl, target);
+ if (!nlifts) continue;
+
+ /* Walk forward; for each lift, insert one IR_LOAD_IMM before the consumer
+ * and rewrite the operand. After insert_at the consumer slides down by 1,
+ * so re-fetch via index. We use opt_block_insert_at one slot at a time;
+ * cap-doubling keeps total work O(ninsts + nlifts). */
+ for (u32 i = 0; i < bl->ninsts;) {
+ Inst* in = &bl->insts[i];
+ LowerImmKind kind;
+ u32 sub;
+ u32 slot = inst_imm_slot(in, &kind, &sub);
+ if (kind == LOWER_IMM_NONE || slot >= in->nopnds) {
+ ++i;
+ continue;
+ }
+ Operand* op = &in->opnds[slot];
+ if (!operand_should_lift(target, op, kind, sub)) {
+ ++i;
+ continue;
+ }
+ /* Snapshot the imm operand before the insert moves memory around. */
+ CfreeCgTypeId type = op->type;
+ u8 cls = op->cls;
+ i64 imm = op->v.imm;
+
+ PReg p = ir_alloc_preg(f, type, cls);
+ Inst* slot_in = opt_block_insert_at(f, bl, i, 1u);
+ slot_in->op = (u16)IR_LOAD_IMM;
+ slot_in->def = (Val)p;
+ slot_in->type = type;
+ slot_in->extra.imm = imm;
+ slot_in->loc = bl->insts[i + 1u].loc;
+ slot_in->nopnds = 1u;
+ slot_in->opnds = arena_array(f->arena, Operand, 1);
+ memset(&slot_in->opnds[0], 0, sizeof(Operand));
+ slot_in->opnds[0].kind = OPK_REG;
+ slot_in->opnds[0].cls = cls;
+ slot_in->opnds[0].type = type;
+ slot_in->opnds[0].v.reg = p;
+ ir_assign_inst_id(f, slot_in);
+
+ /* The consumer is now at index i+1; rewrite its imm operand to OPK_REG. */
+ Inst* user = &bl->insts[i + 1u];
+ Operand* uop = &user->opnds[slot];
+ memset(uop, 0, sizeof(Operand));
+ uop->kind = OPK_REG;
+ uop->cls = cls;
+ uop->type = type;
+ uop->v.reg = p;
+
+ i += 2u;
+ }
+ }
+
+ opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE);
+}
diff --git a/src/opt/pass_simplify.c b/src/opt/pass_simplify.c
@@ -129,12 +129,42 @@ static int convert_noop(Func* f, const Inst* in) {
}
}
+/* Commutative integer binops the canonicalizer below may swap operands of.
+ * Floating-point commutative ops are intentionally excluded: the backend
+ * routes them through the fp branch (which doesn't consult imm_legal) so
+ * canonicalization gains nothing, and IEEE-754 may distinguish operand order
+ * for NaN payloads. Non-commutative ops (sub, shifts, div/rem) are skipped. */
+static int is_commutative_binop(BinOp op) {
+ switch (op) {
+ case BO_IADD:
+ case BO_IMUL:
+ case BO_AND:
+ case BO_OR:
+ case BO_XOR:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
static int simplify_binop(Func* f, Inst* in, int ssa) {
if (!in || (IROp)in->op != IR_BINOP || in->flags || in->nopnds < 3) return 0;
if (in->opnds[0].kind != OPK_REG) return 0;
u32 width = simplify_width(f, in->type ? in->type : in->opnds[0].type);
if (!width) return 0;
+ /* Canonicalize commutative binops to put any inline OPK_IMM on the rhs.
+ * The native emitter and the loop-imm-hoist pass both inspect only opnds[2]
+ * for imm-legality; putting the constant there lets aa64 strength-reduce
+ * `mul x, 5` to a shifted-add and lets the hoist pass lift loop-invariant
+ * non-foldable imms. Value-preserving by commutativity. */
+ if (is_commutative_binop((BinOp)in->extra.imm) &&
+ in->opnds[1].kind == OPK_IMM && in->opnds[2].kind != OPK_IMM) {
+ Operand tmp = in->opnds[1];
+ in->opnds[1] = in->opnds[2];
+ in->opnds[2] = tmp;
+ }
+
Operand* a = &in->opnds[1];
Operand* b = &in->opnds[2];
i64 av = 0;