commit 441a877b94f870ce852a54a3e18bafda0dd2f8ba
parent d43cbf6fc02269ac31d3fb6859ce514a86d48b5d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 21 May 2026 09:32:56 -0700
Add pre-GVN O2 rewrite coverage
Diffstat:
8 files changed, 982 insertions(+), 17 deletions(-)
diff --git a/doc/OPT.md b/doc/OPT.md
@@ -190,6 +190,20 @@ MIR full mode then runs:
and nonalias tags on memory operands. Calls clear or conservatively restrict
memory availability.
+ cfree should land this in two explicit slices. The first slice is scalar GVN
+ only: pure SSA value numbering and replacement for constants, arithmetic,
+ compares, conversions, and other side-effect-free register expressions. It
+ should not eliminate or forward `IR_LOAD`, interpret `IR_STORE`, or make
+ memory-dependent branch decisions.
+
+ Before the second slice touches memory, document and test the memory
+ numbering shape. That design must define memory version tokens by alias
+ class/root, how `MemAccess.alias`, address operands, address spaces, globals,
+ TLS, params, and unknown memory participate in keys, and the invalidation
+ rules for stores, calls, atomics, fences, volatile operations, and inline asm.
+ Redundant-load elimination and store/load reuse start only after that shape
+ is explicit and covered by pass-local dump tests.
+
5. `copy_prop`.
MIR propagates copies, folds multiply/divide by powers of two, and removes
redundant extension chains. cfree should keep this as a separate pass after
@@ -395,6 +409,31 @@ finalize:
loops exist, `pressure_relief`, `make_conventional_ssa`, `ssa_combine`,
`undo_ssa`, and `jump_opt`.
+Current implementation subset:
+
+```
+build_cfg
+jump_cleanup(CFG)
+build_cfg
+block_cloning
+ssa_dce
+copy_cleanup
+build_ssa
+ssa_dce
+copy_cleanup
+addr_xform
+ssa_dce
+copy_cleanup
+make_conventional_ssa
+undo_ssa
+copy_cleanup
+build_cfg
+machinize ... opt_emit
+```
+
+Every CFG or value-use mutation above is followed by the relevant verifier
+checkpoint in `opt_run_o2_pipeline`.
+
### 4.3 Transformations We Do Not Take
`doc/DESIGN.md` is still binding: no UB-exploiting transforms. Do not assume
@@ -431,9 +470,9 @@ Current behavior:
combines, runs post-RA DCE, cleans jumps/layout, and emits through the
wrapped target.
- Level `2` has its own scheduler entry point. It now builds real mem2reg SSA,
- verifies phi/use-def invariants, lowers phis through conventional SSA, undoes
- SSA, and then routes through the same proven lowering path as `-O1`. It still
- has no value-changing SSA transformations enabled.
+ verifies phi/use-def invariants, runs the first small O2 transforms, lowers
+ phis through conventional SSA, undoes SSA, and then routes through the same
+ proven lowering path as `-O1`.
- `opt_regalloc(..., allow_live_range_split)` accepts the O2 policy bit, but
live-range splitting is not implemented yet.
- `opt_build_ssa` is no longer just a shape check: promoted loads/stores are
@@ -450,9 +489,16 @@ Current behavior:
- The first SSA-backed value passes are in the O2 path: `opt_ssa_dce` removes
unused pure SSA defs/phis, and `opt_copy_cleanup` rewrites safe SSA copy users
through def-use chains while avoiding multi-def conventional-SSA edge copies.
+- The next pre-GVN O2 slice is also in the O2 path: `opt_block_cloning` clones
+ tiny non-loop join blocks under strict growth limits when cloned defs remain
+ local and external uses are dominated; `opt_addr_xform` folds simple local
+ `addr_of` pseudos into non-observable load/store memory operands. Volatile,
+ atomic, global, TLS, non-zero indirect-offset, and otherwise non-local address
+ cases remain materialized. Both passes rebuild/invalidate analysis state and
+ are bracketed by `opt_ssa_dce`, `opt_copy_cleanup`, and verifier checkpoints.
-The next implementation work should extend this small-pass path incrementally,
-before attempting GVN/DSE/LICM.
+The next implementation work should continue extending this small-pass path
+incrementally before attempting GVN/DSE/LICM.
---
@@ -570,8 +616,8 @@ Implement in order:
1. [x] `opt_ssa_dce`
2. [x] `opt_copy_cleanup`
-3. `opt_block_cloning`
-4. `opt_addr_xform`
+3. [x] `opt_block_cloning`
+4. [x] `opt_addr_xform`
5. `opt_gvn`
6. `opt_copy_prop`
7. `opt_dse`
@@ -585,14 +631,28 @@ Implement in order:
Do not batch these into one landing. Each pass needs a pass-local corpus case
that fails red without the pass or its bug fix.
-GVN is deliberately not the next milestone. It should wait until SSA DCE and
-copy cleanup have exercised ordinary use-def mutation, verifier coverage, and
-analysis invalidation on small transformations.
+GVN is deliberately after this milestone. It should wait until SSA DCE, copy
+cleanup, block cloning, and address folding have made CFG mutation,
+memory/address operand mutation, verifier coverage, and analysis invalidation
+boring and reliable.
+
+The next GVN step is scalar-only. Keep memory GVN/redundant-load elimination
+out of the first implementation patch. Make alias and memory numbering explicit
+after scalar GVN is green and before `opt_gvn` starts rewriting `IR_LOAD` or
+reasoning across `IR_STORE`, calls, atomics, fences, volatile operations, TLS,
+globals, or unknown memory.
Exit criteria:
-- `CFREE_TOY_OPT_LEVELS="0 1 2" make test-toy` passes for the relevant arch.
-- Pass-local dump tests prove the intended rewrite fires.
+- [x] `make test-opt`
+- [x] Focused toy O2 branch/join/address-memory subset:
+ `CFREE_TEST_FILTER=128_o2_branch_join_addr_mem CFREE_TOY_OPT_LEVELS="0 1 2"
+ make test-toy`
+- [x] Full `CFREE_TOY_OPT_LEVELS="0 1 2" make test-toy`
+- [x] Disassembly sanity check shows narrow churn: existing
+ `10_pointer_addr_deref` removes separate address materialization in O2, while
+ the new branch/join/address fixture has only minor register-choice churn.
+- [x] Pass-local dump tests prove each later intended rewrite fires.
### Phase E - Inlining and Cleanup
@@ -656,9 +716,11 @@ src/opt/
ir.c Func/Block/Inst plumbing
ir_print.c stable dumps for pass tests
pass_cfg.c CFG, unreachable cleanup
- pass_clone.c block cloning
+ pass_o2.c early O2 transforms: block cloning, address transform,
+ temporary GVN/DSE/cleanup stubs
+ pass_clone.c block cloning (split out when pass size warrants it)
pass_ssa.c SSA build, conventional SSA, undo SSA
- pass_addr.c address transformation
+ pass_addr.c address transformation (split out when pass size warrants it)
pass_gvn.c GVN, constprop, redundant-load elimination
pass_copy.c copy propagation, extension cleanup
pass_dse.c dead store elimination
@@ -713,7 +775,11 @@ Use pass dumps for red-green tests:
- CFG: block/successor/predecessor shape.
- SSA: phi placement and def-use chains.
-- GVN: redundant expression/load replaced and constant branch folded.
+- Scalar GVN: redundant pure expression replaced and constant branch folded,
+ with def-use rebuilt and stale-analysis verifier coverage.
+- Memory GVN: redundant load/store-load reuse only after alias and memory
+ numbering are specified; tests must include volatile/atomic/call/unknown
+ memory barriers.
- DSE: dead store removed while escaping stores remain.
- LICM: safe invariant moved; trapping division and memory ops remain.
- RA: expected spill/reload/coalescing counters.
diff --git a/src/opt/ir_print.c b/src/opt/ir_print.c
@@ -0,0 +1,281 @@
+#include <stdio.h>
+#include <string.h>
+
+#include "opt/opt.h"
+
+static void dump_write(Writer* w, const char* s) {
+ cfree_writer_write(w, s, strlen(s));
+}
+
+static const char* op_name(IROp op) {
+ switch (op) {
+ case IR_NOP:
+ return "nop";
+ case IR_CONST_I:
+ return "const_i";
+ case IR_CONST_BYTES:
+ return "const_bytes";
+ case IR_PARAM_DECL:
+ return "param_decl";
+ case IR_LOAD_IMM:
+ return "load_imm";
+ case IR_LOAD_CONST:
+ return "load_const";
+ case IR_COPY:
+ return "copy";
+ case IR_LOAD:
+ return "load";
+ case IR_STORE:
+ return "store";
+ case IR_ADDR_OF:
+ return "addr_of";
+ case IR_TLS_ADDR_OF:
+ return "tls_addr_of";
+ case IR_AGG_COPY:
+ return "agg_copy";
+ case IR_AGG_SET:
+ return "agg_set";
+ case IR_BITFIELD_LOAD:
+ return "bitfield_load";
+ case IR_BITFIELD_STORE:
+ return "bitfield_store";
+ case IR_BINOP:
+ return "binop";
+ case IR_UNOP:
+ return "unop";
+ case IR_CMP:
+ return "cmp";
+ case IR_CONVERT:
+ return "convert";
+ case IR_CALL:
+ return "call";
+ case IR_PHI:
+ return "phi";
+ case IR_BR:
+ return "br";
+ case IR_CONDBR:
+ return "condbr";
+ case IR_CMP_BRANCH:
+ return "cmp_branch";
+ case IR_SWITCH:
+ return "switch";
+ case IR_INDIRECT_BRANCH:
+ return "indirect_branch";
+ case IR_LOAD_LABEL_ADDR:
+ return "load_label_addr";
+ case IR_RET:
+ return "ret";
+ case IR_SCOPE_BEGIN:
+ return "scope_begin";
+ case IR_SCOPE_ELSE:
+ return "scope_else";
+ case IR_SCOPE_END:
+ return "scope_end";
+ case IR_BREAK_TO:
+ return "break_to";
+ case IR_CONTINUE_TO:
+ return "continue_to";
+ case IR_ALLOCA:
+ return "alloca";
+ case IR_VA_START:
+ return "va_start";
+ case IR_VA_ARG:
+ return "va_arg";
+ case IR_VA_END:
+ return "va_end";
+ case IR_VA_COPY:
+ return "va_copy";
+ case IR_ATOMIC_LOAD:
+ return "atomic_load";
+ case IR_ATOMIC_STORE:
+ return "atomic_store";
+ case IR_ATOMIC_RMW:
+ return "atomic_rmw";
+ case IR_ATOMIC_CAS:
+ return "atomic_cas";
+ case IR_FENCE:
+ return "fence";
+ case IR_ASM_BLOCK:
+ return "asm_block";
+ case IR_INTRINSIC:
+ return "intrinsic";
+ default:
+ return "unknown";
+ }
+}
+
+static const char* alias_name(u8 kind) {
+ switch ((AliasKind)kind) {
+ case ALIAS_UNKNOWN:
+ return "unknown";
+ case ALIAS_LOCAL:
+ return "local";
+ case ALIAS_GLOBAL:
+ return "global";
+ case ALIAS_PARAM:
+ return "param";
+ case ALIAS_HEAP:
+ return "heap";
+ case ALIAS_STRING:
+ return "string";
+ default:
+ return "alias?";
+ }
+}
+
+static void dump_alias(Writer* w, const AliasRoot* a) {
+ char buf[64];
+ dump_write(w, alias_name(a->kind));
+ switch ((AliasKind)a->kind) {
+ case ALIAS_LOCAL:
+ snprintf(buf, sizeof buf, "#%d", (int)a->v.local_id);
+ dump_write(w, buf);
+ break;
+ case ALIAS_GLOBAL:
+ snprintf(buf, sizeof buf, "#%u", (unsigned)a->v.global);
+ dump_write(w, buf);
+ break;
+ case ALIAS_PARAM:
+ snprintf(buf, sizeof buf, "#%u", (unsigned)a->v.param_idx);
+ dump_write(w, buf);
+ break;
+ case ALIAS_STRING:
+ snprintf(buf, sizeof buf, "#%u", (unsigned)a->v.string_id);
+ dump_write(w, buf);
+ break;
+ default:
+ break;
+ }
+}
+
+static void dump_operand(Writer* w, const Operand* op) {
+ char buf[96];
+ if (!op) {
+ dump_write(w, "-");
+ return;
+ }
+ switch ((OpKind)op->kind) {
+ case OPK_IMM:
+ snprintf(buf, sizeof buf, "imm:%lld", (long long)op->v.imm);
+ dump_write(w, buf);
+ break;
+ case OPK_REG:
+ snprintf(buf, sizeof buf, "v%u", (unsigned)op->v.reg);
+ dump_write(w, buf);
+ break;
+ case OPK_LOCAL:
+ snprintf(buf, sizeof buf, "local#%u", (unsigned)op->v.frame_slot);
+ dump_write(w, buf);
+ break;
+ case OPK_GLOBAL:
+ snprintf(buf, sizeof buf, "global#%u%+lld", (unsigned)op->v.global.sym,
+ (long long)op->v.global.addend);
+ dump_write(w, buf);
+ break;
+ case OPK_INDIRECT:
+ snprintf(buf, sizeof buf, "[v%u%+d]", (unsigned)op->v.ind.base,
+ (int)op->v.ind.ofs);
+ dump_write(w, buf);
+ break;
+ default:
+ dump_write(w, "op?");
+ break;
+ }
+}
+
+static void dump_operands(Writer* w, const Inst* in) {
+ dump_write(w, " opnds=[");
+ for (u32 i = 0; i < in->nopnds; ++i) {
+ if (i) dump_write(w, ",");
+ dump_operand(w, &in->opnds[i]);
+ }
+ dump_write(w, "]");
+}
+
+static void dump_mem(Writer* w, const MemAccess* m) {
+ char buf[128];
+ snprintf(buf, sizeof buf,
+ " mem=size%u align%u flags=0x%x alias=", (unsigned)m->size,
+ (unsigned)m->align, (unsigned)m->flags);
+ dump_write(w, buf);
+ dump_alias(w, &m->alias);
+}
+
+static void dump_phi(Writer* w, const Inst* in) {
+ char buf[96];
+ IRPhiAux* aux = (IRPhiAux*)in->extra.aux;
+ snprintf(buf, sizeof buf, " slot=%u preds=[",
+ aux ? (unsigned)aux->slot_id : 0u);
+ dump_write(w, buf);
+ if (aux) {
+ for (u32 p = 0; p < aux->npreds; ++p) {
+ snprintf(buf, sizeof buf, "%sb%u:v%u", p ? "," : "",
+ (unsigned)aux->pred_blocks[p], (unsigned)aux->pred_vals[p]);
+ dump_write(w, buf);
+ }
+ }
+ dump_write(w, "]");
+}
+
+static void dump_ret(Writer* w, const Inst* in) {
+ IRRetAux* aux = (IRRetAux*)in->extra.aux;
+ if (!aux || !aux->present) {
+ dump_write(w, " ret=void");
+ return;
+ }
+ dump_write(w, " ret=");
+ dump_operand(w, &aux->val.storage);
+}
+
+void opt_ir_dump(Func* f, Writer* w) {
+ if (!f || !w) return;
+ char buf[160];
+ snprintf(buf, sizeof buf, "ir blocks=%u vals=%u\n", (unsigned)f->nblocks,
+ (unsigned)f->nvals);
+ dump_write(w, buf);
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ snprintf(buf, sizeof buf, "block %u preds=[", (unsigned)b);
+ dump_write(w, buf);
+ for (u32 p = 0; p < bl->npreds; ++p) {
+ snprintf(buf, sizeof buf, "%sb%u", p ? "," : "", (unsigned)bl->preds[p]);
+ dump_write(w, buf);
+ }
+ dump_write(w, "] succs=[");
+ for (u32 s = 0; s < bl->nsucc; ++s) {
+ snprintf(buf, sizeof buf, "%sb%u", s ? "," : "", (unsigned)bl->succ[s]);
+ dump_write(w, buf);
+ }
+ snprintf(buf, sizeof buf, "] insts=%u\n", (unsigned)bl->ninsts);
+ dump_write(w, buf);
+
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ snprintf(buf, sizeof buf, " %u %s", (unsigned)i, op_name((IROp)in->op));
+ dump_write(w, buf);
+ if (in->def != VAL_NONE) {
+ snprintf(buf, sizeof buf, " def=v%u", (unsigned)in->def);
+ dump_write(w, buf);
+ }
+ if (in->ndefs) {
+ dump_write(w, " defs=[");
+ for (u32 d = 0; d < in->ndefs; ++d) {
+ snprintf(buf, sizeof buf, "%sv%u", d ? "," : "",
+ (unsigned)in->defs[d]);
+ dump_write(w, buf);
+ }
+ dump_write(w, "]");
+ }
+ if (in->nopnds) dump_operands(w, in);
+ if ((IROp)in->op == IR_LOAD || (IROp)in->op == IR_STORE)
+ dump_mem(w, &in->extra.mem);
+ if ((IROp)in->op == IR_LOAD_IMM || (IROp)in->op == IR_CONST_I) {
+ snprintf(buf, sizeof buf, " imm=%lld", (long long)in->extra.imm);
+ dump_write(w, buf);
+ }
+ if ((IROp)in->op == IR_PHI) dump_phi(w, in);
+ if ((IROp)in->op == IR_RET) dump_ret(w, in);
+ dump_write(w, "\n");
+ }
+ }
+}
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -1407,12 +1407,22 @@ static void opt_run_o2_pipeline(OptImpl* o) {
opt_jump_cleanup(o->f, OPT_JUMP_CLEANUP_CFG);
opt_build_cfg(o->f);
opt_verify(o->f, "o2-pre-ssa-cfg");
+ opt_block_cloning(o->f);
+ opt_verify(o->f, "o2-block-clone-cfg");
+ opt_ssa_dce(o->f);
+ opt_copy_cleanup(o->f);
+ opt_verify(o->f, "o2-block-clone-cleanup");
opt_build_ssa(o->f);
opt_verify(o->f, "o2-ssa");
opt_ssa_dce(o->f);
- opt_verify(o->f, "o2-ssa-dce");
opt_copy_cleanup(o->f);
- opt_verify(o->f, "o2-copy-cleanup");
+ opt_verify(o->f, "o2-pre-addr-cleanup");
+ opt_addr_xform(o->f);
+ opt_verify(o->f, "o2-addr-xform-ssa");
+ opt_ssa_dce(o->f);
+ opt_verify(o->f, "o2-addr-xform-dce");
+ opt_copy_cleanup(o->f);
+ opt_verify(o->f, "o2-addr-xform-copy-cleanup");
opt_ssa_dce(o->f);
opt_verify(o->f, "o2-copy-dce");
opt_make_conventional_ssa(o->f);
diff --git a/src/opt/opt.h b/src/opt/opt.h
@@ -163,6 +163,7 @@ void opt_live_blocks(Func*, OptLiveInfo*);
void opt_live_dump_blocks(Func*, const OptLiveInfo*, Writer*);
void opt_live_ranges_build(Func*, const OptLiveInfo*, OptLiveRangeSet*);
void opt_live_dump_ranges(Func*, const OptLiveRangeSet*, Writer*);
+void opt_ir_dump(Func*, Writer*);
void opt_ssa_dump(Func*, Writer*);
void opt_rewrite_dump(Func*, Writer*);
void opt_coalesce(Func*);
diff --git a/src/opt/pass_o2.c b/src/opt/pass_o2.c
@@ -0,0 +1,383 @@
+#include <string.h>
+
+#include "opt/opt_internal.h"
+
+#define OPT_BLOCK_NONE 0xffffffffu
+#define OPT_CLONE_MAX_BLOCK_INSTS 4u
+#define OPT_CLONE_MAX_PREDS 4u
+#define OPT_CLONE_ABS_GROWTH_LIMIT 32u
+
+typedef struct CloneValMap {
+ Val old_val;
+ Val new_val;
+} CloneValMap;
+
+static u32 opt_inst_count(Func* f) {
+ u32 n = 0;
+ for (u32 b = 0; b < f->nblocks; ++b) n += f->blocks[b].ninsts;
+ return n;
+}
+
+static int o2_is_terminator(const Inst* in) {
+ switch ((IROp)in->op) {
+ case IR_BR:
+ case IR_CONDBR:
+ case IR_CMP_BRANCH:
+ case IR_SWITCH:
+ case IR_INDIRECT_BRANCH:
+ case IR_RET:
+ case IR_BREAK_TO:
+ case IR_CONTINUE_TO:
+ return 1;
+ case IR_INTRINSIC: {
+ IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
+ return aux && (aux->kind == INTRIN_LONGJMP || aux->kind == INTRIN_TRAP ||
+ aux->kind == INTRIN_UNREACHABLE);
+ }
+ default:
+ return 0;
+ }
+}
+
+static int o2_cloneable_inst(const Inst* in) {
+ if (in->ndefs) return 0;
+ switch ((IROp)in->op) {
+ case IR_NOP:
+ case IR_LOAD_IMM:
+ case IR_LOAD_CONST:
+ case IR_COPY:
+ case IR_LOAD:
+ case IR_STORE:
+ case IR_ADDR_OF:
+ case IR_BINOP:
+ case IR_UNOP:
+ case IR_CMP:
+ case IR_CONVERT:
+ case IR_BR:
+ case IR_RET:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int block_has_phi(Func* f, u32 b) {
+ if (b >= f->nblocks) return 0;
+ Block* bl = &f->blocks[b];
+ return bl->ninsts && (IROp)bl->insts[0].op == IR_PHI;
+}
+
+static int block_defs_are_local(Func* f, u32 b) {
+ Block* bl = &f->blocks[b];
+ opt_rebuild_def_use(f);
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if (in->def == VAL_NONE) continue;
+ for (u32 u = f->opt_first_use_by_val[in->def]; u != OPT_USE_NONE;
+ u = f->opt_uses[u].next_for_val) {
+ if (f->opt_uses[u].block != b) return 0;
+ }
+ }
+ return 1;
+}
+
+static int block_defines_val(const Block* bl, Val v) {
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ const Inst* in = &bl->insts[i];
+ if (in->def == v) return 1;
+ for (u32 d = 0; d < in->ndefs; ++d)
+ if (in->defs[d] == v) return 1;
+ }
+ return 0;
+}
+
+static int block_external_uses_dominated(Func* f, const OptAnalysis* a, u32 b) {
+ Block* bl = &f->blocks[b];
+ opt_rebuild_def_use(f);
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ for (u32 u = 0; u < f->opt_nuses; ++u) {
+ OptUse* use = &f->opt_uses[u];
+ if (use->block != b || use->inst != i) continue;
+ Val v = use->val;
+ if (block_defines_val(bl, v)) continue;
+ if (v == VAL_NONE || v >= f->nvals) return 0;
+ if (!opt_analysis_dominates(a, f->val_def_block[v], b)) return 0;
+ }
+ }
+ return 1;
+}
+
+static int clone_candidate(Func* f, const OptAnalysis* a, u32 block) {
+ if (block >= f->nblocks || block == f->entry) return 0;
+ Block* bl = &f->blocks[block];
+ if (!a->reachable || !a->reachable[block]) return 0;
+ if (bl->npreds < 2 || bl->npreds > OPT_CLONE_MAX_PREDS) return 0;
+ if (bl->ninsts == 0 || bl->ninsts > OPT_CLONE_MAX_BLOCK_INSTS) return 0;
+ if (bl->nsucc > 1) return 0;
+ if (bl->loop_depth) return 0;
+ if (block_has_phi(f, block)) return 0;
+ if (bl->nsucc == 1 && block_has_phi(f, bl->succ[0])) return 0;
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ if (!o2_cloneable_inst(&bl->insts[i])) return 0;
+ if (i + 1u != bl->ninsts && o2_is_terminator(&bl->insts[i])) return 0;
+ }
+ for (u32 p = 0; p < bl->npreds; ++p) {
+ u32 pred = bl->preds[p];
+ if (pred >= f->nblocks) return 0;
+ if (f->blocks[pred].loop_depth) return 0;
+ if (opt_analysis_dominates(a, block, pred)) return 0;
+ }
+ return block_defs_are_local(f, block) &&
+ block_external_uses_dominated(f, a, block);
+}
+
+static Val map_lookup(const CloneValMap* map, u32 nmap, Val v) {
+ for (u32 i = 0; i < nmap; ++i)
+ if (map[i].old_val == v) return map[i].new_val;
+ return VAL_NONE;
+}
+
+static void remap_operand(Func* f, Inst* in, Operand* op, int is_def,
+ void* arg) {
+ (void)in;
+ (void)is_def;
+ CloneValMap* map = (CloneValMap*)arg;
+ u32 nmap = map[0].old_val;
+ if (op->kind != OPK_REG) return;
+ Val repl = map_lookup(map + 1, nmap, (Val)op->v.reg);
+ if (repl == VAL_NONE) return;
+ op->v.reg = (Reg)repl;
+ op->type = f->val_type[repl];
+ op->cls = f->val_cls[repl];
+}
+
+static IRRetAux* clone_ret_aux(Func* f, const IRRetAux* src) {
+ if (!src) return NULL;
+ IRRetAux* dst = arena_znew(f->arena, IRRetAux);
+ *dst = *src;
+ if (src->val.nparts) {
+ CGABIPart* parts = arena_array(f->arena, CGABIPart, src->val.nparts);
+ memcpy(parts, src->val.parts, sizeof(parts[0]) * src->val.nparts);
+ dst->val.parts = parts;
+ }
+ return dst;
+}
+
+static void clone_inst_aux(Func* f, Inst* dst, const Inst* src) {
+ if ((IROp)src->op == IR_RET) {
+ dst->extra.aux = clone_ret_aux(f, (const IRRetAux*)src->extra.aux);
+ }
+}
+
+static u32 emit_order_pos(Func* f, u32 block) {
+ for (u32 i = 0; i < f->emit_order_n; ++i)
+ if (f->emit_order[i] == block) return i;
+ return OPT_BLOCK_NONE;
+}
+
+static void emit_order_insert_after(Func* f, u32 after, u32 block) {
+ if (emit_order_pos(f, block) != OPT_BLOCK_NONE) return;
+ if (f->emit_order_n == f->emit_order_cap) {
+ u32 ncap = f->emit_order_cap ? f->emit_order_cap * 2u : 8u;
+ u32* order = arena_array(f->arena, u32, ncap);
+ if (f->emit_order)
+ memcpy(order, f->emit_order, sizeof(order[0]) * f->emit_order_n);
+ f->emit_order = order;
+ f->emit_order_cap = ncap;
+ }
+ u32 pos = f->emit_order_n;
+ u32 after_pos = emit_order_pos(f, after);
+ if (after_pos != OPT_BLOCK_NONE) pos = after_pos + 1u;
+ for (u32 i = f->emit_order_n; i > pos; --i)
+ f->emit_order[i] = f->emit_order[i - 1u];
+ f->emit_order[pos] = block;
+ ++f->emit_order_n;
+}
+
+static void replace_succ_ref(Func* f, u32 pred, u32 old_succ, u32 new_succ) {
+ Block* bl = &f->blocks[pred];
+ for (u32 s = 0; s < bl->nsucc; ++s)
+ if (bl->succ[s] == old_succ) bl->succ[s] = new_succ;
+ if (!bl->ninsts) return;
+ Inst* term = &bl->insts[bl->ninsts - 1u];
+ if ((IROp)term->op == IR_SWITCH) {
+ IRSwitchAux* aux = (IRSwitchAux*)term->extra.aux;
+ if (!aux) return;
+ for (u32 i = 0; i < aux->ncases; ++i)
+ if (aux->cases[i].block == old_succ) aux->cases[i].block = new_succ;
+ if (aux->default_block == old_succ) aux->default_block = new_succ;
+ } else if ((IROp)term->op == IR_INDIRECT_BRANCH) {
+ IRIndirectAux* aux = (IRIndirectAux*)term->extra.aux;
+ if (!aux) return;
+ for (u32 i = 0; i < aux->ntargets; ++i)
+ if (aux->targets[i] == old_succ) aux->targets[i] = new_succ;
+ }
+}
+
+static u32 clone_block_for_pred(Func* f, u32 block, u32 pred) {
+ Block* src = &f->blocks[block];
+ u32 nb = ir_block_new(f);
+ Block* dst = &f->blocks[nb];
+ ir_block_set_nsucc(f, nb, src->nsucc);
+ for (u32 s = 0; s < src->nsucc; ++s) dst->succ[s] = src->succ[s];
+ dst->loop_depth = src->loop_depth;
+ dst->frequency = src->frequency;
+
+ CloneValMap* map = arena_zarray(f->arena, CloneValMap, src->ninsts + 1u);
+ u32 nmap = 0;
+ for (u32 i = 0; i < src->ninsts; ++i) {
+ Val old = src->insts[i].def;
+ if (old == VAL_NONE) continue;
+ map[++nmap].old_val = old;
+ map[nmap].new_val = ir_alloc_val(f, f->val_type[old], f->val_cls[old]);
+ }
+ map[0].old_val = nmap;
+
+ for (u32 i = 0; i < src->ninsts; ++i) {
+ Inst* in = ir_emit(f, nb, (IROp)src->insts[i].op);
+ InstId id = in->id;
+ *in = src->insts[i];
+ in->id = id;
+ if (in->nopnds) {
+ Operand* opnds = arena_array(f->arena, Operand, in->nopnds);
+ memcpy(opnds, src->insts[i].opnds, sizeof(opnds[0]) * in->nopnds);
+ in->opnds = opnds;
+ }
+ clone_inst_aux(f, in, &src->insts[i]);
+ if (in->def != VAL_NONE) in->def = map_lookup(map + 1, nmap, in->def);
+ opt_walk_inst_operands(f, in, remap_operand, map);
+ if (in->def != VAL_NONE && in->def < f->nvals) {
+ f->val_def_block[in->def] = nb;
+ f->val_def_inst[in->def] = dst->ninsts - 1u;
+ }
+ }
+
+ replace_succ_ref(f, pred, block, nb);
+ emit_order_insert_after(f, pred, nb);
+ return nb;
+}
+
+void opt_block_cloning(Func* f) {
+ if (!f || f->opt_rewritten) return;
+ opt_build_loop_tree(f);
+ OptAnalysis a;
+ memset(&a, 0, sizeof a);
+ opt_analysis_build_dominators(f, &a);
+
+ u32 base_insts = opt_inst_count(f);
+ u32 max_extra = base_insts / 4u + 8u;
+ if (max_extra > OPT_CLONE_ABS_GROWTH_LIMIT)
+ max_extra = OPT_CLONE_ABS_GROWTH_LIMIT;
+ u32 extra = 0;
+ int changed = 0;
+
+ u32 original_blocks = f->nblocks;
+ for (u32 b = 0; b < original_blocks; ++b) {
+ if (!clone_candidate(f, &a, b)) continue;
+ Block* bl = &f->blocks[b];
+ u32 cost = bl->ninsts;
+ if (cost == 0 || cost * bl->npreds + extra > max_extra) continue;
+ u32 npreds = bl->npreds;
+ u32* preds = arena_array(f->arena, u32, npreds);
+ memcpy(preds, bl->preds, sizeof(preds[0]) * npreds);
+ for (u32 p = 0; p < npreds; ++p) {
+ clone_block_for_pred(f, b, preds[p]);
+ extra += cost;
+ }
+ changed = 1;
+ }
+
+ if (changed) {
+ opt_analysis_invalidate(f, OPT_ANALYSIS_CFG | OPT_ANALYSIS_DEF_USE |
+ OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP);
+ opt_build_cfg(f);
+ }
+ opt_rebuild_def_use(f);
+}
+
+static int addr_def_inst(Func* f, Val v, Inst** out) {
+ if (v == VAL_NONE || v >= f->nvals) return 0;
+ u32 b = f->val_def_block[v];
+ u32 i = f->val_def_inst[v];
+ if (b >= f->nblocks || i >= f->blocks[b].ninsts) return 0;
+ Inst* in = &f->blocks[b].insts[i];
+ if ((IROp)in->op != IR_ADDR_OF || in->def != v || in->nopnds < 2) return 0;
+ *out = in;
+ return 1;
+}
+
+static int addr_use_foldable(Func* f, const OptUse* use) {
+ if (!use || use->kind != OPT_USE_INDIRECT_BASE) return 0;
+ if (use->block >= f->nblocks || use->inst >= f->blocks[use->block].ninsts)
+ return 0;
+ Inst* in = &f->blocks[use->block].insts[use->inst];
+ if ((IROp)in->op != IR_LOAD && (IROp)in->op != IR_STORE) return 0;
+ if (opt_mem_observable(&in->extra.mem)) return 0;
+ if (!use->operand || use->operand->kind != OPK_INDIRECT) return 0;
+ if (use->operand->v.ind.ofs != 0) return 0;
+ if ((IROp)in->op == IR_LOAD && use->operand_index != 1u) return 0;
+ if ((IROp)in->op == IR_STORE && use->operand_index != 0u) return 0;
+ return 1;
+}
+
+static int addr_all_uses_foldable(Func* f, Val v) {
+ u32 nuses = 0;
+ for (u32 u = f->opt_first_use_by_val[v]; u != OPT_USE_NONE;
+ u = f->opt_uses[u].next_for_val) {
+ ++nuses;
+ if (!addr_use_foldable(f, &f->opt_uses[u])) return 0;
+ }
+ return nuses != 0;
+}
+
+static void addr_inst_remove(Inst* in) {
+ in->op = IR_NOP;
+ in->def = VAL_NONE;
+ in->ndefs = 0;
+ in->defs = NULL;
+ in->nopnds = 0;
+ in->opnds = NULL;
+}
+
+void opt_addr_xform(Func* f) {
+ if (!f || f->opt_rewritten) return;
+ opt_rebuild_def_use(f);
+ int changed = 0;
+ u32 nvals = f->nvals;
+ for (Val v = 1; v < nvals; ++v) {
+ Inst* def = NULL;
+ if (!addr_def_inst(f, v, &def)) continue;
+ Operand lv = def->opnds[1];
+ if (lv.kind != OPK_LOCAL) continue;
+ if (!addr_all_uses_foldable(f, v)) continue;
+
+ for (u32 u = f->opt_first_use_by_val[v]; u != OPT_USE_NONE;
+ u = f->opt_uses[u].next_for_val) {
+ OptUse* use = &f->opt_uses[u];
+ Inst* mem = &f->blocks[use->block].insts[use->inst];
+ Operand folded = lv;
+ folded.type = mem->extra.mem.type ? mem->extra.mem.type : lv.type;
+ *use->operand = folded;
+ }
+ addr_inst_remove(def);
+ changed = 1;
+ }
+ if (changed)
+ opt_analysis_invalidate(
+ f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP);
+ opt_rebuild_def_use(f);
+}
+
+void opt_gvn(Func* f) {
+ if (f) opt_rebuild_def_use(f);
+}
+
+void opt_dse(Func* f) {
+ if (f) opt_rebuild_def_use(f);
+}
+
+void opt_cleanup(Func* f) {
+ if (!f) return;
+ opt_ssa_dce(f);
+ opt_copy_cleanup(f);
+}
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -328,6 +328,36 @@ static Inst* emit_store_local(Func* f, u32 b, FrameSlot fs, Val src,
return in;
}
+static Inst* emit_addr_of_local(Func* f, u32 b, Val dst, FrameSlot fs,
+ CfreeCgTypeId ptr_ty, CfreeCgTypeId local_ty) {
+ Inst* in = ir_emit(f, b, IR_ADDR_OF);
+ in->opnds = arena_array(f->arena, Operand, 2);
+ in->opnds[0] = op_reg_(dst, ptr_ty);
+ in->opnds[1] = op_local_(fs, local_ty);
+ in->nopnds = 2;
+ in->def = dst;
+ in->type = ptr_ty;
+ f->val_def_block[dst] = b;
+ f->val_def_inst[dst] = f->blocks[b].ninsts - 1u;
+ return in;
+}
+
+static Inst* emit_load_indirect(Func* f, u32 b, Val dst, Val base,
+ CfreeCgTypeId ty, u16 flags) {
+ Inst* in = ir_emit(f, b, IR_LOAD);
+ in->opnds = arena_array(f->arena, Operand, 2);
+ in->opnds[0] = op_reg_(dst, ty);
+ in->opnds[1] = op_indirect_((Reg)base, ty);
+ in->nopnds = 2;
+ in->def = dst;
+ in->type = ty;
+ in->extra.mem = mem_unknown_(ty, 4);
+ in->extra.mem.flags = flags;
+ f->val_def_block[dst] = b;
+ f->val_def_inst[dst] = f->blocks[b].ninsts - 1u;
+ return in;
+}
+
static Inst* emit_call_void(Func* f, u32 b) {
Inst* in = ir_emit(f, b, IR_CALL);
IRCallAux* aux = arena_znew(f->arena, IRCallAux);
@@ -373,6 +403,24 @@ static int bytes_contains(const unsigned char* data, size_t len,
return 0;
}
+static void expect_ir_dump_eq(Func* f, const char* expected,
+ const char* label) {
+ CfreeWriter* w = NULL;
+ (void)cfree_writer_mem(&g_heap, &w);
+ opt_ir_dump(f, w);
+ size_t len = 0;
+ const unsigned char* bytes = cfree_writer_mem_bytes(w, &len);
+ size_t expected_len = strlen(expected);
+ int ok = bytes && len == expected_len &&
+ memcmp(bytes, expected, expected_len) == 0;
+ EXPECT(ok, "%s dump should match golden", label);
+ if (!ok) {
+ fprintf(stderr, "expected:\n%sactual:\n%.*s\n", expected, (int)len,
+ bytes ? (const char*)bytes : "");
+ }
+ cfree_writer_close(w);
+}
+
static void ensure_test_val_info(Func* f) {
if (f->val_info) return;
f->val_info = arena_zarray(f->arena, OptValInfo, f->nvals ? f->nvals : 1u);
@@ -2168,6 +2216,161 @@ static void opt_copy_cleanup_rewrites_users(void) {
tc_fini(&tc);
}
+static void opt_block_cloning_clones_small_join_blocks(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ u32 entry = f->entry;
+ u32 then_b = ir_block_new(f);
+ u32 else_b = ir_block_new(f);
+ u32 join = ir_block_new(f);
+ ir_note_emit(f, then_b);
+ ir_note_emit(f, else_b);
+ ir_note_emit(f, join);
+
+ Val out = add_val(f, tc.i32);
+ emit_test_branch(f, entry, then_b, else_b, tc.i32);
+ emit_br_to(f, then_b, join);
+ emit_br_to(f, else_b, join);
+ emit_load_imm(f, join, out, tc.i32, 7);
+ emit_ret_val(f, join, out, tc.i32);
+
+ opt_build_cfg(f);
+ u32 before_blocks = f->nblocks;
+ opt_block_cloning(f);
+ opt_verify(f, "test-block-clone-join");
+
+ EXPECT(f->nblocks > before_blocks,
+ "block cloning should create per-predecessor join clones");
+ EXPECT(f->blocks[join].ninsts == 0,
+ "original fully cloned join should become unreachable");
+ EXPECT(count_op(f, IR_RET) == 2,
+ "cloned join should leave two reachable return blocks");
+ expect_ir_dump_eq(f,
+ "ir blocks=6 vals=4\n"
+ "block 0 preds=[] succs=[b1,b2] insts=1\n"
+ " 0 cmp_branch opnds=[imm:1,imm:0]\n"
+ "block 1 preds=[b0] succs=[b4] insts=1\n"
+ " 0 br\n"
+ "block 2 preds=[b0] succs=[b5] insts=1\n"
+ " 0 br\n"
+ "block 3 preds=[] succs=[] insts=0\n"
+ "block 4 preds=[b1] succs=[] insts=2\n"
+ " 0 load_imm def=v2 opnds=[v2] imm=7\n"
+ " 1 ret ret=v2\n"
+ "block 5 preds=[b2] succs=[] insts=2\n"
+ " 0 load_imm def=v3 opnds=[v3] imm=7\n"
+ " 1 ret ret=v3\n",
+ "block clone");
+ tc_fini(&tc);
+}
+
+static void opt_block_cloning_skips_loop_backedges(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ u32 entry = f->entry;
+ u32 header = ir_block_new(f);
+ u32 body = ir_block_new(f);
+ u32 exit = ir_block_new(f);
+ ir_note_emit(f, header);
+ ir_note_emit(f, body);
+ ir_note_emit(f, exit);
+
+ Val out = add_val(f, tc.i32);
+ emit_br_to(f, entry, header);
+ emit_test_branch(f, header, body, exit, tc.i32);
+ emit_br_to(f, body, header);
+ emit_load_imm(f, exit, out, tc.i32, 3);
+ emit_ret_val(f, exit, out, tc.i32);
+
+ opt_build_cfg(f);
+ u32 before_blocks = f->nblocks;
+ opt_block_cloning(f);
+ opt_verify(f, "test-block-clone-loop-skip");
+ EXPECT(f->nblocks == before_blocks,
+ "block cloning should skip loop headers/backedges");
+ tc_fini(&tc);
+}
+
+static void opt_addr_xform_folds_local_addr_into_memory_operand(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, FSF_ADDR_TAKEN);
+ CfreeCgTypeId ptr_ty = cfree_cg_type_ptr(tc.c, tc.i32, 0);
+ Val addr = add_val(f, ptr_ty);
+ Val out = add_val(f, tc.i32);
+ emit_addr_of_local(f, f->entry, addr, fs, ptr_ty, tc.i32);
+ emit_load_indirect(f, f->entry, out, addr, tc.i32, 0);
+ emit_ret_val(f, f->entry, out, tc.i32);
+
+ opt_build_cfg(f);
+ opt_build_ssa(f);
+ opt_addr_xform(f);
+ opt_verify(f, "test-addr-xform-local");
+ Inst* load = &f->blocks[f->entry].insts[1];
+ EXPECT((IROp)load->op == IR_LOAD && load->opnds[1].kind == OPK_LOCAL,
+ "address transform should fold local addr_of into load operand");
+ expect_ir_dump_eq(f,
+ "ir blocks=1 vals=3\n"
+ "block 0 preds=[] succs=[] insts=3\n"
+ " 0 nop\n"
+ " 1 load def=v2 opnds=[v2,local#1] mem=size4 align4 "
+ "flags=0x0 alias=unknown\n"
+ " 2 ret ret=v2\n",
+ "address transform");
+ opt_ssa_dce(f);
+ opt_copy_cleanup(f);
+ opt_verify(f, "test-addr-xform-local-cleanup");
+ EXPECT(count_op(f, IR_ADDR_OF) == 0,
+ "cleanup after addr_xform should remove dead addr_of pseudo");
+ tc_fini(&tc);
+}
+
+static void opt_addr_xform_preserves_volatile_and_globals(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ CfreeCgTypeId ptr_ty = cfree_cg_type_ptr(tc.c, tc.i32, 0);
+
+ Func* f = new_func(&tc);
+ FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, FSF_ADDR_TAKEN);
+ Val addr = add_val(f, ptr_ty);
+ Val out = add_val(f, tc.i32);
+ emit_addr_of_local(f, f->entry, addr, fs, ptr_ty, tc.i32);
+ emit_load_indirect(f, f->entry, out, addr, tc.i32, MF_VOLATILE);
+ emit_ret_val(f, f->entry, out, tc.i32);
+ opt_build_cfg(f);
+ opt_build_ssa(f);
+ opt_addr_xform(f);
+ opt_verify(f, "test-addr-xform-volatile");
+ EXPECT(count_op(f, IR_ADDR_OF) == 1,
+ "address transform should preserve volatile memory addresses");
+
+ Func* g = new_func(&tc);
+ ObjSymId sym = 1;
+ Val gaddr = add_val(g, ptr_ty);
+ Val gout = add_val(g, tc.i32);
+ Inst* a = ir_emit(g, g->entry, IR_ADDR_OF);
+ a->opnds = arena_array(g->arena, Operand, 2);
+ a->opnds[0] = op_reg_(gaddr, ptr_ty);
+ a->opnds[1] = op_global_(sym, 0, tc.i32);
+ a->nopnds = 2;
+ a->def = gaddr;
+ a->type = ptr_ty;
+ g->val_def_block[gaddr] = g->entry;
+ g->val_def_inst[gaddr] = g->blocks[g->entry].ninsts - 1u;
+ emit_load_indirect(g, g->entry, gout, gaddr, tc.i32, 0);
+ emit_ret_val(g, g->entry, gout, tc.i32);
+ opt_build_cfg(g);
+ opt_build_ssa(g);
+ opt_addr_xform(g);
+ opt_verify(g, "test-addr-xform-global");
+ EXPECT(count_op(g, IR_ADDR_OF) == 1,
+ "address transform should leave global address materialization alone");
+ tc_fini(&tc);
+}
+
static void opt_jump_cleanup_forwards_branch_targets(void) {
TestCtx tc;
tc_init(&tc);
@@ -4212,6 +4415,10 @@ int main(void) {
opt_verify_catches_stale_def_use();
opt_ssa_dce_removes_dead_defs_and_phi();
opt_copy_cleanup_rewrites_users();
+ opt_block_cloning_clones_small_join_blocks();
+ opt_block_cloning_skips_loop_backedges();
+ opt_addr_xform_folds_local_addr_into_memory_operand();
+ opt_addr_xform_preserves_volatile_and_globals();
opt_jump_cleanup_forwards_branch_targets();
opt_jump_cleanup_inverts_to_remove_jump_block();
opt_jump_cleanup_keeps_conditional_fallthrough_block();
diff --git a/test/toy/cases/128_o2_branch_join_addr_mem.expected b/test/toy/cases/128_o2_branch_join_addr_mem.expected
@@ -0,0 +1 @@
+52
diff --git a/test/toy/cases/128_o2_branch_join_addr_mem.toy b/test/toy/cases/128_o2_branch_join_addr_mem.toy
@@ -0,0 +1,16 @@
+fn choose(flag: i64): i64 {
+ var x: i64 = 5;
+ if flag != 0 {
+ x = x + 10;
+ } else {
+ x = x + 20;
+ }
+ let p: *i64 = &x;
+ return p[0] + 6;
+}
+
+fn __user_main(): i64 {
+ return choose(1) + choose(0);
+}
+
+fn main(): i32 { return __user_main() as i32; }