commit 3d23cd16ae3c4e78bf1feff6be03bf9e6b3fe9c3
parent 3a249786f7441806f9ba1eb92ce1dd40ba07da04
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 21 May 2026 15:13:28 -0700
opt: add SSA copy propagation
Diffstat:
7 files changed, 295 insertions(+), 5 deletions(-)
diff --git a/doc/OPT.md b/doc/OPT.md
@@ -433,6 +433,10 @@ ssa_dce
verify(o2-addr-xform-dce)
copy_cleanup
verify(o2-addr-xform-copy-cleanup)
+gvn
+verify(o2-gvn-ssa)
+copy_prop
+verify(o2-copy-prop-ssa)
ssa_dce
verify(o2-copy-dce)
make_conventional_ssa
@@ -526,6 +530,10 @@ Current behavior:
atomic, global, TLS, non-zero indirect-offset, and otherwise non-local address
cases remain materialized. Both passes rebuild/invalidate analysis state and
are bracketed by `opt_ssa_dce`, `opt_copy_cleanup`, and verifier checkpoints.
+- The first post-address value passes are in the O2 path: scalar-only
+ `opt_gvn` rewrites dominated duplicate pure expressions and folds safe
+ integer constants/branches without interpreting memory, and `opt_copy_prop`
+ propagates SSA copy chains plus collapses redundant integer extension chains.
Temporary defensive/pessimizing checks to lift:
@@ -690,8 +698,8 @@ Implement in order:
3. [x] `opt_block_cloning`
4. [x] `opt_addr_xform`
5. [x] scalar-only `opt_gvn`
-6. [ ] `opt_copy_prop`
-7. [ ] memory-aware `opt_gvn` / redundant load elimination design and tests
+6. [x] `opt_copy_prop`
+7. [ ] memory-aware `opt_gvn` / redundant load elimination tests
8. [ ] `opt_dse`
9. [ ] `opt_licm`
10. [ ] `opt_pressure_relief`
@@ -727,12 +735,56 @@ Completed pre-GVN slice exit criteria:
- [x] Pass-local dump tests prove the implemented SSA DCE, copy cleanup, block
cloning, and address-folding rewrites fire.
+Memory numbering and alias invalidation model for memory GVN:
+
+- Memory value numbering is keyed by `(address-space, alias root, byte range,
+ memory version, load type, size, alignment, flags)`. The address operand is
+ still part of the key unless the alias root is a singleton local/global/string
+ object with a proven constant offset. This keeps two unknown pointers in the
+ same alias class from being treated as the same location.
+- A memory version token is maintained per alias root plus one conservative
+ `unknown` token. `ALIAS_LOCAL`, `ALIAS_GLOBAL`, `ALIAS_PARAM`, `ALIAS_HEAP`,
+ and `ALIAS_STRING` each use their root id as the fine-grained version key;
+ `ALIAS_UNKNOWN`, missing alias data, unsupported address forms, and escaped
+ allocas use the unknown token. Address space participates before alias-root
+ comparison, so different address spaces never share a memory value number.
+- A store to a precise root increments that root's version. A store through
+ unknown memory increments the unknown token and invalidates every non-invariant
+ root that may alias unknown memory. A store to volatile memory is never
+ forwarded from or to, and also acts as an observable memory barrier for other
+ volatile operations.
+- Calls increment the unknown token and invalidate all roots that can escape:
+ globals, params, heap, TLS/global address roots, address-taken locals, and
+ any local whose address has flowed to unknown memory. Non-address-taken
+ promoted locals may keep their root version across a call. Later call attrs
+ can narrow this, but the first memory GVN slice should assume calls clobber.
+- Atomics, fences, inline asm with memory effects, `IR_AGG_COPY`,
+ `IR_AGG_SET`, bitfield stores, varargs memory operations, and runtime memory
+ intrinsics are full memory barriers until each has a precise alias rule.
+ Atomic loads are not redundant-load candidates unless a later model proves
+ the exact ordering legality.
+- TLS addresses are treated like globals for alias roots but are call-clobbered
+ unless the access is proven local-exec and the call cannot observe the thread
+ object. The first slice should conservatively invalidate them at calls.
+- Redundant-load elimination may replace `IR_LOAD` only when the prior value's
+ def dominates the use, the memory key including version is identical, neither
+ access is volatile/atomic, the load type and byte range match, and no
+ invalidating barrier lies between the two through the version model. Store/load
+ reuse uses the same key and additionally requires the stored operand's value
+ to dominate the load.
+
Remaining Phase D exit criteria:
- [x] Scalar GVN pass-local dump tests prove redundant pure expressions are
replaced and constant branches fold.
-- [ ] Memory numbering and alias invalidation are documented before any memory
+- [x] Memory numbering and alias invalidation are documented before any memory
GVN or redundant-load rewrite lands.
+- [x] `opt_copy_prop` pass-local tests prove SSA copy propagation and redundant
+ extension-chain cleanup, with a focused toy O2 fixture.
+- [x] AArch64 O2 disassembly sanity checks show `130_o2_copy_prop_ext`
+ collapsing the widening/copy chain to a single byte load/zero-extend path,
+ while `128_o2_branch_join_addr_mem` still keeps the stack load pending memory
+ GVN as expected.
- [ ] DSE, LICM, pressure relief, SSA combine, and full O2 jump optimization
each have focused red-green tests before they enter the default O2 schedule.
@@ -799,7 +851,7 @@ src/opt/
ir_print.c stable dumps for pass tests
pass_cfg.c CFG, unreachable cleanup
pass_o2.c early O2 transforms: block cloning, address transform,
- temporary GVN/DSE/cleanup stubs
+ scalar GVN, temporary DSE/cleanup stubs
pass_clone.c block cloning (split out when pass size warrants it)
pass_ssa.c SSA build, conventional SSA, undo SSA
pass_addr.c address transformation (split out when pass size warrants it)
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -1429,6 +1429,8 @@ static void opt_run_o2_pipeline(OptImpl* o) {
opt_verify(o->f, "o2-addr-xform-copy-cleanup");
opt_gvn(o->f);
opt_verify(o->f, "o2-gvn-ssa");
+ opt_copy_prop(o->f);
+ opt_verify(o->f, "o2-copy-prop-ssa");
opt_ssa_dce(o->f);
opt_verify(o->f, "o2-copy-dce");
opt_make_conventional_ssa(o->f);
diff --git a/src/opt/pass_copy.c b/src/opt/pass_copy.c
@@ -1,5 +1,16 @@
+#include <string.h>
+
#include "opt/opt_internal.h"
+typedef struct ConvertStep {
+ Val src;
+ CfreeCgTypeId src_type;
+ CfreeCgTypeId dst_type;
+ u32 src_width;
+ u32 dst_width;
+ u8 kind;
+} ConvertStep;
+
static int same_val_shape(Func* f, Val a, Val b) {
if (a == VAL_NONE || b == VAL_NONE || a >= f->nvals || b >= f->nvals)
return 0;
@@ -85,6 +96,113 @@ static void compact_copies(Func* f) {
}
}
+static Inst* val_def_inst(Func* f, Val v) {
+ if (!f || v == VAL_NONE || v >= f->nvals) return NULL;
+ u32 b = f->val_def_block[v];
+ u32 i = f->val_def_inst[v];
+ if (b >= f->nblocks || i >= f->blocks[b].ninsts) return NULL;
+ return &f->blocks[b].insts[i];
+}
+
+static int int_width(Func* f, CfreeCgTypeId ty, u32* out) {
+ u32 width = cfree_cg_type_int_width((CfreeCompiler*)f->c, ty);
+ if (!width || width > 64u) return 0;
+ *out = width;
+ return 1;
+}
+
+static int convert_step(Func* f, const Inst* in, ConvertStep* out) {
+ if (!f || !in || !out || (IROp)in->op != IR_CONVERT || in->nopnds < 2)
+ return 0;
+ if (in->def == VAL_NONE || in->def >= f->nvals) return 0;
+ if (in->opnds[0].kind != OPK_REG || in->opnds[1].kind != OPK_REG) return 0;
+
+ memset(out, 0, sizeof *out);
+ out->src = (Val)in->opnds[1].v.reg;
+ if (out->src == VAL_NONE || out->src >= f->nvals) return 0;
+ out->src_type = in->opnds[1].type;
+ out->dst_type = in->opnds[0].type;
+ out->kind = (u8)in->extra.imm;
+ if (!int_width(f, out->src_type, &out->src_width) ||
+ !int_width(f, out->dst_type, &out->dst_width))
+ return 0;
+ return 1;
+}
+
+static int conversion_is_noop(const ConvertStep* s) {
+ if (!s || s->src_width != s->dst_width || s->src_type != s->dst_type)
+ return 0;
+ switch ((ConvKind)s->kind) {
+ case CV_ZEXT:
+ case CV_SEXT:
+ case CV_TRUNC:
+ case CV_BITCAST:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int extension_kind(u8 kind) {
+ return kind == CV_ZEXT || kind == CV_SEXT;
+}
+
+static void set_convert_source(Func* f, Inst* in, Val src) {
+ in->opnds[1].v.reg = (Reg)src;
+ in->opnds[1].type = f->val_type[src];
+ in->opnds[1].cls = f->val_cls[src];
+}
+
+static void make_copy_from(Func* f, Inst* in, Val src) {
+ Val dst = in->def;
+ Operand* opnds = in->opnds;
+ if (!opnds) opnds = arena_array(f->arena, Operand, 2);
+ memset(&opnds[0], 0, sizeof opnds[0]);
+ memset(&opnds[1], 0, sizeof opnds[1]);
+ opnds[0].kind = OPK_REG;
+ opnds[0].type = f->val_type[dst];
+ opnds[0].cls = f->val_cls[dst];
+ opnds[0].v.reg = (Reg)dst;
+ opnds[1].kind = OPK_REG;
+ opnds[1].type = f->val_type[src];
+ opnds[1].cls = f->val_cls[src];
+ opnds[1].v.reg = (Reg)src;
+ in->op = IR_COPY;
+ in->type = f->val_type[dst];
+ in->opnds = opnds;
+ in->nopnds = 2;
+}
+
+static int simplify_convert_chain(Func* f, Inst* outer, const ConvertStep* o,
+ const ConvertStep* inner) {
+ if (!f || !outer || !o || !inner) return 0;
+
+ if (conversion_is_noop(o)) {
+ make_copy_from(f, outer, o->src);
+ return 1;
+ }
+
+ if (extension_kind(o->kind) && o->kind == inner->kind &&
+ inner->src_width <= inner->dst_width &&
+ inner->dst_width <= o->dst_width) {
+ set_convert_source(f, outer, inner->src);
+ return 1;
+ }
+
+ if (o->kind == CV_TRUNC && extension_kind(inner->kind) &&
+ o->dst_width <= inner->dst_width) {
+ set_convert_source(f, outer, inner->src);
+ if (o->dst_width == inner->src_width && o->dst_type == inner->src_type) {
+ make_copy_from(f, outer, inner->src);
+ } else if (o->dst_width > inner->src_width) {
+ outer->extra.imm = inner->kind;
+ }
+ return 1;
+ }
+
+ return 0;
+}
+
static int cleanup_one_copy(Func* f) {
opt_rebuild_def_use(f);
for (u32 b = 0; b < f->nblocks; ++b) {
@@ -108,9 +226,48 @@ static int cleanup_one_copy(Func* f) {
return 0;
}
+static int cleanup_one_extension(Func* f) {
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* outer = &bl->insts[i];
+ ConvertStep o;
+ if (!convert_step(f, outer, &o)) continue;
+
+ Inst* inner_inst = val_def_inst(f, o.src);
+ ConvertStep inner;
+ if (!convert_step(f, inner_inst, &inner)) {
+ if (!conversion_is_noop(&o)) continue;
+ make_copy_from(f, outer, o.src);
+ } else if (!simplify_convert_chain(f, outer, &o, &inner)) {
+ continue;
+ }
+
+ opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE);
+ return 1;
+ }
+ }
+ return 0;
+}
+
void opt_copy_cleanup(Func* f) {
if (!f || f->opt_rewritten) return;
while (cleanup_one_copy(f)) {
}
opt_rebuild_def_use(f);
}
+
+void opt_copy_prop(Func* f) {
+ if (!f || f->opt_rewritten) return;
+ if (!f->opt_reg_ssa && f->npregs > 1) {
+ opt_copy_cleanup(f);
+ return;
+ }
+
+ opt_copy_cleanup(f);
+ while (cleanup_one_extension(f)) {
+ opt_rebuild_def_use(f);
+ opt_copy_cleanup(f);
+ }
+ opt_copy_cleanup(f);
+}
diff --git a/src/opt/pass_o2.c b/src/opt/pass_o2.c
@@ -1097,5 +1097,5 @@ void opt_dse(Func* f) {
void opt_cleanup(Func* f) {
if (!f) return;
opt_ssa_dce(f);
- opt_copy_cleanup(f);
+ opt_copy_prop(f);
}
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -66,6 +66,8 @@ static int g_checks;
typedef struct TestCtx {
Compiler cc;
Compiler* c;
+ CfreeCgTypeId i8;
+ CfreeCgTypeId i16;
CfreeCgTypeId i32;
CfreeCgTypeId i64;
CfreeCgTypeId f64;
@@ -93,6 +95,8 @@ static void tc_init_target(TestCtx* tc, CfreeArchKind arch, CfreeOSKind os) {
tc->c = &tc->cc;
{
CfreeCgBuiltinTypes b = cfree_cg_builtin_types(tc->c);
+ tc->i8 = b.id[CFREE_CG_BUILTIN_I8];
+ tc->i16 = b.id[CFREE_CG_BUILTIN_I16];
tc->i32 = b.id[CFREE_CG_BUILTIN_I32];
tc->i64 = b.id[CFREE_CG_BUILTIN_I64];
tc->f64 = b.id[CFREE_CG_BUILTIN_F64];
@@ -2289,6 +2293,64 @@ static void opt_copy_cleanup_rewrites_users(void) {
tc_fini(&tc);
}
+static void opt_copy_prop_rewrites_ssa_copy_chain(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ Val a = add_val(f, tc.i32);
+ Val b = add_val(f, tc.i32);
+ Val c = add_val(f, tc.i32);
+ emit_load_imm(f, f->entry, a, tc.i32, 42);
+ emit_copy(f, f->entry, b, a, tc.i32);
+ emit_copy(f, f->entry, c, b, tc.i32);
+ emit_ret_val(f, f->entry, c, tc.i32);
+
+ opt_build_cfg(f);
+ opt_copy_prop(f);
+ opt_verify(f, "test-copy-prop-chain");
+ EXPECT(count_op(f, IR_COPY) == 0,
+ "copy propagation should remove SSA copy chains");
+ EXPECT(ret_val(f, f->entry) == a,
+ "copy propagation should rewrite users to the original value");
+ tc_fini(&tc);
+}
+
+static void opt_copy_prop_collapses_redundant_extension_chain(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ Val x = add_val(f, tc.i8);
+ Val w16 = add_val(f, tc.i16);
+ Val w32 = add_val(f, tc.i32);
+ Val w64 = add_val(f, tc.i64);
+ emit_scalar_input(f, f->entry, x, tc.i8);
+ emit_convert_typed(f, f->entry, w16, x, tc.i16, tc.i8, CV_ZEXT);
+ emit_convert_typed(f, f->entry, w32, w16, tc.i32, tc.i16, CV_ZEXT);
+ emit_convert_typed(f, f->entry, w64, w32, tc.i64, tc.i32, CV_ZEXT);
+ emit_ret_val(f, f->entry, w64, tc.i64);
+
+ opt_build_cfg(f);
+ opt_copy_prop(f);
+ opt_verify(f, "test-copy-prop-ext-chain");
+
+ Inst* wide = def_inst(f, w64);
+ EXPECT(wide && (IROp)wide->op == IR_CONVERT && wide->nopnds == 2,
+ "extension cleanup should preserve the final widening conversion");
+ EXPECT(wide && wide->opnds[1].kind == OPK_REG &&
+ wide->opnds[1].v.reg == (Reg)x,
+ "extension cleanup should bypass redundant intermediate extensions");
+ expect_ir_dump_eq(f,
+ "ir blocks=1 vals=5\n"
+ "block 0 preds=[] succs=[] insts=5\n"
+ " 0 param_decl def=v1 opnds=[v1]\n"
+ " 1 convert def=v2 opnds=[v2,v1]\n"
+ " 2 convert def=v3 opnds=[v3,v1]\n"
+ " 3 convert def=v4 opnds=[v4,v1]\n"
+ " 4 ret ret=v4\n",
+ "copy prop extension chain");
+ tc_fini(&tc);
+}
+
static void opt_block_cloning_clones_small_join_blocks(void) {
TestCtx tc;
tc_init(&tc);
@@ -4860,6 +4922,8 @@ int main(void) {
opt_verify_catches_stale_def_use();
opt_ssa_dce_removes_dead_defs_and_phi();
opt_copy_cleanup_rewrites_users();
+ opt_copy_prop_rewrites_ssa_copy_chain();
+ opt_copy_prop_collapses_redundant_extension_chain();
opt_block_cloning_clones_small_join_blocks();
opt_block_cloning_skips_loop_backedges();
opt_addr_xform_folds_local_addr_into_memory_operand();
diff --git a/test/toy/cases/130_o2_copy_prop_ext.expected b/test/toy/cases/130_o2_copy_prop_ext.expected
@@ -0,0 +1 @@
+42
diff --git a/test/toy/cases/130_o2_copy_prop_ext.toy b/test/toy/cases/130_o2_copy_prop_ext.toy
@@ -0,0 +1,14 @@
+fn widen_copy(x: i8): i64 {
+ let a: i16 = @zext<i16>(x);
+ let b: i32 = @zext<i32>(a);
+ let c: i64 = @zext<i64>(b);
+ let d: i64 = c;
+ let e: i64 = d;
+ return e;
+}
+
+fn __user_main(): i64 {
+ return widen_copy(42 as i8);
+}
+
+fn main(): i32 { return __user_main() as i32; }