commit 5dbd91a562625c2f87035eeb010468eb5ea91d4e
parent 3d23cd16ae3c4e78bf1feff6be03bf9e6b3fe9c3
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 21 May 2026 15:43:05 -0700
opt: add first memory-aware gvn slice
Diffstat:
4 files changed, 719 insertions(+), 23 deletions(-)
diff --git a/doc/OPT.md b/doc/OPT.md
@@ -530,10 +530,14 @@ Current behavior:
atomic, global, TLS, non-zero indirect-offset, and otherwise non-local address
cases remain materialized. Both passes rebuild/invalidate analysis state and
are bracketed by `opt_ssa_dce`, `opt_copy_cleanup`, and verifier checkpoints.
-- The first post-address value passes are in the O2 path: scalar-only
- `opt_gvn` rewrites dominated duplicate pure expressions and folds safe
- integer constants/branches without interpreting memory, and `opt_copy_prop`
- propagates SSA copy chains plus collapses redundant integer extension chains.
+- The first post-address value passes are in the O2 path: scalar GVN rewrites
+ dominated duplicate pure expressions and folds safe integer
+ constants/branches; the first memory-aware GVN slice rewrites exact
+ straight-line redundant loads and store/load reuse through the documented
+ alias/version model; and `opt_copy_prop` propagates SSA copy chains plus
+ collapses redundant integer extension chains. Memory GVN is still local to
+ the current dominating value table and does not yet model path-specific memory
+ states at join blocks.
Temporary defensive/pessimizing checks to lift:
@@ -699,14 +703,16 @@ Implement in order:
4. [x] `opt_addr_xform`
5. [x] scalar-only `opt_gvn`
6. [x] `opt_copy_prop`
-7. [ ] memory-aware `opt_gvn` / redundant load elimination tests
-8. [ ] `opt_dse`
-9. [ ] `opt_licm`
-10. [ ] `opt_pressure_relief`
-11. [x] `opt_make_conventional_ssa`
-12. [ ] `opt_ssa_combine`
-13. [x] `opt_undo_ssa`
-14. [ ] full O2 `opt_jump_opt` beyond the shared O1 jump cleanup
+7. [x] first memory-aware `opt_gvn` / redundant load elimination tests
+8. [ ] scalar/address identity simplification before memory GVN
+9. [ ] extended memory tracker for joins/path state
+10. [ ] `opt_dse`
+11. [ ] `opt_licm`
+12. [ ] `opt_pressure_relief`
+13. [x] `opt_make_conventional_ssa`
+14. [ ] `opt_ssa_combine`
+15. [x] `opt_undo_ssa`
+16. [ ] full O2 `opt_jump_opt` beyond the shared O1 jump cleanup
Do not batch these into one landing. Each pass needs a pass-local corpus case
that fails red without the pass or its bug fix.
@@ -716,11 +722,11 @@ cleanup, block cloning, and address folding have made CFG mutation,
memory/address operand mutation, verifier coverage, and analysis invalidation
boring and reliable.
-The next GVN step is scalar-only. Keep memory GVN/redundant-load elimination
-out of the first implementation patch. Make alias and memory numbering explicit
-after scalar GVN is green and before `opt_gvn` starts rewriting `IR_LOAD` or
-reasoning across `IR_STORE`, calls, atomics, fences, volatile operations, TLS,
-globals, or unknown memory.
+The first memory GVN step is now landed as a straight-line, dominance-table
+slice. Keep later memory rewrites tied to the documented alias and memory
+numbering model: do not expand `IR_LOAD` rewrites or reason across `IR_STORE`,
+calls, atomics, fences, volatile operations, TLS, globals, or unknown memory
+without an explicit invalidation rule and pass-local tests.
Completed pre-GVN slice exit criteria:
@@ -773,6 +779,37 @@ Memory numbering and alias invalidation model for memory GVN:
reuse uses the same key and additionally requires the stored operand's value
to dominate the load.
+Current memory GVN slice:
+
+- [x] Pass-local tests cover repeated exact local loads, store/load reuse for
+ the same precise local, unknown-store barriers, call barriers, and
+ volatile/atomic exclusions.
+- [x] The pass maintains per-root plus unknown versions, treats calls and
+ conservative memory ops as barriers, and can see through simple `addr_of` plus
+ zero-index address materialization when both memory operations refer to the
+ same precise root.
+- [x] AArch64 disassembly sanity check: a direct address-taken store followed by
+ a load of the same address reuses the stored value, removing the post-store
+ `ldur`.
+- [ ] Extend the tracker from a single RPO walk table to memory state snapshots
+ at block boundaries. Join blocks need a memory-phi/equivalent availability
+ model so stores from all predecessors to the same root/version can make a
+ joined load available. This is why `128_o2_branch_join_addr_mem` still keeps
+ the final stack load after the branch join.
+- [ ] Record root escape information precisely enough that calls can preserve
+ non-escaping address-taken locals, while globals/params/heap/TLS and unknown
+ roots remain conservatively clobbered.
+
+Scalar/address identity simplification should run before memory GVN, and the
+memory key builder should also tolerate the normalized forms. Red-green cases
+should include integer `x + 0`, `0 + x`, `x - 0`, `x * 1`, `1 * x`, `x / 1`,
+bitwise `x | 0`, `x ^ 0`, `x & -1`, shifts by zero, copies through identity
+conversions, and pointer/index forms such as `base + 0`, `gep/index 0`, and
+address arithmetic that only changes representation. These rewrites should be
+typed and legality-aware: do not erase operations with trapping, overflow, FP
+rounding/NaN/sign-zero semantics, volatile/atomic memory effects, or target
+addressing constraints that have not been modeled.
+
Remaining Phase D exit criteria:
- [x] Scalar GVN pass-local dump tests prove redundant pure expressions are
@@ -783,9 +820,11 @@ Remaining Phase D exit criteria:
extension-chain cleanup, with a focused toy O2 fixture.
- [x] AArch64 O2 disassembly sanity checks show `130_o2_copy_prop_ext`
collapsing the widening/copy chain to a single byte load/zero-extend path,
- while `128_o2_branch_join_addr_mem` still keeps the stack load pending memory
- GVN as expected.
-- [ ] DSE, LICM, pressure relief, SSA combine, and full O2 jump optimization
+ and the first memory GVN slice removing a straight-line post-store stack load.
+ `128_o2_branch_join_addr_mem` still keeps the join load pending extended
+ memory-state tracking.
+- [ ] Identity simplification, extended memory tracking, DSE, LICM, pressure
+ relief, SSA combine, and full O2 jump optimization
each have focused red-green tests before they enter the default O2 schedule.
### Phase E - Inlining and Cleanup
diff --git a/src/opt/pass_o2.c b/src/opt/pass_o2.c
@@ -31,6 +31,22 @@ typedef struct GvnOperandKey {
} v;
} GvnOperandKey;
+typedef struct GvnMemKey {
+ u8 valid;
+ u8 root_kind;
+ u8 has_addr;
+ u8 pad0;
+ u16 addr_space;
+ u16 flags;
+ CfreeCgTypeId mem_type;
+ u32 size;
+ u32 align;
+ i64 root_id;
+ i64 offset;
+ u32 root_version;
+ u32 unknown_version;
+} GvnMemKey;
+
typedef struct GvnKey {
u16 op;
u16 nops;
@@ -38,6 +54,7 @@ typedef struct GvnKey {
u8 cls;
u8 pad[3];
i64 tag;
+ GvnMemKey mem;
GvnOperandKey ops[2];
} GvnKey;
@@ -58,12 +75,28 @@ typedef struct GvnTable {
u32 cap;
} GvnTable;
+typedef struct GvnMemVersion {
+ u8 root_kind;
+ u8 pad0;
+ u16 addr_space;
+ i64 root_id;
+ u32 version;
+} GvnMemVersion;
+
+typedef struct GvnMemState {
+ GvnMemVersion* roots;
+ u32 nroots;
+ u32 cap;
+ u32 unknown_version;
+} GvnMemState;
+
typedef struct GvnCtx {
Func* f;
OptAnalysis* analysis;
Val* parent;
GvnConst* constants;
GvnTable table;
+ GvnMemState mem;
int changed;
int cfg_changed;
} GvnCtx;
@@ -366,6 +399,17 @@ static int addr_def_inst(Func* f, Val v, Inst** out) {
return 1;
}
+static int val_def_inst(Func* f, Val v, Inst** out) {
+ if (v == VAL_NONE || v >= f->nvals) return 0;
+ u32 b = f->val_def_block[v];
+ u32 i = f->val_def_inst[v];
+ if (b >= f->nblocks || i >= f->blocks[b].ninsts) return 0;
+ Inst* in = &f->blocks[b].insts[i];
+ if (in->def != v) return 0;
+ *out = in;
+ return 1;
+}
+
static int addr_use_foldable(Func* f, const OptUse* use) {
if (!use || use->kind != OPT_USE_INDIRECT_BASE) return 0;
if (use->block >= f->nblocks || use->inst >= f->blocks[use->block].ninsts)
@@ -705,12 +749,29 @@ static u64 gvn_key_hash(const GvnKey* k) {
h = gvn_mix_u64(h, k->type);
h = gvn_mix_u64(h, k->cls);
h = gvn_mix_u64(h, (u64)k->tag);
+ h = gvn_mix_u64(h, k->mem.valid);
+ if (k->mem.valid) {
+ h = gvn_mix_u64(h, k->mem.root_kind);
+ h = gvn_mix_u64(h, k->mem.has_addr);
+ h = gvn_mix_u64(h, k->mem.addr_space);
+ h = gvn_mix_u64(h, k->mem.flags);
+ h = gvn_mix_u64(h, k->mem.mem_type);
+ h = gvn_mix_u64(h, k->mem.size);
+ h = gvn_mix_u64(h, k->mem.align);
+ h = gvn_mix_u64(h, (u64)k->mem.root_id);
+ h = gvn_mix_u64(h, (u64)k->mem.offset);
+ h = gvn_mix_u64(h, k->mem.root_version);
+ h = gvn_mix_u64(h, k->mem.unknown_version);
+ }
for (u32 i = 0; i < k->nops; ++i) {
h = gvn_mix_u64(h, k->ops[i].kind);
h = gvn_mix_u64(h, k->ops[i].cls);
h = gvn_mix_u64(h, k->ops[i].type);
- h = gvn_mix_u64(h, k->ops[i].kind == OPK_REG ? k->ops[i].v.reg
- : (u64)k->ops[i].v.imm);
+ h = gvn_mix_u64(h,
+ (k->ops[i].kind == OPK_REG ||
+ k->ops[i].kind == OPK_INDIRECT)
+ ? k->ops[i].v.reg
+ : (u64)k->ops[i].v.imm);
}
return h;
}
@@ -718,7 +779,8 @@ static u64 gvn_key_hash(const GvnKey* k) {
static int gvn_operand_key_equal(const GvnOperandKey* a,
const GvnOperandKey* b) {
if (a->kind != b->kind || a->cls != b->cls || a->type != b->type) return 0;
- if (a->kind == OPK_REG) return a->v.reg == b->v.reg;
+ if (a->kind == OPK_REG || a->kind == OPK_INDIRECT)
+ return a->v.reg == b->v.reg;
return a->v.imm == b->v.imm;
}
@@ -726,6 +788,17 @@ static int gvn_key_equal(const GvnKey* a, const GvnKey* b) {
if (a->op != b->op || a->nops != b->nops || a->type != b->type ||
a->cls != b->cls || a->tag != b->tag)
return 0;
+ if (a->mem.valid != b->mem.valid) return 0;
+ if (a->mem.valid &&
+ (a->mem.root_kind != b->mem.root_kind ||
+ a->mem.has_addr != b->mem.has_addr ||
+ a->mem.addr_space != b->mem.addr_space ||
+ a->mem.flags != b->mem.flags || a->mem.mem_type != b->mem.mem_type ||
+ a->mem.size != b->mem.size || a->mem.align != b->mem.align ||
+ a->mem.root_id != b->mem.root_id || a->mem.offset != b->mem.offset ||
+ a->mem.root_version != b->mem.root_version ||
+ a->mem.unknown_version != b->mem.unknown_version))
+ return 0;
for (u32 i = 0; i < a->nops; ++i)
if (!gvn_operand_key_equal(&a->ops[i], &b->ops[i])) return 0;
return 1;
@@ -817,6 +890,249 @@ static int gvn_make_operand_key(GvnCtx* ctx, const Operand* op,
return 1;
}
+static int gvn_make_addr_operand_key(GvnCtx* ctx, const Operand* op,
+ GvnOperandKey* out) {
+ memset(out, 0, sizeof *out);
+ if (!op) return 0;
+ out->kind = op->kind;
+ out->cls = op->cls;
+ out->type = op->type;
+ switch ((OpKind)op->kind) {
+ case OPK_REG: {
+ Val v = gvn_find(ctx, (Val)op->v.reg);
+ if (v == VAL_NONE || v >= ctx->f->nvals) return 0;
+ out->v.reg = v;
+ return 1;
+ }
+ case OPK_INDIRECT: {
+ Val v = gvn_find(ctx, (Val)op->v.ind.base);
+ if (v == VAL_NONE || v >= ctx->f->nvals) return 0;
+ out->v.reg = v;
+ return 1;
+ }
+ case OPK_LOCAL:
+ out->v.imm = (i64)op->v.frame_slot;
+ return 1;
+ case OPK_GLOBAL:
+ out->v.imm = (i64)op->v.global.sym;
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int gvn_mem_root_from_addr_val(GvnCtx* ctx, Val addr_val, u32 depth,
+ u8* kind_out, i64* id_out,
+ i64* offset_out,
+ int* singleton_out) {
+ Inst* def = NULL;
+ if (!ctx || depth > 4u) return 0;
+ addr_val = gvn_find(ctx, addr_val);
+ if (!val_def_inst(ctx->f, addr_val, &def)) return 0;
+
+ if ((IROp)def->op == IR_ADDR_OF && def->nopnds >= 2) {
+ Operand* lv = &def->opnds[1];
+ if (lv->kind == OPK_LOCAL) {
+ *kind_out = ALIAS_LOCAL;
+ *id_out = (i64)lv->v.frame_slot;
+ *offset_out = 0;
+ *singleton_out = 1;
+ return 1;
+ }
+ if (lv->kind == OPK_GLOBAL) {
+ *kind_out = ALIAS_GLOBAL;
+ *id_out = (i64)lv->v.global.sym;
+ *offset_out = lv->v.global.addend;
+ *singleton_out = 1;
+ return 1;
+ }
+ return 0;
+ }
+
+ if ((IROp)def->op == IR_COPY && def->nopnds >= 2 &&
+ def->opnds[1].kind == OPK_REG) {
+ return gvn_mem_root_from_addr_val(ctx, (Val)def->opnds[1].v.reg,
+ depth + 1u, kind_out, id_out, offset_out,
+ singleton_out);
+ }
+
+ if ((IROp)def->op == IR_BINOP && def->nopnds >= 3 &&
+ (BinOp)def->extra.imm == BO_IADD) {
+ i64 c = 0;
+ if (def->opnds[1].kind == OPK_REG &&
+ gvn_const_for_operand(ctx, &def->opnds[2], &c) && c == 0) {
+ return gvn_mem_root_from_addr_val(ctx, (Val)def->opnds[1].v.reg,
+ depth + 1u, kind_out, id_out,
+ offset_out, singleton_out);
+ }
+ if (def->opnds[2].kind == OPK_REG &&
+ gvn_const_for_operand(ctx, &def->opnds[1], &c) && c == 0) {
+ return gvn_mem_root_from_addr_val(ctx, (Val)def->opnds[2].v.reg,
+ depth + 1u, kind_out, id_out,
+ offset_out, singleton_out);
+ }
+ }
+
+ return 0;
+}
+
+static int gvn_mem_root_from_access(GvnCtx* ctx, const Operand* addr,
+ const MemAccess* mem,
+ u8* kind_out, i64* id_out,
+ i64* offset_out, int* singleton_out) {
+ u8 kind = ALIAS_UNKNOWN;
+ i64 id = 0;
+ i64 offset = 0;
+ int singleton = 0;
+
+ if (addr) {
+ switch ((OpKind)addr->kind) {
+ case OPK_LOCAL:
+ kind = ALIAS_LOCAL;
+ id = (i64)addr->v.frame_slot;
+ singleton = 1;
+ break;
+ case OPK_GLOBAL:
+ kind = ALIAS_GLOBAL;
+ id = (i64)addr->v.global.sym;
+ offset = addr->v.global.addend;
+ singleton = 1;
+ break;
+ case OPK_INDIRECT:
+ offset = addr->v.ind.ofs;
+ if (ctx) {
+ Val base = gvn_find(ctx, (Val)addr->v.ind.base);
+ u8 akind;
+ i64 aid;
+ i64 aofs;
+ int asing;
+ if (gvn_mem_root_from_addr_val(ctx, base, 0, &akind, &aid, &aofs,
+ &asing)) {
+ kind = akind;
+ id = aid;
+ offset += aofs;
+ singleton = asing;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (kind == ALIAS_UNKNOWN && mem) {
+ kind = mem->alias.kind;
+ switch ((AliasKind)kind) {
+ case ALIAS_LOCAL:
+ id = mem->alias.v.local_id;
+ break;
+ case ALIAS_GLOBAL:
+ id = (i64)mem->alias.v.global;
+ break;
+ case ALIAS_PARAM:
+ id = (i64)mem->alias.v.param_idx;
+ break;
+ case ALIAS_STRING:
+ id = (i64)mem->alias.v.string_id;
+ break;
+ case ALIAS_HEAP:
+ id = 0;
+ break;
+ default:
+ kind = ALIAS_UNKNOWN;
+ id = 0;
+ break;
+ }
+ }
+
+ *kind_out = kind;
+ *id_out = id;
+ *offset_out = offset;
+ *singleton_out = singleton;
+ return kind != ALIAS_UNKNOWN;
+}
+
+static GvnMemVersion* gvn_mem_version(GvnCtx* ctx, u8 kind, u16 addr_space,
+ i64 root_id) {
+ GvnMemState* s = &ctx->mem;
+ for (u32 i = 0; i < s->nroots; ++i) {
+ GvnMemVersion* r = &s->roots[i];
+ if (r->root_kind == kind && r->addr_space == addr_space &&
+ r->root_id == root_id)
+ return r;
+ }
+ if (s->nroots == s->cap) {
+ u32 ncap = s->cap ? s->cap * 2u : 16u;
+ GvnMemVersion* roots = arena_array(ctx->f->arena, GvnMemVersion, ncap);
+ if (s->roots) memcpy(roots, s->roots, sizeof(roots[0]) * s->nroots);
+ s->roots = roots;
+ s->cap = ncap;
+ }
+ GvnMemVersion* r = &s->roots[s->nroots++];
+ memset(r, 0, sizeof *r);
+ r->root_kind = kind;
+ r->addr_space = addr_space;
+ r->root_id = root_id;
+ return r;
+}
+
+static void gvn_mem_barrier(GvnCtx* ctx) {
+ ++ctx->mem.unknown_version;
+}
+
+static void gvn_mem_bump_store(GvnCtx* ctx, u8 kind, u16 addr_space,
+ i64 root_id) {
+ if (kind == ALIAS_UNKNOWN) {
+ gvn_mem_barrier(ctx);
+ return;
+ }
+ ++gvn_mem_version(ctx, kind, addr_space, root_id)->version;
+}
+
+static int gvn_make_memory_key(GvnCtx* ctx, const Inst* in, u32 addr_idx,
+ CfreeCgTypeId val_ty, u8 val_cls,
+ GvnKey* key) {
+ const MemAccess* mem;
+ const Operand* addr;
+ u8 root_kind;
+ i64 root_id;
+ i64 offset;
+ int singleton;
+
+ if (!in || addr_idx >= in->nopnds) return 0;
+ mem = &in->extra.mem;
+ addr = &in->opnds[addr_idx];
+ if (opt_mem_observable(mem)) return 0;
+
+ memset(key, 0, sizeof *key);
+ key->op = IR_LOAD;
+ key->type = val_ty;
+ key->cls = val_cls;
+ key->mem.valid = 1;
+ key->mem.addr_space = mem->addr_space;
+ key->mem.flags = mem->flags;
+ key->mem.mem_type = mem->type;
+ key->mem.size = mem->size;
+ key->mem.align = mem->align;
+ (void)gvn_mem_root_from_access(ctx, addr, mem, &root_kind, &root_id, &offset,
+ &singleton);
+ key->mem.root_kind = root_kind;
+ key->mem.root_id = root_id;
+ key->mem.offset = offset;
+ key->mem.unknown_version = ctx->mem.unknown_version;
+ if (root_kind != ALIAS_UNKNOWN) {
+ key->mem.root_version =
+ gvn_mem_version(ctx, root_kind, mem->addr_space, root_id)->version;
+ }
+
+ if (!singleton) {
+ key->mem.has_addr = 1;
+ key->nops = 1;
+ if (!gvn_make_addr_operand_key(ctx, addr, &key->ops[0])) return 0;
+ }
+ return 1;
+}
+
static int gvn_operand_key_less(const GvnOperandKey* a,
const GvnOperandKey* b) {
if (a->kind != b->kind) return a->kind < b->kind;
@@ -994,6 +1310,90 @@ static int gvn_fold_branch(GvnCtx* ctx, u32 b, Inst* in) {
return 1;
}
+static int gvn_inst_memory_barrier(const Inst* in) {
+ switch ((IROp)in->op) {
+ case IR_CALL:
+ case IR_AGG_COPY:
+ case IR_AGG_SET:
+ case IR_BITFIELD_LOAD:
+ case IR_BITFIELD_STORE:
+ case IR_VA_START:
+ case IR_VA_ARG:
+ case IR_VA_END:
+ case IR_VA_COPY:
+ case IR_ATOMIC_LOAD:
+ case IR_ATOMIC_STORE:
+ case IR_ATOMIC_RMW:
+ case IR_ATOMIC_CAS:
+ case IR_FENCE:
+ case IR_ASM_BLOCK:
+ case IR_INTRINSIC:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int gvn_visit_memory_inst(GvnCtx* ctx, u32 b, u32 i, Inst* in) {
+ if ((IROp)in->op == IR_LOAD) {
+ if (opt_mem_observable(&in->extra.mem)) {
+ gvn_mem_barrier(ctx);
+ return 1;
+ }
+ if (in->def == VAL_NONE || in->def >= ctx->f->nvals) return 1;
+ GvnKey key;
+ if (!gvn_make_memory_key(ctx, in, 1u, ctx->f->val_type[in->def],
+ ctx->f->val_cls[in->def], &key))
+ return 1;
+ Val leader = VAL_NONE;
+ if (gvn_table_find(ctx, &key, b, i, &leader)) {
+ if (gvn_replace_dominated_uses(ctx, in->def, leader,
+ ctx->f->val_def_block[leader],
+ ctx->f->val_def_inst[leader]))
+ ctx->changed = 1;
+ return 1;
+ }
+ gvn_table_add(&ctx->table, &key, in->def, b, i);
+ return 1;
+ }
+
+ if ((IROp)in->op == IR_STORE) {
+ u8 root_kind;
+ i64 root_id;
+ i64 offset;
+ int singleton;
+ Val src;
+ if (opt_mem_observable(&in->extra.mem)) {
+ gvn_mem_barrier(ctx);
+ return 1;
+ }
+ if (in->nopnds < 2) {
+ gvn_mem_barrier(ctx);
+ return 1;
+ }
+ (void)gvn_mem_root_from_access(ctx, &in->opnds[0], &in->extra.mem,
+ &root_kind, &root_id, &offset, &singleton);
+ (void)offset;
+ (void)singleton;
+ gvn_mem_bump_store(ctx, root_kind, in->extra.mem.addr_space, root_id);
+ if (in->opnds[1].kind != OPK_REG) return 1;
+ src = gvn_find(ctx, (Val)in->opnds[1].v.reg);
+ if (src == VAL_NONE || src >= ctx->f->nvals) return 1;
+ GvnKey key;
+ if (!gvn_make_memory_key(ctx, in, 0u, ctx->f->val_type[src],
+ ctx->f->val_cls[src], &key))
+ return 1;
+ gvn_table_add(&ctx->table, &key, src, b, i);
+ return 1;
+ }
+
+ if (gvn_inst_memory_barrier(in)) {
+ gvn_mem_barrier(ctx);
+ return 1;
+ }
+ return 0;
+}
+
static void gvn_visit_inst(GvnCtx* ctx, u32 b, u32 i) {
Inst* in = &ctx->f->blocks[b].insts[i];
if (gvn_fold_branch(ctx, b, in)) {
@@ -1001,6 +1401,7 @@ static void gvn_visit_inst(GvnCtx* ctx, u32 b, u32 i) {
ctx->cfg_changed = 1;
return;
}
+ if (gvn_visit_memory_inst(ctx, b, i, in)) return;
if (!gvn_scalar_candidate(in)) return;
if (gvn_try_fold_inst(ctx, in)) ctx->changed = 1;
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -381,6 +381,18 @@ static Inst* emit_store_local(Func* f, u32 b, FrameSlot fs, Val src,
return in;
}
+static Inst* emit_store_indirect(Func* f, u32 b, Val base, Val src,
+ CfreeCgTypeId ty, u16 flags) {
+ Inst* in = ir_emit(f, b, IR_STORE);
+ in->opnds = arena_array(f->arena, Operand, 2);
+ in->opnds[0] = op_indirect_((Reg)base, ty);
+ in->opnds[1] = op_reg_(src, ty);
+ in->nopnds = 2;
+ in->extra.mem = mem_unknown_(ty, 4);
+ in->extra.mem.flags = flags;
+ return in;
+}
+
static Inst* emit_addr_of_local(Func* f, u32 b, Val dst, FrameSlot fs,
CfreeCgTypeId ptr_ty, CfreeCgTypeId local_ty) {
Inst* in = ir_emit(f, b, IR_ADDR_OF);
@@ -395,6 +407,24 @@ static Inst* emit_addr_of_local(Func* f, u32 b, Val dst, FrameSlot fs,
return in;
}
+static Inst* emit_atomic_load_local(Func* f, u32 b, Val dst, FrameSlot fs,
+ CfreeCgTypeId ty) {
+ Inst* in = ir_emit(f, b, IR_ATOMIC_LOAD);
+ IRAtomicAux* aux = arena_znew(f->arena, IRAtomicAux);
+ aux->mem = mem_local_(fs, ty, 4, MF_ATOMIC);
+ aux->mo = MO_SEQ_CST;
+ in->opnds = arena_array(f->arena, Operand, 2);
+ in->opnds[0] = op_reg_(dst, ty);
+ in->opnds[1] = op_local_(fs, ty);
+ in->nopnds = 2;
+ in->def = dst;
+ in->type = ty;
+ in->extra.aux = aux;
+ f->val_def_block[dst] = b;
+ f->val_def_inst[dst] = f->blocks[b].ninsts - 1u;
+ return in;
+}
+
static Inst* emit_load_indirect(Func* f, u32 b, Val dst, Val base,
CfreeCgTypeId ty, u16 flags) {
Inst* in = ir_emit(f, b, IR_LOAD);
@@ -2665,6 +2695,210 @@ static void opt_gvn_folds_safe_scalar_constants(void) {
tc_fini(&tc);
}
+static void opt_gvn_rewrites_redundant_local_load(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, FSF_ADDR_TAKEN);
+ Val first = add_val(f, tc.i32);
+ Val second = add_val(f, tc.i32);
+ emit_load_local(f, f->entry, first, fs, tc.i32, 0);
+ emit_load_local(f, f->entry, second, fs, tc.i32, 0);
+ emit_ret_val(f, f->entry, second, tc.i32);
+
+ opt_build_cfg(f);
+ opt_gvn(f);
+ opt_verify(f, "test-gvn-memory-redundant-local-load");
+ EXPECT(ret_val(f, f->entry) == first,
+ "memory GVN should rewrite repeated local loads");
+ expect_ir_dump_eq(f,
+ "ir blocks=1 vals=3\n"
+ "block 0 preds=[] succs=[] insts=3\n"
+ " 0 load def=v1 opnds=[v1,local#1] mem=size4 align4 "
+ "flags=0x0 alias=local#1\n"
+ " 1 load def=v2 opnds=[v2,local#1] mem=size4 align4 "
+ "flags=0x0 alias=local#1\n"
+ " 2 ret ret=v1\n",
+ "memory GVN repeated local load");
+ tc_fini(&tc);
+}
+
+static void opt_gvn_reuses_store_to_local_load(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, FSF_ADDR_TAKEN);
+ Val src = add_val(f, tc.i32);
+ Val loaded = add_val(f, tc.i32);
+ emit_scalar_input(f, f->entry, src, tc.i32);
+ emit_store_local(f, f->entry, fs, src, tc.i32, 0);
+ emit_load_local(f, f->entry, loaded, fs, tc.i32, 0);
+ emit_ret_val(f, f->entry, loaded, tc.i32);
+
+ opt_build_cfg(f);
+ opt_gvn(f);
+ opt_verify(f, "test-gvn-memory-store-load-local");
+ EXPECT(ret_val(f, f->entry) == src,
+ "memory GVN should reuse a dominating store to the same local");
+ expect_ir_dump_eq(f,
+ "ir blocks=1 vals=3\n"
+ "block 0 preds=[] succs=[] insts=4\n"
+ " 0 param_decl def=v1 opnds=[v1]\n"
+ " 1 store opnds=[local#1,v1] mem=size4 align4 "
+ "flags=0x0 alias=local#1\n"
+ " 2 load def=v2 opnds=[v2,local#1] mem=size4 align4 "
+ "flags=0x0 alias=local#1\n"
+ " 3 ret ret=v1\n",
+ "memory GVN store load local");
+ tc_fini(&tc);
+}
+
+static void opt_gvn_reuses_store_to_addr_of_zero_index_load(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, FSF_ADDR_TAKEN);
+ CfreeCgTypeId ptr_ty = cfree_cg_type_ptr(tc.c, tc.i32, 0);
+ Val src = add_val(f, tc.i32);
+ Val addr_store = add_val(f, ptr_ty);
+ Val zero_store = add_val(f, ptr_ty);
+ Val indexed_store = add_val(f, ptr_ty);
+ Val addr_load = add_val(f, ptr_ty);
+ Val zero_load = add_val(f, ptr_ty);
+ Val indexed_load = add_val(f, ptr_ty);
+ Val loaded = add_val(f, tc.i32);
+ emit_scalar_input(f, f->entry, src, tc.i32);
+ emit_addr_of_local(f, f->entry, addr_store, fs, ptr_ty, tc.i32);
+ emit_load_imm(f, f->entry, zero_store, ptr_ty, 0);
+ emit_binop(f, f->entry, indexed_store, addr_store, zero_store, ptr_ty);
+ emit_store_indirect(f, f->entry, indexed_store, src, tc.i32, 0);
+ emit_addr_of_local(f, f->entry, addr_load, fs, ptr_ty, tc.i32);
+ emit_load_imm(f, f->entry, zero_load, ptr_ty, 0);
+ emit_binop(f, f->entry, indexed_load, addr_load, zero_load, ptr_ty);
+ emit_load_indirect(f, f->entry, loaded, indexed_load, tc.i32, 0);
+ emit_ret_val(f, f->entry, loaded, tc.i32);
+
+ opt_build_cfg(f);
+ opt_gvn(f);
+ opt_verify(f, "test-gvn-memory-addr-of-zero-index-load");
+ EXPECT(ret_val(f, f->entry) == src,
+ "memory GVN should see through addr_of plus zero index");
+ tc_fini(&tc);
+}
+
+static void opt_gvn_preserves_load_across_unknown_store(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ CfreeCgTypeId ptr_ty = cfree_cg_type_ptr(tc.c, tc.i32, 0);
+ Func* f = new_func(&tc);
+ FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, FSF_ADDR_TAKEN);
+ Val ptr = add_val(f, ptr_ty);
+ Val src = add_val(f, tc.i32);
+ Val first = add_val(f, tc.i32);
+ Val second = add_val(f, tc.i32);
+ emit_scalar_input(f, f->entry, ptr, ptr_ty);
+ emit_scalar_input(f, f->entry, src, tc.i32);
+ emit_load_local(f, f->entry, first, fs, tc.i32, 0);
+ emit_store_indirect(f, f->entry, ptr, src, tc.i32, 0);
+ emit_load_local(f, f->entry, second, fs, tc.i32, 0);
+ emit_ret_val(f, f->entry, second, tc.i32);
+
+ opt_build_cfg(f);
+ opt_gvn(f);
+ opt_verify(f, "test-gvn-memory-unknown-store-barrier");
+ EXPECT(ret_val(f, f->entry) == second,
+ "memory GVN should not rewrite across an unknown store");
+ expect_ir_dump_eq(f,
+ "ir blocks=1 vals=5\n"
+ "block 0 preds=[] succs=[] insts=6\n"
+ " 0 param_decl def=v1 opnds=[v1]\n"
+ " 1 param_decl def=v2 opnds=[v2]\n"
+ " 2 load def=v3 opnds=[v3,local#1] mem=size4 align4 "
+ "flags=0x0 alias=local#1\n"
+ " 3 store opnds=[[v1+0],v2] mem=size4 align4 flags=0x0 "
+ "alias=unknown\n"
+ " 4 load def=v4 opnds=[v4,local#1] mem=size4 align4 "
+ "flags=0x0 alias=local#1\n"
+ " 5 ret ret=v4\n",
+ "memory GVN unknown store barrier");
+ tc_fini(&tc);
+}
+
+static void opt_gvn_preserves_load_across_call(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, FSF_ADDR_TAKEN);
+ Val first = add_val(f, tc.i32);
+ Val second = add_val(f, tc.i32);
+ emit_load_local(f, f->entry, first, fs, tc.i32, 0);
+ emit_call_void(f, f->entry);
+ emit_load_local(f, f->entry, second, fs, tc.i32, 0);
+ emit_ret_val(f, f->entry, second, tc.i32);
+
+ opt_build_cfg(f);
+ opt_gvn(f);
+ opt_verify(f, "test-gvn-memory-call-barrier");
+ EXPECT(ret_val(f, f->entry) == second,
+ "memory GVN should not rewrite across calls");
+ expect_ir_dump_eq(f,
+ "ir blocks=1 vals=3\n"
+ "block 0 preds=[] succs=[] insts=4\n"
+ " 0 load def=v1 opnds=[v1,local#1] mem=size4 align4 "
+ "flags=0x0 alias=local#1\n"
+ " 1 call\n"
+ " 2 load def=v2 opnds=[v2,local#1] mem=size4 align4 "
+ "flags=0x0 alias=local#1\n"
+ " 3 ret ret=v2\n",
+ "memory GVN call barrier");
+ tc_fini(&tc);
+}
+
+static void opt_gvn_preserves_observable_memory_loads(void) {
+ TestCtx tc;
+ tc_init(&tc);
+
+ Func* volatile_f = new_func(&tc);
+ FrameSlot vfs =
+ add_frame_slot(volatile_f, tc.i32, FS_LOCAL, 4, FSF_ADDR_TAKEN);
+ Val vfirst = add_val(volatile_f, tc.i32);
+ Val vsecond = add_val(volatile_f, tc.i32);
+ emit_load_local(volatile_f, volatile_f->entry, vfirst, vfs, tc.i32,
+ MF_VOLATILE);
+ emit_load_local(volatile_f, volatile_f->entry, vsecond, vfs, tc.i32,
+ MF_VOLATILE);
+ emit_ret_val(volatile_f, volatile_f->entry, vsecond, tc.i32);
+ opt_build_cfg(volatile_f);
+ opt_gvn(volatile_f);
+ opt_verify(volatile_f, "test-gvn-memory-volatile-loads");
+ EXPECT(ret_val(volatile_f, volatile_f->entry) == vsecond,
+ "memory GVN should not rewrite volatile loads");
+ expect_ir_dump_eq(volatile_f,
+ "ir blocks=1 vals=3\n"
+ "block 0 preds=[] succs=[] insts=3\n"
+ " 0 load def=v1 opnds=[v1,local#1] mem=size4 align4 "
+ "flags=0x1 alias=local#1\n"
+ " 1 load def=v2 opnds=[v2,local#1] mem=size4 align4 "
+ "flags=0x1 alias=local#1\n"
+ " 2 ret ret=v2\n",
+ "memory GVN volatile loads");
+
+ Func* atomic_f = new_func(&tc);
+ FrameSlot afs = add_frame_slot(atomic_f, tc.i32, FS_LOCAL, 4, FSF_ADDR_TAKEN);
+ Val afirst = add_val(atomic_f, tc.i32);
+ Val asecond = add_val(atomic_f, tc.i32);
+ emit_atomic_load_local(atomic_f, atomic_f->entry, afirst, afs, tc.i32);
+ emit_atomic_load_local(atomic_f, atomic_f->entry, asecond, afs, tc.i32);
+ emit_ret_val(atomic_f, atomic_f->entry, asecond, tc.i32);
+ opt_build_cfg(atomic_f);
+ opt_gvn(atomic_f);
+ opt_verify(atomic_f, "test-gvn-memory-atomic-loads");
+ EXPECT(ret_val(atomic_f, atomic_f->entry) == asecond,
+ "memory GVN should not rewrite atomic loads");
+
+ tc_fini(&tc);
+}
+
static void opt_jump_cleanup_forwards_branch_targets(void) {
TestCtx tc;
tc_init(&tc);
@@ -4933,6 +5167,12 @@ int main(void) {
opt_gvn_preserves_nondominated_scalar_duplicates();
opt_gvn_canonicalizes_commutative_scalar_operands();
opt_gvn_folds_safe_scalar_constants();
+ opt_gvn_rewrites_redundant_local_load();
+ opt_gvn_reuses_store_to_local_load();
+ opt_gvn_reuses_store_to_addr_of_zero_index_load();
+ opt_gvn_preserves_load_across_unknown_store();
+ opt_gvn_preserves_load_across_call();
+ opt_gvn_preserves_observable_memory_loads();
opt_jump_cleanup_forwards_branch_targets();
opt_jump_cleanup_inverts_to_remove_jump_block();
opt_jump_cleanup_keeps_conditional_fallthrough_block();
diff --git a/test/toy/cases/131_o2_mem_gvn_addr_load.toy b/test/toy/cases/131_o2_mem_gvn_addr_load.toy
@@ -0,0 +1,16 @@
+fn choose(flag: i64): i64 {
+ var x: i64 = 11;
+ if flag != 0 {
+ x = x + 31;
+ } else {
+ x = x + 47;
+ }
+ let p: *i64 = &x;
+ return p[0];
+}
+
+fn __user_main(): i64 {
+ return (choose(1) - 42) + (choose(0) - 58);
+}
+
+fn main(): i32 { return __user_main() as i32; }