Implement production SSA for O2 - kit

commit 87eb85cdc69c09fe71ad21e4d29c0f832bf05b77
parent cfe4be02ba8c5cb7e022bb550458fbcd40ef7d14
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 21 May 2026 07:24:16 -0700

Implement production SSA for O2

Diffstat:
M src/opt/ir.h  | 22 ++++++++++++++++++++++
M src/opt/opt.c  | 15 +++++++++++++--
M src/opt/opt_internal.h  | 3 +++
M src/opt/pass_analysis.c  | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M src/opt/pass_ssa.c  | 878 ++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
M test/opt/opt_test.c  | 181 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

6 files changed, 927 insertions(+), 322 deletions(-)
diff --git a/src/opt/ir.h b/src/opt/ir.h
@@ -331,6 +331,24 @@ typedef struct OptValInfo {
   u32 forbidden_hard_regs; /* bit r means Val may not allocate hard reg r. */
 } OptValInfo;
 
+typedef enum OptUseKind {
+  OPT_USE_OPERAND,
+  OPT_USE_INDIRECT_BASE,
+  OPT_USE_PHI_INPUT,
+} OptUseKind;
+
+typedef struct OptUse {
+  Val val;
+  u32 block;
+  u32 inst;
+  u32 next_for_val;
+  u32 operand_index;
+  u32 phi_pred_index;
+  Operand* operand;
+  u8 kind; /* OptUseKind */
+  u8 pad[3];
+} OptUse;
+
 typedef struct Func {
   Arena* arena;
   Compiler* c;
@@ -392,6 +410,10 @@ typedef struct Func {
   u64 opt_dde_live_words_touched;
   OptValInfo* val_info; /* indexed by Val, length nvals */
 
+  OptUse* opt_uses;
+  u32 opt_nuses, opt_uses_cap;
+  u32* opt_first_use_by_val; /* indexed by Val, OPT_USE_NONE if no uses */
+
   Reg opt_hard_regs[OPT_REG_CLASSES][OPT_MAX_HARD_REGS];
   u32 opt_hard_reg_count[OPT_REG_CLASSES];
   CGPhysRegInfo opt_phys_regs[OPT_REG_CLASSES][OPT_MAX_HARD_REGS];
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -1400,8 +1400,19 @@ static void opt_run_o1_pipeline(OptImpl* o) {
 }
 
 static void opt_run_o2_pipeline(OptImpl* o) {
-  /* O2 has a distinct scheduler now, but its value/SSA pre-lowering passes
-   * are still disabled until their shared analysis substrate lands. */
+  metrics_scope_begin(o->c, "opt.o2.ssa");
+  opt_build_cfg(o->f);
+  opt_jump_cleanup(o->f, OPT_JUMP_CLEANUP_CFG);
+  opt_build_cfg(o->f);
+  opt_verify(o->f, "o2-pre-ssa-cfg");
+  opt_build_ssa(o->f);
+  opt_verify(o->f, "o2-ssa");
+  opt_make_conventional_ssa(o->f);
+  opt_verify(o->f, "o2-conventional-ssa");
+  opt_undo_ssa(o->f);
+  opt_build_cfg(o->f);
+  opt_verify(o->f, "o2-undo-ssa");
+  metrics_scope_end(o->c, "opt.o2.ssa");
   opt_run_lowering_pipeline(o, "opt.o2.total", 1);
 }
 
diff --git a/src/opt/opt_internal.h b/src/opt/opt_internal.h
@@ -3,6 +3,8 @@
 
 #include "opt/opt.h"
 
+#define OPT_USE_NONE 0xffffffffu
+
 typedef struct OptHardRegSet {
   u32 cls[OPT_REG_CLASSES];
 } OptHardRegSet;
@@ -49,6 +51,7 @@ void opt_analysis_build_order(Func*, OptAnalysis*);
 void opt_analysis_build_dominators(Func*, OptAnalysis*);
 void opt_analysis_build_dom_frontier(Func*, OptAnalysis*);
 int opt_analysis_dominates(const OptAnalysis*, u32 dom, u32 node);
+void opt_rebuild_def_use(Func*);
 void opt_verify(Func*, const char* stage);
 
 int opt_val_in_inst_defs(const Inst*, Val);
diff --git a/src/opt/pass_analysis.c b/src/opt/pass_analysis.c
@@ -179,6 +179,123 @@ static int fixed_terminator_succ_count(const Inst* in, u32* count_out) {
   }
 }
 
+static void opt_use_add(Func* f, Val v, u32 b, u32 i, u8 kind, u32 op_idx,
+                        u32 pred_idx, Operand* op) {
+  if (v == VAL_NONE || v >= f->nvals) return;
+  if (f->opt_nuses == f->opt_uses_cap) {
+    u32 ncap = f->opt_uses_cap ? f->opt_uses_cap * 2u : 32u;
+    OptUse* uses = arena_zarray(f->arena, OptUse, ncap);
+    if (f->opt_uses)
+      memcpy(uses, f->opt_uses, sizeof(uses[0]) * f->opt_nuses);
+    f->opt_uses = uses;
+    f->opt_uses_cap = ncap;
+  }
+  u32 id = f->opt_nuses++;
+  OptUse* u = &f->opt_uses[id];
+  u->val = v;
+  u->block = b;
+  u->inst = i;
+  u->kind = kind;
+  u->operand_index = op_idx;
+  u->phi_pred_index = pred_idx;
+  u->operand = op;
+  u->next_for_val = f->opt_first_use_by_val[v];
+  f->opt_first_use_by_val[v] = id;
+}
+
+static void opt_use_add_operand(Func* f, u32 b, u32 i, u32 op_idx,
+                                Operand* op, int is_def) {
+  if (!op || is_def) return;
+  if (op->kind == OPK_REG) {
+    opt_use_add(f, (Val)op->v.reg, b, i, OPT_USE_OPERAND, op_idx,
+                OPT_USE_NONE, op);
+  } else if (op->kind == OPK_INDIRECT) {
+    opt_use_add(f, (Val)op->v.ind.base, b, i, OPT_USE_INDIRECT_BASE, op_idx,
+                OPT_USE_NONE, op);
+  }
+}
+
+static void opt_use_add_abivalue(Func* f, u32 b, u32 i, CGABIValue* v,
+                                 int storage_def) {
+  if (!v) return;
+  opt_use_add_operand(f, b, i, OPT_USE_NONE, &v->storage, storage_def);
+  for (u32 p = 0; p < v->nparts; ++p)
+    opt_use_add_operand(f, b, i, p, (Operand*)&v->parts[p].op, storage_def);
+}
+
+static void opt_collect_inst_uses(Func* f, u32 b, u32 i, Inst* in) {
+  for (u32 o = 0; o < in->nopnds; ++o) {
+    int is_def = 0;
+    if (in->opnds[o].kind == OPK_REG)
+      is_def = o == 0 && opt_val_in_inst_defs(in, (Val)in->opnds[o].v.reg);
+    opt_use_add_operand(f, b, i, o, &in->opnds[o], is_def);
+  }
+
+  switch ((IROp)in->op) {
+    case IR_PHI: {
+      IRPhiAux* aux = (IRPhiAux*)in->extra.aux;
+      if (!aux) break;
+      for (u32 p = 0; p < aux->npreds; ++p)
+        opt_use_add(f, aux->pred_vals[p], b, i, OPT_USE_PHI_INPUT,
+                    OPT_USE_NONE, p, NULL);
+      break;
+    }
+    case IR_CALL: {
+      IRCallAux* aux = (IRCallAux*)in->extra.aux;
+      if (!aux) break;
+      if (aux->use_plan_replay) {
+        opt_use_add_operand(f, b, i, OPT_USE_NONE, &aux->plan.callee, 0);
+        for (u32 a = 0; a < aux->plan.nargs; ++a)
+          opt_use_add_operand(f, b, i, a, &aux->plan.args[a].src, 0);
+      } else {
+        opt_use_add_operand(f, b, i, OPT_USE_NONE, &aux->desc.callee, 0);
+        for (u32 a = 0; a < aux->desc.nargs; ++a)
+          opt_use_add_abivalue(f, b, i, (CGABIValue*)&aux->desc.args[a], 0);
+      }
+      break;
+    }
+    case IR_RET: {
+      IRRetAux* aux = (IRRetAux*)in->extra.aux;
+      if (aux && aux->present) opt_use_add_abivalue(f, b, i, &aux->val, 0);
+      break;
+    }
+    case IR_SCOPE_BEGIN: {
+      IRScopeAux* aux = (IRScopeAux*)in->extra.aux;
+      if (aux) opt_use_add_operand(f, b, i, OPT_USE_NONE, &aux->desc.cond, 0);
+      break;
+    }
+    case IR_ASM_BLOCK: {
+      IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
+      if (!aux) break;
+      for (u32 a = 0; a < aux->nin; ++a)
+        opt_use_add_operand(f, b, i, a, &aux->in_ops[a], 0);
+      break;
+    }
+    case IR_INTRINSIC: {
+      IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
+      if (!aux) break;
+      for (u32 a = 0; a < aux->narg; ++a)
+        opt_use_add_operand(f, b, i, a, &aux->args[a], 0);
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+void opt_rebuild_def_use(Func* f) {
+  if (!f) return;
+  f->opt_nuses = 0;
+  f->opt_first_use_by_val =
+      arena_array(f->arena, u32, f->nvals ? f->nvals : 1u);
+  for (u32 v = 0; v < f->nvals; ++v) f->opt_first_use_by_val[v] = OPT_USE_NONE;
+  for (u32 b = 0; b < f->nblocks; ++b) {
+    Block* bl = &f->blocks[b];
+    for (u32 i = 0; i < bl->ninsts; ++i)
+      opt_collect_inst_uses(f, b, i, &bl->insts[i]);
+  }
+}
+
 static void verify_operand(Func* f, Inst* in, Operand* op, int is_def,
                            void* arg) {
   (void)in;
@@ -207,6 +324,12 @@ static void verify_values(Func* f, const char* stage) {
       if ((IROp)in->op == IR_PHI) {
         IRPhiAux* aux = (IRPhiAux*)in->extra.aux;
         if (!aux) opt_fail(f, stage, "phi missing aux", b, i);
+        if (in->def == VAL_NONE) opt_fail(f, stage, "phi missing def", b, i);
+        if (in->nopnds || in->opnds)
+          opt_fail(f, stage, "phi should not carry operands", b, i);
+        if (aux->slot_id > f->nframe_slots)
+          opt_fail(f, stage, "phi bad slot id", aux->slot_id,
+                   f->nframe_slots);
         if (aux->npreds != bl->npreds)
           opt_fail(f, stage, "phi pred count mismatch", aux->npreds,
                    bl->npreds);
@@ -222,6 +345,32 @@ static void verify_values(Func* f, const char* stage) {
   }
 }
 
+static void verify_def_use(Func* f, const char* stage) {
+  if (f->opt_rewritten) return;
+  opt_rebuild_def_use(f);
+  for (u32 u = 0; u < f->opt_nuses; ++u) {
+    OptUse* use = &f->opt_uses[u];
+    if (use->val == VAL_NONE || use->val >= f->nvals)
+      opt_fail(f, stage, "def-use bad use val", use->val, f->nvals);
+    if (use->block >= f->nblocks)
+      opt_fail(f, stage, "def-use bad block", use->block, f->nblocks);
+    if (use->inst >= f->blocks[use->block].ninsts)
+      opt_fail(f, stage, "def-use bad inst", use->inst,
+               f->blocks[use->block].ninsts);
+    if (f->val_def_block[use->val] >= f->nblocks)
+      opt_fail(f, stage, "def-use bad def block", use->val,
+               f->val_def_block[use->val]);
+  }
+  for (Val v = 1; v < f->nvals; ++v) {
+    for (u32 u = f->opt_first_use_by_val[v]; u != OPT_USE_NONE;
+         u = f->opt_uses[u].next_for_val) {
+      if (u >= f->opt_nuses) opt_fail(f, stage, "def-use bad next", v, u);
+      if (f->opt_uses[u].val != v)
+        opt_fail(f, stage, "def-use wrong value list", v, u);
+    }
+  }
+}
+
 void opt_verify(Func* f, const char* stage) {
   if (!f) return;
   if (f->nblocks && f->entry >= f->nblocks)
@@ -264,4 +413,5 @@ void opt_verify(Func* f, const char* stage) {
     seen_emit[b] = 1;
   }
   verify_values(f, stage);
+  verify_def_use(f, stage);
 }
diff --git a/src/opt/pass_ssa.c b/src/opt/pass_ssa.c
@@ -1,427 +1,665 @@
-/* pass_ssa.c — mem2reg + dominance-frontier SSA construction.
- *
- * Goal for Phase 3 (doc/OPT.md): build SSA without consuming it. The
- * output is discarded before replay, so this pass's job is shape
- * checking — no panics on the corpus.
- *
- * Algorithm (Cooper-Harvey-Kennedy iterative dominators + Cytron et al.
- * dominance-frontier phi insertion):
- *
- *   1. Postorder + reverse-postorder traversal of the CFG.
- *   2. Compute idom[] iteratively via the two-finger intersect.
- *   3. Compute DF[] from idom[] in one pass.
- *   4. For each promotable FrameSlot (no FSF_ADDR_TAKEN), compute the
- *      iterated dominance frontier of its defining blocks; insert
- *      IR_PHI at the start of each block in the IDF.
- *   5. Rename: DFS the dominator tree, maintain a stack per slot; on
- *      store push the stored Val, on load record (load_def → top) into
- *      a rename map, on each successor fill in this block's slot in
- *      that successor's phis. After processing children, pop.
- *
- * The rename map is built but its uses are intentionally NOT walked
- * across other instructions in the dry-run — that is the part that
- * mutates the IR for downstream passes, and Phase 3 discards the IR.
- * The phi-insertion + slot-stack walk is the part that exercises the
- * IR shape, which is what the dry-run is checking. */
-
-#include "opt/ir.h"
-#include "opt/opt.h"
+/* pass_ssa.c - mem2reg SSA construction and phi destruction for O2. */
+
+#include "opt/opt_internal.h"
 
 #include <string.h>
 
 #include "core/arena.h"
 #include "core/core.h"
 
-#define BLK_NONE 0xffffffffu
-
-/* ---- postorder ---- */
+typedef struct SlotStack {
+  Val* vals;
+  u32 n;
+  u32 cap;
+} SlotStack;
 
-typedef struct PostorderCtx {
+typedef struct RenameCtx {
   Func* f;
-  u32* po;     /* po[i] = block id at postorder position i */
-  u32* po_idx; /* po_idx[block] = postorder position of block */
-  u8* visited;
-  u32 count;
-} PostorderCtx;
-
-static void postorder_dfs(PostorderCtx* ctx, u32 b) {
-  if (ctx->visited[b]) return;
-  ctx->visited[b] = 1;
-  Block* bl = &ctx->f->blocks[b];
-  for (u32 s = 0; s < bl->nsucc; ++s) {
-    u32 t = bl->succ[s];
-    if (t < ctx->f->nblocks) postorder_dfs(ctx, t);
-  }
-  ctx->po[ctx->count] = b;
-  ctx->po_idx[b] = ctx->count;
-  ctx->count++;
+  OptAnalysis* analysis;
+  u8* promoted;
+  Val* repl;
+  u32 repl_cap;
+  SlotStack* stacks;
+} RenameCtx;
+
+typedef struct EdgeMove {
+  Val dst;
+  Val src;
+  CfreeCgTypeId type;
+  u8 cls;
+} EdgeMove;
+
+static u8 ssa_type_class(Func* f, CfreeCgTypeId ty) {
+  CfreeCgTypeKind kind = cfree_cg_type_kind((CfreeCompiler*)f->c, ty);
+  return kind == CFREE_CG_TYPE_FLOAT ? RC_FP : RC_INT;
+}
+
+static u32 opnd_slot_id(const Operand* op) {
+  if (!op || op->kind != OPK_LOCAL) return 0;
+  return (u32)op->v.frame_slot;
 }
 
-/* ---- dominators (Cooper-Harvey-Kennedy) ---- */
+static int base_slot_promotable(const Func* f, u32 slot_id) {
+  if (slot_id == 0 || slot_id > f->nframe_slots) return 0;
+  const IRFrameSlot* s = &f->frame_slots[slot_id - 1u];
+  if (s->kind != FS_LOCAL) return 0;
+  if (s->flags & (FSF_ADDR_TAKEN | FSF_VOLATILE)) return 0;
+  return 1;
+}
 
-static u32 dom_intersect(u32 b1, u32 b2, const u32* idom, const u32* po_idx) {
-  while (b1 != b2) {
-    while (po_idx[b1] < po_idx[b2]) b1 = idom[b1];
-    while (po_idx[b2] < po_idx[b1]) b2 = idom[b2];
-  }
-  return b1;
+static int operand_has_slot(const Operand* op, u32 slot_id) {
+  return op && op->kind == OPK_LOCAL && opnd_slot_id(op) == slot_id;
 }
 
-static u32* compute_idom(Func* f, const u32* po, const u32* po_idx, u32 ncount,
-                        u32 entry) {
-  u32* idom = arena_array(f->arena, u32, f->nblocks);
-  for (u32 b = 0; b < f->nblocks; ++b) idom[b] = BLK_NONE;
-  idom[entry] = entry;
-
-  int changed = 1;
-  while (changed) {
-    changed = 0;
-    /* Reverse postorder, skip the entry block. */
-    for (i32 i = (i32)ncount - 1; i >= 0; --i) {
-      u32 b = po[i];
-      if (b == entry) continue;
-      Block* bl = &f->blocks[b];
-      u32 new_idom = BLK_NONE;
-      for (u32 p = 0; p < bl->npreds; ++p) {
-        u32 pp = bl->preds[p];
-        if (idom[pp] != BLK_NONE) {
-          new_idom = (new_idom == BLK_NONE)
-                         ? pp
-                         : dom_intersect(pp, new_idom, idom, po_idx);
-        }
-      }
-      if (new_idom != BLK_NONE && idom[b] != new_idom) {
-        idom[b] = new_idom;
-        changed = 1;
+static int abivalue_has_slot(const CGABIValue* v, u32 slot_id) {
+  if (!v) return 0;
+  if (operand_has_slot(&v->storage, slot_id)) return 1;
+  for (u32 i = 0; i < v->nparts; ++i)
+    if (operand_has_slot(&v->parts[i].op, slot_id)) return 1;
+  return 0;
+}
+
+static int aux_has_slot(const Inst* in, u32 slot_id) {
+  switch ((IROp)in->op) {
+    case IR_CALL: {
+      IRCallAux* aux = (IRCallAux*)in->extra.aux;
+      if (!aux) return 0;
+      if (aux->use_plan_replay) {
+        if (operand_has_slot(&aux->plan.callee, slot_id)) return 1;
+        for (u32 i = 0; i < aux->plan.nargs; ++i)
+          if (operand_has_slot(&aux->plan.args[i].src, slot_id)) return 1;
+        for (u32 i = 0; i < aux->plan.nrets; ++i)
+          if (operand_has_slot(&aux->plan.rets[i].dst, slot_id)) return 1;
+      } else {
+        if (operand_has_slot(&aux->desc.callee, slot_id)) return 1;
+        for (u32 i = 0; i < aux->desc.nargs; ++i)
+          if (abivalue_has_slot(&aux->desc.args[i], slot_id)) return 1;
+        if (abivalue_has_slot(&aux->desc.ret, slot_id)) return 1;
       }
+      break;
+    }
+    case IR_RET: {
+      IRRetAux* aux = (IRRetAux*)in->extra.aux;
+      return aux && aux->present && abivalue_has_slot(&aux->val, slot_id);
+    }
+    case IR_SCOPE_BEGIN: {
+      IRScopeAux* aux = (IRScopeAux*)in->extra.aux;
+      return aux && operand_has_slot(&aux->desc.cond, slot_id);
+    }
+    case IR_ASM_BLOCK: {
+      IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
+      if (!aux) return 0;
+      for (u32 i = 0; i < aux->nin; ++i)
+        if (operand_has_slot(&aux->in_ops[i], slot_id)) return 1;
+      for (u32 i = 0; i < aux->nout; ++i)
+        if (operand_has_slot(&aux->out_ops[i], slot_id)) return 1;
+      break;
     }
+    case IR_INTRINSIC: {
+      IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
+      if (!aux) return 0;
+      for (u32 i = 0; i < aux->narg; ++i)
+        if (operand_has_slot(&aux->args[i], slot_id)) return 1;
+      for (u32 i = 0; i < aux->ndst; ++i)
+        if (operand_has_slot(&aux->dsts[i], slot_id)) return 1;
+      break;
+    }
+    default:
+      break;
   }
-  return idom;
+  return 0;
 }
 
-/* ---- dominance frontier ---- */
-
-typedef struct DfSet {
-  u32* members;
-  u32 n, cap;
-} DfSet;
-
-static void df_add(Arena* a, DfSet* s, u32 b) {
-  for (u32 i = 0; i < s->n; ++i)
-    if (s->members[i] == b) return;
-  if (s->n == s->cap) {
-    u32 ncap = s->cap ? s->cap * 2u : 4u;
-    u32* nb = arena_array(a, u32, ncap);
-    if (s->members) memcpy(nb, s->members, sizeof(u32) * s->n);
-    s->members = nb;
-    s->cap = ncap;
+static int slot_access_promotable(const Inst* in, u32 slot_id) {
+  if ((IROp)in->op == IR_LOAD) {
+    if (in->nopnds < 2 || opnd_slot_id(&in->opnds[1]) != slot_id) return 1;
+    return in->opnds[0].kind == OPK_REG && !opt_mem_observable(&in->extra.mem);
   }
-  s->members[s->n++] = b;
+  if ((IROp)in->op == IR_STORE) {
+    if (in->nopnds < 2 || opnd_slot_id(&in->opnds[0]) != slot_id) return 1;
+    if (opt_mem_observable(&in->extra.mem)) return 0;
+    return in->opnds[1].kind == OPK_REG || in->opnds[1].kind == OPK_IMM;
+  }
+  for (u32 i = 0; i < in->nopnds; ++i)
+    if (opnd_slot_id(&in->opnds[i]) == slot_id) return 0;
+  if (aux_has_slot(in, slot_id)) return 0;
+  return 1;
 }
 
-static DfSet* compute_df(Func* f, const u32* idom) {
-  DfSet* df = arena_zarray(f->arena, DfSet, f->nblocks);
+static u8* find_promoted_slots(Func* f) {
+  u8* promoted = arena_zarray(f->arena, u8, f->nframe_slots + 1u);
+  for (u32 sid = 1; sid <= f->nframe_slots; ++sid) {
+    if (!base_slot_promotable(f, sid)) continue;
+    promoted[sid] = 1;
+  }
   for (u32 b = 0; b < f->nblocks; ++b) {
     Block* bl = &f->blocks[b];
-    if (bl->npreds < 2) continue;
-    if (idom[b] == BLK_NONE) continue;
-    for (u32 p = 0; p < bl->npreds; ++p) {
-      u32 runner = bl->preds[p];
-      while (runner != idom[b] && runner != BLK_NONE) {
-        df_add(f->arena, &df[runner], b);
-        runner = idom[runner];
+    for (u32 i = 0; i < bl->ninsts; ++i) {
+      Inst* in = &bl->insts[i];
+      for (u32 sid = 1; sid <= f->nframe_slots; ++sid) {
+        if (promoted[sid] && !slot_access_promotable(in, sid))
+          promoted[sid] = 0;
       }
     }
   }
-  return df;
+  return promoted;
 }
 
-/* ---- promotable slots ---- */
+static void stack_push(Arena* a, SlotStack* s, Val v) {
+  if (s->n == s->cap) {
+    u32 ncap = s->cap ? s->cap * 2u : 4u;
+    Val* vals = arena_array(a, Val, ncap);
+    if (s->vals) memcpy(vals, s->vals, sizeof(vals[0]) * s->n);
+    s->vals = vals;
+    s->cap = ncap;
+  }
+  s->vals[s->n++] = v;
+}
 
-/* Identify the FrameSlot a load/store address operand refers to.
- * Returns 0 if the operand is not OPK_LOCAL (i.e., not a direct
- * frame-slot reference) — those addresses route through computed Val
- * pointers and cannot be promoted by a slot-keyed pass. */
-static u32 opnd_slot_id(const Operand* op) {
-  if (op->kind != OPK_LOCAL) return 0;
-  return (u32)op->v.frame_slot;
+static Val stack_top(const SlotStack* s) {
+  return s && s->n ? s->vals[s->n - 1u] : VAL_NONE;
 }
 
-static int slot_promotable(const Func* f, u32 slot_id) {
-  if (slot_id == 0 || slot_id > f->nframe_slots) return 0;
-  const IRFrameSlot* s = &f->frame_slots[slot_id - 1];
-  if (s->flags & FSF_ADDR_TAKEN) return 0;
-  if (s->flags & FSF_VOLATILE) return 0;
-  /* Only locals are promotable for v1; params live in slots too but
-   * promoting them is a separate transform. */
-  if (s->kind != FS_LOCAL) return 0;
-  return 1;
+static Val resolve_repl(const RenameCtx* ctx, Val v) {
+  while (v != VAL_NONE && v < ctx->repl_cap && ctx->repl[v] != VAL_NONE &&
+         ctx->repl[v] != v) {
+    v = ctx->repl[v];
+  }
+  return v;
 }
 
-/* ---- phi insertion ---- */
+static void replace_use(Func* f, Inst* in, Operand* op, int is_def, void* arg) {
+  (void)in;
+  RenameCtx* ctx = (RenameCtx*)arg;
+  if (is_def || op->kind != OPK_REG) return;
+  Val old = (Val)op->v.reg;
+  if (old == VAL_NONE || old >= f->nvals) return;
+  Val repl = resolve_repl(ctx, old);
+  if (repl != VAL_NONE && repl != old) {
+    op->v.reg = (Reg)repl;
+    op->type = f->val_type[repl];
+    op->cls = f->val_cls[repl];
+  }
+}
 
-/* Insert n_new phi instructions at the start of block b, each tagged
- * with its slot via opnds[0] = synthetic "slot tag" Val (we reuse
- * IRPhiAux to carry pred info). The stored slot id lives in
- * extra.imm. */
-static void insert_phis(Func* f, u32 b, u32 n_new, const u32* phi_slots,
-                        const u32* phi_blocks_for_slot) {
-  if (!n_new) return;
+static void insert_phis(Func* f, u32 b, const u8* needs_phi) {
   Block* bl = &f->blocks[b];
-  u32 old = bl->ninsts;
-  u32 nnew = old + n_new;
-  Inst* nb = arena_zarray(f->arena, Inst, nnew);
-  /* Phis go first. */
-  for (u32 i = 0; i < n_new; ++i) {
-    Inst* in = &nb[i];
-    u32 slot_id = phi_slots[i];
-    const IRFrameSlot* s = &f->frame_slots[slot_id - 1];
+  u32 nphi = 0;
+  for (u32 sid = 1; sid <= f->nframe_slots; ++sid)
+    if (needs_phi[sid * f->nblocks + b]) ++nphi;
+  if (!nphi) return;
+
+  u32 old_nvals = f->nvals;
+  Inst* insts = arena_zarray(f->arena, Inst, bl->ninsts + nphi);
+  u32 w = 0;
+  for (u32 sid = 1; sid <= f->nframe_slots; ++sid) {
+    if (!needs_phi[sid * f->nblocks + b]) continue;
+    const IRFrameSlot* slot = &f->frame_slots[sid - 1u];
+    Inst* in = &insts[w++];
     in->op = IR_PHI;
-    in->type = s->type;
-    in->extra.imm = (i64)slot_id;
-    /* Allocate IRPhiAux with one slot per pred, initialized to
-     * VAL_NONE. The rename pass fills pred_vals later. */
+    in->type = slot->type;
+    in->def = ir_alloc_val(f, slot->type, ssa_type_class(f, slot->type));
+    f->val_def_block[in->def] = b;
+    f->val_def_inst[in->def] = w - 1u;
     IRPhiAux* aux = arena_znew(f->arena, IRPhiAux);
+    aux->slot_id = sid;
     aux->npreds = bl->npreds;
     if (bl->npreds) {
       aux->pred_blocks = arena_array(f->arena, u32, bl->npreds);
       aux->pred_vals = arena_zarray(f->arena, Val, bl->npreds);
       memcpy(aux->pred_blocks, bl->preds, sizeof(u32) * bl->npreds);
     }
-    /* Reuse extra union: imm carries slot, but we also need aux. We
-     * stash the IRPhiAux in opnds: opnds[0] is a sentinel pointer cast
-     * — that breaks the Val* type. Instead, we use a parallel side
-     * table rooted on the inst. To keep this self-contained without
-     * altering the Inst layout, we point in->opnds at a Val array of
-     * length 0 and rely on the aux pointer through the extra union.
-     *
-     * Layout choice: extra.aux = aux (carry the IRPhiAux pointer);
-     * imm-as-slot lives in a side table. */
     in->extra.aux = aux;
-    in->nopnds = 0;
-    in->opnds = NULL;
-    /* def Val: allocate a fresh value typed as the slot's type. */
-    /* We can't use val_alloc directly (it's static); set up the val
-     * table manually via an emit-equivalent. Simpler: piggy-back on
-     * ir_emit_const_i style by appending after current state. But we're
-     * mid-rebuild of the block. Defer: encode def slot via re-reading
-     * the val table. */
-    /* For Phase 3 dry-run, leave def = VAL_NONE on phis. The rename
-     * pass uses extra.aux->pred_vals to inspect phi shape; the
-     * dry-run discards before downstream passes need def. */
-    in->def = VAL_NONE;
-  }
-  /* Existing instructions shifted right. */
-  if (old) memcpy(nb + n_new, bl->insts, sizeof(Inst) * old);
-  bl->insts = nb;
-  bl->ninsts = nnew;
-  bl->cap = nnew;
-  /* val_def_inst for any val defined in this block has shifted by
-   * n_new. Walk the val table and update. */
-  for (u32 v = 1; v < f->nvals; ++v) {
-    if (f->val_def_block[v] == b) f->val_def_inst[v] += n_new;
   }
-  (void)phi_blocks_for_slot;
+  if (bl->ninsts) memcpy(insts + nphi, bl->insts, sizeof(Inst) * bl->ninsts);
+  bl->insts = insts;
+  bl->ninsts += nphi;
+  bl->cap = bl->ninsts;
+  for (Val v = 1; v < old_nvals; ++v)
+    if (f->val_def_block[v] == b) f->val_def_inst[v] += nphi;
 }
 
-/* ---- rename ---- */
+static void mark_def_blocks(Func* f, const u8* promoted, u8* def_blocks) {
+  for (u32 b = 0; b < f->nblocks; ++b) {
+    Block* bl = &f->blocks[b];
+    for (u32 i = 0; i < bl->ninsts; ++i) {
+      Inst* in = &bl->insts[i];
+      if ((IROp)in->op != IR_STORE || in->nopnds < 2) continue;
+      u32 sid = opnd_slot_id(&in->opnds[0]);
+      if (sid && promoted[sid]) def_blocks[sid * f->nblocks + b] = 1;
+    }
+  }
+}
 
-typedef struct SlotStack {
-  Val* stack;
-  u32 n, cap;
-} SlotStack;
+static void compute_phi_sites(Func* f, OptAnalysis* a, const u8* promoted,
+                              u8* needs_phi) {
+  u8* def_blocks = arena_zarray(f->arena, u8,
+                                (f->nframe_slots + 1u) * f->nblocks);
+  mark_def_blocks(f, promoted, def_blocks);
+  u32* work = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u);
+  u8* queued = arena_zarray(f->arena, u8, f->nblocks ? f->nblocks : 1u);
 
-static void slot_push(Arena* a, SlotStack* s, Val v) {
-  if (s->n == s->cap) {
-    u32 ncap = s->cap ? s->cap * 2u : 4u;
-    Val* nb = arena_array(a, Val, ncap);
-    if (s->stack) memcpy(nb, s->stack, sizeof(Val) * s->n);
-    s->stack = nb;
-    s->cap = ncap;
+  for (u32 sid = 1; sid <= f->nframe_slots; ++sid) {
+    if (!promoted[sid]) continue;
+    memset(queued, 0, f->nblocks);
+    u32 wn = 0;
+    for (u32 b = 0; b < f->nblocks; ++b) {
+      if (!def_blocks[sid * f->nblocks + b]) continue;
+      queued[b] = 1;
+      work[wn++] = b;
+    }
+    while (wn) {
+      u32 x = work[--wn];
+      OptBlockList* df = &a->dom_frontier[x];
+      for (u32 i = 0; i < df->n; ++i) {
+        u32 y = df->items[i];
+        if (needs_phi[sid * f->nblocks + y]) continue;
+        needs_phi[sid * f->nblocks + y] = 1;
+        if (!queued[y]) {
+          queued[y] = 1;
+          work[wn++] = y;
+        }
+      }
+    }
   }
-  s->stack[s->n++] = v;
 }
 
-static Val slot_top(const SlotStack* s) {
-  return s->n ? s->stack[s->n - 1] : VAL_NONE;
+static void rewrite_store_immediate(RenameCtx* ctx, Inst* in, u32 b, u32 i,
+                                    u32 sid) {
+  Operand src = in->opnds[1];
+  Val v = ir_alloc_val(ctx->f, src.type, src.cls);
+  Operand* opnds = arena_array(ctx->f->arena, Operand, 1);
+  opnds[0] = src;
+  opnds[0].kind = OPK_REG;
+  opnds[0].v.reg = (Reg)v;
+  in->op = IR_LOAD_IMM;
+  in->type = src.type;
+  in->def = v;
+  in->opnds = opnds;
+  in->nopnds = 1;
+  in->extra.imm = src.v.imm;
+  ctx->f->val_def_block[v] = b;
+  ctx->f->val_def_inst[v] = i;
+  stack_push(ctx->f->arena, &ctx->stacks[sid], v);
 }
 
-static void rename_dfs(Func* f, u32 b, const u32* idom, SlotStack* slots) {
+static void rename_block(RenameCtx* ctx, u32 b) {
+  Func* f = ctx->f;
   Block* bl = &f->blocks[b];
-  /* Track per-slot push count so we can pop on exit. */
-  u32* pushed = arena_zarray(f->arena, u32, f->nframe_slots + 1);
+  u32* pushed = arena_zarray(f->arena, u32, f->nframe_slots + 1u);
 
-  /* 1. Process phis in this block: each phi has extra.imm = slot id.
-   * Push a synthetic phi value (VAL_NONE in dry-run; a "fresh Val"
-   * once we wire renaming). */
   for (u32 i = 0; i < bl->ninsts; ++i) {
     Inst* in = &bl->insts[i];
-    if (in->op != IR_PHI) break;
-    u32 slot_id = (u32)in->extra.imm;
-    if (slot_id == 0 || slot_id > f->nframe_slots) continue;
-    /* For dry-run, the phi's def is VAL_NONE; we still push so the
-     * stack has the right depth. Downstream passes (Phase 4+) will
-     * allocate a real Val here. */
-    slot_push(f->arena, &slots[slot_id], VAL_NONE);
-    pushed[slot_id]++;
+    if ((IROp)in->op != IR_PHI) break;
+    IRPhiAux* aux = (IRPhiAux*)in->extra.aux;
+    if (!aux || !aux->slot_id || !ctx->promoted[aux->slot_id]) continue;
+    stack_push(f->arena, &ctx->stacks[aux->slot_id], in->def);
+    pushed[aux->slot_id]++;
   }
 
-  /* 2. Process the rest of the block. */
   for (u32 i = 0; i < bl->ninsts; ++i) {
     Inst* in = &bl->insts[i];
-    if (in->op == IR_PHI) continue;
-    if (in->op == IR_STORE) {
-      /* IR_STORE opnds: [0] = addr, [1] = src. */
-      if (in->nopnds < 2) continue;
+    if ((IROp)in->op == IR_PHI) continue;
+    if ((IROp)in->op == IR_STORE && in->nopnds >= 2) {
       u32 sid = opnd_slot_id(&in->opnds[0]);
-      if (sid && slot_promotable(f, sid)) {
-        const Operand* src = &in->opnds[1];
-        Val v = (src->kind == OPK_REG) ? (Val)src->v.reg : VAL_NONE;
-        slot_push(f->arena, &slots[sid], v);
+      if (sid && ctx->promoted[sid]) {
+        Operand src = in->opnds[1];
+        if (src.kind == OPK_REG) {
+          stack_push(f->arena, &ctx->stacks[sid],
+                     resolve_repl(ctx, (Val)src.v.reg));
+          in->op = IR_NOP;
+          in->nopnds = 0;
+          in->opnds = NULL;
+          in->def = VAL_NONE;
+        } else {
+          rewrite_store_immediate(ctx, in, b, i, sid);
+        }
         pushed[sid]++;
+        continue;
       }
-      continue;
     }
-    if (in->op == IR_LOAD) {
-      /* IR_LOAD opnds: [0] = dst REG, [1] = addr. */
-      if (in->nopnds < 2) continue;
+    if ((IROp)in->op == IR_LOAD && in->nopnds >= 2) {
       u32 sid = opnd_slot_id(&in->opnds[1]);
-      if (sid && slot_promotable(f, sid)) {
-        /* Touching slot_top exercises the stack invariant — that's
-         * the shape check we want. The rewrite map proper would walk
-         * uses; we skip that in the dry-run (output discarded). */
-        (void)slot_top(&slots[sid]);
+      if (sid && ctx->promoted[sid] && in->def != VAL_NONE) {
+        Val cur = stack_top(&ctx->stacks[sid]);
+        if (cur == VAL_NONE) continue;
+        if (in->def < ctx->repl_cap)
+          ctx->repl[in->def] = resolve_repl(ctx, cur);
+        in->op = IR_NOP;
+        in->nopnds = 0;
+        in->opnds = NULL;
+        in->def = VAL_NONE;
+        continue;
       }
-      continue;
     }
+    opt_walk_inst_operands(f, in, replace_use, ctx);
   }
 
-  /* 3. Fill our slot in each successor's phis. */
   for (u32 s = 0; s < bl->nsucc; ++s) {
     u32 succ = bl->succ[s];
     if (succ >= f->nblocks) continue;
     Block* sb = &f->blocks[succ];
-    /* Find which pred index `b` is in succ. */
-    u32 pred_idx = 0;
-    int found = 0;
+    u32 pred_idx = OPT_USE_NONE;
     for (u32 p = 0; p < sb->npreds; ++p) {
       if (sb->preds[p] == b) {
         pred_idx = p;
-        found = 1;
         break;
       }
     }
-    if (!found) continue;
+    if (pred_idx == OPT_USE_NONE) continue;
     for (u32 i = 0; i < sb->ninsts; ++i) {
-      Inst* in = &sb->insts[i];
-      if (in->op != IR_PHI) break;
-      u32 slot_id = (u32)in->extra.imm;
-      if (slot_id == 0 || slot_id > f->nframe_slots) continue;
-      IRPhiAux* aux = (IRPhiAux*)in->extra.aux;
-      if (!aux || pred_idx >= aux->npreds) continue;
-      aux->pred_vals[pred_idx] = slot_top(&slots[slot_id]);
+      Inst* phi = &sb->insts[i];
+      if ((IROp)phi->op != IR_PHI) break;
+      IRPhiAux* aux = (IRPhiAux*)phi->extra.aux;
+      if (!aux || !aux->slot_id || !ctx->promoted[aux->slot_id]) continue;
+      aux->pred_vals[pred_idx] =
+          resolve_repl(ctx, stack_top(&ctx->stacks[aux->slot_id]));
     }
   }
 
-  /* 4. Recurse into immediate dom children. */
-  for (u32 c = 0; c < f->nblocks; ++c) {
-    if (c == b) continue;
-    if (idom[c] == b) rename_dfs(f, c, idom, slots);
-  }
+  OptBlockList* children = &ctx->analysis->dom_children[b];
+  for (u32 i = 0; i < children->n; ++i) rename_block(ctx, children->items[i]);
 
-  /* 5. Pop. */
-  for (u32 sid = 0; sid <= f->nframe_slots; ++sid) {
+  for (u32 sid = 1; sid <= f->nframe_slots; ++sid) {
     while (pushed[sid]--) {
-      if (slots[sid].n) slots[sid].n--;
+      if (ctx->stacks[sid].n) --ctx->stacks[sid].n;
     }
   }
 }
 
-/* ---- main entry ---- */
+static void replace_phi_inputs(Func* f, RenameCtx* ctx) {
+  for (u32 b = 0; b < f->nblocks; ++b) {
+    Block* bl = &f->blocks[b];
+    for (u32 i = 0; i < bl->ninsts; ++i) {
+      Inst* in = &bl->insts[i];
+      if ((IROp)in->op != IR_PHI) break;
+      IRPhiAux* aux = (IRPhiAux*)in->extra.aux;
+      if (!aux) continue;
+      for (u32 p = 0; p < aux->npreds; ++p)
+        aux->pred_vals[p] = resolve_repl(ctx, aux->pred_vals[p]);
+    }
+  }
+}
 
 void opt_build_ssa(Func* f) {
-  if (f->nblocks == 0) return;
-
-  /* Postorder traversal from entry. */
-  PostorderCtx pctx;
-  pctx.f = f;
-  pctx.po = arena_array(f->arena, u32, f->nblocks);
-  pctx.po_idx = arena_array(f->arena, u32, f->nblocks);
-  pctx.visited = arena_zarray(f->arena, u8, f->nblocks);
-  pctx.count = 0;
-  for (u32 b = 0; b < f->nblocks; ++b) pctx.po_idx[b] = 0;
-  postorder_dfs(&pctx, f->entry);
-  /* Unreachable blocks: never visited. Skip them in the dom analysis;
-   * the dry-run shouldn't crash on them. */
-
-  u32* idom = compute_idom(f, pctx.po, pctx.po_idx, pctx.count, f->entry);
-  DfSet* df = compute_df(f, idom);
-
-  /* For each promotable slot, find defining blocks → iterated DF. */
-  if (f->nframe_slots == 0) return;
-  u8* needs_phi_storage =
-      arena_zarray(f->arena, u8, (f->nframe_slots + 1) * f->nblocks);
-#define NEEDS_PHI(slot, blk) \
-  needs_phi_storage[((slot) * f->nblocks) + (blk)]
-
-  /* Worklist algo per slot. */
-  u32* worklist = arena_array(f->arena, u32, f->nblocks);
-  u8* on_worklist = arena_zarray(f->arena, u8, f->nblocks);
+  if (!f || f->nblocks == 0 || f->nframe_slots == 0) {
+    if (f) opt_rebuild_def_use(f);
+    return;
+  }
 
-  for (u32 sid = 1; sid <= f->nframe_slots; ++sid) {
-    if (!slot_promotable(f, sid)) continue;
-    /* Reset per-slot worklist state. */
-    for (u32 i = 0; i < f->nblocks; ++i) on_worklist[i] = 0;
-    u32 wn = 0;
-    /* Seed with blocks containing a store to this slot. */
-    for (u32 b = 0; b < f->nblocks; ++b) {
-      Block* bl = &f->blocks[b];
-      for (u32 i = 0; i < bl->ninsts; ++i) {
-        Inst* in = &bl->insts[i];
-        if (in->op == IR_STORE && in->nopnds >= 1 &&
-            opnd_slot_id(&in->opnds[0]) == sid) {
-          if (!on_worklist[b]) {
-            on_worklist[b] = 1;
-            worklist[wn++] = b;
-          }
-          break;
+  OptAnalysis a;
+  opt_analysis_build_order(f, &a);
+  opt_analysis_build_dominators(f, &a);
+  opt_analysis_build_dom_frontier(f, &a);
+
+  u8* promoted = find_promoted_slots(f);
+  u8* needs_phi =
+      arena_zarray(f->arena, u8, (f->nframe_slots + 1u) * f->nblocks);
+  compute_phi_sites(f, &a, promoted, needs_phi);
+  for (u32 b = 0; b < f->nblocks; ++b) insert_phis(f, b, needs_phi);
+
+  RenameCtx ctx;
+  memset(&ctx, 0, sizeof ctx);
+  ctx.f = f;
+  ctx.analysis = &a;
+  ctx.promoted = promoted;
+  ctx.repl_cap = f->vals_cap ? f->vals_cap : f->nvals;
+  ctx.repl = arena_zarray(f->arena, Val, ctx.repl_cap ? ctx.repl_cap : 1u);
+  ctx.stacks = arena_zarray(f->arena, SlotStack, f->nframe_slots + 1u);
+  rename_block(&ctx, f->entry);
+
+  for (u32 b = 0; b < f->nblocks; ++b) {
+    Block* bl = &f->blocks[b];
+    for (u32 i = 0; i < bl->ninsts; ++i)
+      opt_walk_inst_operands(f, &bl->insts[i], replace_use, &ctx);
+  }
+  replace_phi_inputs(f, &ctx);
+  opt_rebuild_def_use(f);
+}
+
+static int ssa_is_terminator(const Inst* in) {
+  switch ((IROp)in->op) {
+    case IR_BR:
+    case IR_CONDBR:
+    case IR_CMP_BRANCH:
+    case IR_SWITCH:
+    case IR_INDIRECT_BRANCH:
+    case IR_RET:
+    case IR_BREAK_TO:
+    case IR_CONTINUE_TO:
+      return 1;
+    case IR_INTRINSIC: {
+      IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
+      return aux && (aux->kind == INTRIN_LONGJMP ||
+                     aux->kind == INTRIN_TRAP ||
+                     aux->kind == INTRIN_UNREACHABLE);
+    }
+    default:
+      return 0;
+  }
+}
+
+static Inst make_copy_inst(Func* f, Val dst, Val src, CfreeCgTypeId ty, u8 cls) {
+  Inst in;
+  memset(&in, 0, sizeof in);
+  in.op = IR_COPY;
+  in.type = ty;
+  in.def = dst;
+  in.nopnds = 2;
+  in.opnds = arena_array(f->arena, Operand, 2);
+  in.opnds[0].kind = OPK_REG;
+  in.opnds[0].type = ty;
+  in.opnds[0].cls = cls;
+  in.opnds[0].v.reg = (Reg)dst;
+  in.opnds[1].kind = OPK_REG;
+  in.opnds[1].type = ty;
+  in.opnds[1].cls = cls;
+  in.opnds[1].v.reg = (Reg)src;
+  return in;
+}
+
+static void insert_edge_moves(Func* f, u32 b, const EdgeMove* moves, u32 n) {
+  if (!n) return;
+  Block* bl = &f->blocks[b];
+  u32 term = bl->ninsts && ssa_is_terminator(&bl->insts[bl->ninsts - 1u]);
+  u32 insert_at = bl->ninsts - term;
+  u32 extra = n == 1 ? 1u : n * 2u;
+  Inst* insts = arena_zarray(f->arena, Inst, bl->ninsts + extra);
+  if (insert_at) memcpy(insts, bl->insts, sizeof(Inst) * insert_at);
+  u32 w = insert_at;
+  if (n == 1) {
+    insts[w++] = make_copy_inst(f, moves[0].dst, moves[0].src, moves[0].type,
+                                moves[0].cls);
+  } else {
+    Val* temps = arena_array(f->arena, Val, n);
+    for (u32 i = 0; i < n; ++i) {
+      temps[i] = ir_alloc_val(f, moves[i].type, moves[i].cls);
+      f->val_def_block[temps[i]] = b;
+      f->val_def_inst[temps[i]] = w;
+      insts[w++] = make_copy_inst(f, temps[i], moves[i].src, moves[i].type,
+                                  moves[i].cls);
+    }
+    for (u32 i = 0; i < n; ++i) {
+      insts[w] = make_copy_inst(f, moves[i].dst, temps[i], moves[i].type,
+                                moves[i].cls);
+      f->val_def_block[moves[i].dst] = b;
+      f->val_def_inst[moves[i].dst] = w;
+      ++w;
+    }
+  }
+  if (term) insts[w++] = bl->insts[bl->ninsts - 1u];
+  bl->insts = insts;
+  bl->ninsts = w;
+  bl->cap = w;
+  if (n == 1) {
+    f->val_def_block[moves[0].dst] = b;
+    f->val_def_inst[moves[0].dst] = insert_at;
+  }
+}
+
+static void replace_succ_ref(Func* f, u32 pred, u32 old_succ, u32 new_succ) {
+  Block* bl = &f->blocks[pred];
+  for (u32 s = 0; s < bl->nsucc; ++s) {
+    if (bl->succ[s] == old_succ) bl->succ[s] = new_succ;
+  }
+  if (!bl->ninsts) return;
+  Inst* term = &bl->insts[bl->ninsts - 1u];
+  if ((IROp)term->op == IR_SWITCH) {
+    IRSwitchAux* aux = (IRSwitchAux*)term->extra.aux;
+    if (!aux) return;
+    for (u32 i = 0; i < aux->ncases; ++i)
+      if (aux->cases[i].block == old_succ) aux->cases[i].block = new_succ;
+    if (aux->default_block == old_succ) aux->default_block = new_succ;
+  } else if ((IROp)term->op == IR_INDIRECT_BRANCH) {
+    IRIndirectAux* aux = (IRIndirectAux*)term->extra.aux;
+    if (!aux) return;
+    for (u32 i = 0; i < aux->ntargets; ++i)
+      if (aux->targets[i] == old_succ) aux->targets[i] = new_succ;
+  }
+}
+
+static void emit_order_insert_after(Func* f, u32 after, u32 block) {
+  for (u32 i = 0; i < f->emit_order_n; ++i)
+    if (f->emit_order[i] == block) return;
+  if (f->emit_order_n == f->emit_order_cap) {
+    u32 ncap = f->emit_order_cap ? f->emit_order_cap * 2u : 8u;
+    u32* order = arena_array(f->arena, u32, ncap);
+    if (f->emit_order)
+      memcpy(order, f->emit_order, sizeof(order[0]) * f->emit_order_n);
+    f->emit_order = order;
+    f->emit_order_cap = ncap;
+  }
+  u32 pos = f->emit_order_n;
+  for (u32 i = 0; i < f->emit_order_n; ++i) {
+    if (f->emit_order[i] == after) {
+      pos = i + 1u;
+      break;
+    }
+  }
+  for (u32 i = f->emit_order_n; i > pos; --i)
+    f->emit_order[i] = f->emit_order[i - 1u];
+  f->emit_order[pos] = block;
+  ++f->emit_order_n;
+}
+
+static int edge_is_fallthrough(Func* f, u32 pred, u32 succ) {
+  Block* bl = &f->blocks[pred];
+  if (!bl->ninsts) return 0;
+  IROp op = (IROp)bl->insts[bl->ninsts - 1u].op;
+  if (op != IR_CMP_BRANCH && op != IR_CONDBR) return 0;
+  return bl->nsucc >= 2 && bl->succ[1] == succ;
+}
+
+static u32 split_edge(Func* f, u32 pred, u32 succ) {
+  u32 edge = ir_block_new(f);
+  Block* eb = &f->blocks[edge];
+  Inst* br = ir_emit(f, edge, IR_BR);
+  (void)br;
+  eb->succ[0] = succ;
+  eb->nsucc = 1;
+  int place_after = edge_is_fallthrough(f, pred, succ);
+  replace_succ_ref(f, pred, succ, edge);
+  if (succ < f->nblocks) {
+    Block* sb = &f->blocks[succ];
+    for (u32 i = 0; i < sb->ninsts; ++i) {
+      Inst* phi = &sb->insts[i];
+      if ((IROp)phi->op != IR_PHI) break;
+      IRPhiAux* aux = (IRPhiAux*)phi->extra.aux;
+      if (!aux) continue;
+      for (u32 p = 0; p < aux->npreds; ++p) {
+        if (aux->pred_blocks[p] == pred) {
+          aux->pred_blocks[p] = edge;
+          aux->pred_vals[p] = phi->def;
         }
       }
     }
-    /* Iterated DF. */
-    while (wn) {
-      u32 x = worklist[--wn];
-      DfSet* d = &df[x];
-      for (u32 i = 0; i < d->n; ++i) {
-        u32 y = d->members[i];
-        if (NEEDS_PHI(sid, y)) continue;
-        NEEDS_PHI(sid, y) = 1;
-        if (!on_worklist[y]) {
-          on_worklist[y] = 1;
-          worklist[wn++] = y;
+  }
+  if (place_after) {
+    emit_order_insert_after(f, pred, edge);
+  } else {
+    ir_note_emit(f, edge);
+  }
+  return edge;
+}
+
+static void realign_phi_preds(Func* f) {
+  for (u32 b = 0; b < f->nblocks; ++b) {
+    Block* bl = &f->blocks[b];
+    for (u32 i = 0; i < bl->ninsts; ++i) {
+      Inst* phi = &bl->insts[i];
+      if ((IROp)phi->op != IR_PHI) break;
+      IRPhiAux* aux = (IRPhiAux*)phi->extra.aux;
+      if (!aux || aux->npreds == bl->npreds) {
+        if (!aux) continue;
+      }
+      u32 old_n = aux->npreds;
+      u32* old_blocks = aux->pred_blocks;
+      Val* old_vals = aux->pred_vals;
+      u32* pred_blocks = bl->npreds ? arena_array(f->arena, u32, bl->npreds)
+                                    : NULL;
+      Val* pred_vals = bl->npreds ? arena_zarray(f->arena, Val, bl->npreds)
+                                  : NULL;
+      for (u32 p = 0; p < bl->npreds; ++p) {
+        pred_blocks[p] = bl->preds[p];
+        pred_vals[p] = phi->def;
+        for (u32 old = 0; old < old_n; ++old) {
+          if (old_blocks && old_blocks[old] == bl->preds[p]) {
+            pred_vals[p] = old_vals ? old_vals[old] : VAL_NONE;
+            break;
+          }
         }
       }
+      aux->npreds = bl->npreds;
+      aux->pred_blocks = pred_blocks;
+      aux->pred_vals = pred_vals;
     }
   }
+}
 
-  /* Insert phis per block. */
+void opt_make_conventional_ssa(Func* f) {
+  if (!f) return;
   for (u32 b = 0; b < f->nblocks; ++b) {
-    u32 nphi = 0;
-    for (u32 sid = 1; sid <= f->nframe_slots; ++sid) {
-      if (NEEDS_PHI(sid, b)) nphi++;
-    }
-    if (!nphi) continue;
-    u32* slots_arr = arena_array(f->arena, u32, nphi);
-    u32 k = 0;
-    for (u32 sid = 1; sid <= f->nframe_slots; ++sid) {
-      if (NEEDS_PHI(sid, b)) slots_arr[k++] = sid;
+    Block* bl = &f->blocks[b];
+    if (!bl->ninsts || (IROp)bl->insts[0].op != IR_PHI) continue;
+    for (u32 p = 0; p < bl->npreds; ++p) {
+      u32 pred = bl->preds[p];
+      EdgeMove* moves = arena_array(f->arena, EdgeMove, bl->ninsts);
+      u32 n = 0;
+      for (u32 i = 0; i < bl->ninsts; ++i) {
+        Inst* phi = &bl->insts[i];
+        if ((IROp)phi->op != IR_PHI) break;
+        IRPhiAux* aux = (IRPhiAux*)phi->extra.aux;
+        if (!aux || p >= aux->npreds) continue;
+        Val src = aux->pred_vals[p];
+        if (src == VAL_NONE || src == phi->def) continue;
+        moves[n].dst = phi->def;
+        moves[n].src = src;
+        moves[n].type = phi->type;
+        moves[n].cls = f->val_cls[phi->def];
+        ++n;
+      }
+      if (!n) continue;
+      u32 target = pred;
+      if (pred >= f->nblocks) continue;
+      if (f->blocks[pred].nsucc != 1) target = split_edge(f, pred, b);
+      insert_edge_moves(f, target, moves, n);
     }
-    insert_phis(f, b, nphi, slots_arr, NULL);
   }
+  opt_build_cfg(f);
+  realign_phi_preds(f);
+  opt_rebuild_def_use(f);
+}
 
-  /* Rename phase: DFS from entry over the dominator tree. */
-  SlotStack* slots = arena_zarray(f->arena, SlotStack, f->nframe_slots + 1);
-  rename_dfs(f, f->entry, idom, slots);
-
-#undef NEEDS_PHI
+void opt_undo_ssa(Func* f) {
+  if (!f) return;
+  for (u32 b = 0; b < f->nblocks; ++b) {
+    Block* bl = &f->blocks[b];
+    u32 w = 0;
+    for (u32 i = 0; i < bl->ninsts; ++i) {
+      if ((IROp)bl->insts[i].op == IR_PHI) continue;
+      bl->insts[w++] = bl->insts[i];
+    }
+    bl->ninsts = w;
+  }
+  opt_rebuild_def_use(f);
 }
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -317,6 +317,17 @@ static Inst* emit_load_local(Func* f, u32 b, Val dst, FrameSlot fs,
   return in;
 }
 
+static Inst* emit_store_local(Func* f, u32 b, FrameSlot fs, Val src,
+                              CfreeCgTypeId ty, u16 flags) {
+  Inst* in = ir_emit(f, b, IR_STORE);
+  in->opnds = arena_array(f->arena, Operand, 2);
+  in->opnds[0] = op_local_(fs, ty);
+  in->opnds[1] = op_reg_(src, ty);
+  in->nopnds = 2;
+  in->extra.mem = mem_local_(fs, ty, 4, flags);
+  return in;
+}
+
 static Inst* emit_call_void(Func* f, u32 b) {
   Inst* in = ir_emit(f, b, IR_CALL);
   IRCallAux* aux = arena_znew(f->arena, IRCallAux);
@@ -1875,6 +1886,172 @@ static void opt_analysis_dominators_and_frontier(void) {
   tc_fini(&tc);
 }
 
+static void opt_ssa_diamond_mem2reg_phi(void) {
+  TestCtx tc;
+  tc_init(&tc);
+  Func* f = new_func(&tc);
+  FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, 0);
+  u32 entry = f->entry;
+  u32 then_b = ir_block_new(f);
+  u32 else_b = ir_block_new(f);
+  u32 join = ir_block_new(f);
+  ir_note_emit(f, then_b);
+  ir_note_emit(f, else_b);
+  ir_note_emit(f, join);
+
+  emit_test_branch(f, entry, then_b, else_b, tc.i32);
+  Val tv = add_val(f, tc.i32);
+  Val ev = add_val(f, tc.i32);
+  Val out = add_val(f, tc.i32);
+  emit_load_imm(f, then_b, tv, tc.i32, 11);
+  emit_store_local(f, then_b, fs, tv, tc.i32, 0);
+  emit_br_to(f, then_b, join);
+  emit_load_imm(f, else_b, ev, tc.i32, 22);
+  emit_store_local(f, else_b, fs, ev, tc.i32, 0);
+  emit_br_to(f, else_b, join);
+  emit_load_local(f, join, out, fs, tc.i32, 0);
+  emit_ret_val(f, join, out, tc.i32);
+
+  opt_build_cfg(f);
+  opt_build_ssa(f);
+  opt_verify(f, "test-ssa-diamond");
+  EXPECT(count_op(f, IR_PHI) == 1, "diamond should insert one phi");
+  EXPECT(count_op(f, IR_STORE) == 0, "promoted stores should be removed");
+  EXPECT(count_op(f, IR_LOAD) == 0, "promoted load should be removed");
+  Inst* phi = &f->blocks[join].insts[0];
+  IRPhiAux* aux = (IRPhiAux*)phi->extra.aux;
+  EXPECT((IROp)phi->op == IR_PHI && phi->def != VAL_NONE,
+         "phi should have a real def");
+  EXPECT(aux && aux->slot_id == (u32)fs, "phi should carry slot id in aux");
+  EXPECT(aux && aux->npreds == f->blocks[join].npreds,
+         "phi predecessor count should match block preds");
+
+  opt_make_conventional_ssa(f);
+  opt_verify(f, "test-ssa-diamond-conventional");
+  opt_undo_ssa(f);
+  opt_verify(f, "test-ssa-diamond-undo");
+  EXPECT(count_op(f, IR_PHI) == 0, "undo_ssa should remove phis");
+  EXPECT(count_op(f, IR_COPY) >= 2, "phi lowering should insert edge copies");
+  tc_fini(&tc);
+}
+
+static void opt_ssa_loop_carried_phi(void) {
+  TestCtx tc;
+  tc_init(&tc);
+  Func* f = new_func(&tc);
+  FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, 0);
+  u32 entry = f->entry;
+  u32 header = ir_block_new(f);
+  u32 body = ir_block_new(f);
+  u32 exit = ir_block_new(f);
+  ir_note_emit(f, header);
+  ir_note_emit(f, body);
+  ir_note_emit(f, exit);
+
+  Val zero = add_val(f, tc.i32);
+  Val cur = add_val(f, tc.i32);
+  Val one = add_val(f, tc.i32);
+  Val next = add_val(f, tc.i32);
+  Val out = add_val(f, tc.i32);
+  emit_load_imm(f, entry, zero, tc.i32, 0);
+  emit_store_local(f, entry, fs, zero, tc.i32, 0);
+  emit_br_to(f, entry, header);
+  emit_load_local(f, header, cur, fs, tc.i32, 0);
+  emit_test_branch(f, header, body, exit, tc.i32);
+  emit_load_imm(f, body, one, tc.i32, 1);
+  emit_binop(f, body, next, cur, one, tc.i32);
+  emit_store_local(f, body, fs, next, tc.i32, 0);
+  emit_br_to(f, body, header);
+  emit_load_local(f, exit, out, fs, tc.i32, 0);
+  emit_ret_val(f, exit, out, tc.i32);
+
+  opt_build_cfg(f);
+  opt_build_ssa(f);
+  opt_verify(f, "test-ssa-loop");
+  EXPECT(count_op(f, IR_PHI) == 1, "loop should insert one carried phi");
+  EXPECT((IROp)f->blocks[header].insts[0].op == IR_PHI,
+         "loop-carried phi should be in the header");
+  EXPECT(count_op(f, IR_STORE) == 0, "loop promoted stores should be removed");
+  EXPECT(count_op(f, IR_LOAD) == 0, "loop promoted loads should be removed");
+  opt_make_conventional_ssa(f);
+  opt_undo_ssa(f);
+  opt_verify(f, "test-ssa-loop-undo");
+  tc_fini(&tc);
+}
+
+static void opt_ssa_non_promotable_slots_stay_memory(void) {
+  TestCtx tc;
+  tc_init(&tc);
+  Func* f = new_func(&tc);
+  FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, FSF_ADDR_TAKEN);
+  Val v = add_val(f, tc.i32);
+  Val out = add_val(f, tc.i32);
+  emit_load_imm(f, f->entry, v, tc.i32, 7);
+  emit_store_local(f, f->entry, fs, v, tc.i32, 0);
+  emit_load_local(f, f->entry, out, fs, tc.i32, 0);
+  emit_ret_val(f, f->entry, out, tc.i32);
+  opt_build_cfg(f);
+  opt_build_ssa(f);
+  opt_verify(f, "test-ssa-addr-taken");
+  EXPECT(count_op(f, IR_STORE) == 1 && count_op(f, IR_LOAD) == 1,
+         "address-taken slot should remain in memory");
+
+  Func* g = new_func(&tc);
+  fs = add_frame_slot(g, tc.i32, FS_LOCAL, 4, FSF_VOLATILE);
+  v = add_val(g, tc.i32);
+  out = add_val(g, tc.i32);
+  emit_load_imm(g, g->entry, v, tc.i32, 9);
+  emit_store_local(g, g->entry, fs, v, tc.i32, MF_VOLATILE);
+  emit_load_local(g, g->entry, out, fs, tc.i32, MF_VOLATILE);
+  emit_ret_val(g, g->entry, out, tc.i32);
+  opt_build_cfg(g);
+  opt_build_ssa(g);
+  opt_verify(g, "test-ssa-volatile");
+  EXPECT(count_op(g, IR_STORE) == 1 && count_op(g, IR_LOAD) == 1,
+         "volatile slot should remain in memory");
+  tc_fini(&tc);
+}
+
+static void opt_ssa_conventional_splits_critical_edge(void) {
+  TestCtx tc;
+  tc_init(&tc);
+  Func* f = new_func(&tc);
+  FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, 0);
+  u32 entry = f->entry;
+  u32 other = ir_block_new(f);
+  u32 join = ir_block_new(f);
+  u32 exit = ir_block_new(f);
+  ir_note_emit(f, other);
+  ir_note_emit(f, join);
+  ir_note_emit(f, exit);
+
+  Val a = add_val(f, tc.i32);
+  Val b = add_val(f, tc.i32);
+  Val out = add_val(f, tc.i32);
+  emit_load_imm(f, entry, a, tc.i32, 1);
+  emit_store_local(f, entry, fs, a, tc.i32, 0);
+  emit_test_branch(f, entry, join, other, tc.i32);
+  emit_load_imm(f, other, b, tc.i32, 2);
+  emit_store_local(f, other, fs, b, tc.i32, 0);
+  emit_br_to(f, other, join);
+  emit_load_local(f, join, out, fs, tc.i32, 0);
+  emit_ret_val(f, join, out, tc.i32);
+  ir_emit(f, exit, IR_RET);
+
+  opt_build_cfg(f);
+  u32 before_blocks = f->nblocks;
+  opt_build_ssa(f);
+  EXPECT(count_op(f, IR_PHI) == 1, "critical-edge test should build a phi");
+  opt_make_conventional_ssa(f);
+  opt_verify(f, "test-ssa-critical-edge");
+  EXPECT(f->nblocks > before_blocks,
+         "conventional SSA should split the critical edge");
+  opt_undo_ssa(f);
+  opt_verify(f, "test-ssa-critical-edge-undo");
+  EXPECT(count_op(f, IR_PHI) == 0, "critical-edge undo should remove phi");
+  tc_fini(&tc);
+}
+
 static void opt_jump_cleanup_forwards_branch_targets(void) {
   TestCtx tc;
   tc_init(&tc);
@@ -3911,6 +4088,10 @@ int main(void) {
   opt_cfg_prunes_unreachable();
   opt_cfg_preserves_scope_edges();
   opt_analysis_dominators_and_frontier();
+  opt_ssa_diamond_mem2reg_phi();
+  opt_ssa_loop_carried_phi();
+  opt_ssa_non_promotable_slots_stay_memory();
+  opt_ssa_conventional_splits_critical_edge();
   opt_jump_cleanup_forwards_branch_targets();
   opt_jump_cleanup_inverts_to_remove_jump_block();
   opt_jump_cleanup_keeps_conditional_fallthrough_block();

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/opt/ir.h	\|	22	++++++++++++++++++++++
M	src/opt/opt.c	\|	15	+++++++++++++--
M	src/opt/opt_internal.h	\|	3	+++
M	src/opt/pass_analysis.c	\|	150	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	src/opt/pass_ssa.c	\|	878	++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
M	test/opt/opt_test.c	\|	181	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++