commit 87eb85cdc69c09fe71ad21e4d29c0f832bf05b77
parent cfe4be02ba8c5cb7e022bb550458fbcd40ef7d14
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 21 May 2026 07:24:16 -0700
Implement production SSA for O2
Diffstat:
6 files changed, 927 insertions(+), 322 deletions(-)
diff --git a/src/opt/ir.h b/src/opt/ir.h
@@ -331,6 +331,24 @@ typedef struct OptValInfo {
u32 forbidden_hard_regs; /* bit r means Val may not allocate hard reg r. */
} OptValInfo;
+typedef enum OptUseKind {
+ OPT_USE_OPERAND,
+ OPT_USE_INDIRECT_BASE,
+ OPT_USE_PHI_INPUT,
+} OptUseKind;
+
+typedef struct OptUse {
+ Val val;
+ u32 block;
+ u32 inst;
+ u32 next_for_val;
+ u32 operand_index;
+ u32 phi_pred_index;
+ Operand* operand;
+ u8 kind; /* OptUseKind */
+ u8 pad[3];
+} OptUse;
+
typedef struct Func {
Arena* arena;
Compiler* c;
@@ -392,6 +410,10 @@ typedef struct Func {
u64 opt_dde_live_words_touched;
OptValInfo* val_info; /* indexed by Val, length nvals */
+ OptUse* opt_uses;
+ u32 opt_nuses, opt_uses_cap;
+ u32* opt_first_use_by_val; /* indexed by Val, OPT_USE_NONE if no uses */
+
Reg opt_hard_regs[OPT_REG_CLASSES][OPT_MAX_HARD_REGS];
u32 opt_hard_reg_count[OPT_REG_CLASSES];
CGPhysRegInfo opt_phys_regs[OPT_REG_CLASSES][OPT_MAX_HARD_REGS];
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -1400,8 +1400,19 @@ static void opt_run_o1_pipeline(OptImpl* o) {
}
static void opt_run_o2_pipeline(OptImpl* o) {
- /* O2 has a distinct scheduler now, but its value/SSA pre-lowering passes
- * are still disabled until their shared analysis substrate lands. */
+ metrics_scope_begin(o->c, "opt.o2.ssa");
+ opt_build_cfg(o->f);
+ opt_jump_cleanup(o->f, OPT_JUMP_CLEANUP_CFG);
+ opt_build_cfg(o->f);
+ opt_verify(o->f, "o2-pre-ssa-cfg");
+ opt_build_ssa(o->f);
+ opt_verify(o->f, "o2-ssa");
+ opt_make_conventional_ssa(o->f);
+ opt_verify(o->f, "o2-conventional-ssa");
+ opt_undo_ssa(o->f);
+ opt_build_cfg(o->f);
+ opt_verify(o->f, "o2-undo-ssa");
+ metrics_scope_end(o->c, "opt.o2.ssa");
opt_run_lowering_pipeline(o, "opt.o2.total", 1);
}
diff --git a/src/opt/opt_internal.h b/src/opt/opt_internal.h
@@ -3,6 +3,8 @@
#include "opt/opt.h"
+#define OPT_USE_NONE 0xffffffffu
+
typedef struct OptHardRegSet {
u32 cls[OPT_REG_CLASSES];
} OptHardRegSet;
@@ -49,6 +51,7 @@ void opt_analysis_build_order(Func*, OptAnalysis*);
void opt_analysis_build_dominators(Func*, OptAnalysis*);
void opt_analysis_build_dom_frontier(Func*, OptAnalysis*);
int opt_analysis_dominates(const OptAnalysis*, u32 dom, u32 node);
+void opt_rebuild_def_use(Func*);
void opt_verify(Func*, const char* stage);
int opt_val_in_inst_defs(const Inst*, Val);
diff --git a/src/opt/pass_analysis.c b/src/opt/pass_analysis.c
@@ -179,6 +179,123 @@ static int fixed_terminator_succ_count(const Inst* in, u32* count_out) {
}
}
+static void opt_use_add(Func* f, Val v, u32 b, u32 i, u8 kind, u32 op_idx,
+ u32 pred_idx, Operand* op) {
+ if (v == VAL_NONE || v >= f->nvals) return;
+ if (f->opt_nuses == f->opt_uses_cap) {
+ u32 ncap = f->opt_uses_cap ? f->opt_uses_cap * 2u : 32u;
+ OptUse* uses = arena_zarray(f->arena, OptUse, ncap);
+ if (f->opt_uses)
+ memcpy(uses, f->opt_uses, sizeof(uses[0]) * f->opt_nuses);
+ f->opt_uses = uses;
+ f->opt_uses_cap = ncap;
+ }
+ u32 id = f->opt_nuses++;
+ OptUse* u = &f->opt_uses[id];
+ u->val = v;
+ u->block = b;
+ u->inst = i;
+ u->kind = kind;
+ u->operand_index = op_idx;
+ u->phi_pred_index = pred_idx;
+ u->operand = op;
+ u->next_for_val = f->opt_first_use_by_val[v];
+ f->opt_first_use_by_val[v] = id;
+}
+
+static void opt_use_add_operand(Func* f, u32 b, u32 i, u32 op_idx,
+ Operand* op, int is_def) {
+ if (!op || is_def) return;
+ if (op->kind == OPK_REG) {
+ opt_use_add(f, (Val)op->v.reg, b, i, OPT_USE_OPERAND, op_idx,
+ OPT_USE_NONE, op);
+ } else if (op->kind == OPK_INDIRECT) {
+ opt_use_add(f, (Val)op->v.ind.base, b, i, OPT_USE_INDIRECT_BASE, op_idx,
+ OPT_USE_NONE, op);
+ }
+}
+
+static void opt_use_add_abivalue(Func* f, u32 b, u32 i, CGABIValue* v,
+ int storage_def) {
+ if (!v) return;
+ opt_use_add_operand(f, b, i, OPT_USE_NONE, &v->storage, storage_def);
+ for (u32 p = 0; p < v->nparts; ++p)
+ opt_use_add_operand(f, b, i, p, (Operand*)&v->parts[p].op, storage_def);
+}
+
+static void opt_collect_inst_uses(Func* f, u32 b, u32 i, Inst* in) {
+ for (u32 o = 0; o < in->nopnds; ++o) {
+ int is_def = 0;
+ if (in->opnds[o].kind == OPK_REG)
+ is_def = o == 0 && opt_val_in_inst_defs(in, (Val)in->opnds[o].v.reg);
+ opt_use_add_operand(f, b, i, o, &in->opnds[o], is_def);
+ }
+
+ switch ((IROp)in->op) {
+ case IR_PHI: {
+ IRPhiAux* aux = (IRPhiAux*)in->extra.aux;
+ if (!aux) break;
+ for (u32 p = 0; p < aux->npreds; ++p)
+ opt_use_add(f, aux->pred_vals[p], b, i, OPT_USE_PHI_INPUT,
+ OPT_USE_NONE, p, NULL);
+ break;
+ }
+ case IR_CALL: {
+ IRCallAux* aux = (IRCallAux*)in->extra.aux;
+ if (!aux) break;
+ if (aux->use_plan_replay) {
+ opt_use_add_operand(f, b, i, OPT_USE_NONE, &aux->plan.callee, 0);
+ for (u32 a = 0; a < aux->plan.nargs; ++a)
+ opt_use_add_operand(f, b, i, a, &aux->plan.args[a].src, 0);
+ } else {
+ opt_use_add_operand(f, b, i, OPT_USE_NONE, &aux->desc.callee, 0);
+ for (u32 a = 0; a < aux->desc.nargs; ++a)
+ opt_use_add_abivalue(f, b, i, (CGABIValue*)&aux->desc.args[a], 0);
+ }
+ break;
+ }
+ case IR_RET: {
+ IRRetAux* aux = (IRRetAux*)in->extra.aux;
+ if (aux && aux->present) opt_use_add_abivalue(f, b, i, &aux->val, 0);
+ break;
+ }
+ case IR_SCOPE_BEGIN: {
+ IRScopeAux* aux = (IRScopeAux*)in->extra.aux;
+ if (aux) opt_use_add_operand(f, b, i, OPT_USE_NONE, &aux->desc.cond, 0);
+ break;
+ }
+ case IR_ASM_BLOCK: {
+ IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
+ if (!aux) break;
+ for (u32 a = 0; a < aux->nin; ++a)
+ opt_use_add_operand(f, b, i, a, &aux->in_ops[a], 0);
+ break;
+ }
+ case IR_INTRINSIC: {
+ IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
+ if (!aux) break;
+ for (u32 a = 0; a < aux->narg; ++a)
+ opt_use_add_operand(f, b, i, a, &aux->args[a], 0);
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+void opt_rebuild_def_use(Func* f) {
+ if (!f) return;
+ f->opt_nuses = 0;
+ f->opt_first_use_by_val =
+ arena_array(f->arena, u32, f->nvals ? f->nvals : 1u);
+ for (u32 v = 0; v < f->nvals; ++v) f->opt_first_use_by_val[v] = OPT_USE_NONE;
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i)
+ opt_collect_inst_uses(f, b, i, &bl->insts[i]);
+ }
+}
+
static void verify_operand(Func* f, Inst* in, Operand* op, int is_def,
void* arg) {
(void)in;
@@ -207,6 +324,12 @@ static void verify_values(Func* f, const char* stage) {
if ((IROp)in->op == IR_PHI) {
IRPhiAux* aux = (IRPhiAux*)in->extra.aux;
if (!aux) opt_fail(f, stage, "phi missing aux", b, i);
+ if (in->def == VAL_NONE) opt_fail(f, stage, "phi missing def", b, i);
+ if (in->nopnds || in->opnds)
+ opt_fail(f, stage, "phi should not carry operands", b, i);
+ if (aux->slot_id > f->nframe_slots)
+ opt_fail(f, stage, "phi bad slot id", aux->slot_id,
+ f->nframe_slots);
if (aux->npreds != bl->npreds)
opt_fail(f, stage, "phi pred count mismatch", aux->npreds,
bl->npreds);
@@ -222,6 +345,32 @@ static void verify_values(Func* f, const char* stage) {
}
}
+static void verify_def_use(Func* f, const char* stage) {
+ if (f->opt_rewritten) return;
+ opt_rebuild_def_use(f);
+ for (u32 u = 0; u < f->opt_nuses; ++u) {
+ OptUse* use = &f->opt_uses[u];
+ if (use->val == VAL_NONE || use->val >= f->nvals)
+ opt_fail(f, stage, "def-use bad use val", use->val, f->nvals);
+ if (use->block >= f->nblocks)
+ opt_fail(f, stage, "def-use bad block", use->block, f->nblocks);
+ if (use->inst >= f->blocks[use->block].ninsts)
+ opt_fail(f, stage, "def-use bad inst", use->inst,
+ f->blocks[use->block].ninsts);
+ if (f->val_def_block[use->val] >= f->nblocks)
+ opt_fail(f, stage, "def-use bad def block", use->val,
+ f->val_def_block[use->val]);
+ }
+ for (Val v = 1; v < f->nvals; ++v) {
+ for (u32 u = f->opt_first_use_by_val[v]; u != OPT_USE_NONE;
+ u = f->opt_uses[u].next_for_val) {
+ if (u >= f->opt_nuses) opt_fail(f, stage, "def-use bad next", v, u);
+ if (f->opt_uses[u].val != v)
+ opt_fail(f, stage, "def-use wrong value list", v, u);
+ }
+ }
+}
+
void opt_verify(Func* f, const char* stage) {
if (!f) return;
if (f->nblocks && f->entry >= f->nblocks)
@@ -264,4 +413,5 @@ void opt_verify(Func* f, const char* stage) {
seen_emit[b] = 1;
}
verify_values(f, stage);
+ verify_def_use(f, stage);
}
diff --git a/src/opt/pass_ssa.c b/src/opt/pass_ssa.c
@@ -1,427 +1,665 @@
-/* pass_ssa.c — mem2reg + dominance-frontier SSA construction.
- *
- * Goal for Phase 3 (doc/OPT.md): build SSA without consuming it. The
- * output is discarded before replay, so this pass's job is shape
- * checking — no panics on the corpus.
- *
- * Algorithm (Cooper-Harvey-Kennedy iterative dominators + Cytron et al.
- * dominance-frontier phi insertion):
- *
- * 1. Postorder + reverse-postorder traversal of the CFG.
- * 2. Compute idom[] iteratively via the two-finger intersect.
- * 3. Compute DF[] from idom[] in one pass.
- * 4. For each promotable FrameSlot (no FSF_ADDR_TAKEN), compute the
- * iterated dominance frontier of its defining blocks; insert
- * IR_PHI at the start of each block in the IDF.
- * 5. Rename: DFS the dominator tree, maintain a stack per slot; on
- * store push the stored Val, on load record (load_def → top) into
- * a rename map, on each successor fill in this block's slot in
- * that successor's phis. After processing children, pop.
- *
- * The rename map is built but its uses are intentionally NOT walked
- * across other instructions in the dry-run — that is the part that
- * mutates the IR for downstream passes, and Phase 3 discards the IR.
- * The phi-insertion + slot-stack walk is the part that exercises the
- * IR shape, which is what the dry-run is checking. */
-
-#include "opt/ir.h"
-#include "opt/opt.h"
+/* pass_ssa.c - mem2reg SSA construction and phi destruction for O2. */
+
+#include "opt/opt_internal.h"
#include <string.h>
#include "core/arena.h"
#include "core/core.h"
-#define BLK_NONE 0xffffffffu
-
-/* ---- postorder ---- */
+typedef struct SlotStack {
+ Val* vals;
+ u32 n;
+ u32 cap;
+} SlotStack;
-typedef struct PostorderCtx {
+typedef struct RenameCtx {
Func* f;
- u32* po; /* po[i] = block id at postorder position i */
- u32* po_idx; /* po_idx[block] = postorder position of block */
- u8* visited;
- u32 count;
-} PostorderCtx;
-
-static void postorder_dfs(PostorderCtx* ctx, u32 b) {
- if (ctx->visited[b]) return;
- ctx->visited[b] = 1;
- Block* bl = &ctx->f->blocks[b];
- for (u32 s = 0; s < bl->nsucc; ++s) {
- u32 t = bl->succ[s];
- if (t < ctx->f->nblocks) postorder_dfs(ctx, t);
- }
- ctx->po[ctx->count] = b;
- ctx->po_idx[b] = ctx->count;
- ctx->count++;
+ OptAnalysis* analysis;
+ u8* promoted;
+ Val* repl;
+ u32 repl_cap;
+ SlotStack* stacks;
+} RenameCtx;
+
+typedef struct EdgeMove {
+ Val dst;
+ Val src;
+ CfreeCgTypeId type;
+ u8 cls;
+} EdgeMove;
+
+static u8 ssa_type_class(Func* f, CfreeCgTypeId ty) {
+ CfreeCgTypeKind kind = cfree_cg_type_kind((CfreeCompiler*)f->c, ty);
+ return kind == CFREE_CG_TYPE_FLOAT ? RC_FP : RC_INT;
+}
+
+static u32 opnd_slot_id(const Operand* op) {
+ if (!op || op->kind != OPK_LOCAL) return 0;
+ return (u32)op->v.frame_slot;
}
-/* ---- dominators (Cooper-Harvey-Kennedy) ---- */
+static int base_slot_promotable(const Func* f, u32 slot_id) {
+ if (slot_id == 0 || slot_id > f->nframe_slots) return 0;
+ const IRFrameSlot* s = &f->frame_slots[slot_id - 1u];
+ if (s->kind != FS_LOCAL) return 0;
+ if (s->flags & (FSF_ADDR_TAKEN | FSF_VOLATILE)) return 0;
+ return 1;
+}
-static u32 dom_intersect(u32 b1, u32 b2, const u32* idom, const u32* po_idx) {
- while (b1 != b2) {
- while (po_idx[b1] < po_idx[b2]) b1 = idom[b1];
- while (po_idx[b2] < po_idx[b1]) b2 = idom[b2];
- }
- return b1;
+static int operand_has_slot(const Operand* op, u32 slot_id) {
+ return op && op->kind == OPK_LOCAL && opnd_slot_id(op) == slot_id;
}
-static u32* compute_idom(Func* f, const u32* po, const u32* po_idx, u32 ncount,
- u32 entry) {
- u32* idom = arena_array(f->arena, u32, f->nblocks);
- for (u32 b = 0; b < f->nblocks; ++b) idom[b] = BLK_NONE;
- idom[entry] = entry;
-
- int changed = 1;
- while (changed) {
- changed = 0;
- /* Reverse postorder, skip the entry block. */
- for (i32 i = (i32)ncount - 1; i >= 0; --i) {
- u32 b = po[i];
- if (b == entry) continue;
- Block* bl = &f->blocks[b];
- u32 new_idom = BLK_NONE;
- for (u32 p = 0; p < bl->npreds; ++p) {
- u32 pp = bl->preds[p];
- if (idom[pp] != BLK_NONE) {
- new_idom = (new_idom == BLK_NONE)
- ? pp
- : dom_intersect(pp, new_idom, idom, po_idx);
- }
- }
- if (new_idom != BLK_NONE && idom[b] != new_idom) {
- idom[b] = new_idom;
- changed = 1;
+static int abivalue_has_slot(const CGABIValue* v, u32 slot_id) {
+ if (!v) return 0;
+ if (operand_has_slot(&v->storage, slot_id)) return 1;
+ for (u32 i = 0; i < v->nparts; ++i)
+ if (operand_has_slot(&v->parts[i].op, slot_id)) return 1;
+ return 0;
+}
+
+static int aux_has_slot(const Inst* in, u32 slot_id) {
+ switch ((IROp)in->op) {
+ case IR_CALL: {
+ IRCallAux* aux = (IRCallAux*)in->extra.aux;
+ if (!aux) return 0;
+ if (aux->use_plan_replay) {
+ if (operand_has_slot(&aux->plan.callee, slot_id)) return 1;
+ for (u32 i = 0; i < aux->plan.nargs; ++i)
+ if (operand_has_slot(&aux->plan.args[i].src, slot_id)) return 1;
+ for (u32 i = 0; i < aux->plan.nrets; ++i)
+ if (operand_has_slot(&aux->plan.rets[i].dst, slot_id)) return 1;
+ } else {
+ if (operand_has_slot(&aux->desc.callee, slot_id)) return 1;
+ for (u32 i = 0; i < aux->desc.nargs; ++i)
+ if (abivalue_has_slot(&aux->desc.args[i], slot_id)) return 1;
+ if (abivalue_has_slot(&aux->desc.ret, slot_id)) return 1;
}
+ break;
+ }
+ case IR_RET: {
+ IRRetAux* aux = (IRRetAux*)in->extra.aux;
+ return aux && aux->present && abivalue_has_slot(&aux->val, slot_id);
+ }
+ case IR_SCOPE_BEGIN: {
+ IRScopeAux* aux = (IRScopeAux*)in->extra.aux;
+ return aux && operand_has_slot(&aux->desc.cond, slot_id);
+ }
+ case IR_ASM_BLOCK: {
+ IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
+ if (!aux) return 0;
+ for (u32 i = 0; i < aux->nin; ++i)
+ if (operand_has_slot(&aux->in_ops[i], slot_id)) return 1;
+ for (u32 i = 0; i < aux->nout; ++i)
+ if (operand_has_slot(&aux->out_ops[i], slot_id)) return 1;
+ break;
}
+ case IR_INTRINSIC: {
+ IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
+ if (!aux) return 0;
+ for (u32 i = 0; i < aux->narg; ++i)
+ if (operand_has_slot(&aux->args[i], slot_id)) return 1;
+ for (u32 i = 0; i < aux->ndst; ++i)
+ if (operand_has_slot(&aux->dsts[i], slot_id)) return 1;
+ break;
+ }
+ default:
+ break;
}
- return idom;
+ return 0;
}
-/* ---- dominance frontier ---- */
-
-typedef struct DfSet {
- u32* members;
- u32 n, cap;
-} DfSet;
-
-static void df_add(Arena* a, DfSet* s, u32 b) {
- for (u32 i = 0; i < s->n; ++i)
- if (s->members[i] == b) return;
- if (s->n == s->cap) {
- u32 ncap = s->cap ? s->cap * 2u : 4u;
- u32* nb = arena_array(a, u32, ncap);
- if (s->members) memcpy(nb, s->members, sizeof(u32) * s->n);
- s->members = nb;
- s->cap = ncap;
+static int slot_access_promotable(const Inst* in, u32 slot_id) {
+ if ((IROp)in->op == IR_LOAD) {
+ if (in->nopnds < 2 || opnd_slot_id(&in->opnds[1]) != slot_id) return 1;
+ return in->opnds[0].kind == OPK_REG && !opt_mem_observable(&in->extra.mem);
}
- s->members[s->n++] = b;
+ if ((IROp)in->op == IR_STORE) {
+ if (in->nopnds < 2 || opnd_slot_id(&in->opnds[0]) != slot_id) return 1;
+ if (opt_mem_observable(&in->extra.mem)) return 0;
+ return in->opnds[1].kind == OPK_REG || in->opnds[1].kind == OPK_IMM;
+ }
+ for (u32 i = 0; i < in->nopnds; ++i)
+ if (opnd_slot_id(&in->opnds[i]) == slot_id) return 0;
+ if (aux_has_slot(in, slot_id)) return 0;
+ return 1;
}
-static DfSet* compute_df(Func* f, const u32* idom) {
- DfSet* df = arena_zarray(f->arena, DfSet, f->nblocks);
+static u8* find_promoted_slots(Func* f) {
+ u8* promoted = arena_zarray(f->arena, u8, f->nframe_slots + 1u);
+ for (u32 sid = 1; sid <= f->nframe_slots; ++sid) {
+ if (!base_slot_promotable(f, sid)) continue;
+ promoted[sid] = 1;
+ }
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
- if (bl->npreds < 2) continue;
- if (idom[b] == BLK_NONE) continue;
- for (u32 p = 0; p < bl->npreds; ++p) {
- u32 runner = bl->preds[p];
- while (runner != idom[b] && runner != BLK_NONE) {
- df_add(f->arena, &df[runner], b);
- runner = idom[runner];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ for (u32 sid = 1; sid <= f->nframe_slots; ++sid) {
+ if (promoted[sid] && !slot_access_promotable(in, sid))
+ promoted[sid] = 0;
}
}
}
- return df;
+ return promoted;
}
-/* ---- promotable slots ---- */
+static void stack_push(Arena* a, SlotStack* s, Val v) {
+ if (s->n == s->cap) {
+ u32 ncap = s->cap ? s->cap * 2u : 4u;
+ Val* vals = arena_array(a, Val, ncap);
+ if (s->vals) memcpy(vals, s->vals, sizeof(vals[0]) * s->n);
+ s->vals = vals;
+ s->cap = ncap;
+ }
+ s->vals[s->n++] = v;
+}
-/* Identify the FrameSlot a load/store address operand refers to.
- * Returns 0 if the operand is not OPK_LOCAL (i.e., not a direct
- * frame-slot reference) — those addresses route through computed Val
- * pointers and cannot be promoted by a slot-keyed pass. */
-static u32 opnd_slot_id(const Operand* op) {
- if (op->kind != OPK_LOCAL) return 0;
- return (u32)op->v.frame_slot;
+static Val stack_top(const SlotStack* s) {
+ return s && s->n ? s->vals[s->n - 1u] : VAL_NONE;
}
-static int slot_promotable(const Func* f, u32 slot_id) {
- if (slot_id == 0 || slot_id > f->nframe_slots) return 0;
- const IRFrameSlot* s = &f->frame_slots[slot_id - 1];
- if (s->flags & FSF_ADDR_TAKEN) return 0;
- if (s->flags & FSF_VOLATILE) return 0;
- /* Only locals are promotable for v1; params live in slots too but
- * promoting them is a separate transform. */
- if (s->kind != FS_LOCAL) return 0;
- return 1;
+static Val resolve_repl(const RenameCtx* ctx, Val v) {
+ while (v != VAL_NONE && v < ctx->repl_cap && ctx->repl[v] != VAL_NONE &&
+ ctx->repl[v] != v) {
+ v = ctx->repl[v];
+ }
+ return v;
}
-/* ---- phi insertion ---- */
+static void replace_use(Func* f, Inst* in, Operand* op, int is_def, void* arg) {
+ (void)in;
+ RenameCtx* ctx = (RenameCtx*)arg;
+ if (is_def || op->kind != OPK_REG) return;
+ Val old = (Val)op->v.reg;
+ if (old == VAL_NONE || old >= f->nvals) return;
+ Val repl = resolve_repl(ctx, old);
+ if (repl != VAL_NONE && repl != old) {
+ op->v.reg = (Reg)repl;
+ op->type = f->val_type[repl];
+ op->cls = f->val_cls[repl];
+ }
+}
-/* Insert n_new phi instructions at the start of block b, each tagged
- * with its slot via opnds[0] = synthetic "slot tag" Val (we reuse
- * IRPhiAux to carry pred info). The stored slot id lives in
- * extra.imm. */
-static void insert_phis(Func* f, u32 b, u32 n_new, const u32* phi_slots,
- const u32* phi_blocks_for_slot) {
- if (!n_new) return;
+static void insert_phis(Func* f, u32 b, const u8* needs_phi) {
Block* bl = &f->blocks[b];
- u32 old = bl->ninsts;
- u32 nnew = old + n_new;
- Inst* nb = arena_zarray(f->arena, Inst, nnew);
- /* Phis go first. */
- for (u32 i = 0; i < n_new; ++i) {
- Inst* in = &nb[i];
- u32 slot_id = phi_slots[i];
- const IRFrameSlot* s = &f->frame_slots[slot_id - 1];
+ u32 nphi = 0;
+ for (u32 sid = 1; sid <= f->nframe_slots; ++sid)
+ if (needs_phi[sid * f->nblocks + b]) ++nphi;
+ if (!nphi) return;
+
+ u32 old_nvals = f->nvals;
+ Inst* insts = arena_zarray(f->arena, Inst, bl->ninsts + nphi);
+ u32 w = 0;
+ for (u32 sid = 1; sid <= f->nframe_slots; ++sid) {
+ if (!needs_phi[sid * f->nblocks + b]) continue;
+ const IRFrameSlot* slot = &f->frame_slots[sid - 1u];
+ Inst* in = &insts[w++];
in->op = IR_PHI;
- in->type = s->type;
- in->extra.imm = (i64)slot_id;
- /* Allocate IRPhiAux with one slot per pred, initialized to
- * VAL_NONE. The rename pass fills pred_vals later. */
+ in->type = slot->type;
+ in->def = ir_alloc_val(f, slot->type, ssa_type_class(f, slot->type));
+ f->val_def_block[in->def] = b;
+ f->val_def_inst[in->def] = w - 1u;
IRPhiAux* aux = arena_znew(f->arena, IRPhiAux);
+ aux->slot_id = sid;
aux->npreds = bl->npreds;
if (bl->npreds) {
aux->pred_blocks = arena_array(f->arena, u32, bl->npreds);
aux->pred_vals = arena_zarray(f->arena, Val, bl->npreds);
memcpy(aux->pred_blocks, bl->preds, sizeof(u32) * bl->npreds);
}
- /* Reuse extra union: imm carries slot, but we also need aux. We
- * stash the IRPhiAux in opnds: opnds[0] is a sentinel pointer cast
- * — that breaks the Val* type. Instead, we use a parallel side
- * table rooted on the inst. To keep this self-contained without
- * altering the Inst layout, we point in->opnds at a Val array of
- * length 0 and rely on the aux pointer through the extra union.
- *
- * Layout choice: extra.aux = aux (carry the IRPhiAux pointer);
- * imm-as-slot lives in a side table. */
in->extra.aux = aux;
- in->nopnds = 0;
- in->opnds = NULL;
- /* def Val: allocate a fresh value typed as the slot's type. */
- /* We can't use val_alloc directly (it's static); set up the val
- * table manually via an emit-equivalent. Simpler: piggy-back on
- * ir_emit_const_i style by appending after current state. But we're
- * mid-rebuild of the block. Defer: encode def slot via re-reading
- * the val table. */
- /* For Phase 3 dry-run, leave def = VAL_NONE on phis. The rename
- * pass uses extra.aux->pred_vals to inspect phi shape; the
- * dry-run discards before downstream passes need def. */
- in->def = VAL_NONE;
- }
- /* Existing instructions shifted right. */
- if (old) memcpy(nb + n_new, bl->insts, sizeof(Inst) * old);
- bl->insts = nb;
- bl->ninsts = nnew;
- bl->cap = nnew;
- /* val_def_inst for any val defined in this block has shifted by
- * n_new. Walk the val table and update. */
- for (u32 v = 1; v < f->nvals; ++v) {
- if (f->val_def_block[v] == b) f->val_def_inst[v] += n_new;
}
- (void)phi_blocks_for_slot;
+ if (bl->ninsts) memcpy(insts + nphi, bl->insts, sizeof(Inst) * bl->ninsts);
+ bl->insts = insts;
+ bl->ninsts += nphi;
+ bl->cap = bl->ninsts;
+ for (Val v = 1; v < old_nvals; ++v)
+ if (f->val_def_block[v] == b) f->val_def_inst[v] += nphi;
}
-/* ---- rename ---- */
+static void mark_def_blocks(Func* f, const u8* promoted, u8* def_blocks) {
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if ((IROp)in->op != IR_STORE || in->nopnds < 2) continue;
+ u32 sid = opnd_slot_id(&in->opnds[0]);
+ if (sid && promoted[sid]) def_blocks[sid * f->nblocks + b] = 1;
+ }
+ }
+}
-typedef struct SlotStack {
- Val* stack;
- u32 n, cap;
-} SlotStack;
+static void compute_phi_sites(Func* f, OptAnalysis* a, const u8* promoted,
+ u8* needs_phi) {
+ u8* def_blocks = arena_zarray(f->arena, u8,
+ (f->nframe_slots + 1u) * f->nblocks);
+ mark_def_blocks(f, promoted, def_blocks);
+ u32* work = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u);
+ u8* queued = arena_zarray(f->arena, u8, f->nblocks ? f->nblocks : 1u);
-static void slot_push(Arena* a, SlotStack* s, Val v) {
- if (s->n == s->cap) {
- u32 ncap = s->cap ? s->cap * 2u : 4u;
- Val* nb = arena_array(a, Val, ncap);
- if (s->stack) memcpy(nb, s->stack, sizeof(Val) * s->n);
- s->stack = nb;
- s->cap = ncap;
+ for (u32 sid = 1; sid <= f->nframe_slots; ++sid) {
+ if (!promoted[sid]) continue;
+ memset(queued, 0, f->nblocks);
+ u32 wn = 0;
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ if (!def_blocks[sid * f->nblocks + b]) continue;
+ queued[b] = 1;
+ work[wn++] = b;
+ }
+ while (wn) {
+ u32 x = work[--wn];
+ OptBlockList* df = &a->dom_frontier[x];
+ for (u32 i = 0; i < df->n; ++i) {
+ u32 y = df->items[i];
+ if (needs_phi[sid * f->nblocks + y]) continue;
+ needs_phi[sid * f->nblocks + y] = 1;
+ if (!queued[y]) {
+ queued[y] = 1;
+ work[wn++] = y;
+ }
+ }
+ }
}
- s->stack[s->n++] = v;
}
-static Val slot_top(const SlotStack* s) {
- return s->n ? s->stack[s->n - 1] : VAL_NONE;
+static void rewrite_store_immediate(RenameCtx* ctx, Inst* in, u32 b, u32 i,
+ u32 sid) {
+ Operand src = in->opnds[1];
+ Val v = ir_alloc_val(ctx->f, src.type, src.cls);
+ Operand* opnds = arena_array(ctx->f->arena, Operand, 1);
+ opnds[0] = src;
+ opnds[0].kind = OPK_REG;
+ opnds[0].v.reg = (Reg)v;
+ in->op = IR_LOAD_IMM;
+ in->type = src.type;
+ in->def = v;
+ in->opnds = opnds;
+ in->nopnds = 1;
+ in->extra.imm = src.v.imm;
+ ctx->f->val_def_block[v] = b;
+ ctx->f->val_def_inst[v] = i;
+ stack_push(ctx->f->arena, &ctx->stacks[sid], v);
}
-static void rename_dfs(Func* f, u32 b, const u32* idom, SlotStack* slots) {
+static void rename_block(RenameCtx* ctx, u32 b) {
+ Func* f = ctx->f;
Block* bl = &f->blocks[b];
- /* Track per-slot push count so we can pop on exit. */
- u32* pushed = arena_zarray(f->arena, u32, f->nframe_slots + 1);
+ u32* pushed = arena_zarray(f->arena, u32, f->nframe_slots + 1u);
- /* 1. Process phis in this block: each phi has extra.imm = slot id.
- * Push a synthetic phi value (VAL_NONE in dry-run; a "fresh Val"
- * once we wire renaming). */
for (u32 i = 0; i < bl->ninsts; ++i) {
Inst* in = &bl->insts[i];
- if (in->op != IR_PHI) break;
- u32 slot_id = (u32)in->extra.imm;
- if (slot_id == 0 || slot_id > f->nframe_slots) continue;
- /* For dry-run, the phi's def is VAL_NONE; we still push so the
- * stack has the right depth. Downstream passes (Phase 4+) will
- * allocate a real Val here. */
- slot_push(f->arena, &slots[slot_id], VAL_NONE);
- pushed[slot_id]++;
+ if ((IROp)in->op != IR_PHI) break;
+ IRPhiAux* aux = (IRPhiAux*)in->extra.aux;
+ if (!aux || !aux->slot_id || !ctx->promoted[aux->slot_id]) continue;
+ stack_push(f->arena, &ctx->stacks[aux->slot_id], in->def);
+ pushed[aux->slot_id]++;
}
- /* 2. Process the rest of the block. */
for (u32 i = 0; i < bl->ninsts; ++i) {
Inst* in = &bl->insts[i];
- if (in->op == IR_PHI) continue;
- if (in->op == IR_STORE) {
- /* IR_STORE opnds: [0] = addr, [1] = src. */
- if (in->nopnds < 2) continue;
+ if ((IROp)in->op == IR_PHI) continue;
+ if ((IROp)in->op == IR_STORE && in->nopnds >= 2) {
u32 sid = opnd_slot_id(&in->opnds[0]);
- if (sid && slot_promotable(f, sid)) {
- const Operand* src = &in->opnds[1];
- Val v = (src->kind == OPK_REG) ? (Val)src->v.reg : VAL_NONE;
- slot_push(f->arena, &slots[sid], v);
+ if (sid && ctx->promoted[sid]) {
+ Operand src = in->opnds[1];
+ if (src.kind == OPK_REG) {
+ stack_push(f->arena, &ctx->stacks[sid],
+ resolve_repl(ctx, (Val)src.v.reg));
+ in->op = IR_NOP;
+ in->nopnds = 0;
+ in->opnds = NULL;
+ in->def = VAL_NONE;
+ } else {
+ rewrite_store_immediate(ctx, in, b, i, sid);
+ }
pushed[sid]++;
+ continue;
}
- continue;
}
- if (in->op == IR_LOAD) {
- /* IR_LOAD opnds: [0] = dst REG, [1] = addr. */
- if (in->nopnds < 2) continue;
+ if ((IROp)in->op == IR_LOAD && in->nopnds >= 2) {
u32 sid = opnd_slot_id(&in->opnds[1]);
- if (sid && slot_promotable(f, sid)) {
- /* Touching slot_top exercises the stack invariant — that's
- * the shape check we want. The rewrite map proper would walk
- * uses; we skip that in the dry-run (output discarded). */
- (void)slot_top(&slots[sid]);
+ if (sid && ctx->promoted[sid] && in->def != VAL_NONE) {
+ Val cur = stack_top(&ctx->stacks[sid]);
+ if (cur == VAL_NONE) continue;
+ if (in->def < ctx->repl_cap)
+ ctx->repl[in->def] = resolve_repl(ctx, cur);
+ in->op = IR_NOP;
+ in->nopnds = 0;
+ in->opnds = NULL;
+ in->def = VAL_NONE;
+ continue;
}
- continue;
}
+ opt_walk_inst_operands(f, in, replace_use, ctx);
}
- /* 3. Fill our slot in each successor's phis. */
for (u32 s = 0; s < bl->nsucc; ++s) {
u32 succ = bl->succ[s];
if (succ >= f->nblocks) continue;
Block* sb = &f->blocks[succ];
- /* Find which pred index `b` is in succ. */
- u32 pred_idx = 0;
- int found = 0;
+ u32 pred_idx = OPT_USE_NONE;
for (u32 p = 0; p < sb->npreds; ++p) {
if (sb->preds[p] == b) {
pred_idx = p;
- found = 1;
break;
}
}
- if (!found) continue;
+ if (pred_idx == OPT_USE_NONE) continue;
for (u32 i = 0; i < sb->ninsts; ++i) {
- Inst* in = &sb->insts[i];
- if (in->op != IR_PHI) break;
- u32 slot_id = (u32)in->extra.imm;
- if (slot_id == 0 || slot_id > f->nframe_slots) continue;
- IRPhiAux* aux = (IRPhiAux*)in->extra.aux;
- if (!aux || pred_idx >= aux->npreds) continue;
- aux->pred_vals[pred_idx] = slot_top(&slots[slot_id]);
+ Inst* phi = &sb->insts[i];
+ if ((IROp)phi->op != IR_PHI) break;
+ IRPhiAux* aux = (IRPhiAux*)phi->extra.aux;
+ if (!aux || !aux->slot_id || !ctx->promoted[aux->slot_id]) continue;
+ aux->pred_vals[pred_idx] =
+ resolve_repl(ctx, stack_top(&ctx->stacks[aux->slot_id]));
}
}
- /* 4. Recurse into immediate dom children. */
- for (u32 c = 0; c < f->nblocks; ++c) {
- if (c == b) continue;
- if (idom[c] == b) rename_dfs(f, c, idom, slots);
- }
+ OptBlockList* children = &ctx->analysis->dom_children[b];
+ for (u32 i = 0; i < children->n; ++i) rename_block(ctx, children->items[i]);
- /* 5. Pop. */
- for (u32 sid = 0; sid <= f->nframe_slots; ++sid) {
+ for (u32 sid = 1; sid <= f->nframe_slots; ++sid) {
while (pushed[sid]--) {
- if (slots[sid].n) slots[sid].n--;
+ if (ctx->stacks[sid].n) --ctx->stacks[sid].n;
}
}
}
-/* ---- main entry ---- */
+static void replace_phi_inputs(Func* f, RenameCtx* ctx) {
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if ((IROp)in->op != IR_PHI) break;
+ IRPhiAux* aux = (IRPhiAux*)in->extra.aux;
+ if (!aux) continue;
+ for (u32 p = 0; p < aux->npreds; ++p)
+ aux->pred_vals[p] = resolve_repl(ctx, aux->pred_vals[p]);
+ }
+ }
+}
void opt_build_ssa(Func* f) {
- if (f->nblocks == 0) return;
-
- /* Postorder traversal from entry. */
- PostorderCtx pctx;
- pctx.f = f;
- pctx.po = arena_array(f->arena, u32, f->nblocks);
- pctx.po_idx = arena_array(f->arena, u32, f->nblocks);
- pctx.visited = arena_zarray(f->arena, u8, f->nblocks);
- pctx.count = 0;
- for (u32 b = 0; b < f->nblocks; ++b) pctx.po_idx[b] = 0;
- postorder_dfs(&pctx, f->entry);
- /* Unreachable blocks: never visited. Skip them in the dom analysis;
- * the dry-run shouldn't crash on them. */
-
- u32* idom = compute_idom(f, pctx.po, pctx.po_idx, pctx.count, f->entry);
- DfSet* df = compute_df(f, idom);
-
- /* For each promotable slot, find defining blocks → iterated DF. */
- if (f->nframe_slots == 0) return;
- u8* needs_phi_storage =
- arena_zarray(f->arena, u8, (f->nframe_slots + 1) * f->nblocks);
-#define NEEDS_PHI(slot, blk) \
- needs_phi_storage[((slot) * f->nblocks) + (blk)]
-
- /* Worklist algo per slot. */
- u32* worklist = arena_array(f->arena, u32, f->nblocks);
- u8* on_worklist = arena_zarray(f->arena, u8, f->nblocks);
+ if (!f || f->nblocks == 0 || f->nframe_slots == 0) {
+ if (f) opt_rebuild_def_use(f);
+ return;
+ }
- for (u32 sid = 1; sid <= f->nframe_slots; ++sid) {
- if (!slot_promotable(f, sid)) continue;
- /* Reset per-slot worklist state. */
- for (u32 i = 0; i < f->nblocks; ++i) on_worklist[i] = 0;
- u32 wn = 0;
- /* Seed with blocks containing a store to this slot. */
- for (u32 b = 0; b < f->nblocks; ++b) {
- Block* bl = &f->blocks[b];
- for (u32 i = 0; i < bl->ninsts; ++i) {
- Inst* in = &bl->insts[i];
- if (in->op == IR_STORE && in->nopnds >= 1 &&
- opnd_slot_id(&in->opnds[0]) == sid) {
- if (!on_worklist[b]) {
- on_worklist[b] = 1;
- worklist[wn++] = b;
- }
- break;
+ OptAnalysis a;
+ opt_analysis_build_order(f, &a);
+ opt_analysis_build_dominators(f, &a);
+ opt_analysis_build_dom_frontier(f, &a);
+
+ u8* promoted = find_promoted_slots(f);
+ u8* needs_phi =
+ arena_zarray(f->arena, u8, (f->nframe_slots + 1u) * f->nblocks);
+ compute_phi_sites(f, &a, promoted, needs_phi);
+ for (u32 b = 0; b < f->nblocks; ++b) insert_phis(f, b, needs_phi);
+
+ RenameCtx ctx;
+ memset(&ctx, 0, sizeof ctx);
+ ctx.f = f;
+ ctx.analysis = &a;
+ ctx.promoted = promoted;
+ ctx.repl_cap = f->vals_cap ? f->vals_cap : f->nvals;
+ ctx.repl = arena_zarray(f->arena, Val, ctx.repl_cap ? ctx.repl_cap : 1u);
+ ctx.stacks = arena_zarray(f->arena, SlotStack, f->nframe_slots + 1u);
+ rename_block(&ctx, f->entry);
+
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i)
+ opt_walk_inst_operands(f, &bl->insts[i], replace_use, &ctx);
+ }
+ replace_phi_inputs(f, &ctx);
+ opt_rebuild_def_use(f);
+}
+
+static int ssa_is_terminator(const Inst* in) {
+ switch ((IROp)in->op) {
+ case IR_BR:
+ case IR_CONDBR:
+ case IR_CMP_BRANCH:
+ case IR_SWITCH:
+ case IR_INDIRECT_BRANCH:
+ case IR_RET:
+ case IR_BREAK_TO:
+ case IR_CONTINUE_TO:
+ return 1;
+ case IR_INTRINSIC: {
+ IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
+ return aux && (aux->kind == INTRIN_LONGJMP ||
+ aux->kind == INTRIN_TRAP ||
+ aux->kind == INTRIN_UNREACHABLE);
+ }
+ default:
+ return 0;
+ }
+}
+
+static Inst make_copy_inst(Func* f, Val dst, Val src, CfreeCgTypeId ty, u8 cls) {
+ Inst in;
+ memset(&in, 0, sizeof in);
+ in.op = IR_COPY;
+ in.type = ty;
+ in.def = dst;
+ in.nopnds = 2;
+ in.opnds = arena_array(f->arena, Operand, 2);
+ in.opnds[0].kind = OPK_REG;
+ in.opnds[0].type = ty;
+ in.opnds[0].cls = cls;
+ in.opnds[0].v.reg = (Reg)dst;
+ in.opnds[1].kind = OPK_REG;
+ in.opnds[1].type = ty;
+ in.opnds[1].cls = cls;
+ in.opnds[1].v.reg = (Reg)src;
+ return in;
+}
+
+static void insert_edge_moves(Func* f, u32 b, const EdgeMove* moves, u32 n) {
+ if (!n) return;
+ Block* bl = &f->blocks[b];
+ u32 term = bl->ninsts && ssa_is_terminator(&bl->insts[bl->ninsts - 1u]);
+ u32 insert_at = bl->ninsts - term;
+ u32 extra = n == 1 ? 1u : n * 2u;
+ Inst* insts = arena_zarray(f->arena, Inst, bl->ninsts + extra);
+ if (insert_at) memcpy(insts, bl->insts, sizeof(Inst) * insert_at);
+ u32 w = insert_at;
+ if (n == 1) {
+ insts[w++] = make_copy_inst(f, moves[0].dst, moves[0].src, moves[0].type,
+ moves[0].cls);
+ } else {
+ Val* temps = arena_array(f->arena, Val, n);
+ for (u32 i = 0; i < n; ++i) {
+ temps[i] = ir_alloc_val(f, moves[i].type, moves[i].cls);
+ f->val_def_block[temps[i]] = b;
+ f->val_def_inst[temps[i]] = w;
+ insts[w++] = make_copy_inst(f, temps[i], moves[i].src, moves[i].type,
+ moves[i].cls);
+ }
+ for (u32 i = 0; i < n; ++i) {
+ insts[w] = make_copy_inst(f, moves[i].dst, temps[i], moves[i].type,
+ moves[i].cls);
+ f->val_def_block[moves[i].dst] = b;
+ f->val_def_inst[moves[i].dst] = w;
+ ++w;
+ }
+ }
+ if (term) insts[w++] = bl->insts[bl->ninsts - 1u];
+ bl->insts = insts;
+ bl->ninsts = w;
+ bl->cap = w;
+ if (n == 1) {
+ f->val_def_block[moves[0].dst] = b;
+ f->val_def_inst[moves[0].dst] = insert_at;
+ }
+}
+
+static void replace_succ_ref(Func* f, u32 pred, u32 old_succ, u32 new_succ) {
+ Block* bl = &f->blocks[pred];
+ for (u32 s = 0; s < bl->nsucc; ++s) {
+ if (bl->succ[s] == old_succ) bl->succ[s] = new_succ;
+ }
+ if (!bl->ninsts) return;
+ Inst* term = &bl->insts[bl->ninsts - 1u];
+ if ((IROp)term->op == IR_SWITCH) {
+ IRSwitchAux* aux = (IRSwitchAux*)term->extra.aux;
+ if (!aux) return;
+ for (u32 i = 0; i < aux->ncases; ++i)
+ if (aux->cases[i].block == old_succ) aux->cases[i].block = new_succ;
+ if (aux->default_block == old_succ) aux->default_block = new_succ;
+ } else if ((IROp)term->op == IR_INDIRECT_BRANCH) {
+ IRIndirectAux* aux = (IRIndirectAux*)term->extra.aux;
+ if (!aux) return;
+ for (u32 i = 0; i < aux->ntargets; ++i)
+ if (aux->targets[i] == old_succ) aux->targets[i] = new_succ;
+ }
+}
+
+static void emit_order_insert_after(Func* f, u32 after, u32 block) {
+ for (u32 i = 0; i < f->emit_order_n; ++i)
+ if (f->emit_order[i] == block) return;
+ if (f->emit_order_n == f->emit_order_cap) {
+ u32 ncap = f->emit_order_cap ? f->emit_order_cap * 2u : 8u;
+ u32* order = arena_array(f->arena, u32, ncap);
+ if (f->emit_order)
+ memcpy(order, f->emit_order, sizeof(order[0]) * f->emit_order_n);
+ f->emit_order = order;
+ f->emit_order_cap = ncap;
+ }
+ u32 pos = f->emit_order_n;
+ for (u32 i = 0; i < f->emit_order_n; ++i) {
+ if (f->emit_order[i] == after) {
+ pos = i + 1u;
+ break;
+ }
+ }
+ for (u32 i = f->emit_order_n; i > pos; --i)
+ f->emit_order[i] = f->emit_order[i - 1u];
+ f->emit_order[pos] = block;
+ ++f->emit_order_n;
+}
+
+static int edge_is_fallthrough(Func* f, u32 pred, u32 succ) {
+ Block* bl = &f->blocks[pred];
+ if (!bl->ninsts) return 0;
+ IROp op = (IROp)bl->insts[bl->ninsts - 1u].op;
+ if (op != IR_CMP_BRANCH && op != IR_CONDBR) return 0;
+ return bl->nsucc >= 2 && bl->succ[1] == succ;
+}
+
+static u32 split_edge(Func* f, u32 pred, u32 succ) {
+ u32 edge = ir_block_new(f);
+ Block* eb = &f->blocks[edge];
+ Inst* br = ir_emit(f, edge, IR_BR);
+ (void)br;
+ eb->succ[0] = succ;
+ eb->nsucc = 1;
+ int place_after = edge_is_fallthrough(f, pred, succ);
+ replace_succ_ref(f, pred, succ, edge);
+ if (succ < f->nblocks) {
+ Block* sb = &f->blocks[succ];
+ for (u32 i = 0; i < sb->ninsts; ++i) {
+ Inst* phi = &sb->insts[i];
+ if ((IROp)phi->op != IR_PHI) break;
+ IRPhiAux* aux = (IRPhiAux*)phi->extra.aux;
+ if (!aux) continue;
+ for (u32 p = 0; p < aux->npreds; ++p) {
+ if (aux->pred_blocks[p] == pred) {
+ aux->pred_blocks[p] = edge;
+ aux->pred_vals[p] = phi->def;
}
}
}
- /* Iterated DF. */
- while (wn) {
- u32 x = worklist[--wn];
- DfSet* d = &df[x];
- for (u32 i = 0; i < d->n; ++i) {
- u32 y = d->members[i];
- if (NEEDS_PHI(sid, y)) continue;
- NEEDS_PHI(sid, y) = 1;
- if (!on_worklist[y]) {
- on_worklist[y] = 1;
- worklist[wn++] = y;
+ }
+ if (place_after) {
+ emit_order_insert_after(f, pred, edge);
+ } else {
+ ir_note_emit(f, edge);
+ }
+ return edge;
+}
+
+static void realign_phi_preds(Func* f) {
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* phi = &bl->insts[i];
+ if ((IROp)phi->op != IR_PHI) break;
+ IRPhiAux* aux = (IRPhiAux*)phi->extra.aux;
+ if (!aux || aux->npreds == bl->npreds) {
+ if (!aux) continue;
+ }
+ u32 old_n = aux->npreds;
+ u32* old_blocks = aux->pred_blocks;
+ Val* old_vals = aux->pred_vals;
+ u32* pred_blocks = bl->npreds ? arena_array(f->arena, u32, bl->npreds)
+ : NULL;
+ Val* pred_vals = bl->npreds ? arena_zarray(f->arena, Val, bl->npreds)
+ : NULL;
+ for (u32 p = 0; p < bl->npreds; ++p) {
+ pred_blocks[p] = bl->preds[p];
+ pred_vals[p] = phi->def;
+ for (u32 old = 0; old < old_n; ++old) {
+ if (old_blocks && old_blocks[old] == bl->preds[p]) {
+ pred_vals[p] = old_vals ? old_vals[old] : VAL_NONE;
+ break;
+ }
}
}
+ aux->npreds = bl->npreds;
+ aux->pred_blocks = pred_blocks;
+ aux->pred_vals = pred_vals;
}
}
+}
- /* Insert phis per block. */
+void opt_make_conventional_ssa(Func* f) {
+ if (!f) return;
for (u32 b = 0; b < f->nblocks; ++b) {
- u32 nphi = 0;
- for (u32 sid = 1; sid <= f->nframe_slots; ++sid) {
- if (NEEDS_PHI(sid, b)) nphi++;
- }
- if (!nphi) continue;
- u32* slots_arr = arena_array(f->arena, u32, nphi);
- u32 k = 0;
- for (u32 sid = 1; sid <= f->nframe_slots; ++sid) {
- if (NEEDS_PHI(sid, b)) slots_arr[k++] = sid;
+ Block* bl = &f->blocks[b];
+ if (!bl->ninsts || (IROp)bl->insts[0].op != IR_PHI) continue;
+ for (u32 p = 0; p < bl->npreds; ++p) {
+ u32 pred = bl->preds[p];
+ EdgeMove* moves = arena_array(f->arena, EdgeMove, bl->ninsts);
+ u32 n = 0;
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* phi = &bl->insts[i];
+ if ((IROp)phi->op != IR_PHI) break;
+ IRPhiAux* aux = (IRPhiAux*)phi->extra.aux;
+ if (!aux || p >= aux->npreds) continue;
+ Val src = aux->pred_vals[p];
+ if (src == VAL_NONE || src == phi->def) continue;
+ moves[n].dst = phi->def;
+ moves[n].src = src;
+ moves[n].type = phi->type;
+ moves[n].cls = f->val_cls[phi->def];
+ ++n;
+ }
+ if (!n) continue;
+ u32 target = pred;
+ if (pred >= f->nblocks) continue;
+ if (f->blocks[pred].nsucc != 1) target = split_edge(f, pred, b);
+ insert_edge_moves(f, target, moves, n);
}
- insert_phis(f, b, nphi, slots_arr, NULL);
}
+ opt_build_cfg(f);
+ realign_phi_preds(f);
+ opt_rebuild_def_use(f);
+}
- /* Rename phase: DFS from entry over the dominator tree. */
- SlotStack* slots = arena_zarray(f->arena, SlotStack, f->nframe_slots + 1);
- rename_dfs(f, f->entry, idom, slots);
-
-#undef NEEDS_PHI
+void opt_undo_ssa(Func* f) {
+ if (!f) return;
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ u32 w = 0;
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ if ((IROp)bl->insts[i].op == IR_PHI) continue;
+ bl->insts[w++] = bl->insts[i];
+ }
+ bl->ninsts = w;
+ }
+ opt_rebuild_def_use(f);
}
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -317,6 +317,17 @@ static Inst* emit_load_local(Func* f, u32 b, Val dst, FrameSlot fs,
return in;
}
+static Inst* emit_store_local(Func* f, u32 b, FrameSlot fs, Val src,
+ CfreeCgTypeId ty, u16 flags) {
+ Inst* in = ir_emit(f, b, IR_STORE);
+ in->opnds = arena_array(f->arena, Operand, 2);
+ in->opnds[0] = op_local_(fs, ty);
+ in->opnds[1] = op_reg_(src, ty);
+ in->nopnds = 2;
+ in->extra.mem = mem_local_(fs, ty, 4, flags);
+ return in;
+}
+
static Inst* emit_call_void(Func* f, u32 b) {
Inst* in = ir_emit(f, b, IR_CALL);
IRCallAux* aux = arena_znew(f->arena, IRCallAux);
@@ -1875,6 +1886,172 @@ static void opt_analysis_dominators_and_frontier(void) {
tc_fini(&tc);
}
+static void opt_ssa_diamond_mem2reg_phi(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, 0);
+ u32 entry = f->entry;
+ u32 then_b = ir_block_new(f);
+ u32 else_b = ir_block_new(f);
+ u32 join = ir_block_new(f);
+ ir_note_emit(f, then_b);
+ ir_note_emit(f, else_b);
+ ir_note_emit(f, join);
+
+ emit_test_branch(f, entry, then_b, else_b, tc.i32);
+ Val tv = add_val(f, tc.i32);
+ Val ev = add_val(f, tc.i32);
+ Val out = add_val(f, tc.i32);
+ emit_load_imm(f, then_b, tv, tc.i32, 11);
+ emit_store_local(f, then_b, fs, tv, tc.i32, 0);
+ emit_br_to(f, then_b, join);
+ emit_load_imm(f, else_b, ev, tc.i32, 22);
+ emit_store_local(f, else_b, fs, ev, tc.i32, 0);
+ emit_br_to(f, else_b, join);
+ emit_load_local(f, join, out, fs, tc.i32, 0);
+ emit_ret_val(f, join, out, tc.i32);
+
+ opt_build_cfg(f);
+ opt_build_ssa(f);
+ opt_verify(f, "test-ssa-diamond");
+ EXPECT(count_op(f, IR_PHI) == 1, "diamond should insert one phi");
+ EXPECT(count_op(f, IR_STORE) == 0, "promoted stores should be removed");
+ EXPECT(count_op(f, IR_LOAD) == 0, "promoted load should be removed");
+ Inst* phi = &f->blocks[join].insts[0];
+ IRPhiAux* aux = (IRPhiAux*)phi->extra.aux;
+ EXPECT((IROp)phi->op == IR_PHI && phi->def != VAL_NONE,
+ "phi should have a real def");
+ EXPECT(aux && aux->slot_id == (u32)fs, "phi should carry slot id in aux");
+ EXPECT(aux && aux->npreds == f->blocks[join].npreds,
+ "phi predecessor count should match block preds");
+
+ opt_make_conventional_ssa(f);
+ opt_verify(f, "test-ssa-diamond-conventional");
+ opt_undo_ssa(f);
+ opt_verify(f, "test-ssa-diamond-undo");
+ EXPECT(count_op(f, IR_PHI) == 0, "undo_ssa should remove phis");
+ EXPECT(count_op(f, IR_COPY) >= 2, "phi lowering should insert edge copies");
+ tc_fini(&tc);
+}
+
+static void opt_ssa_loop_carried_phi(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, 0);
+ u32 entry = f->entry;
+ u32 header = ir_block_new(f);
+ u32 body = ir_block_new(f);
+ u32 exit = ir_block_new(f);
+ ir_note_emit(f, header);
+ ir_note_emit(f, body);
+ ir_note_emit(f, exit);
+
+ Val zero = add_val(f, tc.i32);
+ Val cur = add_val(f, tc.i32);
+ Val one = add_val(f, tc.i32);
+ Val next = add_val(f, tc.i32);
+ Val out = add_val(f, tc.i32);
+ emit_load_imm(f, entry, zero, tc.i32, 0);
+ emit_store_local(f, entry, fs, zero, tc.i32, 0);
+ emit_br_to(f, entry, header);
+ emit_load_local(f, header, cur, fs, tc.i32, 0);
+ emit_test_branch(f, header, body, exit, tc.i32);
+ emit_load_imm(f, body, one, tc.i32, 1);
+ emit_binop(f, body, next, cur, one, tc.i32);
+ emit_store_local(f, body, fs, next, tc.i32, 0);
+ emit_br_to(f, body, header);
+ emit_load_local(f, exit, out, fs, tc.i32, 0);
+ emit_ret_val(f, exit, out, tc.i32);
+
+ opt_build_cfg(f);
+ opt_build_ssa(f);
+ opt_verify(f, "test-ssa-loop");
+ EXPECT(count_op(f, IR_PHI) == 1, "loop should insert one carried phi");
+ EXPECT((IROp)f->blocks[header].insts[0].op == IR_PHI,
+ "loop-carried phi should be in the header");
+ EXPECT(count_op(f, IR_STORE) == 0, "loop promoted stores should be removed");
+ EXPECT(count_op(f, IR_LOAD) == 0, "loop promoted loads should be removed");
+ opt_make_conventional_ssa(f);
+ opt_undo_ssa(f);
+ opt_verify(f, "test-ssa-loop-undo");
+ tc_fini(&tc);
+}
+
+static void opt_ssa_non_promotable_slots_stay_memory(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, FSF_ADDR_TAKEN);
+ Val v = add_val(f, tc.i32);
+ Val out = add_val(f, tc.i32);
+ emit_load_imm(f, f->entry, v, tc.i32, 7);
+ emit_store_local(f, f->entry, fs, v, tc.i32, 0);
+ emit_load_local(f, f->entry, out, fs, tc.i32, 0);
+ emit_ret_val(f, f->entry, out, tc.i32);
+ opt_build_cfg(f);
+ opt_build_ssa(f);
+ opt_verify(f, "test-ssa-addr-taken");
+ EXPECT(count_op(f, IR_STORE) == 1 && count_op(f, IR_LOAD) == 1,
+ "address-taken slot should remain in memory");
+
+ Func* g = new_func(&tc);
+ fs = add_frame_slot(g, tc.i32, FS_LOCAL, 4, FSF_VOLATILE);
+ v = add_val(g, tc.i32);
+ out = add_val(g, tc.i32);
+ emit_load_imm(g, g->entry, v, tc.i32, 9);
+ emit_store_local(g, g->entry, fs, v, tc.i32, MF_VOLATILE);
+ emit_load_local(g, g->entry, out, fs, tc.i32, MF_VOLATILE);
+ emit_ret_val(g, g->entry, out, tc.i32);
+ opt_build_cfg(g);
+ opt_build_ssa(g);
+ opt_verify(g, "test-ssa-volatile");
+ EXPECT(count_op(g, IR_STORE) == 1 && count_op(g, IR_LOAD) == 1,
+ "volatile slot should remain in memory");
+ tc_fini(&tc);
+}
+
+static void opt_ssa_conventional_splits_critical_edge(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, 0);
+ u32 entry = f->entry;
+ u32 other = ir_block_new(f);
+ u32 join = ir_block_new(f);
+ u32 exit = ir_block_new(f);
+ ir_note_emit(f, other);
+ ir_note_emit(f, join);
+ ir_note_emit(f, exit);
+
+ Val a = add_val(f, tc.i32);
+ Val b = add_val(f, tc.i32);
+ Val out = add_val(f, tc.i32);
+ emit_load_imm(f, entry, a, tc.i32, 1);
+ emit_store_local(f, entry, fs, a, tc.i32, 0);
+ emit_test_branch(f, entry, join, other, tc.i32);
+ emit_load_imm(f, other, b, tc.i32, 2);
+ emit_store_local(f, other, fs, b, tc.i32, 0);
+ emit_br_to(f, other, join);
+ emit_load_local(f, join, out, fs, tc.i32, 0);
+ emit_ret_val(f, join, out, tc.i32);
+ ir_emit(f, exit, IR_RET);
+
+ opt_build_cfg(f);
+ u32 before_blocks = f->nblocks;
+ opt_build_ssa(f);
+ EXPECT(count_op(f, IR_PHI) == 1, "critical-edge test should build a phi");
+ opt_make_conventional_ssa(f);
+ opt_verify(f, "test-ssa-critical-edge");
+ EXPECT(f->nblocks > before_blocks,
+ "conventional SSA should split the critical edge");
+ opt_undo_ssa(f);
+ opt_verify(f, "test-ssa-critical-edge-undo");
+ EXPECT(count_op(f, IR_PHI) == 0, "critical-edge undo should remove phi");
+ tc_fini(&tc);
+}
+
static void opt_jump_cleanup_forwards_branch_targets(void) {
TestCtx tc;
tc_init(&tc);
@@ -3911,6 +4088,10 @@ int main(void) {
opt_cfg_prunes_unreachable();
opt_cfg_preserves_scope_edges();
opt_analysis_dominators_and_frontier();
+ opt_ssa_diamond_mem2reg_phi();
+ opt_ssa_loop_carried_phi();
+ opt_ssa_non_promotable_slots_stay_memory();
+ opt_ssa_conventional_splits_critical_edge();
opt_jump_cleanup_forwards_branch_targets();
opt_jump_cleanup_inverts_to_remove_jump_block();
opt_jump_cleanup_keeps_conditional_fallthrough_block();