kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 87eb85cdc69c09fe71ad21e4d29c0f832bf05b77
parent cfe4be02ba8c5cb7e022bb550458fbcd40ef7d14
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 21 May 2026 07:24:16 -0700

Implement production SSA for O2

Diffstat:
Msrc/opt/ir.h | 22++++++++++++++++++++++
Msrc/opt/opt.c | 15+++++++++++++--
Msrc/opt/opt_internal.h | 3+++
Msrc/opt/pass_analysis.c | 150+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/opt/pass_ssa.c | 878++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
Mtest/opt/opt_test.c | 181+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 927 insertions(+), 322 deletions(-)

diff --git a/src/opt/ir.h b/src/opt/ir.h @@ -331,6 +331,24 @@ typedef struct OptValInfo { u32 forbidden_hard_regs; /* bit r means Val may not allocate hard reg r. */ } OptValInfo; +typedef enum OptUseKind { + OPT_USE_OPERAND, + OPT_USE_INDIRECT_BASE, + OPT_USE_PHI_INPUT, +} OptUseKind; + +typedef struct OptUse { + Val val; + u32 block; + u32 inst; + u32 next_for_val; + u32 operand_index; + u32 phi_pred_index; + Operand* operand; + u8 kind; /* OptUseKind */ + u8 pad[3]; +} OptUse; + typedef struct Func { Arena* arena; Compiler* c; @@ -392,6 +410,10 @@ typedef struct Func { u64 opt_dde_live_words_touched; OptValInfo* val_info; /* indexed by Val, length nvals */ + OptUse* opt_uses; + u32 opt_nuses, opt_uses_cap; + u32* opt_first_use_by_val; /* indexed by Val, OPT_USE_NONE if no uses */ + Reg opt_hard_regs[OPT_REG_CLASSES][OPT_MAX_HARD_REGS]; u32 opt_hard_reg_count[OPT_REG_CLASSES]; CGPhysRegInfo opt_phys_regs[OPT_REG_CLASSES][OPT_MAX_HARD_REGS]; diff --git a/src/opt/opt.c b/src/opt/opt.c @@ -1400,8 +1400,19 @@ static void opt_run_o1_pipeline(OptImpl* o) { } static void opt_run_o2_pipeline(OptImpl* o) { - /* O2 has a distinct scheduler now, but its value/SSA pre-lowering passes - * are still disabled until their shared analysis substrate lands. */ + metrics_scope_begin(o->c, "opt.o2.ssa"); + opt_build_cfg(o->f); + opt_jump_cleanup(o->f, OPT_JUMP_CLEANUP_CFG); + opt_build_cfg(o->f); + opt_verify(o->f, "o2-pre-ssa-cfg"); + opt_build_ssa(o->f); + opt_verify(o->f, "o2-ssa"); + opt_make_conventional_ssa(o->f); + opt_verify(o->f, "o2-conventional-ssa"); + opt_undo_ssa(o->f); + opt_build_cfg(o->f); + opt_verify(o->f, "o2-undo-ssa"); + metrics_scope_end(o->c, "opt.o2.ssa"); opt_run_lowering_pipeline(o, "opt.o2.total", 1); } diff --git a/src/opt/opt_internal.h b/src/opt/opt_internal.h @@ -3,6 +3,8 @@ #include "opt/opt.h" +#define OPT_USE_NONE 0xffffffffu + typedef struct OptHardRegSet { u32 cls[OPT_REG_CLASSES]; } OptHardRegSet; @@ -49,6 +51,7 @@ void opt_analysis_build_order(Func*, OptAnalysis*); void opt_analysis_build_dominators(Func*, OptAnalysis*); void opt_analysis_build_dom_frontier(Func*, OptAnalysis*); int opt_analysis_dominates(const OptAnalysis*, u32 dom, u32 node); +void opt_rebuild_def_use(Func*); void opt_verify(Func*, const char* stage); int opt_val_in_inst_defs(const Inst*, Val); diff --git a/src/opt/pass_analysis.c b/src/opt/pass_analysis.c @@ -179,6 +179,123 @@ static int fixed_terminator_succ_count(const Inst* in, u32* count_out) { } } +static void opt_use_add(Func* f, Val v, u32 b, u32 i, u8 kind, u32 op_idx, + u32 pred_idx, Operand* op) { + if (v == VAL_NONE || v >= f->nvals) return; + if (f->opt_nuses == f->opt_uses_cap) { + u32 ncap = f->opt_uses_cap ? f->opt_uses_cap * 2u : 32u; + OptUse* uses = arena_zarray(f->arena, OptUse, ncap); + if (f->opt_uses) + memcpy(uses, f->opt_uses, sizeof(uses[0]) * f->opt_nuses); + f->opt_uses = uses; + f->opt_uses_cap = ncap; + } + u32 id = f->opt_nuses++; + OptUse* u = &f->opt_uses[id]; + u->val = v; + u->block = b; + u->inst = i; + u->kind = kind; + u->operand_index = op_idx; + u->phi_pred_index = pred_idx; + u->operand = op; + u->next_for_val = f->opt_first_use_by_val[v]; + f->opt_first_use_by_val[v] = id; +} + +static void opt_use_add_operand(Func* f, u32 b, u32 i, u32 op_idx, + Operand* op, int is_def) { + if (!op || is_def) return; + if (op->kind == OPK_REG) { + opt_use_add(f, (Val)op->v.reg, b, i, OPT_USE_OPERAND, op_idx, + OPT_USE_NONE, op); + } else if (op->kind == OPK_INDIRECT) { + opt_use_add(f, (Val)op->v.ind.base, b, i, OPT_USE_INDIRECT_BASE, op_idx, + OPT_USE_NONE, op); + } +} + +static void opt_use_add_abivalue(Func* f, u32 b, u32 i, CGABIValue* v, + int storage_def) { + if (!v) return; + opt_use_add_operand(f, b, i, OPT_USE_NONE, &v->storage, storage_def); + for (u32 p = 0; p < v->nparts; ++p) + opt_use_add_operand(f, b, i, p, (Operand*)&v->parts[p].op, storage_def); +} + +static void opt_collect_inst_uses(Func* f, u32 b, u32 i, Inst* in) { + for (u32 o = 0; o < in->nopnds; ++o) { + int is_def = 0; + if (in->opnds[o].kind == OPK_REG) + is_def = o == 0 && opt_val_in_inst_defs(in, (Val)in->opnds[o].v.reg); + opt_use_add_operand(f, b, i, o, &in->opnds[o], is_def); + } + + switch ((IROp)in->op) { + case IR_PHI: { + IRPhiAux* aux = (IRPhiAux*)in->extra.aux; + if (!aux) break; + for (u32 p = 0; p < aux->npreds; ++p) + opt_use_add(f, aux->pred_vals[p], b, i, OPT_USE_PHI_INPUT, + OPT_USE_NONE, p, NULL); + break; + } + case IR_CALL: { + IRCallAux* aux = (IRCallAux*)in->extra.aux; + if (!aux) break; + if (aux->use_plan_replay) { + opt_use_add_operand(f, b, i, OPT_USE_NONE, &aux->plan.callee, 0); + for (u32 a = 0; a < aux->plan.nargs; ++a) + opt_use_add_operand(f, b, i, a, &aux->plan.args[a].src, 0); + } else { + opt_use_add_operand(f, b, i, OPT_USE_NONE, &aux->desc.callee, 0); + for (u32 a = 0; a < aux->desc.nargs; ++a) + opt_use_add_abivalue(f, b, i, (CGABIValue*)&aux->desc.args[a], 0); + } + break; + } + case IR_RET: { + IRRetAux* aux = (IRRetAux*)in->extra.aux; + if (aux && aux->present) opt_use_add_abivalue(f, b, i, &aux->val, 0); + break; + } + case IR_SCOPE_BEGIN: { + IRScopeAux* aux = (IRScopeAux*)in->extra.aux; + if (aux) opt_use_add_operand(f, b, i, OPT_USE_NONE, &aux->desc.cond, 0); + break; + } + case IR_ASM_BLOCK: { + IRAsmAux* aux = (IRAsmAux*)in->extra.aux; + if (!aux) break; + for (u32 a = 0; a < aux->nin; ++a) + opt_use_add_operand(f, b, i, a, &aux->in_ops[a], 0); + break; + } + case IR_INTRINSIC: { + IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; + if (!aux) break; + for (u32 a = 0; a < aux->narg; ++a) + opt_use_add_operand(f, b, i, a, &aux->args[a], 0); + break; + } + default: + break; + } +} + +void opt_rebuild_def_use(Func* f) { + if (!f) return; + f->opt_nuses = 0; + f->opt_first_use_by_val = + arena_array(f->arena, u32, f->nvals ? f->nvals : 1u); + for (u32 v = 0; v < f->nvals; ++v) f->opt_first_use_by_val[v] = OPT_USE_NONE; + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) + opt_collect_inst_uses(f, b, i, &bl->insts[i]); + } +} + static void verify_operand(Func* f, Inst* in, Operand* op, int is_def, void* arg) { (void)in; @@ -207,6 +324,12 @@ static void verify_values(Func* f, const char* stage) { if ((IROp)in->op == IR_PHI) { IRPhiAux* aux = (IRPhiAux*)in->extra.aux; if (!aux) opt_fail(f, stage, "phi missing aux", b, i); + if (in->def == VAL_NONE) opt_fail(f, stage, "phi missing def", b, i); + if (in->nopnds || in->opnds) + opt_fail(f, stage, "phi should not carry operands", b, i); + if (aux->slot_id > f->nframe_slots) + opt_fail(f, stage, "phi bad slot id", aux->slot_id, + f->nframe_slots); if (aux->npreds != bl->npreds) opt_fail(f, stage, "phi pred count mismatch", aux->npreds, bl->npreds); @@ -222,6 +345,32 @@ static void verify_values(Func* f, const char* stage) { } } +static void verify_def_use(Func* f, const char* stage) { + if (f->opt_rewritten) return; + opt_rebuild_def_use(f); + for (u32 u = 0; u < f->opt_nuses; ++u) { + OptUse* use = &f->opt_uses[u]; + if (use->val == VAL_NONE || use->val >= f->nvals) + opt_fail(f, stage, "def-use bad use val", use->val, f->nvals); + if (use->block >= f->nblocks) + opt_fail(f, stage, "def-use bad block", use->block, f->nblocks); + if (use->inst >= f->blocks[use->block].ninsts) + opt_fail(f, stage, "def-use bad inst", use->inst, + f->blocks[use->block].ninsts); + if (f->val_def_block[use->val] >= f->nblocks) + opt_fail(f, stage, "def-use bad def block", use->val, + f->val_def_block[use->val]); + } + for (Val v = 1; v < f->nvals; ++v) { + for (u32 u = f->opt_first_use_by_val[v]; u != OPT_USE_NONE; + u = f->opt_uses[u].next_for_val) { + if (u >= f->opt_nuses) opt_fail(f, stage, "def-use bad next", v, u); + if (f->opt_uses[u].val != v) + opt_fail(f, stage, "def-use wrong value list", v, u); + } + } +} + void opt_verify(Func* f, const char* stage) { if (!f) return; if (f->nblocks && f->entry >= f->nblocks) @@ -264,4 +413,5 @@ void opt_verify(Func* f, const char* stage) { seen_emit[b] = 1; } verify_values(f, stage); + verify_def_use(f, stage); } diff --git a/src/opt/pass_ssa.c b/src/opt/pass_ssa.c @@ -1,427 +1,665 @@ -/* pass_ssa.c — mem2reg + dominance-frontier SSA construction. - * - * Goal for Phase 3 (doc/OPT.md): build SSA without consuming it. The - * output is discarded before replay, so this pass's job is shape - * checking — no panics on the corpus. - * - * Algorithm (Cooper-Harvey-Kennedy iterative dominators + Cytron et al. - * dominance-frontier phi insertion): - * - * 1. Postorder + reverse-postorder traversal of the CFG. - * 2. Compute idom[] iteratively via the two-finger intersect. - * 3. Compute DF[] from idom[] in one pass. - * 4. For each promotable FrameSlot (no FSF_ADDR_TAKEN), compute the - * iterated dominance frontier of its defining blocks; insert - * IR_PHI at the start of each block in the IDF. - * 5. Rename: DFS the dominator tree, maintain a stack per slot; on - * store push the stored Val, on load record (load_def → top) into - * a rename map, on each successor fill in this block's slot in - * that successor's phis. After processing children, pop. - * - * The rename map is built but its uses are intentionally NOT walked - * across other instructions in the dry-run — that is the part that - * mutates the IR for downstream passes, and Phase 3 discards the IR. - * The phi-insertion + slot-stack walk is the part that exercises the - * IR shape, which is what the dry-run is checking. */ - -#include "opt/ir.h" -#include "opt/opt.h" +/* pass_ssa.c - mem2reg SSA construction and phi destruction for O2. */ + +#include "opt/opt_internal.h" #include <string.h> #include "core/arena.h" #include "core/core.h" -#define BLK_NONE 0xffffffffu - -/* ---- postorder ---- */ +typedef struct SlotStack { + Val* vals; + u32 n; + u32 cap; +} SlotStack; -typedef struct PostorderCtx { +typedef struct RenameCtx { Func* f; - u32* po; /* po[i] = block id at postorder position i */ - u32* po_idx; /* po_idx[block] = postorder position of block */ - u8* visited; - u32 count; -} PostorderCtx; - -static void postorder_dfs(PostorderCtx* ctx, u32 b) { - if (ctx->visited[b]) return; - ctx->visited[b] = 1; - Block* bl = &ctx->f->blocks[b]; - for (u32 s = 0; s < bl->nsucc; ++s) { - u32 t = bl->succ[s]; - if (t < ctx->f->nblocks) postorder_dfs(ctx, t); - } - ctx->po[ctx->count] = b; - ctx->po_idx[b] = ctx->count; - ctx->count++; + OptAnalysis* analysis; + u8* promoted; + Val* repl; + u32 repl_cap; + SlotStack* stacks; +} RenameCtx; + +typedef struct EdgeMove { + Val dst; + Val src; + CfreeCgTypeId type; + u8 cls; +} EdgeMove; + +static u8 ssa_type_class(Func* f, CfreeCgTypeId ty) { + CfreeCgTypeKind kind = cfree_cg_type_kind((CfreeCompiler*)f->c, ty); + return kind == CFREE_CG_TYPE_FLOAT ? RC_FP : RC_INT; +} + +static u32 opnd_slot_id(const Operand* op) { + if (!op || op->kind != OPK_LOCAL) return 0; + return (u32)op->v.frame_slot; } -/* ---- dominators (Cooper-Harvey-Kennedy) ---- */ +static int base_slot_promotable(const Func* f, u32 slot_id) { + if (slot_id == 0 || slot_id > f->nframe_slots) return 0; + const IRFrameSlot* s = &f->frame_slots[slot_id - 1u]; + if (s->kind != FS_LOCAL) return 0; + if (s->flags & (FSF_ADDR_TAKEN | FSF_VOLATILE)) return 0; + return 1; +} -static u32 dom_intersect(u32 b1, u32 b2, const u32* idom, const u32* po_idx) { - while (b1 != b2) { - while (po_idx[b1] < po_idx[b2]) b1 = idom[b1]; - while (po_idx[b2] < po_idx[b1]) b2 = idom[b2]; - } - return b1; +static int operand_has_slot(const Operand* op, u32 slot_id) { + return op && op->kind == OPK_LOCAL && opnd_slot_id(op) == slot_id; } -static u32* compute_idom(Func* f, const u32* po, const u32* po_idx, u32 ncount, - u32 entry) { - u32* idom = arena_array(f->arena, u32, f->nblocks); - for (u32 b = 0; b < f->nblocks; ++b) idom[b] = BLK_NONE; - idom[entry] = entry; - - int changed = 1; - while (changed) { - changed = 0; - /* Reverse postorder, skip the entry block. */ - for (i32 i = (i32)ncount - 1; i >= 0; --i) { - u32 b = po[i]; - if (b == entry) continue; - Block* bl = &f->blocks[b]; - u32 new_idom = BLK_NONE; - for (u32 p = 0; p < bl->npreds; ++p) { - u32 pp = bl->preds[p]; - if (idom[pp] != BLK_NONE) { - new_idom = (new_idom == BLK_NONE) - ? pp - : dom_intersect(pp, new_idom, idom, po_idx); - } - } - if (new_idom != BLK_NONE && idom[b] != new_idom) { - idom[b] = new_idom; - changed = 1; +static int abivalue_has_slot(const CGABIValue* v, u32 slot_id) { + if (!v) return 0; + if (operand_has_slot(&v->storage, slot_id)) return 1; + for (u32 i = 0; i < v->nparts; ++i) + if (operand_has_slot(&v->parts[i].op, slot_id)) return 1; + return 0; +} + +static int aux_has_slot(const Inst* in, u32 slot_id) { + switch ((IROp)in->op) { + case IR_CALL: { + IRCallAux* aux = (IRCallAux*)in->extra.aux; + if (!aux) return 0; + if (aux->use_plan_replay) { + if (operand_has_slot(&aux->plan.callee, slot_id)) return 1; + for (u32 i = 0; i < aux->plan.nargs; ++i) + if (operand_has_slot(&aux->plan.args[i].src, slot_id)) return 1; + for (u32 i = 0; i < aux->plan.nrets; ++i) + if (operand_has_slot(&aux->plan.rets[i].dst, slot_id)) return 1; + } else { + if (operand_has_slot(&aux->desc.callee, slot_id)) return 1; + for (u32 i = 0; i < aux->desc.nargs; ++i) + if (abivalue_has_slot(&aux->desc.args[i], slot_id)) return 1; + if (abivalue_has_slot(&aux->desc.ret, slot_id)) return 1; } + break; + } + case IR_RET: { + IRRetAux* aux = (IRRetAux*)in->extra.aux; + return aux && aux->present && abivalue_has_slot(&aux->val, slot_id); + } + case IR_SCOPE_BEGIN: { + IRScopeAux* aux = (IRScopeAux*)in->extra.aux; + return aux && operand_has_slot(&aux->desc.cond, slot_id); + } + case IR_ASM_BLOCK: { + IRAsmAux* aux = (IRAsmAux*)in->extra.aux; + if (!aux) return 0; + for (u32 i = 0; i < aux->nin; ++i) + if (operand_has_slot(&aux->in_ops[i], slot_id)) return 1; + for (u32 i = 0; i < aux->nout; ++i) + if (operand_has_slot(&aux->out_ops[i], slot_id)) return 1; + break; } + case IR_INTRINSIC: { + IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; + if (!aux) return 0; + for (u32 i = 0; i < aux->narg; ++i) + if (operand_has_slot(&aux->args[i], slot_id)) return 1; + for (u32 i = 0; i < aux->ndst; ++i) + if (operand_has_slot(&aux->dsts[i], slot_id)) return 1; + break; + } + default: + break; } - return idom; + return 0; } -/* ---- dominance frontier ---- */ - -typedef struct DfSet { - u32* members; - u32 n, cap; -} DfSet; - -static void df_add(Arena* a, DfSet* s, u32 b) { - for (u32 i = 0; i < s->n; ++i) - if (s->members[i] == b) return; - if (s->n == s->cap) { - u32 ncap = s->cap ? s->cap * 2u : 4u; - u32* nb = arena_array(a, u32, ncap); - if (s->members) memcpy(nb, s->members, sizeof(u32) * s->n); - s->members = nb; - s->cap = ncap; +static int slot_access_promotable(const Inst* in, u32 slot_id) { + if ((IROp)in->op == IR_LOAD) { + if (in->nopnds < 2 || opnd_slot_id(&in->opnds[1]) != slot_id) return 1; + return in->opnds[0].kind == OPK_REG && !opt_mem_observable(&in->extra.mem); } - s->members[s->n++] = b; + if ((IROp)in->op == IR_STORE) { + if (in->nopnds < 2 || opnd_slot_id(&in->opnds[0]) != slot_id) return 1; + if (opt_mem_observable(&in->extra.mem)) return 0; + return in->opnds[1].kind == OPK_REG || in->opnds[1].kind == OPK_IMM; + } + for (u32 i = 0; i < in->nopnds; ++i) + if (opnd_slot_id(&in->opnds[i]) == slot_id) return 0; + if (aux_has_slot(in, slot_id)) return 0; + return 1; } -static DfSet* compute_df(Func* f, const u32* idom) { - DfSet* df = arena_zarray(f->arena, DfSet, f->nblocks); +static u8* find_promoted_slots(Func* f) { + u8* promoted = arena_zarray(f->arena, u8, f->nframe_slots + 1u); + for (u32 sid = 1; sid <= f->nframe_slots; ++sid) { + if (!base_slot_promotable(f, sid)) continue; + promoted[sid] = 1; + } for (u32 b = 0; b < f->nblocks; ++b) { Block* bl = &f->blocks[b]; - if (bl->npreds < 2) continue; - if (idom[b] == BLK_NONE) continue; - for (u32 p = 0; p < bl->npreds; ++p) { - u32 runner = bl->preds[p]; - while (runner != idom[b] && runner != BLK_NONE) { - df_add(f->arena, &df[runner], b); - runner = idom[runner]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + for (u32 sid = 1; sid <= f->nframe_slots; ++sid) { + if (promoted[sid] && !slot_access_promotable(in, sid)) + promoted[sid] = 0; } } } - return df; + return promoted; } -/* ---- promotable slots ---- */ +static void stack_push(Arena* a, SlotStack* s, Val v) { + if (s->n == s->cap) { + u32 ncap = s->cap ? s->cap * 2u : 4u; + Val* vals = arena_array(a, Val, ncap); + if (s->vals) memcpy(vals, s->vals, sizeof(vals[0]) * s->n); + s->vals = vals; + s->cap = ncap; + } + s->vals[s->n++] = v; +} -/* Identify the FrameSlot a load/store address operand refers to. - * Returns 0 if the operand is not OPK_LOCAL (i.e., not a direct - * frame-slot reference) — those addresses route through computed Val - * pointers and cannot be promoted by a slot-keyed pass. */ -static u32 opnd_slot_id(const Operand* op) { - if (op->kind != OPK_LOCAL) return 0; - return (u32)op->v.frame_slot; +static Val stack_top(const SlotStack* s) { + return s && s->n ? s->vals[s->n - 1u] : VAL_NONE; } -static int slot_promotable(const Func* f, u32 slot_id) { - if (slot_id == 0 || slot_id > f->nframe_slots) return 0; - const IRFrameSlot* s = &f->frame_slots[slot_id - 1]; - if (s->flags & FSF_ADDR_TAKEN) return 0; - if (s->flags & FSF_VOLATILE) return 0; - /* Only locals are promotable for v1; params live in slots too but - * promoting them is a separate transform. */ - if (s->kind != FS_LOCAL) return 0; - return 1; +static Val resolve_repl(const RenameCtx* ctx, Val v) { + while (v != VAL_NONE && v < ctx->repl_cap && ctx->repl[v] != VAL_NONE && + ctx->repl[v] != v) { + v = ctx->repl[v]; + } + return v; } -/* ---- phi insertion ---- */ +static void replace_use(Func* f, Inst* in, Operand* op, int is_def, void* arg) { + (void)in; + RenameCtx* ctx = (RenameCtx*)arg; + if (is_def || op->kind != OPK_REG) return; + Val old = (Val)op->v.reg; + if (old == VAL_NONE || old >= f->nvals) return; + Val repl = resolve_repl(ctx, old); + if (repl != VAL_NONE && repl != old) { + op->v.reg = (Reg)repl; + op->type = f->val_type[repl]; + op->cls = f->val_cls[repl]; + } +} -/* Insert n_new phi instructions at the start of block b, each tagged - * with its slot via opnds[0] = synthetic "slot tag" Val (we reuse - * IRPhiAux to carry pred info). The stored slot id lives in - * extra.imm. */ -static void insert_phis(Func* f, u32 b, u32 n_new, const u32* phi_slots, - const u32* phi_blocks_for_slot) { - if (!n_new) return; +static void insert_phis(Func* f, u32 b, const u8* needs_phi) { Block* bl = &f->blocks[b]; - u32 old = bl->ninsts; - u32 nnew = old + n_new; - Inst* nb = arena_zarray(f->arena, Inst, nnew); - /* Phis go first. */ - for (u32 i = 0; i < n_new; ++i) { - Inst* in = &nb[i]; - u32 slot_id = phi_slots[i]; - const IRFrameSlot* s = &f->frame_slots[slot_id - 1]; + u32 nphi = 0; + for (u32 sid = 1; sid <= f->nframe_slots; ++sid) + if (needs_phi[sid * f->nblocks + b]) ++nphi; + if (!nphi) return; + + u32 old_nvals = f->nvals; + Inst* insts = arena_zarray(f->arena, Inst, bl->ninsts + nphi); + u32 w = 0; + for (u32 sid = 1; sid <= f->nframe_slots; ++sid) { + if (!needs_phi[sid * f->nblocks + b]) continue; + const IRFrameSlot* slot = &f->frame_slots[sid - 1u]; + Inst* in = &insts[w++]; in->op = IR_PHI; - in->type = s->type; - in->extra.imm = (i64)slot_id; - /* Allocate IRPhiAux with one slot per pred, initialized to - * VAL_NONE. The rename pass fills pred_vals later. */ + in->type = slot->type; + in->def = ir_alloc_val(f, slot->type, ssa_type_class(f, slot->type)); + f->val_def_block[in->def] = b; + f->val_def_inst[in->def] = w - 1u; IRPhiAux* aux = arena_znew(f->arena, IRPhiAux); + aux->slot_id = sid; aux->npreds = bl->npreds; if (bl->npreds) { aux->pred_blocks = arena_array(f->arena, u32, bl->npreds); aux->pred_vals = arena_zarray(f->arena, Val, bl->npreds); memcpy(aux->pred_blocks, bl->preds, sizeof(u32) * bl->npreds); } - /* Reuse extra union: imm carries slot, but we also need aux. We - * stash the IRPhiAux in opnds: opnds[0] is a sentinel pointer cast - * — that breaks the Val* type. Instead, we use a parallel side - * table rooted on the inst. To keep this self-contained without - * altering the Inst layout, we point in->opnds at a Val array of - * length 0 and rely on the aux pointer through the extra union. - * - * Layout choice: extra.aux = aux (carry the IRPhiAux pointer); - * imm-as-slot lives in a side table. */ in->extra.aux = aux; - in->nopnds = 0; - in->opnds = NULL; - /* def Val: allocate a fresh value typed as the slot's type. */ - /* We can't use val_alloc directly (it's static); set up the val - * table manually via an emit-equivalent. Simpler: piggy-back on - * ir_emit_const_i style by appending after current state. But we're - * mid-rebuild of the block. Defer: encode def slot via re-reading - * the val table. */ - /* For Phase 3 dry-run, leave def = VAL_NONE on phis. The rename - * pass uses extra.aux->pred_vals to inspect phi shape; the - * dry-run discards before downstream passes need def. */ - in->def = VAL_NONE; - } - /* Existing instructions shifted right. */ - if (old) memcpy(nb + n_new, bl->insts, sizeof(Inst) * old); - bl->insts = nb; - bl->ninsts = nnew; - bl->cap = nnew; - /* val_def_inst for any val defined in this block has shifted by - * n_new. Walk the val table and update. */ - for (u32 v = 1; v < f->nvals; ++v) { - if (f->val_def_block[v] == b) f->val_def_inst[v] += n_new; } - (void)phi_blocks_for_slot; + if (bl->ninsts) memcpy(insts + nphi, bl->insts, sizeof(Inst) * bl->ninsts); + bl->insts = insts; + bl->ninsts += nphi; + bl->cap = bl->ninsts; + for (Val v = 1; v < old_nvals; ++v) + if (f->val_def_block[v] == b) f->val_def_inst[v] += nphi; } -/* ---- rename ---- */ +static void mark_def_blocks(Func* f, const u8* promoted, u8* def_blocks) { + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + if ((IROp)in->op != IR_STORE || in->nopnds < 2) continue; + u32 sid = opnd_slot_id(&in->opnds[0]); + if (sid && promoted[sid]) def_blocks[sid * f->nblocks + b] = 1; + } + } +} -typedef struct SlotStack { - Val* stack; - u32 n, cap; -} SlotStack; +static void compute_phi_sites(Func* f, OptAnalysis* a, const u8* promoted, + u8* needs_phi) { + u8* def_blocks = arena_zarray(f->arena, u8, + (f->nframe_slots + 1u) * f->nblocks); + mark_def_blocks(f, promoted, def_blocks); + u32* work = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u); + u8* queued = arena_zarray(f->arena, u8, f->nblocks ? f->nblocks : 1u); -static void slot_push(Arena* a, SlotStack* s, Val v) { - if (s->n == s->cap) { - u32 ncap = s->cap ? s->cap * 2u : 4u; - Val* nb = arena_array(a, Val, ncap); - if (s->stack) memcpy(nb, s->stack, sizeof(Val) * s->n); - s->stack = nb; - s->cap = ncap; + for (u32 sid = 1; sid <= f->nframe_slots; ++sid) { + if (!promoted[sid]) continue; + memset(queued, 0, f->nblocks); + u32 wn = 0; + for (u32 b = 0; b < f->nblocks; ++b) { + if (!def_blocks[sid * f->nblocks + b]) continue; + queued[b] = 1; + work[wn++] = b; + } + while (wn) { + u32 x = work[--wn]; + OptBlockList* df = &a->dom_frontier[x]; + for (u32 i = 0; i < df->n; ++i) { + u32 y = df->items[i]; + if (needs_phi[sid * f->nblocks + y]) continue; + needs_phi[sid * f->nblocks + y] = 1; + if (!queued[y]) { + queued[y] = 1; + work[wn++] = y; + } + } + } } - s->stack[s->n++] = v; } -static Val slot_top(const SlotStack* s) { - return s->n ? s->stack[s->n - 1] : VAL_NONE; +static void rewrite_store_immediate(RenameCtx* ctx, Inst* in, u32 b, u32 i, + u32 sid) { + Operand src = in->opnds[1]; + Val v = ir_alloc_val(ctx->f, src.type, src.cls); + Operand* opnds = arena_array(ctx->f->arena, Operand, 1); + opnds[0] = src; + opnds[0].kind = OPK_REG; + opnds[0].v.reg = (Reg)v; + in->op = IR_LOAD_IMM; + in->type = src.type; + in->def = v; + in->opnds = opnds; + in->nopnds = 1; + in->extra.imm = src.v.imm; + ctx->f->val_def_block[v] = b; + ctx->f->val_def_inst[v] = i; + stack_push(ctx->f->arena, &ctx->stacks[sid], v); } -static void rename_dfs(Func* f, u32 b, const u32* idom, SlotStack* slots) { +static void rename_block(RenameCtx* ctx, u32 b) { + Func* f = ctx->f; Block* bl = &f->blocks[b]; - /* Track per-slot push count so we can pop on exit. */ - u32* pushed = arena_zarray(f->arena, u32, f->nframe_slots + 1); + u32* pushed = arena_zarray(f->arena, u32, f->nframe_slots + 1u); - /* 1. Process phis in this block: each phi has extra.imm = slot id. - * Push a synthetic phi value (VAL_NONE in dry-run; a "fresh Val" - * once we wire renaming). */ for (u32 i = 0; i < bl->ninsts; ++i) { Inst* in = &bl->insts[i]; - if (in->op != IR_PHI) break; - u32 slot_id = (u32)in->extra.imm; - if (slot_id == 0 || slot_id > f->nframe_slots) continue; - /* For dry-run, the phi's def is VAL_NONE; we still push so the - * stack has the right depth. Downstream passes (Phase 4+) will - * allocate a real Val here. */ - slot_push(f->arena, &slots[slot_id], VAL_NONE); - pushed[slot_id]++; + if ((IROp)in->op != IR_PHI) break; + IRPhiAux* aux = (IRPhiAux*)in->extra.aux; + if (!aux || !aux->slot_id || !ctx->promoted[aux->slot_id]) continue; + stack_push(f->arena, &ctx->stacks[aux->slot_id], in->def); + pushed[aux->slot_id]++; } - /* 2. Process the rest of the block. */ for (u32 i = 0; i < bl->ninsts; ++i) { Inst* in = &bl->insts[i]; - if (in->op == IR_PHI) continue; - if (in->op == IR_STORE) { - /* IR_STORE opnds: [0] = addr, [1] = src. */ - if (in->nopnds < 2) continue; + if ((IROp)in->op == IR_PHI) continue; + if ((IROp)in->op == IR_STORE && in->nopnds >= 2) { u32 sid = opnd_slot_id(&in->opnds[0]); - if (sid && slot_promotable(f, sid)) { - const Operand* src = &in->opnds[1]; - Val v = (src->kind == OPK_REG) ? (Val)src->v.reg : VAL_NONE; - slot_push(f->arena, &slots[sid], v); + if (sid && ctx->promoted[sid]) { + Operand src = in->opnds[1]; + if (src.kind == OPK_REG) { + stack_push(f->arena, &ctx->stacks[sid], + resolve_repl(ctx, (Val)src.v.reg)); + in->op = IR_NOP; + in->nopnds = 0; + in->opnds = NULL; + in->def = VAL_NONE; + } else { + rewrite_store_immediate(ctx, in, b, i, sid); + } pushed[sid]++; + continue; } - continue; } - if (in->op == IR_LOAD) { - /* IR_LOAD opnds: [0] = dst REG, [1] = addr. */ - if (in->nopnds < 2) continue; + if ((IROp)in->op == IR_LOAD && in->nopnds >= 2) { u32 sid = opnd_slot_id(&in->opnds[1]); - if (sid && slot_promotable(f, sid)) { - /* Touching slot_top exercises the stack invariant — that's - * the shape check we want. The rewrite map proper would walk - * uses; we skip that in the dry-run (output discarded). */ - (void)slot_top(&slots[sid]); + if (sid && ctx->promoted[sid] && in->def != VAL_NONE) { + Val cur = stack_top(&ctx->stacks[sid]); + if (cur == VAL_NONE) continue; + if (in->def < ctx->repl_cap) + ctx->repl[in->def] = resolve_repl(ctx, cur); + in->op = IR_NOP; + in->nopnds = 0; + in->opnds = NULL; + in->def = VAL_NONE; + continue; } - continue; } + opt_walk_inst_operands(f, in, replace_use, ctx); } - /* 3. Fill our slot in each successor's phis. */ for (u32 s = 0; s < bl->nsucc; ++s) { u32 succ = bl->succ[s]; if (succ >= f->nblocks) continue; Block* sb = &f->blocks[succ]; - /* Find which pred index `b` is in succ. */ - u32 pred_idx = 0; - int found = 0; + u32 pred_idx = OPT_USE_NONE; for (u32 p = 0; p < sb->npreds; ++p) { if (sb->preds[p] == b) { pred_idx = p; - found = 1; break; } } - if (!found) continue; + if (pred_idx == OPT_USE_NONE) continue; for (u32 i = 0; i < sb->ninsts; ++i) { - Inst* in = &sb->insts[i]; - if (in->op != IR_PHI) break; - u32 slot_id = (u32)in->extra.imm; - if (slot_id == 0 || slot_id > f->nframe_slots) continue; - IRPhiAux* aux = (IRPhiAux*)in->extra.aux; - if (!aux || pred_idx >= aux->npreds) continue; - aux->pred_vals[pred_idx] = slot_top(&slots[slot_id]); + Inst* phi = &sb->insts[i]; + if ((IROp)phi->op != IR_PHI) break; + IRPhiAux* aux = (IRPhiAux*)phi->extra.aux; + if (!aux || !aux->slot_id || !ctx->promoted[aux->slot_id]) continue; + aux->pred_vals[pred_idx] = + resolve_repl(ctx, stack_top(&ctx->stacks[aux->slot_id])); } } - /* 4. Recurse into immediate dom children. */ - for (u32 c = 0; c < f->nblocks; ++c) { - if (c == b) continue; - if (idom[c] == b) rename_dfs(f, c, idom, slots); - } + OptBlockList* children = &ctx->analysis->dom_children[b]; + for (u32 i = 0; i < children->n; ++i) rename_block(ctx, children->items[i]); - /* 5. Pop. */ - for (u32 sid = 0; sid <= f->nframe_slots; ++sid) { + for (u32 sid = 1; sid <= f->nframe_slots; ++sid) { while (pushed[sid]--) { - if (slots[sid].n) slots[sid].n--; + if (ctx->stacks[sid].n) --ctx->stacks[sid].n; } } } -/* ---- main entry ---- */ +static void replace_phi_inputs(Func* f, RenameCtx* ctx) { + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + if ((IROp)in->op != IR_PHI) break; + IRPhiAux* aux = (IRPhiAux*)in->extra.aux; + if (!aux) continue; + for (u32 p = 0; p < aux->npreds; ++p) + aux->pred_vals[p] = resolve_repl(ctx, aux->pred_vals[p]); + } + } +} void opt_build_ssa(Func* f) { - if (f->nblocks == 0) return; - - /* Postorder traversal from entry. */ - PostorderCtx pctx; - pctx.f = f; - pctx.po = arena_array(f->arena, u32, f->nblocks); - pctx.po_idx = arena_array(f->arena, u32, f->nblocks); - pctx.visited = arena_zarray(f->arena, u8, f->nblocks); - pctx.count = 0; - for (u32 b = 0; b < f->nblocks; ++b) pctx.po_idx[b] = 0; - postorder_dfs(&pctx, f->entry); - /* Unreachable blocks: never visited. Skip them in the dom analysis; - * the dry-run shouldn't crash on them. */ - - u32* idom = compute_idom(f, pctx.po, pctx.po_idx, pctx.count, f->entry); - DfSet* df = compute_df(f, idom); - - /* For each promotable slot, find defining blocks → iterated DF. */ - if (f->nframe_slots == 0) return; - u8* needs_phi_storage = - arena_zarray(f->arena, u8, (f->nframe_slots + 1) * f->nblocks); -#define NEEDS_PHI(slot, blk) \ - needs_phi_storage[((slot) * f->nblocks) + (blk)] - - /* Worklist algo per slot. */ - u32* worklist = arena_array(f->arena, u32, f->nblocks); - u8* on_worklist = arena_zarray(f->arena, u8, f->nblocks); + if (!f || f->nblocks == 0 || f->nframe_slots == 0) { + if (f) opt_rebuild_def_use(f); + return; + } - for (u32 sid = 1; sid <= f->nframe_slots; ++sid) { - if (!slot_promotable(f, sid)) continue; - /* Reset per-slot worklist state. */ - for (u32 i = 0; i < f->nblocks; ++i) on_worklist[i] = 0; - u32 wn = 0; - /* Seed with blocks containing a store to this slot. */ - for (u32 b = 0; b < f->nblocks; ++b) { - Block* bl = &f->blocks[b]; - for (u32 i = 0; i < bl->ninsts; ++i) { - Inst* in = &bl->insts[i]; - if (in->op == IR_STORE && in->nopnds >= 1 && - opnd_slot_id(&in->opnds[0]) == sid) { - if (!on_worklist[b]) { - on_worklist[b] = 1; - worklist[wn++] = b; - } - break; + OptAnalysis a; + opt_analysis_build_order(f, &a); + opt_analysis_build_dominators(f, &a); + opt_analysis_build_dom_frontier(f, &a); + + u8* promoted = find_promoted_slots(f); + u8* needs_phi = + arena_zarray(f->arena, u8, (f->nframe_slots + 1u) * f->nblocks); + compute_phi_sites(f, &a, promoted, needs_phi); + for (u32 b = 0; b < f->nblocks; ++b) insert_phis(f, b, needs_phi); + + RenameCtx ctx; + memset(&ctx, 0, sizeof ctx); + ctx.f = f; + ctx.analysis = &a; + ctx.promoted = promoted; + ctx.repl_cap = f->vals_cap ? f->vals_cap : f->nvals; + ctx.repl = arena_zarray(f->arena, Val, ctx.repl_cap ? ctx.repl_cap : 1u); + ctx.stacks = arena_zarray(f->arena, SlotStack, f->nframe_slots + 1u); + rename_block(&ctx, f->entry); + + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) + opt_walk_inst_operands(f, &bl->insts[i], replace_use, &ctx); + } + replace_phi_inputs(f, &ctx); + opt_rebuild_def_use(f); +} + +static int ssa_is_terminator(const Inst* in) { + switch ((IROp)in->op) { + case IR_BR: + case IR_CONDBR: + case IR_CMP_BRANCH: + case IR_SWITCH: + case IR_INDIRECT_BRANCH: + case IR_RET: + case IR_BREAK_TO: + case IR_CONTINUE_TO: + return 1; + case IR_INTRINSIC: { + IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; + return aux && (aux->kind == INTRIN_LONGJMP || + aux->kind == INTRIN_TRAP || + aux->kind == INTRIN_UNREACHABLE); + } + default: + return 0; + } +} + +static Inst make_copy_inst(Func* f, Val dst, Val src, CfreeCgTypeId ty, u8 cls) { + Inst in; + memset(&in, 0, sizeof in); + in.op = IR_COPY; + in.type = ty; + in.def = dst; + in.nopnds = 2; + in.opnds = arena_array(f->arena, Operand, 2); + in.opnds[0].kind = OPK_REG; + in.opnds[0].type = ty; + in.opnds[0].cls = cls; + in.opnds[0].v.reg = (Reg)dst; + in.opnds[1].kind = OPK_REG; + in.opnds[1].type = ty; + in.opnds[1].cls = cls; + in.opnds[1].v.reg = (Reg)src; + return in; +} + +static void insert_edge_moves(Func* f, u32 b, const EdgeMove* moves, u32 n) { + if (!n) return; + Block* bl = &f->blocks[b]; + u32 term = bl->ninsts && ssa_is_terminator(&bl->insts[bl->ninsts - 1u]); + u32 insert_at = bl->ninsts - term; + u32 extra = n == 1 ? 1u : n * 2u; + Inst* insts = arena_zarray(f->arena, Inst, bl->ninsts + extra); + if (insert_at) memcpy(insts, bl->insts, sizeof(Inst) * insert_at); + u32 w = insert_at; + if (n == 1) { + insts[w++] = make_copy_inst(f, moves[0].dst, moves[0].src, moves[0].type, + moves[0].cls); + } else { + Val* temps = arena_array(f->arena, Val, n); + for (u32 i = 0; i < n; ++i) { + temps[i] = ir_alloc_val(f, moves[i].type, moves[i].cls); + f->val_def_block[temps[i]] = b; + f->val_def_inst[temps[i]] = w; + insts[w++] = make_copy_inst(f, temps[i], moves[i].src, moves[i].type, + moves[i].cls); + } + for (u32 i = 0; i < n; ++i) { + insts[w] = make_copy_inst(f, moves[i].dst, temps[i], moves[i].type, + moves[i].cls); + f->val_def_block[moves[i].dst] = b; + f->val_def_inst[moves[i].dst] = w; + ++w; + } + } + if (term) insts[w++] = bl->insts[bl->ninsts - 1u]; + bl->insts = insts; + bl->ninsts = w; + bl->cap = w; + if (n == 1) { + f->val_def_block[moves[0].dst] = b; + f->val_def_inst[moves[0].dst] = insert_at; + } +} + +static void replace_succ_ref(Func* f, u32 pred, u32 old_succ, u32 new_succ) { + Block* bl = &f->blocks[pred]; + for (u32 s = 0; s < bl->nsucc; ++s) { + if (bl->succ[s] == old_succ) bl->succ[s] = new_succ; + } + if (!bl->ninsts) return; + Inst* term = &bl->insts[bl->ninsts - 1u]; + if ((IROp)term->op == IR_SWITCH) { + IRSwitchAux* aux = (IRSwitchAux*)term->extra.aux; + if (!aux) return; + for (u32 i = 0; i < aux->ncases; ++i) + if (aux->cases[i].block == old_succ) aux->cases[i].block = new_succ; + if (aux->default_block == old_succ) aux->default_block = new_succ; + } else if ((IROp)term->op == IR_INDIRECT_BRANCH) { + IRIndirectAux* aux = (IRIndirectAux*)term->extra.aux; + if (!aux) return; + for (u32 i = 0; i < aux->ntargets; ++i) + if (aux->targets[i] == old_succ) aux->targets[i] = new_succ; + } +} + +static void emit_order_insert_after(Func* f, u32 after, u32 block) { + for (u32 i = 0; i < f->emit_order_n; ++i) + if (f->emit_order[i] == block) return; + if (f->emit_order_n == f->emit_order_cap) { + u32 ncap = f->emit_order_cap ? f->emit_order_cap * 2u : 8u; + u32* order = arena_array(f->arena, u32, ncap); + if (f->emit_order) + memcpy(order, f->emit_order, sizeof(order[0]) * f->emit_order_n); + f->emit_order = order; + f->emit_order_cap = ncap; + } + u32 pos = f->emit_order_n; + for (u32 i = 0; i < f->emit_order_n; ++i) { + if (f->emit_order[i] == after) { + pos = i + 1u; + break; + } + } + for (u32 i = f->emit_order_n; i > pos; --i) + f->emit_order[i] = f->emit_order[i - 1u]; + f->emit_order[pos] = block; + ++f->emit_order_n; +} + +static int edge_is_fallthrough(Func* f, u32 pred, u32 succ) { + Block* bl = &f->blocks[pred]; + if (!bl->ninsts) return 0; + IROp op = (IROp)bl->insts[bl->ninsts - 1u].op; + if (op != IR_CMP_BRANCH && op != IR_CONDBR) return 0; + return bl->nsucc >= 2 && bl->succ[1] == succ; +} + +static u32 split_edge(Func* f, u32 pred, u32 succ) { + u32 edge = ir_block_new(f); + Block* eb = &f->blocks[edge]; + Inst* br = ir_emit(f, edge, IR_BR); + (void)br; + eb->succ[0] = succ; + eb->nsucc = 1; + int place_after = edge_is_fallthrough(f, pred, succ); + replace_succ_ref(f, pred, succ, edge); + if (succ < f->nblocks) { + Block* sb = &f->blocks[succ]; + for (u32 i = 0; i < sb->ninsts; ++i) { + Inst* phi = &sb->insts[i]; + if ((IROp)phi->op != IR_PHI) break; + IRPhiAux* aux = (IRPhiAux*)phi->extra.aux; + if (!aux) continue; + for (u32 p = 0; p < aux->npreds; ++p) { + if (aux->pred_blocks[p] == pred) { + aux->pred_blocks[p] = edge; + aux->pred_vals[p] = phi->def; } } } - /* Iterated DF. */ - while (wn) { - u32 x = worklist[--wn]; - DfSet* d = &df[x]; - for (u32 i = 0; i < d->n; ++i) { - u32 y = d->members[i]; - if (NEEDS_PHI(sid, y)) continue; - NEEDS_PHI(sid, y) = 1; - if (!on_worklist[y]) { - on_worklist[y] = 1; - worklist[wn++] = y; + } + if (place_after) { + emit_order_insert_after(f, pred, edge); + } else { + ir_note_emit(f, edge); + } + return edge; +} + +static void realign_phi_preds(Func* f) { + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* phi = &bl->insts[i]; + if ((IROp)phi->op != IR_PHI) break; + IRPhiAux* aux = (IRPhiAux*)phi->extra.aux; + if (!aux || aux->npreds == bl->npreds) { + if (!aux) continue; + } + u32 old_n = aux->npreds; + u32* old_blocks = aux->pred_blocks; + Val* old_vals = aux->pred_vals; + u32* pred_blocks = bl->npreds ? arena_array(f->arena, u32, bl->npreds) + : NULL; + Val* pred_vals = bl->npreds ? arena_zarray(f->arena, Val, bl->npreds) + : NULL; + for (u32 p = 0; p < bl->npreds; ++p) { + pred_blocks[p] = bl->preds[p]; + pred_vals[p] = phi->def; + for (u32 old = 0; old < old_n; ++old) { + if (old_blocks && old_blocks[old] == bl->preds[p]) { + pred_vals[p] = old_vals ? old_vals[old] : VAL_NONE; + break; + } } } + aux->npreds = bl->npreds; + aux->pred_blocks = pred_blocks; + aux->pred_vals = pred_vals; } } +} - /* Insert phis per block. */ +void opt_make_conventional_ssa(Func* f) { + if (!f) return; for (u32 b = 0; b < f->nblocks; ++b) { - u32 nphi = 0; - for (u32 sid = 1; sid <= f->nframe_slots; ++sid) { - if (NEEDS_PHI(sid, b)) nphi++; - } - if (!nphi) continue; - u32* slots_arr = arena_array(f->arena, u32, nphi); - u32 k = 0; - for (u32 sid = 1; sid <= f->nframe_slots; ++sid) { - if (NEEDS_PHI(sid, b)) slots_arr[k++] = sid; + Block* bl = &f->blocks[b]; + if (!bl->ninsts || (IROp)bl->insts[0].op != IR_PHI) continue; + for (u32 p = 0; p < bl->npreds; ++p) { + u32 pred = bl->preds[p]; + EdgeMove* moves = arena_array(f->arena, EdgeMove, bl->ninsts); + u32 n = 0; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* phi = &bl->insts[i]; + if ((IROp)phi->op != IR_PHI) break; + IRPhiAux* aux = (IRPhiAux*)phi->extra.aux; + if (!aux || p >= aux->npreds) continue; + Val src = aux->pred_vals[p]; + if (src == VAL_NONE || src == phi->def) continue; + moves[n].dst = phi->def; + moves[n].src = src; + moves[n].type = phi->type; + moves[n].cls = f->val_cls[phi->def]; + ++n; + } + if (!n) continue; + u32 target = pred; + if (pred >= f->nblocks) continue; + if (f->blocks[pred].nsucc != 1) target = split_edge(f, pred, b); + insert_edge_moves(f, target, moves, n); } - insert_phis(f, b, nphi, slots_arr, NULL); } + opt_build_cfg(f); + realign_phi_preds(f); + opt_rebuild_def_use(f); +} - /* Rename phase: DFS from entry over the dominator tree. */ - SlotStack* slots = arena_zarray(f->arena, SlotStack, f->nframe_slots + 1); - rename_dfs(f, f->entry, idom, slots); - -#undef NEEDS_PHI +void opt_undo_ssa(Func* f) { + if (!f) return; + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + u32 w = 0; + for (u32 i = 0; i < bl->ninsts; ++i) { + if ((IROp)bl->insts[i].op == IR_PHI) continue; + bl->insts[w++] = bl->insts[i]; + } + bl->ninsts = w; + } + opt_rebuild_def_use(f); } diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c @@ -317,6 +317,17 @@ static Inst* emit_load_local(Func* f, u32 b, Val dst, FrameSlot fs, return in; } +static Inst* emit_store_local(Func* f, u32 b, FrameSlot fs, Val src, + CfreeCgTypeId ty, u16 flags) { + Inst* in = ir_emit(f, b, IR_STORE); + in->opnds = arena_array(f->arena, Operand, 2); + in->opnds[0] = op_local_(fs, ty); + in->opnds[1] = op_reg_(src, ty); + in->nopnds = 2; + in->extra.mem = mem_local_(fs, ty, 4, flags); + return in; +} + static Inst* emit_call_void(Func* f, u32 b) { Inst* in = ir_emit(f, b, IR_CALL); IRCallAux* aux = arena_znew(f->arena, IRCallAux); @@ -1875,6 +1886,172 @@ static void opt_analysis_dominators_and_frontier(void) { tc_fini(&tc); } +static void opt_ssa_diamond_mem2reg_phi(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, 0); + u32 entry = f->entry; + u32 then_b = ir_block_new(f); + u32 else_b = ir_block_new(f); + u32 join = ir_block_new(f); + ir_note_emit(f, then_b); + ir_note_emit(f, else_b); + ir_note_emit(f, join); + + emit_test_branch(f, entry, then_b, else_b, tc.i32); + Val tv = add_val(f, tc.i32); + Val ev = add_val(f, tc.i32); + Val out = add_val(f, tc.i32); + emit_load_imm(f, then_b, tv, tc.i32, 11); + emit_store_local(f, then_b, fs, tv, tc.i32, 0); + emit_br_to(f, then_b, join); + emit_load_imm(f, else_b, ev, tc.i32, 22); + emit_store_local(f, else_b, fs, ev, tc.i32, 0); + emit_br_to(f, else_b, join); + emit_load_local(f, join, out, fs, tc.i32, 0); + emit_ret_val(f, join, out, tc.i32); + + opt_build_cfg(f); + opt_build_ssa(f); + opt_verify(f, "test-ssa-diamond"); + EXPECT(count_op(f, IR_PHI) == 1, "diamond should insert one phi"); + EXPECT(count_op(f, IR_STORE) == 0, "promoted stores should be removed"); + EXPECT(count_op(f, IR_LOAD) == 0, "promoted load should be removed"); + Inst* phi = &f->blocks[join].insts[0]; + IRPhiAux* aux = (IRPhiAux*)phi->extra.aux; + EXPECT((IROp)phi->op == IR_PHI && phi->def != VAL_NONE, + "phi should have a real def"); + EXPECT(aux && aux->slot_id == (u32)fs, "phi should carry slot id in aux"); + EXPECT(aux && aux->npreds == f->blocks[join].npreds, + "phi predecessor count should match block preds"); + + opt_make_conventional_ssa(f); + opt_verify(f, "test-ssa-diamond-conventional"); + opt_undo_ssa(f); + opt_verify(f, "test-ssa-diamond-undo"); + EXPECT(count_op(f, IR_PHI) == 0, "undo_ssa should remove phis"); + EXPECT(count_op(f, IR_COPY) >= 2, "phi lowering should insert edge copies"); + tc_fini(&tc); +} + +static void opt_ssa_loop_carried_phi(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, 0); + u32 entry = f->entry; + u32 header = ir_block_new(f); + u32 body = ir_block_new(f); + u32 exit = ir_block_new(f); + ir_note_emit(f, header); + ir_note_emit(f, body); + ir_note_emit(f, exit); + + Val zero = add_val(f, tc.i32); + Val cur = add_val(f, tc.i32); + Val one = add_val(f, tc.i32); + Val next = add_val(f, tc.i32); + Val out = add_val(f, tc.i32); + emit_load_imm(f, entry, zero, tc.i32, 0); + emit_store_local(f, entry, fs, zero, tc.i32, 0); + emit_br_to(f, entry, header); + emit_load_local(f, header, cur, fs, tc.i32, 0); + emit_test_branch(f, header, body, exit, tc.i32); + emit_load_imm(f, body, one, tc.i32, 1); + emit_binop(f, body, next, cur, one, tc.i32); + emit_store_local(f, body, fs, next, tc.i32, 0); + emit_br_to(f, body, header); + emit_load_local(f, exit, out, fs, tc.i32, 0); + emit_ret_val(f, exit, out, tc.i32); + + opt_build_cfg(f); + opt_build_ssa(f); + opt_verify(f, "test-ssa-loop"); + EXPECT(count_op(f, IR_PHI) == 1, "loop should insert one carried phi"); + EXPECT((IROp)f->blocks[header].insts[0].op == IR_PHI, + "loop-carried phi should be in the header"); + EXPECT(count_op(f, IR_STORE) == 0, "loop promoted stores should be removed"); + EXPECT(count_op(f, IR_LOAD) == 0, "loop promoted loads should be removed"); + opt_make_conventional_ssa(f); + opt_undo_ssa(f); + opt_verify(f, "test-ssa-loop-undo"); + tc_fini(&tc); +} + +static void opt_ssa_non_promotable_slots_stay_memory(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, FSF_ADDR_TAKEN); + Val v = add_val(f, tc.i32); + Val out = add_val(f, tc.i32); + emit_load_imm(f, f->entry, v, tc.i32, 7); + emit_store_local(f, f->entry, fs, v, tc.i32, 0); + emit_load_local(f, f->entry, out, fs, tc.i32, 0); + emit_ret_val(f, f->entry, out, tc.i32); + opt_build_cfg(f); + opt_build_ssa(f); + opt_verify(f, "test-ssa-addr-taken"); + EXPECT(count_op(f, IR_STORE) == 1 && count_op(f, IR_LOAD) == 1, + "address-taken slot should remain in memory"); + + Func* g = new_func(&tc); + fs = add_frame_slot(g, tc.i32, FS_LOCAL, 4, FSF_VOLATILE); + v = add_val(g, tc.i32); + out = add_val(g, tc.i32); + emit_load_imm(g, g->entry, v, tc.i32, 9); + emit_store_local(g, g->entry, fs, v, tc.i32, MF_VOLATILE); + emit_load_local(g, g->entry, out, fs, tc.i32, MF_VOLATILE); + emit_ret_val(g, g->entry, out, tc.i32); + opt_build_cfg(g); + opt_build_ssa(g); + opt_verify(g, "test-ssa-volatile"); + EXPECT(count_op(g, IR_STORE) == 1 && count_op(g, IR_LOAD) == 1, + "volatile slot should remain in memory"); + tc_fini(&tc); +} + +static void opt_ssa_conventional_splits_critical_edge(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, 0); + u32 entry = f->entry; + u32 other = ir_block_new(f); + u32 join = ir_block_new(f); + u32 exit = ir_block_new(f); + ir_note_emit(f, other); + ir_note_emit(f, join); + ir_note_emit(f, exit); + + Val a = add_val(f, tc.i32); + Val b = add_val(f, tc.i32); + Val out = add_val(f, tc.i32); + emit_load_imm(f, entry, a, tc.i32, 1); + emit_store_local(f, entry, fs, a, tc.i32, 0); + emit_test_branch(f, entry, join, other, tc.i32); + emit_load_imm(f, other, b, tc.i32, 2); + emit_store_local(f, other, fs, b, tc.i32, 0); + emit_br_to(f, other, join); + emit_load_local(f, join, out, fs, tc.i32, 0); + emit_ret_val(f, join, out, tc.i32); + ir_emit(f, exit, IR_RET); + + opt_build_cfg(f); + u32 before_blocks = f->nblocks; + opt_build_ssa(f); + EXPECT(count_op(f, IR_PHI) == 1, "critical-edge test should build a phi"); + opt_make_conventional_ssa(f); + opt_verify(f, "test-ssa-critical-edge"); + EXPECT(f->nblocks > before_blocks, + "conventional SSA should split the critical edge"); + opt_undo_ssa(f); + opt_verify(f, "test-ssa-critical-edge-undo"); + EXPECT(count_op(f, IR_PHI) == 0, "critical-edge undo should remove phi"); + tc_fini(&tc); +} + static void opt_jump_cleanup_forwards_branch_targets(void) { TestCtx tc; tc_init(&tc); @@ -3911,6 +4088,10 @@ int main(void) { opt_cfg_prunes_unreachable(); opt_cfg_preserves_scope_edges(); opt_analysis_dominators_and_frontier(); + opt_ssa_diamond_mem2reg_phi(); + opt_ssa_loop_carried_phi(); + opt_ssa_non_promotable_slots_stay_memory(); + opt_ssa_conventional_splits_critical_edge(); opt_jump_cleanup_forwards_branch_targets(); opt_jump_cleanup_inverts_to_remove_jump_block(); opt_jump_cleanup_keeps_conditional_fallthrough_block();