commit 7df0fb70a3b9b8cf9a49ccd9a5dd590590f772e9
parent a4a1280359aa38802daecbd1dba497e191655118
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 21 May 2026 11:49:44 -0700
Make optimizer pseudo-registers explicit
Diffstat:
14 files changed, 725 insertions(+), 84 deletions(-)
diff --git a/doc/OPT.md b/doc/OPT.md
@@ -454,8 +454,9 @@ The optimizer is no longer just a stub:
- `src/opt/opt.c` implements the `CGTarget` wrapper.
- `src/opt/ir.c` and `src/opt/ir.h` implement the recorded IR container.
- `src/opt/pass_cfg.c` implements CFG construction.
-- `src/opt/pass_ssa.c` implements mem2reg SSA construction, conventional SSA
- edge-copy lowering, SSA undo, and stable SSA dumps for pass tests.
+- `src/opt/pass_ssa.c` implements pseudo-register SSA construction, mem2reg
+ SSA construction, conventional SSA edge-copy lowering, SSA undo, and stable
+ SSA dumps for pass tests.
- `src/opt/pass_analysis.c` owns shared CFG order, dominators, dominance
frontiers, dominator children, def-use rebuilding, and verifier guardrails.
- `cfree cc` forwards `-O0`/`-O1`/`-O2` into `CfreeCodeOptions`, so native
@@ -469,10 +470,11 @@ Current behavior:
computes liveness, runs the current priority allocator/rewrite path,
combines, runs post-RA DCE, cleans jumps/layout, and emits through the
wrapped target.
-- Level `2` has its own scheduler entry point. It now builds real mem2reg SSA,
- verifies phi/use-def invariants, runs the first small O2 transforms, lowers
- phis through conventional SSA, undoes SSA, and then routes through the same
- proven lowering path as `-O1`.
+- Level `2` has its own scheduler entry point. It now converts mutable
+ pseudo-registers to SSA values, builds real mem2reg SSA, verifies phi/use-def
+ invariants, runs the first small O2 transforms, lowers phis through
+ conventional SSA, undoes SSA, and then routes through the same proven
+ lowering path as `-O1`.
- `opt_regalloc(..., allow_live_range_split)` accepts the O2 policy bit, but
live-range splitting is not implemented yet.
- `opt_build_ssa` is no longer just a shape check: promoted loads/stores are
@@ -497,6 +499,27 @@ Current behavior:
cases remain materialized. Both passes rebuild/invalidate analysis state and
are bracketed by `opt_ssa_dce`, `opt_copy_cleanup`, and verifier checkpoints.
+Temporary defensive/pessimizing checks to lift:
+
+- [ ] O2 register-backed locals/params are forced to frame storage before
+ mem2reg. Lift this when mutable pseudo-register SSA represents source-local
+ storage identity directly instead of relying on frame-slot promotion.
+- [ ] O2 functions containing `IR_ASM_BLOCK` route through the non-SSA lowering
+ path. Lift this when inline-asm tied operands, inout operands,
+ early-clobbers, fixed-register constraints, and clobber liveness are modeled
+ across pseudo-register SSA construction and destruction.
+- [ ] O2 functions containing `IR_LOAD_LABEL_ADDR` or `IR_INDIRECT_BRANCH`
+ route through the non-SSA lowering path. Lift this when label-address and
+ computed-goto targets have stable block/MCLabel semantics across O2 CFG
+ cleanup, block cloning, edge splitting, SSA destruction, and jump cleanup.
+- [ ] O2 switch table promotion is disabled for gapped dense ranges
+ (`span != ncases`). Lift this when jump-table default slots remain correctly
+ bound after O2 layout and jump cleanup, including interior default entries.
+- [ ] Jump forwarding refuses to forward through single-jump blocks that already
+ own an `MCLabel`. Lift or narrow this once data relocations and label tables
+ are retargeted when CFG cleanup forwards or removes labeled trampoline
+ blocks.
+
The next implementation work should continue extending this small-pass path
incrementally before attempting GVN/DSE/LICM.
diff --git a/doc/VIRTUAL_REGS.md b/doc/VIRTUAL_REGS.md
@@ -2,8 +2,8 @@
## Issue
-`opt_cgtarget` currently records `OPK_REG` operands as if the virtual register
-number is also the IR value number. The contract is effectively:
+`opt_cgtarget` used to record `OPK_REG` operands as if the virtual register
+number was also the IR value number. The old contract was effectively:
```text
virtual Reg id == SSA-ish Val id
@@ -28,9 +28,9 @@ opt interpreted that register name as one value identity, the table scale could
use a stale source instead of the checked index. O0/direct x64 mechanics were
not the problem; the broken piece was the optimized IR recording model.
-Register-backed CG locals do not have the same failure mode because they carry
-separate local storage identity through `IRLocal`/`source_local`. Plain virtual
-temporaries currently do not; they are only `Val`s.
+Register-backed CG locals did not have the same failure mode because they
+carried separate local storage identity through `IRLocal`/`source_local`. Plain
+virtual temporaries did not; they were only `Val`s.
## Desired Model
@@ -88,9 +88,26 @@ semantics.
- Do not add new CG restrictions that make virtual registers single-assignment.
- Do not fix individual stale-register cases by assuming virtual registers are
immutable values.
-- Short term, assertions may catch destructive source/destination aliasing in
- paths that still rely on `Reg == Val`, but those assertions are temporary
- guard rails.
- Register-backed locals and params need careful handling because they already
have storage identity. The final model should make that identity explicit
instead of relying on the virtual register number also being a value number.
+
+## Current Implementation
+
+The optimizer now has an explicit pseudo-register namespace:
+
+- `PReg` names mutable CG virtual-register storage.
+- `Val` names SSA values produced by SSA construction and optimization.
+- `Func.preg_type` / `Func.preg_cls` are dense metadata tables indexed by
+ `PReg`; `Func.val_type` / `Func.val_cls` remain indexed by `Val`.
+- During initial recording and O1 lowering, `OPK_REG` operands carry `PReg`
+ ids. O2 calls `opt_build_reg_ssa` to insert register phis and rewrite
+ register operands/defs to `Val` ids before mem2reg SSA.
+
+There is one transitional compatibility layer: `ir_ensure_preg` reserves
+matching shadow `Val` slots for old O1 liveness/allocation arrays that are
+still indexed by the numeric operand id. Those slots are not the ownership
+model for CG virtual registers; the pseudo-register table is.
+
+Remaining pessimizing guardrails are tracked in `doc/OPT.md` so they can be
+lifted after the dependent O2 modeling work lands.
diff --git a/src/cg/control.c b/src/cg/control.c
@@ -163,6 +163,7 @@ static CGSwitchPlan cg_plan_switch(CfreeCg* g, const CGSwitchDesc* d) {
}
/* TARGET_DEFAULT: O0 keeps the chain; O1+ runs the density check. */
if (d->opt_level == 0) return plan;
+ if (d->opt_level >= 2 && plan.span != d->ncases) return plan;
if (d->ncases < CG_SWITCH_TABLE_MIN_CASES_O1) return plan;
if (plan.span > CG_SWITCH_TABLE_MAX_SPAN_O1) return plan;
if (plan.span > (u64)d->ncases * CG_SWITCH_TABLE_DENSITY_RECIP_O1) return plan;
diff --git a/src/cg/value.c b/src/cg/value.c
@@ -689,12 +689,10 @@ void api_ensure_reg(CfreeCg* g, ApiSValue* sv) {
if (sv->kind == SV_CMP) {
CfreeCgTypeId ty = api_sv_type(sv);
Operand dst;
- if (!g->target->virtual_regs &&
- sv->delayed.cmp.a_owned && sv->delayed.cmp.a.kind == OPK_REG &&
+ if (sv->delayed.cmp.a_owned && sv->delayed.cmp.a.kind == OPK_REG &&
sv->delayed.cmp.a.cls == RC_INT) {
dst = api_op_reg(sv->delayed.cmp.a.v.reg, ty);
- } else if (!g->target->virtual_regs &&
- sv->delayed.cmp.b_owned && sv->delayed.cmp.b.kind == OPK_REG &&
+ } else if (sv->delayed.cmp.b_owned && sv->delayed.cmp.b.kind == OPK_REG &&
sv->delayed.cmp.b.cls == RC_INT) {
dst = api_op_reg(sv->delayed.cmp.b.v.reg, ty);
} else {
@@ -708,12 +706,10 @@ void api_ensure_reg(CfreeCg* g, ApiSValue* sv) {
if (sv->kind == SV_ARITH) {
CfreeCgTypeId ty = api_sv_type(sv);
Operand dst;
- if (!g->target->virtual_regs &&
- sv->delayed.arith.a_owned && sv->delayed.arith.a.kind == OPK_REG &&
+ if (sv->delayed.arith.a_owned && sv->delayed.arith.a.kind == OPK_REG &&
sv->delayed.arith.a.cls == RC_INT) {
dst = api_op_reg(sv->delayed.arith.a.v.reg, ty);
- } else if (!g->target->virtual_regs &&
- api_arith_rhs_reusable(sv) && sv->delayed.arith.b_owned &&
+ } else if (api_arith_rhs_reusable(sv) && sv->delayed.arith.b_owned &&
sv->delayed.arith.b.kind == OPK_REG &&
sv->delayed.arith.b.cls == RC_INT) {
dst = api_op_reg(sv->delayed.arith.b.v.reg, ty);
diff --git a/src/opt/ir.c b/src/opt/ir.c
@@ -1,4 +1,4 @@
-/* ir.c — Func/Block/Inst plumbing for the SSA IR (doc/OPT.md §1).
+/* ir.c — Func/Block/Inst plumbing for the optimizer IR (doc/OPT.md §1).
*
* Each CGTarget call recorded by opt_cgtarget produces exactly one Inst.
* Storage is per-
@@ -9,11 +9,11 @@
* - VAL_NONE (= 0) is reserved; first allocated Val is 1.
* - val_def_block / val_def_inst / val_type / val_cls are parallel
* arrays indexed by Val.
- * - Inst.opnds is Operand[] (not Val[]): virtual Reg/Val are collapsed
- * (doc/OPT.md §5.1) and OPK_REG operands' v.reg field IS the Val used
- * at this site. CG mints those virtual regs through the shared simple
- * allocator when driving opt_cgtarget. Other OpKinds
- * (IMM/LOCAL/GLOBAL/INDIRECT) are not Val uses for SSA dataflow.
+ * - preg_type / preg_cls are parallel arrays indexed by mutable CG virtual
+ * Reg id. Before opt_build_reg_ssa, OPK_REG operands name these pseudos.
+ * - O1/non-SSA lowering still uses shadow Val slots for allocation bitsets
+ * until the whole allocator is renamed from Val to PReg. Those slots are
+ * reserved by ir_ensure_preg; SSA values allocated later remain distinct.
*/
#include "opt/ir.h"
@@ -79,6 +79,56 @@ void ir_ensure_val(Func* f, Val v, CfreeCgTypeId t, u8 cls) {
f->val_cls[v] = cls;
}
+/* ---- mutable pseudo-register table ---- */
+
+static void preg_table_grow(Func* f, u32 needed) {
+ if (needed <= f->pregs_cap) return;
+ u32 ncap = f->pregs_cap ? f->pregs_cap : 16u;
+ while (ncap < needed) ncap *= 2u;
+ CfreeCgTypeId* nb_ty = arena_zarray(f->arena, CfreeCgTypeId, ncap);
+ u8* nb_cls = arena_zarray(f->arena, u8, ncap);
+ if (f->npregs) {
+ memcpy(nb_ty, f->preg_type, sizeof(CfreeCgTypeId) * f->npregs);
+ memcpy(nb_cls, f->preg_cls, sizeof(u8) * f->npregs);
+ }
+ f->preg_type = nb_ty;
+ f->preg_cls = nb_cls;
+ f->pregs_cap = ncap;
+}
+
+void ir_ensure_preg(Func* f, PReg r, CfreeCgTypeId t, u8 cls) {
+ if (r == PREG_NONE || r == 0) return;
+ if (f->npregs == 0) {
+ preg_table_grow(f, 16);
+ f->npregs = 1;
+ }
+ if (r >= f->pregs_cap) preg_table_grow(f, r + 1u);
+ while (f->npregs <= r) {
+ f->preg_type[f->npregs] = 0;
+ f->preg_cls[f->npregs] = RC_INT;
+ f->npregs++;
+ }
+ if (!f->preg_type[r]) f->preg_type[r] = t;
+ f->preg_cls[r] = cls;
+
+ /* Transitional compatibility: O1 allocation/liveness arrays are still
+ * indexed by the numeric operand id. Reserve matching Val slots so those
+ * passes keep their existing bounds and metadata while the source of truth
+ * is the pseudo-register table above. */
+ ir_ensure_val(f, (Val)r, t, cls);
+}
+
+PReg ir_alloc_preg(Func* f, CfreeCgTypeId t, u8 cls) {
+ if (f->npregs == 0) {
+ preg_table_grow(f, 16);
+ f->npregs = 1;
+ }
+ if (f->npregs == f->pregs_cap) preg_table_grow(f, f->npregs + 1u);
+ PReg r = (PReg)f->npregs;
+ ir_ensure_preg(f, r, t, cls);
+ return r;
+}
+
/* ---- blocks ---- */
u32 ir_block_new(Func* f) {
@@ -238,6 +288,8 @@ Func* ir_func_new(Compiler* c, const CGFuncDesc* desc) {
* returns Val=1. */
val_table_grow(f, 16);
f->nvals = 1;
+ preg_table_grow(f, 16);
+ f->npregs = 1;
/* Caller is expected to ir_block_new(f) for entry, then assign
* f->entry. */
return f;
diff --git a/src/opt/ir.h b/src/opt/ir.h
@@ -5,12 +5,17 @@
#include "core/arena.h"
#include "core/core.h"
-/* SSA value id. Identical to the Reg space (doc/OPT.md §5.1): when an
- * Operand has kind OPK_REG, v.reg names the Val it carries. VAL_NONE=0
- * is reserved as a sentinel. */
+/* SSA value id. VAL_NONE=0 is reserved as a sentinel. Recorded CG virtual
+ * registers live in Func's pseudo-register table; before pseudo-reg SSA,
+ * OPK_REG operands carry those mutable Reg ids. After pseudo-reg SSA, OPK_REG
+ * operands carry Val ids. The mode is explicit on Func, not encoded in the
+ * numeric id. */
typedef u32 Val;
#define VAL_NONE 0u
+typedef Reg PReg;
+#define PREG_NONE ((PReg)REG_NONE)
+
typedef u32 InstId;
#define INST_ID_NONE 0u
@@ -50,7 +55,8 @@ typedef enum IROp {
IR_CMP,
IR_CONVERT,
- /* Calls. extra.aux = IRCallAux (see below). defs = result Vals. */
+ /* Calls. extra.aux = IRCallAux (see below). defs = result OPK_REG ids:
+ * PReg before opt_build_reg_ssa, Val after it. */
IR_CALL,
/* Phis. extra.aux = IRPhiAux. */
@@ -68,7 +74,7 @@ typedef enum IROp {
succ[0..nvalid) = the valid target blocks. */
IR_LOAD_LABEL_ADDR, /* opnds[0] dst REG; extra.imm = target block id. */
IR_RET, /* extra.aux = IRRetAux* (NULL for void). */
- IR_SCOPE_BEGIN, /* extra.aux = IRScopeAux. defs[0] = scope id Val. */
+ IR_SCOPE_BEGIN, /* extra.aux = IRScopeAux. */
IR_SCOPE_ELSE, /* extra.imm = scope id (Val). */
IR_SCOPE_END, /* extra.imm = scope id (Val). */
IR_BREAK_TO, /* extra.imm = scope id (Val). */
@@ -85,7 +91,7 @@ typedef enum IROp {
IR_ATOMIC_LOAD, /* opnds = [dst, addr]; extra.aux = IRAtomicAux */
IR_ATOMIC_STORE, /* opnds = [addr, src]; extra.aux = IRAtomicAux */
IR_ATOMIC_RMW, /* opnds = [dst, addr, val]; extra.aux */
- IR_ATOMIC_CAS, /* defs = [prior, ok]; extra.aux = IRCasAux */
+ IR_ATOMIC_CAS, /* defs = [prior, ok] OPK_REG ids; extra.aux = IRCasAux */
IR_FENCE, /* extra.imm = MemOrder */
/* Inline asm. extra.aux = IRAsmAux. */
@@ -148,6 +154,7 @@ typedef struct IRPhiAux {
u32* pred_blocks;
Val* pred_vals;
u32 slot_id; /* 0 if not from mem2reg; else 1-based FrameSlot id */
+ u32 reg_id; /* 0 if not from mutable-pseudo SSA; else original Reg id */
} IRPhiAux;
/* IR_CALL aux. The CGTarget interface is rich enough that we keep the
@@ -271,13 +278,16 @@ typedef struct Inst {
InstId id;
SrcLoc loc; /* sticky from CGTarget.set_loc at recording */
CfreeCgTypeId type;
- Val def; /* primary SSA def, or VAL_NONE */
- u32 ndefs; /* multi-result */
+ /* Primary and multi-result OPK_REG definitions. Before opt_build_reg_ssa,
+ * these hold PReg ids matching destination operands. After opt_build_reg_ssa,
+ * they hold Val ids. */
+ Val def;
+ u32 ndefs;
Val* defs;
/* Operands. We use Operand instead of Val so that the original
* CGTarget call shape (IMM / LOCAL / GLOBAL / INDIRECT in addition to
- * REG) round-trips at replay. SSA passes treat OPK_REG operands as
- * Val uses (via .v.reg). */
+ * REG) round-trips at replay. Before opt_build_reg_ssa, OPK_REG operands
+ * are mutable PReg uses/defs; after it, they are Val uses/defs. */
u32 nopnds;
Operand* opnds;
union {
@@ -380,6 +390,14 @@ typedef struct Func {
*/
u32 nvals, vals_cap;
+ /* Mutable pseudo-register table. Index 0 is unused so CG virtual Reg ids can
+ * be used directly. During initial recording and O1, OPK_REG operands name
+ * these persistent storage locations. O2 converts them to Val ids with
+ * opt_build_reg_ssa before running SSA passes. */
+ CfreeCgTypeId* preg_type;
+ u8* preg_cls;
+ u32 npregs, pregs_cap;
+
/* Scope id table. Indexed by scope_id (1-based). Values map to
* IRScopeAux entries (via the IR_SCOPE_BEGIN inst). Stored as a flat
* pointer table for O(1) lookup during scope_else/end/break/continue
@@ -400,6 +418,8 @@ typedef struct Func {
Target opt_target;
u8 opt_has_target;
u8 opt_rewritten;
+ u8 opt_reg_ssa;
+ u8 pad0;
u16 opt_live_words;
u32 opt_used_loc_words;
u32 opt_alloc_hard_loc_words;
@@ -448,6 +468,8 @@ u32 ir_local_add(Func*, const CGLocalDesc*, CGLocalStorage);
Val ir_alloc_val(Func*, CfreeCgTypeId, u8 cls);
void ir_ensure_val(Func*, Val, CfreeCgTypeId, u8 cls);
+PReg ir_alloc_preg(Func*, CfreeCgTypeId, u8 cls);
+void ir_ensure_preg(Func*, PReg, CfreeCgTypeId, u8 cls);
Inst* ir_emit(Func*, u32 block, IROp);
InstId ir_inst_id_new(Func*);
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -1,8 +1,9 @@
/* opt.c — CGTarget wrapper that records each function as IR (doc/OPT.md
* §1). Each CGTarget call lands as exactly one Inst in the current
- * Func's current block; the wrapper's vreg/vlabel/vslot/vscope ids
- * coincide with their IR counterparts (Reg ↔ Val, label ↔ block id,
- * vslot ↔ IR FrameSlot, vscope ↔ scope_aux index — doc/OPT.md §5.1).
+ * Func's current block. CG virtual registers are recorded as mutable
+ * pseudo-register ids (PReg); O2 turns them into Val ids with
+ * opt_build_reg_ssa. Labels, frame slots, and scopes keep their direct IR id
+ * mappings (label ↔ block id, vslot ↔ IR FrameSlot, vscope ↔ scope_aux index).
*
* OPT1: level 1 records the CGTarget stream, runs the minimal backend
* lowering schedule, rewrites virtual regs to hard regs/spill slots,
@@ -54,12 +55,13 @@ static Inst* rec(OptImpl* o, IROp op) {
return in;
}
-static void set_def(Func* f, Inst* in, u32 block, Val v, CfreeCgTypeId t) {
- in->def = v;
+static void set_preg_def(Func* f, Inst* in, u32 block, PReg r,
+ CfreeCgTypeId t) {
+ in->def = (Val)r;
in->type = t;
- if (v != VAL_NONE && v < f->nvals) {
- f->val_def_block[v] = block;
- f->val_def_inst[v] = f->blocks[block].ninsts - 1u;
+ if (r != PREG_NONE && (Val)r < f->nvals) {
+ f->val_def_block[(Val)r] = block;
+ f->val_def_inst[(Val)r] = f->blocks[block].ninsts - 1u;
}
}
@@ -71,9 +73,9 @@ static int intrinsic_terminates(IntrinKind kind) {
static void ensure_operand(Func* f, const Operand* op) {
if (!op) return;
if (op->kind == OPK_REG) {
- ir_ensure_val(f, (Val)op->v.reg, op->type, op->cls);
+ ir_ensure_preg(f, (PReg)op->v.reg, op->type, op->cls);
} else if (op->kind == OPK_INDIRECT) {
- ir_ensure_val(f, (Val)op->v.ind.base, 0, RC_INT);
+ ir_ensure_preg(f, (PReg)op->v.ind.base, 0, RC_INT);
}
}
@@ -175,8 +177,9 @@ static CGLocalStorage w_local(CGTarget* t, const CGLocalDesc* d) {
OptImpl* o = impl_of(t);
CGLocalStorage st;
memset(&st, 0, sizeof st);
- if ((d->flags & (CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED)) == 0) {
- Val v = ir_alloc_val(o->f, d->type, opt_local_reg_class(o, d->type));
+ if (o->level < 2 &&
+ (d->flags & (CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED)) == 0) {
+ PReg v = ir_alloc_preg(o->f, d->type, opt_local_reg_class(o, d->type));
st.kind = CG_LOCAL_STORAGE_REG;
st.v.reg = (Reg)v;
} else {
@@ -407,8 +410,9 @@ static CGLocalStorage w_param(CGTarget* t, const CGParamDesc* d) {
local_desc.align = d->align;
local_desc.flags = d->flags;
if (st.kind == CG_LOCAL_STORAGE_FRAME && st.v.frame_slot == FRAME_SLOT_NONE) {
- if ((d->flags & (CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED)) == 0) {
- Val v = ir_alloc_val(o->f, d->type, opt_local_reg_class(o, d->type));
+ if (o->level < 2 &&
+ (d->flags & (CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED)) == 0) {
+ PReg v = ir_alloc_preg(o->f, d->type, opt_local_reg_class(o, d->type));
st.kind = CG_LOCAL_STORAGE_REG;
st.v.reg = (Reg)v;
} else {
@@ -427,6 +431,8 @@ static CGLocalStorage w_param(CGTarget* t, const CGParamDesc* d) {
ir_param_add(o->f, ©);
ir_local_add(o->f, &local_desc, st);
if (st.kind == CG_LOCAL_STORAGE_REG) {
+ ir_ensure_preg(o->f, (PReg)st.v.reg, d->type,
+ opt_local_reg_class(o, d->type));
Inst* in = rec(o, IR_PARAM_DECL);
in->def = (Val)st.v.reg;
in->type = d->type;
@@ -673,7 +679,7 @@ static void w_load_label_addr(CGTarget* t, Operand dst, Label l) {
in->opnds = dup_opnds(o->f, &dst, 1);
in->nopnds = 1;
in->extra.imm = (i64)(u32)l;
- set_def(o->f, in, o->cur, (Val)dst.v.reg, dst.type);
+ set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type);
}
/* ---- structured scopes ---- */
@@ -802,7 +808,7 @@ static void w_load_imm(CGTarget* t, Operand dst, i64 imm) {
in->opnds = dup_opnds(o->f, ops, 1);
in->nopnds = 1;
in->extra.imm = imm;
- if (dst.kind == OPK_REG) set_def(o->f, in, o->cur, (Val)dst.v.reg, dst.type);
+ if (dst.kind == OPK_REG) set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type);
}
static void w_load_const(CGTarget* t, Operand dst, ConstBytes cb) {
@@ -817,7 +823,7 @@ static void w_load_const(CGTarget* t, Operand dst, ConstBytes cb) {
memcpy(bytes, cb.bytes, cb.size);
in->extra.cbytes.bytes = bytes;
}
- if (dst.kind == OPK_REG) set_def(o->f, in, o->cur, (Val)dst.v.reg, dst.type);
+ if (dst.kind == OPK_REG) set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type);
}
static void w_copy(CGTarget* t, Operand dst, Operand src) {
@@ -826,7 +832,7 @@ static void w_copy(CGTarget* t, Operand dst, Operand src) {
Operand ops[2] = {dst, src};
in->opnds = dup_opnds(o->f, ops, 2);
in->nopnds = 2;
- if (dst.kind == OPK_REG) set_def(o->f, in, o->cur, (Val)dst.v.reg, dst.type);
+ if (dst.kind == OPK_REG) set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type);
}
static void w_load(CGTarget* t, Operand dst, Operand addr, MemAccess m) {
@@ -836,7 +842,7 @@ static void w_load(CGTarget* t, Operand dst, Operand addr, MemAccess m) {
in->opnds = dup_opnds(o->f, ops, 2);
in->nopnds = 2;
in->extra.mem = m;
- if (dst.kind == OPK_REG) set_def(o->f, in, o->cur, (Val)dst.v.reg, dst.type);
+ if (dst.kind == OPK_REG) set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type);
}
static void w_store(CGTarget* t, Operand addr, Operand src, MemAccess m) {
@@ -854,7 +860,7 @@ static void w_addr_of(CGTarget* t, Operand dst, Operand lv) {
Operand ops[2] = {dst, lv};
in->opnds = dup_opnds(o->f, ops, 2);
in->nopnds = 2;
- if (dst.kind == OPK_REG) set_def(o->f, in, o->cur, (Val)dst.v.reg, dst.type);
+ if (dst.kind == OPK_REG) set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type);
}
static void w_tls_addr_of(CGTarget* t, Operand dst, ObjSymId sym, i64 addend) {
@@ -867,7 +873,7 @@ static void w_tls_addr_of(CGTarget* t, Operand dst, ObjSymId sym, i64 addend) {
aux->sym = sym;
aux->addend = addend;
in->extra.aux = aux;
- if (dst.kind == OPK_REG) set_def(o->f, in, o->cur, (Val)dst.v.reg, dst.type);
+ if (dst.kind == OPK_REG) set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type);
}
static void w_copy_bytes(CGTarget* t, Operand dst, Operand src,
@@ -904,7 +910,7 @@ static void w_bitfield_load(CGTarget* t, Operand dst, Operand record,
IRBitFieldAux* aux = arena_znew(o->f->arena, IRBitFieldAux);
aux->access = bf;
in->extra.aux = aux;
- if (dst.kind == OPK_REG) set_def(o->f, in, o->cur, (Val)dst.v.reg, dst.type);
+ if (dst.kind == OPK_REG) set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type);
}
static void w_bitfield_store(CGTarget* t, Operand record, Operand src,
@@ -928,7 +934,7 @@ static void w_binop(CGTarget* t, BinOp op, Operand dst, Operand a, Operand b) {
in->opnds = dup_opnds(o->f, ops, 3);
in->nopnds = 3;
in->extra.imm = (i64)op;
- if (dst.kind == OPK_REG) set_def(o->f, in, o->cur, (Val)dst.v.reg, dst.type);
+ if (dst.kind == OPK_REG) set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type);
}
static void w_unop(CGTarget* t, UnOp op, Operand dst, Operand a) {
@@ -938,7 +944,7 @@ static void w_unop(CGTarget* t, UnOp op, Operand dst, Operand a) {
in->opnds = dup_opnds(o->f, ops, 2);
in->nopnds = 2;
in->extra.imm = (i64)op;
- if (dst.kind == OPK_REG) set_def(o->f, in, o->cur, (Val)dst.v.reg, dst.type);
+ if (dst.kind == OPK_REG) set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type);
}
static void w_cmp(CGTarget* t, CmpOp op, Operand dst, Operand a, Operand b) {
@@ -948,7 +954,7 @@ static void w_cmp(CGTarget* t, CmpOp op, Operand dst, Operand a, Operand b) {
in->opnds = dup_opnds(o->f, ops, 3);
in->nopnds = 3;
in->extra.imm = (i64)op;
- if (dst.kind == OPK_REG) set_def(o->f, in, o->cur, (Val)dst.v.reg, dst.type);
+ if (dst.kind == OPK_REG) set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type);
}
static void w_convert(CGTarget* t, ConvKind k, Operand dst, Operand src) {
@@ -958,7 +964,7 @@ static void w_convert(CGTarget* t, ConvKind k, Operand dst, Operand src) {
in->opnds = dup_opnds(o->f, ops, 2);
in->nopnds = 2;
in->extra.imm = (i64)k;
- if (dst.kind == OPK_REG) set_def(o->f, in, o->cur, (Val)dst.v.reg, dst.type);
+ if (dst.kind == OPK_REG) set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type);
}
/* ---- calls / return ---- */
@@ -992,7 +998,7 @@ static void w_call(CGTarget* t, const CGCallDesc* d) {
in->extra.aux = aux;
in->type = d->fn_type;
if (d->ret.storage.kind == OPK_REG) {
- set_def(o->f, in, o->cur, (Val)d->ret.storage.v.reg, d->ret.type);
+ set_preg_def(o->f, in, o->cur, (PReg)d->ret.storage.v.reg, d->ret.type);
}
}
@@ -1050,7 +1056,7 @@ static void w_alloca_(CGTarget* t, Operand dst, Operand size, u32 align) {
in->opnds = dup_opnds(o->f, ops, 2);
in->nopnds = 2;
in->extra.imm = (i64)align;
- if (dst.kind == OPK_REG) set_def(o->f, in, o->cur, (Val)dst.v.reg, dst.type);
+ if (dst.kind == OPK_REG) set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type);
}
static void w_va_start_(CGTarget* t, Operand ap) {
@@ -1068,7 +1074,7 @@ static void w_va_arg_(CGTarget* t, Operand dst, Operand ap, CfreeCgTypeId ty) {
in->opnds = dup_opnds(o->f, ops, 2);
in->nopnds = 2;
in->extra.aux = (void*)(uintptr_t)ty;
- if (dst.kind == OPK_REG) set_def(o->f, in, o->cur, (Val)dst.v.reg, dst.type);
+ if (dst.kind == OPK_REG) set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type);
}
static void w_va_end_(CGTarget* t, Operand ap) {
@@ -1098,7 +1104,7 @@ static void w_atomic_load(CGTarget* t, Operand dst, Operand addr, MemAccess m,
aux->mem = m;
aux->mo = mo;
in->extra.aux = aux;
- if (dst.kind == OPK_REG) set_def(o->f, in, o->cur, (Val)dst.v.reg, dst.type);
+ if (dst.kind == OPK_REG) set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type);
}
static void w_atomic_store(CGTarget* t, Operand addr, Operand src, MemAccess m,
@@ -1126,7 +1132,7 @@ static void w_atomic_rmw(CGTarget* t, AtomicOp op, Operand dst, Operand addr,
aux->mo = mo;
aux->op = (u8)op;
in->extra.aux = aux;
- if (dst.kind == OPK_REG) set_def(o->f, in, o->cur, (Val)dst.v.reg, dst.type);
+ if (dst.kind == OPK_REG) set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type);
}
static void w_atomic_cas(CGTarget* t, Operand prior, Operand ok, Operand addr,
@@ -1143,7 +1149,7 @@ static void w_atomic_cas(CGTarget* t, Operand prior, Operand ok, Operand addr,
aux->failure = f;
in->extra.aux = aux;
if (prior.kind == OPK_REG)
- set_def(o->f, in, o->cur, (Val)prior.v.reg, prior.type);
+ set_preg_def(o->f, in, o->cur, (PReg)prior.v.reg, prior.type);
if (ok.kind == OPK_REG) {
in->ndefs = 2;
in->defs = arena_array(o->f->arena, Val, 2);
@@ -1182,7 +1188,7 @@ static void w_intrinsic(CGTarget* t, IntrinKind kind, Operand* dsts, u32 nd,
}
in->extra.aux = aux;
if (nd == 1 && dsts[0].kind == OPK_REG) {
- set_def(o->f, in, o->cur, (Val)dsts[0].v.reg, dsts[0].type);
+ set_preg_def(o->f, in, o->cur, (PReg)dsts[0].v.reg, dsts[0].type);
} else if (nd > 1) {
in->ndefs = nd;
in->defs = arena_array(o->f->arena, Val, nd);
@@ -1407,11 +1413,10 @@ static void opt_run_o2_pipeline(OptImpl* o) {
opt_jump_cleanup(o->f, OPT_JUMP_CLEANUP_CFG);
opt_build_cfg(o->f);
opt_verify(o->f, "o2-pre-ssa-cfg");
+ opt_build_reg_ssa(o->f);
+ opt_verify(o->f, "o2-reg-ssa");
opt_block_cloning(o->f);
opt_verify(o->f, "o2-block-clone-cfg");
- opt_ssa_dce(o->f);
- opt_copy_cleanup(o->f);
- opt_verify(o->f, "o2-block-clone-cleanup");
opt_build_ssa(o->f);
opt_verify(o->f, "o2-ssa");
opt_ssa_dce(o->f);
@@ -1436,6 +1441,24 @@ static void opt_run_o2_pipeline(OptImpl* o) {
opt_run_lowering_pipeline(o, "opt.o2.total", 1);
}
+static int func_requires_non_ssa_o2(Func* f) {
+ if (!f) return 0;
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ switch ((IROp)bl->insts[i].op) {
+ case IR_ASM_BLOCK:
+ case IR_LOAD_LABEL_ADDR:
+ case IR_INDIRECT_BRANCH:
+ return 1;
+ default:
+ break;
+ }
+ }
+ }
+ return 0;
+}
+
/* ---- func_end: optionally run dry-run passes; replay; reset ---- */
static void w_func_end(CGTarget* t) {
@@ -1443,7 +1466,7 @@ static void w_func_end(CGTarget* t) {
if (!o->f) return;
opt_frame_home_addr_taken_locals(o->f);
- if (o->level >= 2) {
+ if (o->level >= 2 && !func_requires_non_ssa_o2(o->f)) {
opt_run_o2_pipeline(o);
} else if (o->level >= 1) {
opt_run_o1_pipeline(o);
diff --git a/src/opt/opt.h b/src/opt/opt.h
@@ -35,6 +35,7 @@ typedef enum OptJumpCleanupStage {
} OptJumpCleanupStage;
void opt_jump_cleanup(Func*, OptJumpCleanupStage);
void opt_block_cloning(Func*);
+void opt_build_reg_ssa(Func*);
void opt_build_ssa(Func*);
void opt_addr_xform(Func*);
void opt_gvn(Func*); /* incl. constprop, redundant-load elim */
diff --git a/src/opt/opt_util.c b/src/opt/opt_util.c
@@ -8,6 +8,17 @@ int opt_val_in_inst_defs(const Inst* in, Val v) {
return 0;
}
+static int opt_operand_index_is_def(const Inst* in, u32 i) {
+ if (!in || i >= in->nopnds || in->opnds[i].kind != OPK_REG) return 0;
+ if (!opt_val_in_inst_defs(in, (Val)in->opnds[i].v.reg)) return 0;
+ switch ((IROp)in->op) {
+ case IR_ATOMIC_CAS:
+ return i == 0 || i == 1;
+ default:
+ return i == 0;
+ }
+}
+
void opt_walk_operand(Func* f, Inst* in, Operand* op, int is_def,
OptOperandWalkFn fn, void* ctx) {
if (!op || !fn) return;
@@ -36,15 +47,11 @@ void opt_walk_abivalue(Func* f, Inst* in, CGABIValue* v, int storage_def,
void opt_walk_inst_operands(Func* f, Inst* in, OptOperandWalkFn fn, void* ctx) {
if (!in || !fn) return;
for (u32 i = 0; i < in->nopnds; ++i) {
- int is_def = 0;
- if (in->opnds[i].kind == OPK_REG)
- is_def = i == 0 && opt_val_in_inst_defs(in, (Val)in->opnds[i].v.reg);
+ int is_def = opt_operand_index_is_def(in, i);
if (!is_def) opt_walk_operand(f, in, &in->opnds[i], 0, fn, ctx);
}
for (u32 i = 0; i < in->nopnds; ++i) {
- int is_def = 0;
- if (in->opnds[i].kind == OPK_REG)
- is_def = i == 0 && opt_val_in_inst_defs(in, (Val)in->opnds[i].v.reg);
+ int is_def = opt_operand_index_is_def(in, i);
if (is_def) opt_walk_operand(f, in, &in->opnds[i], 1, fn, ctx);
}
diff --git a/src/opt/pass_analysis.c b/src/opt/pass_analysis.c
@@ -246,11 +246,20 @@ static void opt_use_add_abivalue(Func* f, u32 b, u32 i, CGABIValue* v,
opt_use_add_operand(f, b, i, p, (Operand*)&v->parts[p].op, storage_def);
}
+static int collect_operand_index_is_def(const Inst* in, u32 i) {
+ if (!in || i >= in->nopnds || in->opnds[i].kind != OPK_REG) return 0;
+ if (!opt_val_in_inst_defs(in, (Val)in->opnds[i].v.reg)) return 0;
+ switch ((IROp)in->op) {
+ case IR_ATOMIC_CAS:
+ return i == 0 || i == 1;
+ default:
+ return i == 0;
+ }
+}
+
static void opt_collect_inst_uses(Func* f, u32 b, u32 i, Inst* in) {
for (u32 o = 0; o < in->nopnds; ++o) {
- int is_def = 0;
- if (in->opnds[o].kind == OPK_REG)
- is_def = o == 0 && opt_val_in_inst_defs(in, (Val)in->opnds[o].v.reg);
+ int is_def = collect_operand_index_is_def(in, o);
opt_use_add_operand(f, b, i, o, &in->opnds[o], is_def);
}
@@ -366,6 +375,8 @@ static void verify_values(Func* f, const char* stage) {
opt_fail(f, stage, "phi should not carry operands", b, i);
if (aux->slot_id > f->nframe_slots)
opt_fail(f, stage, "phi bad slot id", aux->slot_id, f->nframe_slots);
+ if (aux->reg_id != 0 && aux->reg_id >= f->npregs)
+ opt_fail(f, stage, "phi bad reg id", aux->reg_id, f->npregs);
if (aux->npreds != bl->npreds)
opt_fail(f, stage, "phi pred count mismatch", aux->npreds,
bl->npreds);
diff --git a/src/opt/pass_jump.c b/src/opt/pass_jump.c
@@ -45,6 +45,7 @@ static int single_jump_block(const JumpCleanupCtx* c, u32 block,
Func* f = c->f;
if (block >= f->nblocks) return 0;
Block* bl = &f->blocks[block];
+ if (bl->mc_label != MC_LABEL_NONE) return 0;
if (bl->ninsts != 1 || bl->nsucc != 1) return 0;
if ((IROp)bl->insts[0].op != IR_BR) return 0;
if (target_out) *target_out = bl->succ[0];
@@ -184,6 +185,26 @@ static void cleanup_invert_jump_fallthrough(JumpCleanupCtx* c) {
}
}
+static void cleanup_invert_taken_fallthrough(JumpCleanupCtx* c) {
+ Func* f = c->f;
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ if (!bl->ninsts || bl->nsucc != 2) continue;
+ Inst* last = &bl->insts[bl->ninsts - 1u];
+ if ((IROp)last->op != IR_CMP_BRANCH) continue;
+
+ u32 taken = bl->succ[0];
+ u32 fallthrough = bl->succ[1];
+ if (next_emit_block(c, b) != taken) continue;
+ CmpOp inverted;
+ if (!invert_cmp((CmpOp)last->extra.imm, &inverted)) continue;
+
+ last->extra.imm = inverted;
+ bl->succ[0] = fallthrough;
+ bl->succ[1] = taken;
+ }
+}
+
static void cleanup_layout_fallthrough_branches(const JumpCleanupCtx* c) {
Func* f = c->f;
for (u32 b = 0; b < f->nblocks; ++b) {
@@ -206,6 +227,7 @@ void opt_jump_cleanup(Func* f, OptJumpCleanupStage stage) {
cleanup_invert_jump_fallthrough(&c);
cleanup_branch_targets(&c);
} else if (stage == OPT_JUMP_CLEANUP_LAYOUT) {
+ cleanup_invert_taken_fallthrough(&c);
cleanup_layout_fallthrough_branches(&c);
}
}
diff --git a/src/opt/pass_o2.c b/src/opt/pass_o2.c
@@ -259,6 +259,10 @@ static u32 clone_block_for_pred(Func* f, u32 block, u32 pred) {
void opt_block_cloning(Func* f) {
if (!f || f->opt_rewritten) return;
+ /* This pass remaps OPK_REG defs as Val ids. In the normal O2 pipeline it
+ * must run after opt_build_reg_ssa; standalone unit tests that build Val IR
+ * directly have no pseudo-register table beyond the sentinel. */
+ if (!f->opt_reg_ssa && f->npregs > 1) return;
opt_build_loop_tree(f);
OptAnalysis a;
memset(&a, 0, sizeof a);
diff --git a/src/opt/pass_ssa.c b/src/opt/pass_ssa.c
@@ -29,6 +29,13 @@ typedef struct EdgeMove {
u8 cls;
} EdgeMove;
+typedef struct RegRenameCtx {
+ Func* f;
+ OptAnalysis* analysis;
+ u32 old_nregs;
+ SlotStack* stacks;
+} RegRenameCtx;
+
static u8 ssa_type_class(Func* f, CfreeCgTypeId ty) {
CfreeCgTypeKind kind = cfree_cg_type_kind((CfreeCompiler*)f->c, ty);
return kind == CFREE_CG_TYPE_FLOAT ? RC_FP : RC_INT;
@@ -160,6 +167,368 @@ static Val stack_top(const SlotStack* s) {
return s && s->n ? s->vals[s->n - 1u] : VAL_NONE;
}
+static int reg_in_defs(const Inst* in, Reg r) {
+ if (!in || r == (Reg)REG_NONE) return 0;
+ if (in->def == (Val)r) return 1;
+ for (u32 i = 0; i < in->ndefs; ++i)
+ if (in->defs[i] == (Val)r) return 1;
+ return 0;
+}
+
+static u8* find_reg_def_blocks(Func* f, u32 old_nregs) {
+ u8* def_blocks = arena_zarray(f->arena, u8, old_nregs * f->nblocks);
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if (in->def != VAL_NONE && in->def < old_nregs)
+ def_blocks[in->def * f->nblocks + b] = 1;
+ for (u32 d = 0; d < in->ndefs; ++d) {
+ Val v = in->defs[d];
+ if (v != VAL_NONE && v < old_nregs)
+ def_blocks[v * f->nblocks + b] = 1;
+ }
+ }
+ }
+ return def_blocks;
+}
+
+static void compute_reg_phi_sites(Func* f, OptAnalysis* a,
+ const OptLiveInfo* live, u32 old_nregs,
+ u8* needs_phi) {
+ u8* def_blocks = find_reg_def_blocks(f, old_nregs);
+ u32* work = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u);
+ u8* queued = arena_zarray(f->arena, u8, f->nblocks ? f->nblocks : 1u);
+
+ for (u32 r = 1; r < old_nregs; ++r) {
+ if (!f->preg_type[r]) continue;
+ memset(queued, 0, f->nblocks);
+ u32 wn = 0;
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ if (!def_blocks[r * f->nblocks + b]) continue;
+ queued[b] = 1;
+ work[wn++] = b;
+ }
+ while (wn) {
+ u32 x = work[--wn];
+ OptBlockList* df = &a->dom_frontier[x];
+ for (u32 i = 0; i < df->n; ++i) {
+ u32 y = df->items[i];
+ if (needs_phi[r * f->nblocks + y]) continue;
+ if (live && !opt_bitset_has(&live->blocks[y].live_in, (Val)r))
+ continue;
+ needs_phi[r * f->nblocks + y] = 1;
+ if (!queued[y]) {
+ queued[y] = 1;
+ work[wn++] = y;
+ }
+ }
+ }
+ }
+}
+
+static void insert_reg_phis(Func* f, u32 b, u32 old_nregs,
+ const u8* needs_phi) {
+ Block* bl = &f->blocks[b];
+ u32 nphi = 0;
+ for (u32 r = 1; r < old_nregs; ++r)
+ if (needs_phi[r * f->nblocks + b]) ++nphi;
+ if (!nphi) return;
+
+ u32 old_nvals = f->nvals;
+ Inst* insts = arena_zarray(f->arena, Inst, bl->ninsts + nphi);
+ u32 w = 0;
+ for (u32 r = 1; r < old_nregs; ++r) {
+ if (!needs_phi[r * f->nblocks + b]) continue;
+ Inst* in = &insts[w++];
+ in->op = IR_PHI;
+ ir_assign_inst_id(f, in);
+ in->type = f->preg_type[r];
+ in->def = ir_alloc_val(f, f->preg_type[r], f->preg_cls[r]);
+ f->val_def_block[in->def] = b;
+ f->val_def_inst[in->def] = w - 1u;
+ IRPhiAux* aux = arena_znew(f->arena, IRPhiAux);
+ aux->reg_id = r;
+ aux->npreds = bl->npreds;
+ if (bl->npreds) {
+ aux->pred_blocks = arena_array(f->arena, u32, bl->npreds);
+ aux->pred_vals = arena_zarray(f->arena, Val, bl->npreds);
+ memcpy(aux->pred_blocks, bl->preds, sizeof(u32) * bl->npreds);
+ }
+ in->extra.aux = aux;
+ }
+ if (bl->ninsts) memcpy(insts + nphi, bl->insts, sizeof(Inst) * bl->ninsts);
+ bl->insts = insts;
+ bl->ninsts += nphi;
+ bl->cap = bl->ninsts;
+ for (Val v = 1; v < old_nvals; ++v)
+ if (f->val_def_block[v] == b) f->val_def_inst[v] += nphi;
+}
+
+static Val reg_stack_top(RegRenameCtx* ctx, Reg r) {
+ if (r == (Reg)REG_NONE || (u32)r >= ctx->old_nregs) return VAL_NONE;
+ return stack_top(&ctx->stacks[(u32)r]);
+}
+
+static void reg_replace_use(RegRenameCtx* ctx, Operand* op) {
+ if (!op) return;
+ if (op->kind == OPK_REG) {
+ Reg r = op->v.reg;
+ Val v = reg_stack_top(ctx, r);
+ if (v == VAL_NONE) return;
+ op->v.reg = (Reg)v;
+ op->type = ctx->f->val_type[v];
+ op->cls = ctx->f->val_cls[v];
+ } else if (op->kind == OPK_INDIRECT) {
+ Reg r = op->v.ind.base;
+ Val v = reg_stack_top(ctx, r);
+ if (v != VAL_NONE) op->v.ind.base = (Reg)v;
+ }
+}
+
+static void reg_replace_abivalue_uses(RegRenameCtx* ctx, CGABIValue* v) {
+ if (!v) return;
+ reg_replace_use(ctx, &v->storage);
+ for (u32 i = 0; i < v->nparts; ++i)
+ reg_replace_use(ctx, (Operand*)&v->parts[i].op);
+}
+
+static Val reg_define_operand(RegRenameCtx* ctx, u32 b, u32 i, Inst* in,
+ Operand* op, u32 def_index, u32* pushed) {
+ if (!op || op->kind != OPK_REG) return VAL_NONE;
+ Reg r = op->v.reg;
+ if (r == (Reg)REG_NONE || (u32)r >= ctx->old_nregs) return VAL_NONE;
+ CfreeCgTypeId ty = op->type ? op->type : ctx->f->preg_type[(u32)r];
+ u8 cls = op->cls;
+ Val v = ir_alloc_val(ctx->f, ty, cls);
+ op->v.reg = (Reg)v;
+ op->type = ty;
+ op->cls = cls;
+ if (def_index == 0 && in->def == (Val)r) in->def = v;
+ if (def_index < in->ndefs && in->defs[def_index] == (Val)r)
+ in->defs[def_index] = v;
+ ctx->f->val_def_block[v] = b;
+ ctx->f->val_def_inst[v] = i;
+ stack_push(ctx->f->arena, &ctx->stacks[(u32)r], v);
+ pushed[(u32)r]++;
+ return v;
+}
+
+static u32 reg_def_index(const Inst* in, Reg r, u32 ordinal) {
+ if (!in || !in->ndefs) return 0;
+ u32 seen = 0;
+ for (u32 i = 0; i < in->ndefs; ++i) {
+ if (in->defs[i] != (Val)r) continue;
+ if (seen == ordinal) return i;
+ ++seen;
+ }
+ return 0;
+}
+
+static void reg_replace_inst_uses(RegRenameCtx* ctx, Inst* in) {
+ for (u32 o = 0; o < in->nopnds; ++o) {
+ Operand* op = &in->opnds[o];
+ int is_def = 0;
+ if (op->kind == OPK_REG) {
+ if ((IROp)in->op == IR_ATOMIC_CAS)
+ is_def = (o == 0 || o == 1) && reg_in_defs(in, op->v.reg);
+ else
+ is_def = o == 0 && reg_in_defs(in, op->v.reg);
+ }
+ if (!is_def) reg_replace_use(ctx, op);
+ }
+
+ switch ((IROp)in->op) {
+ case IR_CALL: {
+ IRCallAux* aux = (IRCallAux*)in->extra.aux;
+ if (!aux) break;
+ if (aux->use_plan_replay) {
+ reg_replace_use(ctx, &aux->plan.callee);
+ for (u32 i = 0; i < aux->plan.nargs; ++i)
+ reg_replace_use(ctx, &aux->plan.args[i].src);
+ } else {
+ reg_replace_use(ctx, &aux->desc.callee);
+ for (u32 i = 0; i < aux->desc.nargs; ++i)
+ reg_replace_abivalue_uses(ctx, (CGABIValue*)&aux->desc.args[i]);
+ }
+ break;
+ }
+ case IR_RET: {
+ IRRetAux* aux = (IRRetAux*)in->extra.aux;
+ if (aux && aux->present) reg_replace_abivalue_uses(ctx, &aux->val);
+ break;
+ }
+ case IR_SCOPE_BEGIN: {
+ IRScopeAux* aux = (IRScopeAux*)in->extra.aux;
+ if (aux) reg_replace_use(ctx, &aux->desc.cond);
+ break;
+ }
+ case IR_ASM_BLOCK: {
+ IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
+ if (!aux) break;
+ for (u32 i = 0; i < aux->nin; ++i) reg_replace_use(ctx, &aux->in_ops[i]);
+ break;
+ }
+ case IR_INTRINSIC: {
+ IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
+ if (!aux) break;
+ for (u32 i = 0; i < aux->narg; ++i) reg_replace_use(ctx, &aux->args[i]);
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+static void reg_define_abivalue(RegRenameCtx* ctx, u32 b, u32 i, Inst* in,
+ CGABIValue* v, u32* pushed) {
+ if (!v) return;
+ if (v->storage.kind == OPK_REG && reg_in_defs(in, v->storage.v.reg))
+ reg_define_operand(ctx, b, i, in, &v->storage, 0, pushed);
+ for (u32 p = 0; p < v->nparts; ++p) {
+ Operand* op = (Operand*)&v->parts[p].op;
+ if (op->kind == OPK_REG && reg_in_defs(in, op->v.reg))
+ reg_define_operand(ctx, b, i, in, op, p, pushed);
+ }
+}
+
+static void reg_define_inst_defs(RegRenameCtx* ctx, u32 b, u32 i, Inst* in,
+ u32* pushed) {
+ switch ((IROp)in->op) {
+ case IR_CALL: {
+ IRCallAux* aux = (IRCallAux*)in->extra.aux;
+ if (!aux) break;
+ if (aux->use_plan_replay) {
+ for (u32 r = 0; r < aux->plan.nrets; ++r) {
+ if (aux->plan.rets[r].dst.kind == OPK_REG &&
+ reg_in_defs(in, aux->plan.rets[r].dst.v.reg))
+ reg_define_operand(ctx, b, i, in, &aux->plan.rets[r].dst, r,
+ pushed);
+ }
+ } else {
+ reg_define_abivalue(ctx, b, i, in, &aux->desc.ret, pushed);
+ }
+ break;
+ }
+ case IR_ATOMIC_CAS:
+ if (in->nopnds >= 1 && in->opnds[0].kind == OPK_REG)
+ reg_define_operand(ctx, b, i, in, &in->opnds[0], 0, pushed);
+ if (in->nopnds >= 2 && in->opnds[1].kind == OPK_REG)
+ reg_define_operand(ctx, b, i, in, &in->opnds[1], 1, pushed);
+ break;
+ case IR_ASM_BLOCK: {
+ IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
+ if (!aux) break;
+ for (u32 o = 0; o < aux->nout; ++o) {
+ if (aux->out_ops[o].kind != OPK_REG) continue;
+ reg_define_operand(ctx, b, i, in, &aux->out_ops[o],
+ reg_def_index(in, aux->out_ops[o].v.reg, o),
+ pushed);
+ }
+ break;
+ }
+ case IR_INTRINSIC: {
+ IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
+ if (!aux) break;
+ for (u32 d = 0; d < aux->ndst; ++d) {
+ if (aux->dsts[d].kind != OPK_REG) continue;
+ reg_define_operand(ctx, b, i, in, &aux->dsts[d], d, pushed);
+ if (aux->result_vals) aux->result_vals[d] = (Val)aux->dsts[d].v.reg;
+ }
+ break;
+ }
+ default:
+ if (in->nopnds && in->opnds[0].kind == OPK_REG &&
+ reg_in_defs(in, in->opnds[0].v.reg))
+ reg_define_operand(ctx, b, i, in, &in->opnds[0], 0, pushed);
+ break;
+ }
+}
+
+static void reg_rename_block(RegRenameCtx* ctx, u32 b) {
+ Func* f = ctx->f;
+ Block* bl = &f->blocks[b];
+ u32* pushed = arena_zarray(f->arena, u32, ctx->old_nregs);
+
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if ((IROp)in->op != IR_PHI) break;
+ IRPhiAux* aux = (IRPhiAux*)in->extra.aux;
+ if (!aux || !aux->reg_id || aux->reg_id >= ctx->old_nregs) continue;
+ stack_push(f->arena, &ctx->stacks[aux->reg_id], in->def);
+ pushed[aux->reg_id]++;
+ }
+
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if ((IROp)in->op == IR_PHI) continue;
+ reg_replace_inst_uses(ctx, in);
+ reg_define_inst_defs(ctx, b, i, in, pushed);
+ }
+
+ for (u32 s = 0; s < bl->nsucc; ++s) {
+ u32 succ = bl->succ[s];
+ if (succ >= f->nblocks) continue;
+ Block* sb = &f->blocks[succ];
+ u32 pred_idx = OPT_USE_NONE;
+ for (u32 p = 0; p < sb->npreds; ++p) {
+ if (sb->preds[p] == b) {
+ pred_idx = p;
+ break;
+ }
+ }
+ if (pred_idx == OPT_USE_NONE) continue;
+ for (u32 i = 0; i < sb->ninsts; ++i) {
+ Inst* phi = &sb->insts[i];
+ if ((IROp)phi->op != IR_PHI) break;
+ IRPhiAux* aux = (IRPhiAux*)phi->extra.aux;
+ if (!aux || !aux->reg_id || aux->reg_id >= ctx->old_nregs) continue;
+ aux->pred_vals[pred_idx] = reg_stack_top(ctx, (Reg)aux->reg_id);
+ }
+ }
+
+ OptBlockList* children = &ctx->analysis->dom_children[b];
+ for (u32 i = 0; i < children->n; ++i)
+ reg_rename_block(ctx, children->items[i]);
+
+ for (u32 r = 1; r < ctx->old_nregs; ++r) {
+ while (pushed[r]--) {
+ if (ctx->stacks[r].n) --ctx->stacks[r].n;
+ }
+ }
+}
+
+void opt_build_reg_ssa(Func* f) {
+ if (!f || f->nblocks == 0 || f->npregs <= 1) {
+ if (f) opt_rebuild_def_use(f);
+ return;
+ }
+ opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE);
+
+ u32 old_nregs = f->npregs;
+ OptAnalysis a;
+ opt_analysis_build_order(f, &a);
+ opt_analysis_build_dominators(f, &a);
+ opt_analysis_build_dom_frontier(f, &a);
+
+ OptLiveInfo live;
+ opt_live_blocks(f, &live);
+ u8* needs_phi = arena_zarray(f->arena, u8, old_nregs * f->nblocks);
+ compute_reg_phi_sites(f, &a, &live, old_nregs, needs_phi);
+ for (u32 b = 0; b < f->nblocks; ++b)
+ insert_reg_phis(f, b, old_nregs, needs_phi);
+
+ RegRenameCtx ctx;
+ memset(&ctx, 0, sizeof ctx);
+ ctx.f = f;
+ ctx.analysis = &a;
+ ctx.old_nregs = old_nregs;
+ ctx.stacks = arena_zarray(f->arena, SlotStack, old_nregs);
+ reg_rename_block(&ctx, f->entry);
+ f->opt_reg_ssa = 1;
+ opt_rebuild_def_use(f);
+}
+
static Val resolve_repl(const RenameCtx* ctx, Val v) {
while (v != VAL_NONE && v < ctx->repl_cap && ctx->repl[v] != VAL_NONE &&
ctx->repl[v] != v) {
diff --git a/test/opt/phase0_guardrails.sh b/test/opt/phase0_guardrails.sh
@@ -131,6 +131,82 @@ write_many_stack_args_o1() {
} >"$TMP/many_stack_args_o1.c"
}
+write_mutable_vreg_o2() {
+ cat >"$TMP/mutable_vreg_o2.c" <<'SRC'
+int main() {
+ int x = 40;
+ x = x + 1;
+ x = x + 1;
+ return x == 42 ? 0 : 1;
+}
+SRC
+}
+
+write_loop_phi_o2() {
+ cat >"$TMP/loop_phi_o2.c" <<'SRC'
+int main() {
+ int x = 0;
+ int i = 0;
+ while (i < 7) {
+ x = x + i;
+ i = i + 1;
+ }
+ return x == 21 ? 0 : 1;
+}
+SRC
+}
+
+write_call_across_o2() {
+ cat >"$TMP/call_across_o2.c" <<'SRC'
+static int bump(int x) { return x + 1; }
+int main() {
+ int x = 10;
+ int y = bump(x);
+ x = x + y;
+ return x == 21 ? 0 : 1;
+}
+SRC
+}
+
+write_dense_switch_o2() {
+ cat >"$TMP/dense_switch_o2.c" <<'SRC'
+int main() {
+ int x = 14;
+ switch (x) {
+ case 10: return 1;
+ case 11: return 2;
+ case 12: return 3;
+ case 13: return 4;
+ case 14: return 0;
+ case 15: return 6;
+ default: return 7;
+ }
+}
+SRC
+}
+
+write_switch_join_add_o2() {
+ cat >"$TMP/switch_join_add_o2.c" <<'SRC'
+int main() {
+ int tag = 2;
+ int value = 0;
+ switch (tag) {
+ case 0:
+ value = 10;
+ break;
+ case 1:
+ case 2:
+ value = 40;
+ break;
+ default:
+ value = 0;
+ break;
+ }
+ return value + 2 == 42 ? 0 : 1;
+}
+SRC
+}
+
run_case() {
local name="$1"
local src="$2"
@@ -147,6 +223,13 @@ run_o1_case() {
printf 'phase0 %-24s O1 OK\n' "$name"
}
+run_o2_case() {
+ local name="$1"
+ local src="$2"
+ "$BIN" run -O2 "$src" >/dev/null
+ printf 'phase0 %-24s O2 OK\n' "$name"
+}
+
check_metrics() {
local src="$TMP/branch_liveness.c"
local err="$TMP/metrics.err"
@@ -199,6 +282,11 @@ write_spills
write_many_small_functions
write_large_straight_line
write_many_stack_args_o1
+write_mutable_vreg_o2
+write_loop_phi_o2
+write_call_across_o2
+write_dense_switch_o2
+write_switch_join_add_o2
run_case branch_liveness "$TMP/branch_liveness.c"
run_case call_clobber "$TMP/call_clobber.c"
@@ -208,6 +296,11 @@ run_case spills "$TMP/spills.c"
run_case many_small_functions "$TMP/many_small_functions.c"
run_case large_straight_line "$TMP/large_straight_line.c"
run_o1_case many_stack_args "$TMP/many_stack_args_o1.c"
+run_o2_case mutable_vreg "$TMP/mutable_vreg_o2.c"
+run_o2_case loop_phi "$TMP/loop_phi_o2.c"
+run_o2_case call_across "$TMP/call_across_o2.c"
+run_o2_case dense_switch "$TMP/dense_switch_o2.c"
+run_o2_case switch_join_add "$TMP/switch_join_add_o2.c"
check_metrics
printf 'phase0 identified inline-asm stress: test/parse/cases/asm_01_grammar.c\n'