kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit d43cbf6fc02269ac31d3fb6859ce514a86d48b5d
parent 8ae3ca5059ac45fe43b64e8e770842d3775522b1
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 21 May 2026 08:39:12 -0700

opt: add SSA DCE and copy cleanup

Diffstat:
Mdoc/OPT.md | 28++++++++++++++++++----------
Msrc/opt/ir.h | 84+++++++++++++++++++++++++++++++++++++++++--------------------------------------
Msrc/opt/opt.c | 8++++++++
Msrc/opt/opt.h | 3++-
Msrc/opt/opt_internal.h | 10++++++++++
Msrc/opt/pass_analysis.c | 73+++++++++++++++++++++++++++++++++++++++++++++++++------------------------
Msrc/opt/pass_cfg.c | 11++++++-----
Msrc/opt/pass_combine.c | 2++
Asrc/opt/pass_copy.c | 116+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/opt/pass_dce.c | 99+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/opt/pass_jump.c | 4+++-
Msrc/opt/pass_loop.c | 13+++++++++++--
Msrc/opt/pass_ssa.c | 32+++++++++++++++++---------------
Mtest/opt/opt_test.c | 287+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
14 files changed, 571 insertions(+), 199 deletions(-)

diff --git a/doc/OPT.md b/doc/OPT.md @@ -443,10 +443,16 @@ Current behavior: dumps no longer depend on raw `Inst*` addresses or positional references. - Loop tree construction uses shared `OptAnalysis` dominators instead of a pass-local dominator implementation. +- `Func.opt_valid_analyses` now tracks CFG, def-use, dominator, and loop-info + validity. Mutating passes invalidate the affected bits, rebuild passes mark + their products valid, and `opt_verify` checks cached def-use before rebuilding + it so stale use-site assumptions are caught. +- The first SSA-backed value passes are in the O2 path: `opt_ssa_dce` removes + unused pure SSA defs/phis, and `opt_copy_cleanup` rewrites safe SSA copy users + through def-use chains while avoiding multi-def conventional-SSA edge copies. -The next implementation work should finish the remaining analysis invalidation -model and then land the first small SSA-backed transformation, preferably SSA -DCE plus copy cleanup, before attempting GVN/DSE/LICM. +The next implementation work should extend this small-pass path incrementally, +before attempting GVN/DSE/LICM. --- @@ -492,12 +498,14 @@ Exit criteria: ### Phase B - O2 Analysis Substrate -Status: mostly implemented. `OptPassCtx`, the shared operand/reference walker, +Status: implemented for the first O2 value-pass slice. `OptPassCtx`, the +shared operand/reference walker, `OptAnalysis` CFG order/dominator/frontier data, stable instruction ids, production mem2reg SSA, def-use rebuilding, SSA dumps, and stronger `opt_verify` guardrails are in place. Loop tree construction now uses shared -dominators. Remaining work is a formal pass invalidation model and any -additional shared analysis fields required by the first SSA value passes. +dominators. Passes now have explicit CFG, def-use, dominator, and loop-info +validity bits, and the first SSA value passes did not require additional shared +analysis fields. Goal: create the durable data structures and invalidation model that all O2 passes will share before implementing transformations. @@ -519,9 +527,9 @@ Checklist: - [x] SSA dump coverage for phis and rewritten uses. - [x] `opt_verify(Func*, stage)` gates for CFG consistency, instruction id consistency, phi arity, def-use consistency, and SSA value class checks. -- [ ] Formal analysis invalidation flags for passes that preserve, rebuild, or +- [x] Formal analysis invalidation flags for passes that preserve, rebuild, or invalidate CFG/value/use-def state. -- [ ] Additional shared analysis fields demanded by first value passes. +- [x] Additional shared analysis fields demanded by first value passes. Exit criteria: @@ -560,8 +568,8 @@ memory optimizations. Implement in order: -1. `opt_ssa_dce` -2. `opt_copy_cleanup` +1. [x] `opt_ssa_dce` +2. [x] `opt_copy_cleanup` 3. `opt_block_cloning` 4. `opt_addr_xform` 5. `opt_gvn` diff --git a/src/opt/ir.h b/src/opt/ir.h @@ -31,15 +31,15 @@ typedef enum IROp { IR_PARAM_DECL, /* Address-bearing data movement. */ - IR_LOAD_IMM, /* opnds[0] dst REG; extra.imm = imm */ - IR_LOAD_CONST, /* opnds[0] dst REG; extra.cbytes */ - IR_COPY, /* opnds[0] dst REG, opnds[1] src REG */ - IR_LOAD, /* opnds[0] dst REG, opnds[1] addr; extra.mem */ - IR_STORE, /* opnds[0] addr, opnds[1] src REG|IMM; extra.mem */ - IR_ADDR_OF, /* opnds[0] dst REG, opnds[1] lv (LOCAL|GLOBAL|INDIRECT) */ - IR_TLS_ADDR_OF, /* opnds[0] dst REG; extra.aux = IRTlsAux */ - IR_AGG_COPY, /* opnds[0] dst, opnds[1] src; extra.aux = IRAggAux */ - IR_AGG_SET, /* opnds[0] dst, opnds[1] byte; extra.aux = IRAggAux */ + IR_LOAD_IMM, /* opnds[0] dst REG; extra.imm = imm */ + IR_LOAD_CONST, /* opnds[0] dst REG; extra.cbytes */ + IR_COPY, /* opnds[0] dst REG, opnds[1] src REG */ + IR_LOAD, /* opnds[0] dst REG, opnds[1] addr; extra.mem */ + IR_STORE, /* opnds[0] addr, opnds[1] src REG|IMM; extra.mem */ + IR_ADDR_OF, /* opnds[0] dst REG, opnds[1] lv (LOCAL|GLOBAL|INDIRECT) */ + IR_TLS_ADDR_OF, /* opnds[0] dst REG; extra.aux = IRTlsAux */ + IR_AGG_COPY, /* opnds[0] dst, opnds[1] src; extra.aux = IRAggAux */ + IR_AGG_SET, /* opnds[0] dst, opnds[1] byte; extra.aux = IRAggAux */ IR_BITFIELD_LOAD, /* opnds[0] dst REG, opnds[1] record; extra.aux */ IR_BITFIELD_STORE, /* opnds[0] record, opnds[1] src; extra.aux */ @@ -57,36 +57,36 @@ typedef enum IROp { IR_PHI, /* Control flow / scopes. */ - IR_BR, /* unconditional. block.succ[0] = target block id. */ - IR_CONDBR, /* opnds[0] cond REG; succ[0] = true, succ[1] = false. */ - IR_CMP_BRANCH, /* fused. opnds = [a, b]; extra.imm = CmpOp; - succ[0] = taken, succ[1] = fallthrough. */ - IR_SWITCH, /* multi-target structured switch. opnds[0] = selector; - extra.aux = IRSwitchAux; succ[0..ncases) = case - blocks, succ[ncases] = default block. */ + IR_BR, /* unconditional. block.succ[0] = target block id. */ + IR_CONDBR, /* opnds[0] cond REG; succ[0] = true, succ[1] = false. */ + IR_CMP_BRANCH, /* fused. opnds = [a, b]; extra.imm = CmpOp; + succ[0] = taken, succ[1] = fallthrough. */ + IR_SWITCH, /* multi-target structured switch. opnds[0] = selector; + extra.aux = IRSwitchAux; succ[0..ncases) = case + blocks, succ[ncases] = default block. */ IR_INDIRECT_BRANCH, /* opnds[0] = addr REG; extra.aux = IRIndirectAux. succ[0..nvalid) = the valid target blocks. */ IR_LOAD_LABEL_ADDR, /* opnds[0] dst REG; extra.imm = target block id. */ - IR_RET, /* extra.aux = IRRetAux* (NULL for void). */ - IR_SCOPE_BEGIN, /* extra.aux = IRScopeAux. defs[0] = scope id Val. */ - IR_SCOPE_ELSE, /* extra.imm = scope id (Val). */ - IR_SCOPE_END, /* extra.imm = scope id (Val). */ - IR_BREAK_TO, /* extra.imm = scope id (Val). */ - IR_CONTINUE_TO, /* extra.imm = scope id (Val). */ + IR_RET, /* extra.aux = IRRetAux* (NULL for void). */ + IR_SCOPE_BEGIN, /* extra.aux = IRScopeAux. defs[0] = scope id Val. */ + IR_SCOPE_ELSE, /* extra.imm = scope id (Val). */ + IR_SCOPE_END, /* extra.imm = scope id (Val). */ + IR_BREAK_TO, /* extra.imm = scope id (Val). */ + IR_CONTINUE_TO, /* extra.imm = scope id (Val). */ /* alloca / variadics. */ - IR_ALLOCA, /* opnds = [dst REG, size]; extra.imm = align */ - IR_VA_START, /* opnds = [ap] */ - IR_VA_ARG, /* opnds = [dst REG, ap]; extra.aux = CfreeCgTypeId*/ - IR_VA_END, /* opnds = [ap] */ - IR_VA_COPY, /* opnds = [dst, src] */ + IR_ALLOCA, /* opnds = [dst REG, size]; extra.imm = align */ + IR_VA_START, /* opnds = [ap] */ + IR_VA_ARG, /* opnds = [dst REG, ap]; extra.aux = CfreeCgTypeId*/ + IR_VA_END, /* opnds = [ap] */ + IR_VA_COPY, /* opnds = [dst, src] */ /* Atomics. */ - IR_ATOMIC_LOAD, /* opnds = [dst, addr]; extra.aux = IRAtomicAux */ - IR_ATOMIC_STORE, /* opnds = [addr, src]; extra.aux = IRAtomicAux */ - IR_ATOMIC_RMW, /* opnds = [dst, addr, val]; extra.aux */ - IR_ATOMIC_CAS, /* defs = [prior, ok]; extra.aux = IRCasAux */ - IR_FENCE, /* extra.imm = MemOrder */ + IR_ATOMIC_LOAD, /* opnds = [dst, addr]; extra.aux = IRAtomicAux */ + IR_ATOMIC_STORE, /* opnds = [addr, src]; extra.aux = IRAtomicAux */ + IR_ATOMIC_RMW, /* opnds = [dst, addr, val]; extra.aux */ + IR_ATOMIC_CAS, /* defs = [prior, ok]; extra.aux = IRCasAux */ + IR_FENCE, /* extra.imm = MemOrder */ /* Inline asm. extra.aux = IRAsmAux. */ IR_ASM_BLOCK, @@ -204,8 +204,9 @@ typedef struct IRAsmAux { AsmConstraint* outs; AsmConstraint* ins; Sym* clobbers; - Operand* out_ops; /* nout slots; the wrapped target may fill in REG location */ - Operand* in_ops; /* nin slots; recorded by w_asm_block, xlat'd at replay */ + Operand* + out_ops; /* nout slots; the wrapped target may fill in REG location */ + Operand* in_ops; /* nin slots; recorded by w_asm_block, xlat'd at replay */ u32 nout, nin, nclob; /* Filled by opt_machinize from backend register-name resolution. */ u32 clobber_mask[OPT_REG_CLASSES]; @@ -217,8 +218,8 @@ typedef struct IRAsmAux { typedef struct IRIntrinAux { IntrinKind kind; - Operand* dsts; /* ndst */ - Operand* args; /* narg */ + Operand* dsts; /* ndst */ + Operand* args; /* narg */ Val* result_vals; /* one per dst that's a REG, parallel to dsts */ u32 ndst, narg; } IRIntrinAux; @@ -270,8 +271,8 @@ typedef struct Inst { InstId id; SrcLoc loc; /* sticky from CGTarget.set_loc at recording */ CfreeCgTypeId type; - Val def; /* primary SSA def, or VAL_NONE */ - u32 ndefs; /* multi-result */ + Val def; /* primary SSA def, or VAL_NONE */ + u32 ndefs; /* multi-result */ Val* defs; /* Operands. We use Operand instead of Val so that the original * CGTarget call shape (IMM / LOCAL / GLOBAL / INDIRECT in addition to @@ -375,7 +376,8 @@ typedef struct Func { u32* val_def_block; u32* val_def_inst; CfreeCgTypeId* val_type; - u8* val_cls; /* RegClass per Val, used by replay to reconstruct REG operands */ + u8* val_cls; /* RegClass per Val, used by replay to reconstruct REG operands + */ u32 nvals, vals_cap; /* Scope id table. Indexed by scope_id (1-based). Values map to @@ -419,6 +421,7 @@ typedef struct Func { OptUse* opt_uses; u32 opt_nuses, opt_uses_cap; u32* opt_first_use_by_val; /* indexed by Val, OPT_USE_NONE if no uses */ + u32 opt_valid_analyses; Reg opt_hard_regs[OPT_REG_CLASSES][OPT_MAX_HARD_REGS]; u32 opt_hard_reg_count[OPT_REG_CLASSES]; @@ -426,7 +429,8 @@ typedef struct Func { u32 opt_phys_reg_count[OPT_REG_CLASSES]; Reg opt_scratch_regs[OPT_REG_CLASSES][OPT_MAX_SCRATCH_REGS]; u32 opt_scratch_reg_count[OPT_REG_CLASSES]; - u32 opt_caller_saved[OPT_REG_CLASSES]; /* bit r set if hard reg r is caller-saved */ + u32 opt_caller_saved[OPT_REG_CLASSES]; /* bit r set if hard reg r is + caller-saved */ u32 opt_callee_saved[OPT_REG_CLASSES]; u32 opt_reserved_regs[OPT_REG_CLASSES]; u32 opt_arg_regs[OPT_REG_CLASSES]; diff --git a/src/opt/opt.c b/src/opt/opt.c @@ -1409,9 +1409,17 @@ static void opt_run_o2_pipeline(OptImpl* o) { opt_verify(o->f, "o2-pre-ssa-cfg"); opt_build_ssa(o->f); opt_verify(o->f, "o2-ssa"); + opt_ssa_dce(o->f); + opt_verify(o->f, "o2-ssa-dce"); + opt_copy_cleanup(o->f); + opt_verify(o->f, "o2-copy-cleanup"); + opt_ssa_dce(o->f); + opt_verify(o->f, "o2-copy-dce"); opt_make_conventional_ssa(o->f); opt_verify(o->f, "o2-conventional-ssa"); opt_undo_ssa(o->f); + opt_copy_cleanup(o->f); + opt_verify(o->f, "o2-undo-copy-cleanup"); opt_build_cfg(o->f); opt_verify(o->f, "o2-undo-ssa"); metrics_scope_end(o->c, "opt.o2.ssa"); diff --git a/src/opt/opt.h b/src/opt/opt.h @@ -37,7 +37,8 @@ void opt_jump_cleanup(Func*, OptJumpCleanupStage); void opt_block_cloning(Func*); void opt_build_ssa(Func*); void opt_addr_xform(Func*); -void opt_gvn(Func*); /* incl. constprop, redundant-load elim */ +void opt_gvn(Func*); /* incl. constprop, redundant-load elim */ +void opt_copy_cleanup(Func*); void opt_copy_prop(Func*); /* incl. redundant-extension elim */ void opt_dse(Func*); /* dead store elimination */ void opt_ssa_dce(Func*); diff --git a/src/opt/opt_internal.h b/src/opt/opt_internal.h @@ -45,8 +45,18 @@ typedef struct OptAnalysis { OptBlockList* dom_frontier; } OptAnalysis; +typedef enum OptAnalysisFlag { + OPT_ANALYSIS_CFG = 1u << 0, + OPT_ANALYSIS_DEF_USE = 1u << 1, + OPT_ANALYSIS_DOM = 1u << 2, + OPT_ANALYSIS_LOOP = 1u << 3, +} OptAnalysisFlag; + typedef void (*OptOperandWalkFn)(Func*, Inst*, Operand*, int is_def, void*); +void opt_analysis_mark_valid(Func*, u32 flags); +void opt_analysis_invalidate(Func*, u32 flags); +int opt_analysis_has(Func*, u32 flags); void opt_analysis_build_order(Func*, OptAnalysis*); void opt_analysis_build_dominators(Func*, OptAnalysis*); void opt_analysis_build_dom_frontier(Func*, OptAnalysis*); diff --git a/src/opt/pass_analysis.c b/src/opt/pass_analysis.c @@ -22,6 +22,20 @@ static int verify_stage_is_ssa(const char* stage) { strstr(stage, "pre-ssa") == NULL; } +void opt_analysis_mark_valid(Func* f, u32 flags) { + if (!f) return; + f->opt_valid_analyses |= flags; +} + +void opt_analysis_invalidate(Func* f, u32 flags) { + if (!f) return; + f->opt_valid_analyses &= ~flags; +} + +int opt_analysis_has(Func* f, u32 flags) { + return f && (f->opt_valid_analyses & flags) == flags; +} + static void block_list_add(Arena* arena, OptBlockList* list, u32 block) { if (list->n == list->cap) { u32 ncap = list->cap ? list->cap * 2u : 4u; @@ -80,7 +94,10 @@ void opt_analysis_build_dominators(Func* f, OptAnalysis* a) { a->idom = arena_array(f->arena, u32, n); a->dom_children = arena_zarray(f->arena, OptBlockList, n); for (u32 i = 0; i < n; ++i) a->idom[i] = OPT_BLK_NONE; - if (f->entry >= f->nblocks || !a->reachable[f->entry]) return; + if (f->entry >= f->nblocks || !a->reachable[f->entry]) { + opt_analysis_mark_valid(f, OPT_ANALYSIS_DOM); + return; + } a->idom[f->entry] = f->entry; int changed = 1; @@ -112,6 +129,7 @@ void opt_analysis_build_dominators(Func* f, OptAnalysis* a) { if (idom == OPT_BLK_NONE || idom == b) continue; block_list_add(f->arena, &a->dom_children[idom], b); } + opt_analysis_mark_valid(f, OPT_ANALYSIS_DOM); } void opt_analysis_build_dom_frontier(Func* f, OptAnalysis* a) { @@ -190,8 +208,7 @@ static void opt_use_add(Func* f, Val v, u32 b, u32 i, u8 kind, u32 op_idx, if (f->opt_nuses == f->opt_uses_cap) { u32 ncap = f->opt_uses_cap ? f->opt_uses_cap * 2u : 32u; OptUse* uses = arena_zarray(f->arena, OptUse, ncap); - if (f->opt_uses) - memcpy(uses, f->opt_uses, sizeof(uses[0]) * f->opt_nuses); + if (f->opt_uses) memcpy(uses, f->opt_uses, sizeof(uses[0]) * f->opt_nuses); f->opt_uses = uses; f->opt_uses_cap = ncap; } @@ -209,12 +226,12 @@ static void opt_use_add(Func* f, Val v, u32 b, u32 i, u8 kind, u32 op_idx, f->opt_first_use_by_val[v] = id; } -static void opt_use_add_operand(Func* f, u32 b, u32 i, u32 op_idx, - Operand* op, int is_def) { +static void opt_use_add_operand(Func* f, u32 b, u32 i, u32 op_idx, Operand* op, + int is_def) { if (!op || is_def) return; if (op->kind == OPK_REG) { - opt_use_add(f, (Val)op->v.reg, b, i, OPT_USE_OPERAND, op_idx, - OPT_USE_NONE, op); + opt_use_add(f, (Val)op->v.reg, b, i, OPT_USE_OPERAND, op_idx, OPT_USE_NONE, + op); } else if (op->kind == OPK_INDIRECT) { opt_use_add(f, (Val)op->v.ind.base, b, i, OPT_USE_INDIRECT_BASE, op_idx, OPT_USE_NONE, op); @@ -242,8 +259,8 @@ static void opt_collect_inst_uses(Func* f, u32 b, u32 i, Inst* in) { IRPhiAux* aux = (IRPhiAux*)in->extra.aux; if (!aux) break; for (u32 p = 0; p < aux->npreds; ++p) - opt_use_add(f, aux->pred_vals[p], b, i, OPT_USE_PHI_INPUT, - OPT_USE_NONE, p, NULL); + opt_use_add(f, aux->pred_vals[p], b, i, OPT_USE_PHI_INPUT, OPT_USE_NONE, + p, NULL); break; } case IR_CALL: { @@ -300,6 +317,7 @@ void opt_rebuild_def_use(Func* f) { for (u32 i = 0; i < bl->ninsts; ++i) opt_collect_inst_uses(f, b, i, &bl->insts[i]); } + opt_analysis_mark_valid(f, OPT_ANALYSIS_DEF_USE); } static void verify_operand(Func* f, Inst* in, Operand* op, int is_def, @@ -312,8 +330,8 @@ static void verify_operand(Func* f, Inst* in, Operand* op, int is_def, opt_fail(f, stage, is_def ? "bad def val" : "bad use val", v, f->nvals); if (!verify_stage_is_ssa(stage)) return; if (op->cls != f->val_cls[v]) - opt_fail(f, stage, is_def ? "def class mismatch" : "use class mismatch", - v, op->cls); + opt_fail(f, stage, is_def ? "def class mismatch" : "use class mismatch", v, + op->cls); } static void verify_values(Func* f, const char* stage) { @@ -326,14 +344,12 @@ static void verify_values(Func* f, const char* stage) { Inst* in = &bl->insts[i]; if (in->id == INST_ID_NONE || in->id >= f->next_inst_id) opt_fail(f, stage, "bad inst id", b, i); - if (seen_inst[in->id]) - opt_fail(f, stage, "duplicate inst id", in->id, b); + if (seen_inst[in->id]) opt_fail(f, stage, "duplicate inst id", in->id, b); seen_inst[in->id] = 1; if (in->def != VAL_NONE) { if (in->def >= f->nvals) opt_fail(f, stage, "bad inst def", b, i); - if (verify_stage_is_ssa(stage) && (IROp)in->op == IR_PHI && - in->type && f->val_type[in->def] && - in->type != f->val_type[in->def]) + if (verify_stage_is_ssa(stage) && (IROp)in->op == IR_PHI && in->type && + f->val_type[in->def] && in->type != f->val_type[in->def]) opt_fail(f, stage, "inst def type mismatch", in->def, b); } for (u32 d = 0; d < in->ndefs; ++d) { @@ -349,8 +365,7 @@ static void verify_values(Func* f, const char* stage) { if (in->nopnds || in->opnds) opt_fail(f, stage, "phi should not carry operands", b, i); if (aux->slot_id > f->nframe_slots) - opt_fail(f, stage, "phi bad slot id", aux->slot_id, - f->nframe_slots); + opt_fail(f, stage, "phi bad slot id", aux->slot_id, f->nframe_slots); if (aux->npreds != bl->npreds) opt_fail(f, stage, "phi pred count mismatch", aux->npreds, bl->npreds); @@ -360,8 +375,7 @@ static void verify_values(Func* f, const char* stage) { if (aux->pred_vals[p] != VAL_NONE && aux->pred_vals[p] >= f->nvals) opt_fail(f, stage, "phi bad pred value", aux->pred_vals[p], f->nvals); - if (aux->pred_vals[p] != VAL_NONE && - f->val_type[aux->pred_vals[p]] && + if (aux->pred_vals[p] != VAL_NONE && f->val_type[aux->pred_vals[p]] && f->val_type[aux->pred_vals[p]] != in->type) opt_fail(f, stage, "phi input type mismatch", b, p); } @@ -378,8 +392,7 @@ static void verify_use_site(Func* f, const char* stage, const OptUse* use) { case OPT_USE_OPERAND: if (!use->operand) opt_fail(f, stage, "def-use missing operand", use->block, use->inst); - if (use->operand->kind != OPK_REG || - (Val)use->operand->v.reg != use->val) + if (use->operand->kind != OPK_REG || (Val)use->operand->v.reg != use->val) opt_fail(f, stage, "def-use operand mismatch", use->val, use->kind); break; case OPT_USE_INDIRECT_BASE: @@ -389,8 +402,7 @@ static void verify_use_site(Func* f, const char* stage, const OptUse* use) { break; case OPT_USE_PHI_INPUT: { if ((IROp)in->op != IR_PHI) - opt_fail(f, stage, "def-use phi site mismatch", use->block, - use->inst); + opt_fail(f, stage, "def-use phi site mismatch", use->block, use->inst); IRPhiAux* aux = (IRPhiAux*)in->extra.aux; if (!aux || use->phi_pred_index >= aux->npreds) opt_fail(f, stage, "def-use phi pred mismatch", use->val, @@ -407,6 +419,19 @@ static void verify_use_site(Func* f, const char* stage, const OptUse* use) { static void verify_def_use(Func* f, const char* stage) { if (f->opt_rewritten) return; + if (opt_analysis_has(f, OPT_ANALYSIS_DEF_USE)) { + for (u32 u = 0; u < f->opt_nuses; ++u) { + OptUse* use = &f->opt_uses[u]; + if (use->val == VAL_NONE || use->val >= f->nvals) + opt_fail(f, stage, "def-use bad cached val", use->val, f->nvals); + if (use->block >= f->nblocks) + opt_fail(f, stage, "def-use bad cached block", use->block, f->nblocks); + if (use->inst >= f->blocks[use->block].ninsts) + opt_fail(f, stage, "def-use bad cached inst", use->inst, + f->blocks[use->block].ninsts); + verify_use_site(f, stage, use); + } + } opt_rebuild_def_use(f); for (u32 u = 0; u < f->opt_nuses; ++u) { OptUse* use = &f->opt_uses[u]; diff --git a/src/opt/pass_cfg.c b/src/opt/pass_cfg.c @@ -20,13 +20,11 @@ * and only repopulates from the trailing terminator inst when one is * present. */ -#include "opt/ir.h" -#include "opt/opt.h" - #include <string.h> #include "core/arena.h" #include "core/core.h" +#include "opt/opt_internal.h" static int is_terminator(const Inst* in) { switch ((IROp)in->op) { @@ -41,8 +39,7 @@ static int is_terminator(const Inst* in) { return 1; case IR_INTRINSIC: { IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; - return aux && (aux->kind == INTRIN_LONGJMP || - aux->kind == INTRIN_TRAP || + return aux && (aux->kind == INTRIN_LONGJMP || aux->kind == INTRIN_TRAP || aux->kind == INTRIN_UNREACHABLE); } default: @@ -117,6 +114,9 @@ static void prune_unreachable(Func* f, const u8* reachable) { } void opt_build_cfg(Func* f) { + if (!f) return; + opt_analysis_invalidate( + f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP); for (u32 b = 0; b < f->nblocks; ++b) { f->blocks[b].preds = NULL; f->blocks[b].npreds = 0; @@ -192,4 +192,5 @@ void opt_build_cfg(Func* f) { f->blocks[t].preds[f->blocks[t].npreds++] = b; } } + opt_analysis_mark_valid(f, OPT_ANALYSIS_CFG); } diff --git a/src/opt/pass_combine.c b/src/opt/pass_combine.c @@ -512,6 +512,8 @@ static void opt_combine_fold_block(Func* f, Block* bl, } void opt_combine(Func* f) { + if (!f) return; + opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE); OptHardBlockLive* hard_live = opt_maybe_build_hard_live(f); for (u32 b = 0; b < f->nblocks; ++b) { Block* bl = &f->blocks[b]; diff --git a/src/opt/pass_copy.c b/src/opt/pass_copy.c @@ -0,0 +1,116 @@ +#include "opt/opt_internal.h" + +static int same_val_shape(Func* f, Val a, Val b) { + if (a == VAL_NONE || b == VAL_NONE || a >= f->nvals || b >= f->nvals) + return 0; + return f->val_cls[a] == f->val_cls[b] && f->val_type[a] == f->val_type[b]; +} + +static int copy_values(const Inst* in, Val* dst, Val* src) { + if (!in || (IROp)in->op != IR_COPY || in->nopnds < 2) return 0; + if (in->opnds[0].kind != OPK_REG || in->opnds[1].kind != OPK_REG) return 0; + *dst = (Val)in->opnds[0].v.reg; + *src = (Val)in->opnds[1].v.reg; + if (in->def != VAL_NONE && in->def != *dst) return 0; + return 1; +} + +static u32 def_count(Func* f, Val v) { + u32 n = 0; + if (v == VAL_NONE) return 0; + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + if (in->def == v) ++n; + for (u32 d = 0; d < in->ndefs; ++d) + if (in->defs[d] == v) ++n; + } + } + return n; +} + +static void replace_one_use(Func* f, const OptUse* use, Val src) { + Inst* in = &f->blocks[use->block].insts[use->inst]; + switch ((OptUseKind)use->kind) { + case OPT_USE_OPERAND: + use->operand->v.reg = (Reg)src; + use->operand->type = f->val_type[src]; + use->operand->cls = f->val_cls[src]; + break; + case OPT_USE_INDIRECT_BASE: + use->operand->v.ind.base = (Reg)src; + break; + case OPT_USE_PHI_INPUT: { + IRPhiAux* aux = (IRPhiAux*)in->extra.aux; + if (aux && use->phi_pred_index < aux->npreds) + aux->pred_vals[use->phi_pred_index] = src; + break; + } + default: + break; + } +} + +static void remove_copy_inst(Inst* in) { + in->op = IR_NOP; + in->def = VAL_NONE; + in->ndefs = 0; + in->defs = NULL; + in->nopnds = 0; + in->opnds = NULL; +} + +static void compact_copies(Func* f) { + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + u32 w = 0; + for (u32 i = 0; i < bl->ninsts; ++i) { + if ((IROp)bl->insts[i].op == IR_NOP) continue; + bl->insts[w] = bl->insts[i]; + if (bl->insts[w].def != VAL_NONE && bl->insts[w].def < f->nvals) { + f->val_def_block[bl->insts[w].def] = b; + f->val_def_inst[bl->insts[w].def] = w; + } + for (u32 d = 0; d < bl->insts[w].ndefs; ++d) { + Val v = bl->insts[w].defs[d]; + if (v != VAL_NONE && v < f->nvals) { + f->val_def_block[v] = b; + f->val_def_inst[v] = w; + } + } + ++w; + } + bl->ninsts = w; + } +} + +static int cleanup_one_copy(Func* f) { + opt_rebuild_def_use(f); + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + Val dst = VAL_NONE; + Val src = VAL_NONE; + if (!copy_values(in, &dst, &src)) continue; + if (dst != src && !same_val_shape(f, dst, src)) continue; + if (dst != src && def_count(f, dst) != 1) continue; + for (u32 u = f->opt_first_use_by_val[dst]; u != OPT_USE_NONE; + u = f->opt_uses[u].next_for_val) + replace_one_use(f, &f->opt_uses[u], src); + remove_copy_inst(in); + opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE); + compact_copies(f); + return 1; + } + } + return 0; +} + +void opt_copy_cleanup(Func* f) { + if (!f || f->opt_rewritten) return; + while (cleanup_one_copy(f)) { + } + opt_rebuild_def_use(f); +} diff --git a/src/opt/pass_dce.c b/src/opt/pass_dce.c @@ -44,7 +44,105 @@ int opt_inst_has_side_effect(Func* f, const Inst* in) { return 0; } } + +static int val_has_uses(Func* f, Val v) { + return v != VAL_NONE && v < f->nvals && f->opt_first_use_by_val && + f->opt_first_use_by_val[v] != OPT_USE_NONE; +} + +static int ssa_dce_candidate(const Inst* in) { + switch ((IROp)in->op) { + case IR_CONST_I: + case IR_CONST_BYTES: + case IR_LOAD_IMM: + case IR_LOAD_CONST: + case IR_LOAD_LABEL_ADDR: + case IR_COPY: + case IR_BINOP: + case IR_UNOP: + case IR_CMP: + case IR_CONVERT: + case IR_PHI: + return 1; + default: + return 0; + } +} + +static int inst_all_defs_unused(Func* f, const Inst* in) { + if (in->def != VAL_NONE && val_has_uses(f, in->def)) return 0; + for (u32 i = 0; i < in->ndefs; ++i) + if (val_has_uses(f, in->defs[i])) return 0; + return in->def != VAL_NONE || in->ndefs != 0; +} + +static void refresh_def_locations(Func* f) { + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + if (in->def != VAL_NONE && in->def < f->nvals) { + f->val_def_block[in->def] = b; + f->val_def_inst[in->def] = i; + } + for (u32 d = 0; d < in->ndefs; ++d) { + Val v = in->defs[d]; + if (v == VAL_NONE || v >= f->nvals) continue; + f->val_def_block[v] = b; + f->val_def_inst[v] = i; + } + } + } +} + +static void compact_nops(Func* f) { + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + u32 w = 0; + for (u32 i = 0; i < bl->ninsts; ++i) { + if ((IROp)bl->insts[i].op == IR_NOP) continue; + bl->insts[w++] = bl->insts[i]; + } + bl->ninsts = w; + } + refresh_def_locations(f); +} + +void opt_ssa_dce(Func* f) { + if (!f || f->opt_rewritten) return; + int changed = 0; + int again = 1; + opt_rebuild_def_use(f); + while (again) { + again = 0; + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + if (!ssa_dce_candidate(in)) continue; + if (opt_inst_has_side_effect(f, in)) continue; + if (!inst_all_defs_unused(f, in)) continue; + in->op = IR_NOP; + in->def = VAL_NONE; + in->ndefs = 0; + in->defs = NULL; + in->nopnds = 0; + in->opnds = NULL; + changed = 1; + again = 1; + } + } + if (again) opt_rebuild_def_use(f); + } + if (changed) { + opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE); + compact_nops(f); + } + opt_rebuild_def_use(f); +} + void opt_dce(Func* f) { + opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE); OptHardBlockLive* hard_live = opt_maybe_build_hard_live(f); for (u32 b = 0; b < f->nblocks; ++b) { Block* bl = &f->blocks[b]; @@ -92,4 +190,5 @@ void opt_dce(Func* f) { } bl->ninsts = w; } + refresh_def_locations(f); } diff --git a/src/opt/pass_jump.c b/src/opt/pass_jump.c @@ -1,6 +1,6 @@ #include <string.h> -#include "opt/opt.h" +#include "opt/opt_internal.h" #define BLOCK_NONE ((u32)~0u) @@ -199,6 +199,8 @@ static void cleanup_layout_fallthrough_branches(const JumpCleanupCtx* c) { void opt_jump_cleanup(Func* f, OptJumpCleanupStage stage) { if (!f) return; + opt_analysis_invalidate(f, OPT_ANALYSIS_CFG | OPT_ANALYSIS_DEF_USE | + OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP); JumpCleanupCtx c = jump_cleanup_ctx(f); if (stage == OPT_JUMP_CLEANUP_CFG) { cleanup_invert_jump_fallthrough(&c); diff --git a/src/opt/pass_loop.c b/src/opt/pass_loop.c @@ -31,16 +31,24 @@ static u32 loop_frequency(u8 depth) { } void opt_build_loop_tree(Func* f) { + if (!f) return; + opt_analysis_invalidate(f, OPT_ANALYSIS_LOOP); for (u32 b = 0; b < f->nblocks; ++b) { f->blocks[b].loop_depth = 0; f->blocks[b].frequency = 1; } - if (f->nblocks == 0 || f->entry >= f->nblocks) return; + if (f->nblocks == 0 || f->entry >= f->nblocks) { + opt_analysis_mark_valid(f, OPT_ANALYSIS_LOOP); + return; + } OptAnalysis a; memset(&a, 0, sizeof a); opt_analysis_build_dominators(f, &a); - if (a.npo == 0) return; + if (a.npo == 0) { + opt_analysis_mark_valid(f, OPT_ANALYSIS_LOOP); + return; + } u8* body = arena_zarray(f->arena, u8, f->nblocks); u32* stack = arena_array(f->arena, u32, f->nblocks); @@ -68,4 +76,5 @@ void opt_build_loop_tree(Func* f) { for (u32 b = 0; b < f->nblocks; ++b) f->blocks[b].frequency = loop_frequency(f->blocks[b].loop_depth); + opt_analysis_mark_valid(f, OPT_ANALYSIS_LOOP); } diff --git a/src/opt/pass_ssa.c b/src/opt/pass_ssa.c @@ -1,12 +1,11 @@ /* pass_ssa.c - mem2reg SSA construction and phi destruction for O2. */ -#include "opt/opt_internal.h" - #include <stdio.h> #include <string.h> #include "core/arena.h" #include "core/core.h" +#include "opt/opt_internal.h" typedef struct SlotStack { Val* vals; @@ -235,8 +234,8 @@ static void mark_def_blocks(Func* f, const u8* promoted, u8* def_blocks) { static void compute_phi_sites(Func* f, OptAnalysis* a, const u8* promoted, u8* needs_phi) { - u8* def_blocks = arena_zarray(f->arena, u8, - (f->nframe_slots + 1u) * f->nblocks); + u8* def_blocks = + arena_zarray(f->arena, u8, (f->nframe_slots + 1u) * f->nblocks); mark_def_blocks(f, promoted, def_blocks); u32* work = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u); u8* queued = arena_zarray(f->arena, u8, f->nblocks ? f->nblocks : 1u); @@ -384,6 +383,7 @@ static void replace_phi_inputs(Func* f, RenameCtx* ctx) { } void opt_build_ssa(Func* f) { + if (f) opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE); if (!f || f->nblocks == 0 || f->nframe_slots == 0) { if (f) opt_rebuild_def_use(f); return; @@ -432,8 +432,7 @@ static int ssa_is_terminator(const Inst* in) { return 1; case IR_INTRINSIC: { IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; - return aux && (aux->kind == INTRIN_LONGJMP || - aux->kind == INTRIN_TRAP || + return aux && (aux->kind == INTRIN_LONGJMP || aux->kind == INTRIN_TRAP || aux->kind == INTRIN_UNREACHABLE); } default: @@ -441,7 +440,8 @@ static int ssa_is_terminator(const Inst* in) { } } -static Inst make_copy_inst(Func* f, Val dst, Val src, CfreeCgTypeId ty, u8 cls) { +static Inst make_copy_inst(Func* f, Val dst, Val src, CfreeCgTypeId ty, + u8 cls) { Inst in; memset(&in, 0, sizeof in); in.op = IR_COPY; @@ -598,10 +598,10 @@ static void realign_phi_preds(Func* f) { u32 old_n = aux->npreds; u32* old_blocks = aux->pred_blocks; Val* old_vals = aux->pred_vals; - u32* pred_blocks = bl->npreds ? arena_array(f->arena, u32, bl->npreds) - : NULL; - Val* pred_vals = bl->npreds ? arena_zarray(f->arena, Val, bl->npreds) - : NULL; + u32* pred_blocks = + bl->npreds ? arena_array(f->arena, u32, bl->npreds) : NULL; + Val* pred_vals = + bl->npreds ? arena_zarray(f->arena, Val, bl->npreds) : NULL; for (u32 p = 0; p < bl->npreds; ++p) { pred_blocks[p] = bl->preds[p]; pred_vals[p] = phi->def; @@ -621,6 +621,8 @@ static void realign_phi_preds(Func* f) { void opt_make_conventional_ssa(Func* f) { if (!f) return; + opt_analysis_invalidate(f, OPT_ANALYSIS_CFG | OPT_ANALYSIS_DEF_USE | + OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP); for (u32 b = 0; b < f->nblocks; ++b) { Block* bl = &f->blocks[b]; if (!bl->ninsts || (IROp)bl->insts[0].op != IR_PHI) continue; @@ -655,6 +657,7 @@ void opt_make_conventional_ssa(Func* f) { void opt_undo_ssa(Func* f) { if (!f) return; + opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE); for (u32 b = 0; b < f->nblocks; ++b) { Block* bl = &f->blocks[b]; u32 w = 0; @@ -693,8 +696,7 @@ void opt_ssa_dump(Func* f, Writer* w) { opt_rebuild_def_use(f); char buf[160]; snprintf(buf, sizeof buf, "ssa blocks=%u vals=%u uses=%u\n", - (unsigned)f->nblocks, (unsigned)f->nvals, - (unsigned)f->opt_nuses); + (unsigned)f->nblocks, (unsigned)f->nvals, (unsigned)f->opt_nuses); ssa_dump_write(w, buf); for (u32 b = 0; b < f->nblocks; ++b) { Block* bl = &f->blocks[b]; @@ -712,8 +714,8 @@ void opt_ssa_dump(Func* f, Writer* w) { } if ((IROp)in->op == IR_PHI) { IRPhiAux* aux = (IRPhiAux*)in->extra.aux; - snprintf(buf, sizeof buf, " phi slot=%u preds=", - aux ? (unsigned)aux->slot_id : 0u); + snprintf(buf, sizeof buf, + " phi slot=%u preds=", aux ? (unsigned)aux->slot_id : 0u); ssa_dump_write(w, buf); if (aux) { for (u32 p = 0; p < aux->npreds; ++p) { diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c @@ -568,8 +568,7 @@ static u32 mock_return_reg_mask(CGTarget* t, const ABIFuncInfo* abi, return 0; } -static void mock_plan_call(CGTarget* t, const CGCallDesc* d, - CGCallPlan* out) { +static void mock_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) { MockCGTarget* m = (MockCGTarget*)t; memset(out, 0, sizeof *out); out->callee = d->callee; @@ -583,7 +582,8 @@ static void mock_plan_call(CGTarget* t, const CGCallDesc* d, pm->src = d->args[i].storage; pm->dst_kind = m->planned_stack_arg ? CG_CALL_PLAN_STACK : CG_CALL_PLAN_REG; pm->cls = RC_INT; - pm->dst_reg = m->planned_arg_regs[i] ? m->planned_arg_regs[i] : (Reg)(i + 1u); + pm->dst_reg = + m->planned_arg_regs[i] ? m->planned_arg_regs[i] : (Reg)(i + 1u); pm->stack_offset = i * 8u; pm->mem.type = d->args[i].type; pm->mem.size = 8; @@ -613,8 +613,7 @@ static void mock_call(CGTarget* t, const CGCallDesc* d) { static void mock_emit_call_plan(CGTarget* t, const CGCallPlan* p) { MockCGTarget* m = (MockCGTarget*)t; ++m->emit_call_plan_calls; - if (m->event_count < (int)sizeof m->events) - m->events[m->event_count++] = 'p'; + if (m->event_count < (int)sizeof m->events) m->events[m->event_count++] = 'p'; m->last_plan_callee = p->callee; } @@ -671,8 +670,7 @@ static void mock_load_const(CGTarget* t, Operand dst, ConstBytes cb) { static void mock_copy(CGTarget* t, Operand dst, Operand src) { MockCGTarget* m = (MockCGTarget*)t; - if (m->event_count < (int)sizeof m->events) - m->events[m->event_count++] = 'c'; + if (m->event_count < (int)sizeof m->events) m->events[m->event_count++] = 'c'; if (m->copy_calls < (int)(sizeof m->copy_dst / sizeof m->copy_dst[0])) { m->copy_dst[m->copy_calls] = dst; m->copy_src[m->copy_calls] = src; @@ -698,8 +696,7 @@ static void mock_store(CGTarget* t, Operand addr, Operand src, MemAccess macc) { static void mock_store_call_arg(CGTarget* t, const CGCallPlanMove* move) { MockCGTarget* m = (MockCGTarget*)t; - if (m->event_count < (int)sizeof m->events) - m->events[m->event_count++] = 's'; + if (m->event_count < (int)sizeof m->events) m->events[m->event_count++] = 's'; ++m->store_call_arg_calls; m->last_stack_arg = *move; } @@ -710,8 +707,10 @@ static void mock_load_call_arg(CGTarget* t, Operand dst, (void)dst; ++m->load_call_arg_calls; m->last_load_arg = *move; - if (move->src_kind == CG_CALL_PLAN_SRC_ADDR) ++m->addr_of_calls; - else ++m->load_calls; + if (move->src_kind == CG_CALL_PLAN_SRC_ADDR) + ++m->addr_of_calls; + else + ++m->load_calls; } static void mock_store_call_ret(CGTarget* t, const CGCallPlanRet* ret, @@ -879,22 +878,40 @@ typedef struct RealArchExpect { } RealArchExpect; static const RealArchExpect g_real_arch[] = { - {"x64", CFREE_ARCH_X86_64, + {"x64", + CFREE_ARCH_X86_64, {X64_RDI, X64_RSI, X64_RDX, X64_RCX, X64_R8, X64_R9, 0, 0}, - {X64_XMM0, X64_XMM1, X64_XMM2, X64_XMM3, X64_XMM4, X64_XMM5, - X64_XMM6, X64_XMM7}, - {X64_RAX, X64_RDX}, {X64_XMM0, X64_XMM1}, X64_RDI, 6, 1, 0}, - {"aa64", CFREE_ARCH_ARM_64, - {0, 1, 2, 3, 4, 5, 6, 7}, {0, 1, 2, 3, 4, 5, 6, 7}, - {0, 1}, {0, 1}, 8, 8, 0, 0}, - {"rv64", CFREE_ARCH_RV64, + {X64_XMM0, X64_XMM1, X64_XMM2, X64_XMM3, X64_XMM4, X64_XMM5, X64_XMM6, + X64_XMM7}, + {X64_RAX, X64_RDX}, + {X64_XMM0, X64_XMM1}, + X64_RDI, + 6, + 1, + 0}, + {"aa64", + CFREE_ARCH_ARM_64, + {0, 1, 2, 3, 4, 5, 6, 7}, + {0, 1, 2, 3, 4, 5, 6, 7}, + {0, 1}, + {0, 1}, + 8, + 8, + 0, + 0}, + {"rv64", + CFREE_ARCH_RV64, {RV_A0, RV_A1, RV_A2, RV_A3, RV_A4, RV_A5, RV_A6, RV_A7}, - {10, 11, 12, 13, 14, 15, 16, 17}, {RV_A0, RV_A1}, {10, 11}, - RV_A0, 8, 0, 1}, + {10, 11, 12, 13, 14, 15, 16, 17}, + {RV_A0, RV_A1}, + {10, 11}, + RV_A0, + 8, + 0, + 1}, }; -static CfreeCgTypeId record_of_i64s(TestCtx* tc, const char* name, - u64 count) { +static CfreeCgTypeId record_of_i64s(TestCtx* tc, const char* name, u64 count) { CfreeCgTypeId arr = cfree_cg_type_array(tc->c, tc->i64, count); CfreeCgField field; memset(&field, 0, sizeof field); @@ -948,9 +965,8 @@ static void expect_plan_arg(const CGCallPlan* p, u32 i, u8 dst_kind, u8 cls, Reg reg, u32 stack_offset, const char* ctx) { EXPECT(i < p->nargs, "%s missing arg %u", ctx, (unsigned)i); if (i >= p->nargs) return; - EXPECT(p->args[i].dst_kind == dst_kind, - "%s arg %u dst kind got %u want %u", ctx, (unsigned)i, - (unsigned)p->args[i].dst_kind, (unsigned)dst_kind); + EXPECT(p->args[i].dst_kind == dst_kind, "%s arg %u dst kind got %u want %u", + ctx, (unsigned)i, (unsigned)p->args[i].dst_kind, (unsigned)dst_kind); EXPECT(p->args[i].cls == cls, "%s arg %u class got %u want %u", ctx, (unsigned)i, (unsigned)p->args[i].cls, (unsigned)cls); if (dst_kind == CG_CALL_PLAN_REG) { @@ -987,10 +1003,10 @@ static void opt_machinize_uses_phys_reg_metadata(void) { static const Reg legacy_pool[] = {7}; static const Reg scratch[] = {9, 10}; static const CGPhysRegInfo phys[] = { - {13, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | - CG_REG_ARG, 0, 1}, - {19, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED | - CG_REG_RET, 50, 4}, + {13, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG, 0, + 1}, + {19, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED | CG_REG_RET, + 50, 4}, {29, RC_INT, 0xff, CG_REG_RESERVED, 0, 0}, }; mock_set_pool(&mock, RC_INT, legacy_pool, 1, scratch, 2, 0); @@ -1000,8 +1016,7 @@ static void opt_machinize_uses_phys_reg_metadata(void) { opt_machinize(f, &mock.base); EXPECT(f->opt_hard_reg_count[RC_INT] == 2, "phys metadata should replace legacy allocable pool"); - EXPECT(f->opt_hard_regs[RC_INT][0] == 13 && - f->opt_hard_regs[RC_INT][1] == 19, + EXPECT(f->opt_hard_regs[RC_INT][0] == 13 && f->opt_hard_regs[RC_INT][1] == 19, "phys allocable order should be preserved"); EXPECT((f->opt_caller_saved[RC_INT] & (1u << 13)) != 0, "caller-saved phys flag should be recorded"); @@ -1023,10 +1038,8 @@ static void opt_machinize_keeps_abi_regs_without_legacy_call_fallback(void) { mock_init(&mock, tc.c); static const Reg scratch[] = {9, 10}; static const CGPhysRegInfo phys[] = { - {2, RC_INT, 0, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | - CG_REG_ARG, 0, 0}, - {3, RC_INT, 0, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | - CG_REG_RET, 0, 0}, + {2, RC_INT, 0, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG, 0, 0}, + {3, RC_INT, 0, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_RET, 0, 0}, {12, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0}, {19, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, }; @@ -1049,12 +1062,10 @@ static void opt_machinize_keeps_abi_regs_without_legacy_call_fallback(void) { opt_machinize(f, &mock.base); EXPECT(mock.plan_call_count == 1, "call should be planned before filtering"); - EXPECT(aux->use_plan_replay, - "stack-arg call should use planned replay"); + EXPECT(aux->use_plan_replay, "stack-arg call should use planned replay"); EXPECT(f->opt_hard_reg_count[RC_INT] == 4, "planned stack calls should keep ABI arg/ret regs allocable"); - EXPECT(f->opt_hard_regs[RC_INT][0] == 2 && - f->opt_hard_regs[RC_INT][1] == 3 && + EXPECT(f->opt_hard_regs[RC_INT][0] == 2 && f->opt_hard_regs[RC_INT][1] == 3 && f->opt_hard_regs[RC_INT][2] == 12 && f->opt_hard_regs[RC_INT][3] == 19, "all non-reserved regs should remain allocable under planned calls"); @@ -1073,10 +1084,8 @@ static void opt_machinize_keeps_abi_regs_for_incoming_params(void) { mock_init(&mock, tc.c); static const Reg scratch[] = {9, 10}; static const CGPhysRegInfo phys[] = { - {2, RC_INT, 0, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | - CG_REG_ARG, 0, 0}, - {3, RC_INT, 0, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | - CG_REG_RET, 0, 0}, + {2, RC_INT, 0, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG, 0, 0}, + {3, RC_INT, 0, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_RET, 0, 0}, {12, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED, 0, 0}, {19, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, }; @@ -1088,8 +1097,7 @@ static void opt_machinize_keeps_abi_regs_for_incoming_params(void) { opt_machinize(f, &mock.base); EXPECT(f->opt_hard_reg_count[RC_INT] == 4, "incoming params should not suppress ABI arg/ret regs"); - EXPECT(f->opt_hard_regs[RC_INT][0] == 2 && - f->opt_hard_regs[RC_INT][1] == 3 && + EXPECT(f->opt_hard_regs[RC_INT][0] == 2 && f->opt_hard_regs[RC_INT][1] == 3 && f->opt_hard_regs[RC_INT][2] == 12 && f->opt_hard_regs[RC_INT][3] == 19, "param functions should keep all non-reserved regs allocable"); @@ -1113,8 +1121,7 @@ static void real_arch_call_plan_layout_one(const RealArchExpect* ex) { CGCallDesc d = make_call_desc(&tc, fn, args, 2, ret); CGCallPlan p; target->plan_call(target, &d, &p); - EXPECT(p.nargs == 2, "%s scalar nargs got %u", ex->name, - (unsigned)p.nargs); + EXPECT(p.nargs == 2, "%s scalar nargs got %u", ex->name, (unsigned)p.nargs); expect_plan_arg(&p, 0, CG_CALL_PLAN_REG, RC_INT, ex->int_arg[0], 0, ex->name); expect_plan_arg(&p, 1, CG_CALL_PLAN_REG, RC_INT, ex->int_arg[1], 0, @@ -1131,22 +1138,17 @@ static void real_arch_call_plan_layout_one(const RealArchExpect* ex) { const ABIFuncInfo* abi = abi_cg_func_info(tc.c->abi, fn); CGABIValue args[2]; args[0] = - call_arg_value(tc.f64, &abi->params[0], - op_reg_cls_(4, tc.f64, RC_FP)); + call_arg_value(tc.f64, &abi->params[0], op_reg_cls_(4, tc.f64, RC_FP)); args[1] = - call_arg_value(tc.f64, &abi->params[1], - op_reg_cls_(5, tc.f64, RC_FP)); + call_arg_value(tc.f64, &abi->params[1], op_reg_cls_(5, tc.f64, RC_FP)); CGABIValue ret = call_arg_value(tc.f64, &abi->ret, op_reg_cls_(6, tc.f64, RC_FP)); CGCallDesc d = make_call_desc(&tc, fn, args, 2, ret); CGCallPlan p; target->plan_call(target, &d, &p); - EXPECT(p.nargs == 2, "%s fp nargs got %u", ex->name, - (unsigned)p.nargs); - expect_plan_arg(&p, 0, CG_CALL_PLAN_REG, RC_FP, ex->fp_arg[0], 0, - ex->name); - expect_plan_arg(&p, 1, CG_CALL_PLAN_REG, RC_FP, ex->fp_arg[1], 0, - ex->name); + EXPECT(p.nargs == 2, "%s fp nargs got %u", ex->name, (unsigned)p.nargs); + expect_plan_arg(&p, 0, CG_CALL_PLAN_REG, RC_FP, ex->fp_arg[0], 0, ex->name); + expect_plan_arg(&p, 1, CG_CALL_PLAN_REG, RC_FP, ex->fp_arg[1], 0, ex->name); expect_plan_ret(&p, 0, RC_FP, ex->fp_ret[0], ex->name); } @@ -1157,24 +1159,20 @@ static void real_arch_call_plan_layout_one(const RealArchExpect* ex) { CGABIValue args[4]; args[0] = call_arg_value(tc.i64, &abi->params[0], op_reg_(7, tc.i64)); args[1] = - call_arg_value(tc.f64, &abi->params[1], - op_reg_cls_(8, tc.f64, RC_FP)); + call_arg_value(tc.f64, &abi->params[1], op_reg_cls_(8, tc.f64, RC_FP)); args[2] = call_arg_value(tc.i64, &abi->params[2], op_reg_(9, tc.i64)); args[3] = - call_arg_value(tc.f64, &abi->params[3], - op_reg_cls_(10, tc.f64, RC_FP)); + call_arg_value(tc.f64, &abi->params[3], op_reg_cls_(10, tc.f64, RC_FP)); CGABIValue ret = call_arg_value(tc.i64, &abi->ret, op_reg_(11, tc.i64)); CGCallDesc d = make_call_desc(&tc, fn, args, 4, ret); CGCallPlan p; target->plan_call(target, &d, &p); expect_plan_arg(&p, 0, CG_CALL_PLAN_REG, RC_INT, ex->int_arg[0], 0, ex->name); - expect_plan_arg(&p, 1, CG_CALL_PLAN_REG, RC_FP, ex->fp_arg[0], 0, - ex->name); + expect_plan_arg(&p, 1, CG_CALL_PLAN_REG, RC_FP, ex->fp_arg[0], 0, ex->name); expect_plan_arg(&p, 2, CG_CALL_PLAN_REG, RC_INT, ex->int_arg[1], 0, ex->name); - expect_plan_arg(&p, 3, CG_CALL_PLAN_REG, RC_FP, ex->fp_arg[1], 0, - ex->name); + expect_plan_arg(&p, 3, CG_CALL_PLAN_REG, RC_FP, ex->fp_arg[1], 0, ex->name); } { @@ -1188,14 +1186,13 @@ static void real_arch_call_plan_layout_one(const RealArchExpect* ex) { CGCallPlan p; target->plan_call(target, &d, &p); EXPECT(p.has_sret, "%s sret plan should be marked", ex->name); - expect_plan_arg(&p, 0, CG_CALL_PLAN_REG, RC_INT, ex->sret_reg, 0, - ex->name); + expect_plan_arg(&p, 0, CG_CALL_PLAN_REG, RC_INT, ex->sret_reg, 0, ex->name); EXPECT(p.args[0].src_kind == CG_CALL_PLAN_SRC_ADDR, "%s sret hidden pointer should materialize an address", ex->name); - expect_plan_arg(&p, 1, CG_CALL_PLAN_REG, RC_INT, - ex->arch == CFREE_ARCH_ARM_64 ? ex->int_arg[0] - : ex->int_arg[1], - 0, ex->name); + expect_plan_arg( + &p, 1, CG_CALL_PLAN_REG, RC_INT, + ex->arch == CFREE_ARCH_ARM_64 ? ex->int_arg[0] : ex->int_arg[1], 0, + ex->name); } { @@ -1225,9 +1222,8 @@ static void real_arch_call_plan_layout_one(const RealArchExpect* ex) { ex->name); } if (ex->variadic_fp_counted) - EXPECT(p.variadic_fp_count == 1, - "%s variadic FP count got %u want 1", ex->name, - (unsigned)p.variadic_fp_count); + EXPECT(p.variadic_fp_count == 1, "%s variadic FP count got %u want 1", + ex->name, (unsigned)p.variadic_fp_count); } { @@ -1258,13 +1254,11 @@ static void real_arch_call_plan_layout_one(const RealArchExpect* ex) { for (u32 i = 0; i < 10; ++i) args[i] = call_arg_value(tc.f128, &abi->params[i], op_local_((FrameSlot)(20 + i), tc.f128)); - CGABIValue ret = call_arg_value(tc.f128, &abi->ret, - op_local_(30, tc.f128)); + CGABIValue ret = call_arg_value(tc.f128, &abi->ret, op_local_(30, tc.f128)); CGCallDesc d = make_call_desc(&tc, fn, args, 10, ret); CGCallPlan p; target->plan_call(target, &d, &p); - EXPECT(p.stack_arg_size == 32, - "aa64 f128 stack-arg size got %u want 32", + EXPECT(p.stack_arg_size == 32, "aa64 f128 stack-arg size got %u want 32", (unsigned)p.stack_arg_size); expect_plan_arg(&p, 8, CG_CALL_PLAN_STACK, RC_FP, 0, 0, "aa64 f128"); expect_plan_arg(&p, 9, CG_CALL_PLAN_STACK, RC_FP, 0, 16, "aa64 f128"); @@ -1285,8 +1279,7 @@ static void opt_regalloc_prefers_caller_saved_for_non_call_value(void) { mock_init(&mock, tc.c); static const Reg scratch[] = {9, 10}; static const CGPhysRegInfo phys[] = { - {2, RC_INT, 0, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | - CG_REG_ARG, 0, 0}, + {2, RC_INT, 0, CG_REG_ALLOCABLE | CG_REG_CALLER_SAVED | CG_REG_ARG, 0, 0}, {19, RC_INT, 0xff, CG_REG_ALLOCABLE | CG_REG_CALLEE_SAVED, 50, 4}, }; mock_set_pool(&mock, RC_INT, NULL, 0, scratch, 2, 0); @@ -1380,10 +1373,10 @@ static void opt_liveness_branch(void) { EXPECT(opt_bitset_has(&live.blocks[b0].live_out, v), "v%u live_out of branch block", v); - EXPECT(opt_bitset_has(&live.blocks[b1].live_in, v), - "v%u live_in true block", v); - EXPECT(opt_bitset_has(&live.blocks[b2].live_in, v), - "v%u live_in false block", v); + EXPECT(opt_bitset_has(&live.blocks[b1].live_in, v), "v%u live_in true block", + v); + EXPECT(opt_bitset_has(&live.blocks[b2].live_in, v), "v%u live_in false block", + v); EXPECT(!opt_bitset_has(&live.blocks[b1].live_out, v), "v%u dies in true block", v); tc_fini(&tc); @@ -1675,8 +1668,7 @@ static void opt_loop_frequency_weights_ranges(void) { EXPECT(ranges.use_freq_by_val[loop_v] > ranges.use_freq_by_val[exit_v], "loop-used value should have higher weighted use frequency"); - EXPECT(ranges.spill_cost_by_val[loop_v] > - ranges.spill_cost_by_val[exit_v], + EXPECT(ranges.spill_cost_by_val[loop_v] > ranges.spill_cost_by_val[exit_v], "loop-used value should have higher spill cost"); tc_fini(&tc); } @@ -1869,10 +1861,14 @@ static void opt_analysis_dominators_and_frontier(void) { emit_ret_val(f, dead, v, tc.i32); opt_build_cfg(f); + EXPECT(opt_analysis_has(f, OPT_ANALYSIS_CFG), + "build_cfg should mark CFG analysis valid"); opt_verify(f, "analysis-test"); OptAnalysis a; memset(&a, 0, sizeof a); opt_analysis_build_dom_frontier(f, &a); + EXPECT(opt_analysis_has(f, OPT_ANALYSIS_DOM), + "dominator build should mark dominators valid"); EXPECT(a.npo == 4, "analysis should visit four reachable blocks, got %u", (unsigned)a.npo); @@ -1884,8 +1880,7 @@ static void opt_analysis_dominators_and_frontier(void) { EXPECT(a.idom[then_b] == entry, "then idom should be entry"); EXPECT(a.idom[else_b] == entry, "else idom should be entry"); EXPECT(a.idom[join] == entry, "join idom should be entry"); - EXPECT(opt_analysis_dominates(&a, entry, join), - "entry should dominate join"); + EXPECT(opt_analysis_dominates(&a, entry, join), "entry should dominate join"); EXPECT(!opt_analysis_dominates(&a, then_b, else_b), "then should not dominate else"); EXPECT(block_list_has(&a.dom_frontier[then_b], join), @@ -2087,6 +2082,92 @@ static void opt_ssa_conventional_splits_critical_edge(void) { tc_fini(&tc); } +static void stale_verify_arg(void* arg) { + opt_verify((Func*)arg, "stale-def-use-test"); +} + +static void opt_verify_catches_stale_def_use(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + Val a = add_val(f, tc.i32); + Val b = add_val(f, tc.i32); + emit_load_imm(f, f->entry, a, tc.i32, 1); + emit_load_imm(f, f->entry, b, tc.i32, 2); + emit_ret_val(f, f->entry, a, tc.i32); + opt_build_cfg(f); + opt_rebuild_def_use(f); + Inst* ret = &f->blocks[f->entry].insts[f->blocks[f->entry].ninsts - 1u]; + IRRetAux* aux = (IRRetAux*)ret->extra.aux; + aux->val.storage.v.reg = (Reg)b; + EXPECT(expect_panic(tc.c, stale_verify_arg, f), + "verifier should catch stale cached def-use after mutation"); + tc_fini(&tc); +} + +static void opt_ssa_dce_removes_dead_defs_and_phi(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + FrameSlot fs = add_frame_slot(f, tc.i32, FS_LOCAL, 4, 0); + u32 entry = f->entry; + u32 then_b = ir_block_new(f); + u32 else_b = ir_block_new(f); + u32 join = ir_block_new(f); + ir_note_emit(f, then_b); + ir_note_emit(f, else_b); + ir_note_emit(f, join); + + Val keep = add_val(f, tc.i32); + Val dead = add_val(f, tc.i32); + Val tv = add_val(f, tc.i32); + Val ev = add_val(f, tc.i32); + emit_load_imm(f, entry, keep, tc.i32, 42); + emit_load_imm(f, entry, dead, tc.i32, 99); + emit_test_branch(f, entry, then_b, else_b, tc.i32); + emit_load_imm(f, then_b, tv, tc.i32, 11); + emit_store_local(f, then_b, fs, tv, tc.i32, 0); + emit_br_to(f, then_b, join); + emit_load_imm(f, else_b, ev, tc.i32, 22); + emit_store_local(f, else_b, fs, ev, tc.i32, 0); + emit_br_to(f, else_b, join); + emit_ret_val(f, join, keep, tc.i32); + + opt_build_cfg(f); + opt_build_ssa(f); + EXPECT(count_op(f, IR_PHI) == 1, "test should build an unused phi"); + opt_ssa_dce(f); + opt_verify(f, "test-ssa-dce"); + EXPECT(count_op(f, IR_PHI) == 0, "SSA DCE should remove unused phi"); + EXPECT(count_uses_of(f, dead) == 0, "dead value should have no users"); + EXPECT(count_op(f, IR_LOAD_IMM) == 1, + "SSA DCE should remove dead pure load_imm defs"); + tc_fini(&tc); +} + +static void opt_copy_cleanup_rewrites_users(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + Val a = add_val(f, tc.i32); + Val b = add_val(f, tc.i32); + Val c = add_val(f, tc.i32); + emit_load_imm(f, f->entry, a, tc.i32, 42); + emit_copy(f, f->entry, b, a, tc.i32); + emit_copy(f, f->entry, c, b, tc.i32); + emit_ret_val(f, f->entry, c, tc.i32); + opt_build_cfg(f); + opt_rebuild_def_use(f); + opt_copy_cleanup(f); + opt_verify(f, "test-copy-cleanup"); + Inst* ret = &f->blocks[f->entry].insts[f->blocks[f->entry].ninsts - 1u]; + IRRetAux* aux = (IRRetAux*)ret->extra.aux; + EXPECT(count_op(f, IR_COPY) == 0, "copy cleanup should remove copy chain"); + EXPECT(aux && aux->val.storage.v.reg == (Reg)a, + "copy cleanup should rewrite final user to original value"); + tc_fini(&tc); +} + static void opt_jump_cleanup_forwards_branch_targets(void) { TestCtx tc; tc_init(&tc); @@ -3432,9 +3513,9 @@ static void opt_planned_call_replay_preserves_indirect_callee_arg_reg(void) { "callee-in-arg-register hazard should copy callee plus arg"); EXPECT(mock.copy_dst[0].v.reg == 9 && mock.copy_src[0].v.reg == 1, "callee should be saved before arg setup overwrites its register"); - EXPECT(mock.last_plan_callee.kind == OPK_REG && - mock.last_plan_callee.v.reg == 9, - "emit_call_plan should receive scratch callee register"); + EXPECT( + mock.last_plan_callee.kind == OPK_REG && mock.last_plan_callee.v.reg == 9, + "emit_call_plan should receive scratch callee register"); tc_fini(&tc); } @@ -3506,8 +3587,7 @@ static void opt_planned_call_replay_materializes_address_args(void) { "sret-shaped plans should use planned replay"); EXPECT(mock.addr_of_calls == 1, "register address arg should materialize with addr_of"); - EXPECT(mock.load_calls == 0, - "address args should not be loaded as values"); + EXPECT(mock.load_calls == 0, "address args should not be loaded as values"); EXPECT(mock.store_call_arg_calls == 1 && mock.last_stack_arg.src_kind == CG_CALL_PLAN_SRC_ADDR, "stack address arg should stay marked as an address"); @@ -3548,9 +3628,9 @@ static void opt_planned_call_replay_resolves_return_reg_collision(void) { EXPECT(mock.copy_calls == 3, "two-register return collision should need three copies, got %d", mock.copy_calls); - EXPECT(mock.event_count >= 4 && mock.events[0] == 'p' && - mock.events[1] == 'c', - "return copies should occur after the planned call branch"); + EXPECT( + mock.event_count >= 4 && mock.events[0] == 'p' && mock.events[1] == 'c', + "return copies should occur after the planned call branch"); EXPECT(mock.copy_dst[0].v.reg == 9 && mock.copy_src[0].v.reg == 1, "return cycle should save first return register to scratch"); EXPECT(mock.copy_dst[1].v.reg == 1 && mock.copy_src[1].v.reg == 2, @@ -3592,9 +3672,10 @@ static void opt_planned_call_replay_stores_stack_sources_before_clobber(void) { "stack-source hazard call should use emit_call_plan"); EXPECT(mock.store_call_arg_calls == 1 && mock.copy_calls == 1, "expected one stack store and one register arg copy"); - EXPECT(mock.event_count >= 3 && mock.events[0] == 's' && - mock.events[1] == 'c' && mock.events[2] == 'p', - "stack arg source should be stored before its source reg is clobbered"); + EXPECT( + mock.event_count >= 3 && mock.events[0] == 's' && mock.events[1] == 'c' && + mock.events[2] == 'p', + "stack arg source should be stored before its source reg is clobbered"); EXPECT(mock.last_stack_arg.src.kind == OPK_REG && mock.last_stack_arg.src.v.reg == 1, "stack arg should use the original source register"); @@ -3889,7 +3970,8 @@ static void opt_known_frame_keeps_frame_for_slot_call_and_alloca(void) { mock_set_pool(&alloca_mock, RC_INT, pool, 3, scratch, 2, 0x4007FFFFu); CGTarget* alloca_opt = opt_cgtarget_new(tc.c, &alloca_mock.base, 1); begin_mock_opt_func(&tc, alloca_opt, tc.i32); - alloca_opt->alloca_(alloca_opt, op_reg_(1, cfree_cg_type_ptr(tc.c, tc.i32, 0)), + alloca_opt->alloca_(alloca_opt, + op_reg_(1, cfree_cg_type_ptr(tc.c, tc.i32, 0)), op_imm_(16, tc.i32), 16); alloca_opt->load_imm(alloca_opt, op_reg_(2, tc.i32), 5); CGABIValue aret = {0}; @@ -4127,6 +4209,9 @@ int main(void) { opt_ssa_loop_carried_phi(); opt_ssa_non_promotable_slots_stay_memory(); opt_ssa_conventional_splits_critical_edge(); + opt_verify_catches_stale_def_use(); + opt_ssa_dce_removes_dead_defs_and_phi(); + opt_copy_cleanup_rewrites_users(); opt_jump_cleanup_forwards_branch_targets(); opt_jump_cleanup_inverts_to_remove_jump_block(); opt_jump_cleanup_keeps_conditional_fallthrough_block();