kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

pass_o2.c (79713B)


      1 #include <kit/cg.h>
      2 #include <string.h>
      3 
      4 #include "cg/ir_eval.h"
      5 #include "opt/opt_internal.h"
      6 
      7 #define OPT_BLOCK_NONE 0xffffffffu
      8 #define OPT_CLONE_MAX_BLOCK_INSTS 4u
      9 #define OPT_CLONE_MAX_PREDS 4u
     10 #define OPT_CLONE_ABS_GROWTH_LIMIT 32u
     11 #define GVN_ENTRY_NONE 0xffffffffu
     12 #define DSE_KEY_NONE 0xffffffffu
     13 
     14 void opt_cleanup(Func* f) {
     15   if (!f) return;
     16   opt_build_cfg(f);
     17   opt_jump_cleanup(f, OPT_JUMP_CLEANUP_CFG);
     18   opt_build_cfg(f);
     19   opt_verify(f, "o2-pre-ssa-cfg");
     20   opt_build_reg_ssa(f);
     21   opt_verify(f, "o2-reg-ssa");
     22   opt_block_cloning(f);
     23   opt_verify(f, "o2-block-clone-cfg");
     24   opt_build_ssa(f);
     25   opt_verify(f, "o2-ssa");
     26   opt_ssa_dce(f);
     27   opt_copy_cleanup(f);
     28   opt_verify(f, "o2-pre-addr-cleanup");
     29   opt_addr_xform(f);
     30   opt_verify(f, "o2-addr-xform-ssa");
     31   opt_ssa_dce(f);
     32   opt_verify(f, "o2-addr-xform-dce");
     33   opt_copy_cleanup(f);
     34   opt_verify(f, "o2-addr-xform-copy-cleanup");
     35   opt_simplify(f);
     36   opt_verify(f, "o2-simplify-ssa");
     37   opt_ssa_dce(f);
     38   opt_copy_cleanup(f);
     39   opt_verify(f, "o2-simplify-cleanup");
     40   opt_gvn(f);
     41   opt_verify(f, "o2-gvn-ssa");
     42   opt_copy_prop(f);
     43   opt_verify(f, "o2-copy-prop-ssa");
     44   opt_ssa_dce(f);
     45   opt_verify(f, "o2-copy-dce");
     46   opt_dse(f);
     47   opt_verify(f, "o2-dse-ssa");
     48   opt_ssa_dce(f);
     49   opt_verify(f, "o2-dse-dce");
     50   opt_build_loop_tree(f);
     51   opt_licm(f);
     52   opt_verify(f, "o2-licm-ssa");
     53   opt_pressure_relief(f);
     54   opt_verify(f, "o2-pressure-relief-ssa");
     55   opt_make_conventional_ssa(f);
     56   opt_verify(f, "o2-conventional-ssa");
     57   opt_ssa_combine(f);
     58   opt_verify(f, "o2-ssa-combine");
     59   opt_undo_ssa(f);
     60   opt_copy_cleanup(f);
     61   opt_verify(f, "o2-undo-copy-cleanup");
     62   opt_jump_opt(f);
     63   opt_verify(f, "o2-jump-opt");
     64 }
     65 
     66 typedef struct CloneValMap {
     67   Val old_val;
     68   Val new_val;
     69 } CloneValMap;
     70 
     71 typedef struct GvnConst {
     72   i64 value;
     73   u8 valid;
     74 } GvnConst;
     75 
     76 typedef struct GvnOperandKey {
     77   u8 kind;
     78   u8 cls;
     79   u16 pad;
     80   KitCgTypeId type;
     81   union {
     82     Val reg;
     83     i64 imm;
     84     struct {
     85       Val base;
     86       Val index;
     87       i32 ofs;
     88       u8 log2_scale;
     89       u8 pad[3];
     90     } ind;
     91   } v;
     92 } GvnOperandKey;
     93 
     94 typedef struct GvnMemKey {
     95   u8 valid;
     96   u8 root_kind;
     97   u8 has_addr;
     98   u8 pad0;
     99   u16 addr_space;
    100   u16 flags;
    101   KitCgTypeId mem_type;
    102   u32 size;
    103   u32 align;
    104   i64 root_id;
    105   i64 offset;
    106   u32 root_version;
    107   u32 unknown_version;
    108 } GvnMemKey;
    109 
    110 typedef struct GvnKey {
    111   u16 op;
    112   u16 nops;
    113   KitCgTypeId type;
    114   u8 cls;
    115   u8 pad[3];
    116   i64 tag;
    117   GvnMemKey mem;
    118   GvnOperandKey ops[2];
    119 } GvnKey;
    120 
    121 typedef struct GvnEntry {
    122   GvnKey key;
    123   Val val;
    124   u32 block;
    125   u32 inst;
    126   u32 next;
    127 } GvnEntry;
    128 
    129 typedef struct GvnTable {
    130   Func* f;
    131   u32* buckets;
    132   u32 nbuckets;
    133   GvnEntry* entries;
    134   u32 nentries;
    135   u32 cap;
    136 } GvnTable;
    137 
    138 typedef struct GvnMemVersion {
    139   u8 root_kind;
    140   u8 pad0;
    141   u16 addr_space;
    142   i64 root_id;
    143   u32 version;
    144 } GvnMemVersion;
    145 
    146 typedef struct GvnMemAvail {
    147   GvnKey key;
    148   Val val;
    149 } GvnMemAvail;
    150 
    151 typedef struct GvnMemState {
    152   GvnMemVersion* roots;
    153   u32 nroots;
    154   u32 cap;
    155   GvnMemAvail* avail;
    156   u32 navail;
    157   u32 cap_avail;
    158   u32 unknown_version;
    159 } GvnMemState;
    160 
    161 typedef struct GvnBlockMemState {
    162   GvnMemState in;
    163   GvnMemState out;
    164   u8 out_valid;
    165 } GvnBlockMemState;
    166 
    167 typedef struct GvnCtx {
    168   Func* f;
    169   OptAnalysis* analysis;
    170   Val* parent;
    171   GvnConst* constants;
    172   GvnTable table;
    173   GvnMemState mem;
    174   GvnBlockMemState* block_mem;
    175   u8* local_escaped;
    176   int changed;
    177   int cfg_changed;
    178 } GvnCtx;
    179 
    180 typedef struct DseKey {
    181   u8 root_kind;
    182   u8 pad0;
    183   u16 addr_space;
    184   u16 flags;
    185   u16 pad1;
    186   KitCgTypeId mem_type;
    187   u32 size;
    188   u32 align;
    189   i64 root_id;
    190   i64 offset;
    191 } DseKey;
    192 
    193 typedef struct DseBitset {
    194   u64* words;
    195 } DseBitset;
    196 
    197 typedef struct DseBlockState {
    198   DseBitset in;
    199   DseBitset out;
    200 } DseBlockState;
    201 
    202 typedef struct DseCtx {
    203   Func* f;
    204   GvnCtx gvn;
    205   DseKey* keys;
    206   u32 nkeys;
    207   u32 cap_keys;
    208   u32** store_key_by_inst;
    209   DseBlockState* block;
    210   DseBitset scratch_in;
    211   DseBitset scratch_out;
    212   u8* local_escaped;
    213   u32 words;
    214   int changed;
    215 } DseCtx;
    216 
    217 typedef struct LicmLoop {
    218   u32 header;
    219   u32 preheader;
    220   u8* body;
    221 } LicmLoop;
    222 
    223 typedef struct PressureBlockPlan {
    224   u8* move_src;
    225   u32* first_before;
    226   u32* last_before;
    227   u32* next_move;
    228 } PressureBlockPlan;
    229 
    230 typedef struct PressurePlan {
    231   PressureBlockPlan* blocks;
    232   u32 nmoves;
    233 } PressurePlan;
    234 
    235 static u32 opt_inst_count(Func* f) {
    236   u32 n = 0;
    237   for (u32 b = 0; b < f->nblocks; ++b) n += f->blocks[b].ninsts;
    238   return n;
    239 }
    240 
    241 static int o2_is_terminator(const Inst* in) {
    242   switch ((IROp)in->op) {
    243     case IR_BR:
    244     case IR_CONDBR:
    245     case IR_CMP_BRANCH:
    246     case IR_SWITCH:
    247     case IR_INDIRECT_BRANCH:
    248     case IR_RET:
    249     case IR_UNREACHABLE:
    250     case IR_BREAK_TO:
    251     case IR_CONTINUE_TO:
    252       return 1;
    253     case IR_INTRINSIC: {
    254       IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
    255       return aux && (aux->kind == INTRIN_LONGJMP || aux->kind == INTRIN_TRAP);
    256     }
    257     default:
    258       return 0;
    259   }
    260 }
    261 
    262 static int o2_cloneable_inst(const Inst* in) {
    263   if (in->ndefs) return 0;
    264   switch ((IROp)in->op) {
    265     case IR_NOP:
    266     case IR_LOAD_IMM:
    267     case IR_LOAD_CONST:
    268     case IR_COPY:
    269     case IR_LOAD:
    270     case IR_STORE:
    271     case IR_ADDR_OF:
    272     case IR_BINOP:
    273     case IR_UNOP:
    274     case IR_CMP:
    275     case IR_CONVERT:
    276     case IR_BR:
    277     case IR_RET:
    278       return 1;
    279     default:
    280       return 0;
    281   }
    282 }
    283 
    284 static int block_has_phi(Func* f, u32 b) {
    285   if (b >= f->nblocks) return 0;
    286   Block* bl = &f->blocks[b];
    287   return bl->ninsts && (IROp)bl->insts[0].op == IR_PHI;
    288 }
    289 
    290 static int block_defs_are_local(Func* f, u32 b) {
    291   Block* bl = &f->blocks[b];
    292   opt_rebuild_def_use(f);
    293   for (u32 i = 0; i < bl->ninsts; ++i) {
    294     Inst* in = &bl->insts[i];
    295     if (in->def == VAL_NONE) continue;
    296     for (u32 u = f->opt_first_use_by_val[in->def]; u != OPT_USE_NONE;
    297          u = f->opt_uses[u].next_for_val) {
    298       if (f->opt_uses[u].block != b) return 0;
    299     }
    300   }
    301   return 1;
    302 }
    303 
    304 static int block_defines_val(const Block* bl, Val v) {
    305   for (u32 i = 0; i < bl->ninsts; ++i) {
    306     const Inst* in = &bl->insts[i];
    307     if (in->def == v) return 1;
    308     for (u32 d = 0; d < in->ndefs; ++d)
    309       if (in->defs[d] == v) return 1;
    310   }
    311   return 0;
    312 }
    313 
    314 static int block_external_uses_dominated(Func* f, const OptAnalysis* a, u32 b) {
    315   Block* bl = &f->blocks[b];
    316   opt_rebuild_def_use(f);
    317   for (u32 i = 0; i < bl->ninsts; ++i) {
    318     for (u32 u = 0; u < f->opt_nuses; ++u) {
    319       OptUse* use = &f->opt_uses[u];
    320       if (use->block != b || use->inst != i) continue;
    321       Val v = use->val;
    322       if (block_defines_val(bl, v)) continue;
    323       if (v == VAL_NONE || v >= f->nvals) return 0;
    324       if (!opt_analysis_dominates(a, f->val_def_block[v], b)) return 0;
    325     }
    326   }
    327   return 1;
    328 }
    329 
    330 static int clone_candidate(Func* f, const OptAnalysis* a, u32 block) {
    331   if (block >= f->nblocks || block == f->entry) return 0;
    332   Block* bl = &f->blocks[block];
    333   if (!a->reachable || !a->reachable[block]) return 0;
    334   if (bl->npreds < 2 || bl->npreds > OPT_CLONE_MAX_PREDS) return 0;
    335   if (bl->ninsts == 0 || bl->ninsts > OPT_CLONE_MAX_BLOCK_INSTS) return 0;
    336   if (bl->nsucc > 1) return 0;
    337   if (bl->loop_depth) return 0;
    338   if (block_has_phi(f, block)) return 0;
    339   if (bl->nsucc == 1 && block_has_phi(f, bl->succ[0])) return 0;
    340   for (u32 i = 0; i < bl->ninsts; ++i) {
    341     if (!o2_cloneable_inst(&bl->insts[i])) return 0;
    342     if (i + 1u != bl->ninsts && o2_is_terminator(&bl->insts[i])) return 0;
    343   }
    344   for (u32 p = 0; p < bl->npreds; ++p) {
    345     u32 pred = bl->preds[p];
    346     if (pred >= f->nblocks) return 0;
    347     if (f->blocks[pred].loop_depth) return 0;
    348     if (opt_analysis_dominates(a, block, pred)) return 0;
    349   }
    350   return block_defs_are_local(f, block) &&
    351          block_external_uses_dominated(f, a, block);
    352 }
    353 
    354 static Val map_lookup(const CloneValMap* map, u32 nmap, Val v) {
    355   for (u32 i = 0; i < nmap; ++i)
    356     if (map[i].old_val == v) return map[i].new_val;
    357   return VAL_NONE;
    358 }
    359 
    360 static void remap_operand(Func* f, Inst* in, Operand* op, int is_def,
    361                           void* arg) {
    362   (void)in;
    363   (void)is_def;
    364   CloneValMap* map = (CloneValMap*)arg;
    365   u32 nmap = map[0].old_val;
    366   if (op->kind != OPK_REG) return;
    367   Val repl = map_lookup(map + 1, nmap, (Val)op->v.reg);
    368   if (repl == VAL_NONE) return;
    369   op->v.reg = (Reg)repl;
    370   op->type = f->val_type[repl];
    371   op->cls = f->val_cls[repl];
    372 }
    373 
    374 static IRRetAux* clone_ret_aux(Func* f, const IRRetAux* src) {
    375   if (!src) return NULL;
    376   IRRetAux* dst = arena_znew(f->arena, IRRetAux);
    377   *dst = *src;
    378   if (src->val.nparts) {
    379     CGABIPart* parts = arena_array(f->arena, CGABIPart, src->val.nparts);
    380     memcpy(parts, src->val.parts, sizeof(parts[0]) * src->val.nparts);
    381     dst->val.parts = parts;
    382   }
    383   return dst;
    384 }
    385 
    386 static void clone_inst_aux(Func* f, Inst* dst, const Inst* src) {
    387   if ((IROp)src->op == IR_RET) {
    388     dst->extra.aux = clone_ret_aux(f, (const IRRetAux*)src->extra.aux);
    389   }
    390 }
    391 
    392 static u32 emit_order_pos(Func* f, u32 block) {
    393   for (u32 i = 0; i < f->emit_order_n; ++i)
    394     if (f->emit_order[i] == block) return i;
    395   return OPT_BLOCK_NONE;
    396 }
    397 
    398 static void emit_order_insert_after(Func* f, u32 after, u32 block) {
    399   if (emit_order_pos(f, block) != OPT_BLOCK_NONE) return;
    400   if (f->emit_order_n == f->emit_order_cap) {
    401     u32 ncap = f->emit_order_cap ? f->emit_order_cap * 2u : 8u;
    402     u32* order = arena_array(f->arena, u32, ncap);
    403     if (f->emit_order)
    404       memcpy(order, f->emit_order, sizeof(order[0]) * f->emit_order_n);
    405     f->emit_order = order;
    406     f->emit_order_cap = ncap;
    407   }
    408   u32 pos = f->emit_order_n;
    409   u32 after_pos = emit_order_pos(f, after);
    410   if (after_pos != OPT_BLOCK_NONE) pos = after_pos + 1u;
    411   for (u32 i = f->emit_order_n; i > pos; --i)
    412     f->emit_order[i] = f->emit_order[i - 1u];
    413   f->emit_order[pos] = block;
    414   ++f->emit_order_n;
    415 }
    416 
    417 static void replace_succ_ref(Func* f, u32 pred, u32 old_succ, u32 new_succ) {
    418   Block* bl = &f->blocks[pred];
    419   for (u32 s = 0; s < bl->nsucc; ++s)
    420     if (bl->succ[s] == old_succ) bl->succ[s] = new_succ;
    421   if (!bl->ninsts) return;
    422   Inst* term = &bl->insts[bl->ninsts - 1u];
    423   if ((IROp)term->op == IR_SWITCH) {
    424     IRSwitchAux* aux = (IRSwitchAux*)term->extra.aux;
    425     if (!aux) return;
    426     for (u32 i = 0; i < aux->ncases; ++i)
    427       if (aux->cases[i].block == old_succ) aux->cases[i].block = new_succ;
    428     if (aux->default_block == old_succ) aux->default_block = new_succ;
    429   } else if ((IROp)term->op == IR_INDIRECT_BRANCH) {
    430     IRIndirectAux* aux = (IRIndirectAux*)term->extra.aux;
    431     if (!aux) return;
    432     for (u32 i = 0; i < aux->ntargets; ++i)
    433       if (aux->targets[i] == old_succ) aux->targets[i] = new_succ;
    434   }
    435 }
    436 
    437 static u32 clone_block_for_pred(Func* f, u32 block, u32 pred) {
    438   Block* src = &f->blocks[block];
    439   u32 nb = ir_block_new(f);
    440   Block* dst = &f->blocks[nb];
    441   ir_block_set_nsucc(f, nb, src->nsucc);
    442   for (u32 s = 0; s < src->nsucc; ++s) dst->succ[s] = src->succ[s];
    443   dst->loop_depth = src->loop_depth;
    444   dst->frequency = src->frequency;
    445 
    446   CloneValMap* map = arena_zarray(f->arena, CloneValMap, src->ninsts + 1u);
    447   u32 nmap = 0;
    448   for (u32 i = 0; i < src->ninsts; ++i) {
    449     Val old = src->insts[i].def;
    450     if (old == VAL_NONE) continue;
    451     map[++nmap].old_val = old;
    452     map[nmap].new_val = ir_alloc_val(f, f->val_type[old], f->val_cls[old]);
    453   }
    454   map[0].old_val = nmap;
    455 
    456   for (u32 i = 0; i < src->ninsts; ++i) {
    457     Inst* in = ir_emit(f, nb, (IROp)src->insts[i].op);
    458     InstId id = in->id;
    459     *in = src->insts[i];
    460     in->id = id;
    461     if (in->nopnds) {
    462       Operand* opnds = arena_array(f->arena, Operand, in->nopnds);
    463       memcpy(opnds, src->insts[i].opnds, sizeof(opnds[0]) * in->nopnds);
    464       in->opnds = opnds;
    465     }
    466     clone_inst_aux(f, in, &src->insts[i]);
    467     if (in->def != VAL_NONE) in->def = map_lookup(map + 1, nmap, in->def);
    468     opt_walk_inst_operands(f, in, remap_operand, map);
    469     if (in->def != VAL_NONE && in->def < f->nvals) {
    470       f->val_def_block[in->def] = nb;
    471       f->val_def_inst[in->def] = dst->ninsts - 1u;
    472     }
    473   }
    474 
    475   replace_succ_ref(f, pred, block, nb);
    476   emit_order_insert_after(f, pred, nb);
    477   return nb;
    478 }
    479 
    480 void opt_block_cloning(Func* f) {
    481   if (!f || f->opt_rewritten) return;
    482   /* This pass remaps OPK_REG defs as Val ids. In the normal O2 pipeline it
    483    * must run after opt_build_reg_ssa; standalone unit tests that build Val IR
    484    * directly have no pseudo-register table beyond the sentinel. */
    485   if (!f->opt_reg_ssa && f->npregs > 1) return;
    486   opt_build_loop_tree(f);
    487   OptAnalysis a;
    488   memset(&a, 0, sizeof a);
    489   opt_analysis_build_dominators(f, &a);
    490 
    491   u32 base_insts = opt_inst_count(f);
    492   u32 max_extra = base_insts / 4u + 8u;
    493   if (max_extra > OPT_CLONE_ABS_GROWTH_LIMIT)
    494     max_extra = OPT_CLONE_ABS_GROWTH_LIMIT;
    495   u32 extra = 0;
    496   int changed = 0;
    497 
    498   u32 original_blocks = f->nblocks;
    499   for (u32 b = 0; b < original_blocks; ++b) {
    500     if (!clone_candidate(f, &a, b)) continue;
    501     Block* bl = &f->blocks[b];
    502     u32 cost = bl->ninsts;
    503     if (cost == 0 || cost * bl->npreds + extra > max_extra) continue;
    504     u32 npreds = bl->npreds;
    505     u32* preds = arena_array(f->arena, u32, npreds);
    506     memcpy(preds, bl->preds, sizeof(preds[0]) * npreds);
    507     for (u32 p = 0; p < npreds; ++p) {
    508       clone_block_for_pred(f, b, preds[p]);
    509       extra += cost;
    510     }
    511     changed = 1;
    512   }
    513 
    514   if (changed) {
    515     opt_analysis_invalidate(f, OPT_ANALYSIS_CFG | OPT_ANALYSIS_DEF_USE |
    516                                    OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP);
    517     opt_build_cfg(f);
    518   }
    519   opt_rebuild_def_use(f);
    520 }
    521 
    522 static int addr_def_inst(Func* f, Val v, Inst** out) {
    523   if (v == VAL_NONE || v >= f->nvals) return 0;
    524   u32 b = f->val_def_block[v];
    525   u32 i = f->val_def_inst[v];
    526   if (b >= f->nblocks || i >= f->blocks[b].ninsts) return 0;
    527   Inst* in = &f->blocks[b].insts[i];
    528   if ((IROp)in->op != IR_ADDR_OF || in->def != v || in->nopnds < 2) return 0;
    529   *out = in;
    530   return 1;
    531 }
    532 
    533 static int val_def_inst(Func* f, Val v, Inst** out) {
    534   if (v == VAL_NONE || v >= f->nvals) return 0;
    535   u32 b = f->val_def_block[v];
    536   u32 i = f->val_def_inst[v];
    537   if (b >= f->nblocks || i >= f->blocks[b].ninsts) return 0;
    538   Inst* in = &f->blocks[b].insts[i];
    539   if (in->def != v) return 0;
    540   *out = in;
    541   return 1;
    542 }
    543 
    544 /* Use classification for the SSA-namespace addr-xform; mirrors the PReg
    545  * variant. Returns 0 for escapes, 1 for zero-EA folds (rewrite to
    546  * OPK_LOCAL), 2 for EA-shaped folds (leave the OPK_INDIRECT alone so the
    547  * EA stays on the load/store). */
    548 static int addr_use_foldable_kind(Func* f, const OptUse* use) {
    549   if (!use || use->kind != OPT_USE_INDIRECT_BASE) return 0;
    550   if (use->block >= f->nblocks || use->inst >= f->blocks[use->block].ninsts)
    551     return 0;
    552   Inst* in = &f->blocks[use->block].insts[use->inst];
    553   if ((IROp)in->op != IR_LOAD && (IROp)in->op != IR_STORE) return 0;
    554   if (opt_mem_observable(&in->extra.mem)) return 0;
    555   if (!use->operand || use->operand->kind != OPK_INDIRECT) return 0;
    556   if ((IROp)in->op == IR_LOAD && use->operand_index != 1u) return 0;
    557   if ((IROp)in->op == IR_STORE && use->operand_index != 0u) return 0;
    558   if (use->operand->v.ind.ofs == 0 &&
    559       use->operand->v.ind.index == (Reg)REG_NONE)
    560     return 1;
    561   return 2;
    562 }
    563 
    564 static int addr_all_uses_foldable(Func* f, Val v, int* out_has_ea) {
    565   u32 nuses = 0;
    566   int has_ea = 0;
    567   for (u32 u = f->opt_first_use_by_val[v]; u != OPT_USE_NONE;
    568        u = f->opt_uses[u].next_for_val) {
    569     ++nuses;
    570     int k = addr_use_foldable_kind(f, &f->opt_uses[u]);
    571     if (!k) return 0;
    572     if (k == 2) has_ea = 1;
    573   }
    574   if (out_has_ea) *out_has_ea = has_ea;
    575   return nuses != 0;
    576 }
    577 
    578 static void addr_inst_remove(Inst* in) {
    579   in->op = IR_NOP;
    580   in->def = VAL_NONE;
    581   in->ndefs = 0;
    582   in->defs = NULL;
    583   in->nopnds = 0;
    584   in->opnds = NULL;
    585 }
    586 
    587 void opt_addr_xform(Func* f) {
    588   if (!f || f->opt_rewritten) return;
    589   opt_rebuild_def_use(f);
    590   int changed = 0;
    591   u32 nvals = f->nvals;
    592   for (Val v = 1; v < nvals; ++v) {
    593     Inst* def = NULL;
    594     if (!addr_def_inst(f, v, &def)) continue;
    595     Operand lv = def->opnds[1];
    596     if (lv.kind != OPK_LOCAL) continue;
    597     int has_ea = 0;
    598     if (!addr_all_uses_foldable(f, v, &has_ea)) continue;
    599 
    600     /* Rewrite zero-EA uses to OPK_LOCAL; leave EA-shaped uses as
    601      * OPK_INDIRECT(p, ofs, index, log2_scale). When any EA-shaped use
    602      * remains, the IR_ADDR_OF def must stay alive to feed its base. */
    603     for (u32 u = f->opt_first_use_by_val[v]; u != OPT_USE_NONE;
    604          u = f->opt_uses[u].next_for_val) {
    605       OptUse* use = &f->opt_uses[u];
    606       Operand* op = use->operand;
    607       if (!op || op->kind != OPK_INDIRECT) continue;
    608       if (op->v.ind.ofs != 0 || op->v.ind.index != (Reg)REG_NONE) continue;
    609       Inst* mem = &f->blocks[use->block].insts[use->inst];
    610       Operand folded = lv;
    611       folded.type = mem->extra.mem.type ? mem->extra.mem.type : lv.type;
    612       *op = folded;
    613     }
    614     if (!has_ea) addr_inst_remove(def);
    615     changed = 1;
    616   }
    617   if (changed)
    618     opt_analysis_invalidate(
    619         f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP);
    620   opt_rebuild_def_use(f);
    621 }
    622 
    623 /* gvn's TYPE policy is int-LIKE width (no-PTR); the value arithmetic is the
    624  * shared cg/ir_eval core (see src/cg/ir_eval.h). */
    625 static int gvn_int_like_width(Func* f, KitCgTypeId ty, u32* width_out) {
    626   u32 width = kit_cg_type_int_width((KitCompiler*)f->c, ty);
    627   if (!width || width > 64u) return 0;
    628   *width_out = width;
    629   return 1;
    630 }
    631 
    632 static int gvn_fold_binop(Func* f, BinOp op, KitCgTypeId ty, i64 a, i64 b,
    633                           i64* out) {
    634   u32 width;
    635   if (!gvn_int_like_width(f, ty, &width)) return 0;
    636   return kit_ir_eval_binop(op, width, a, b, out);
    637 }
    638 
    639 static int gvn_fold_unop(Func* f, UnOp op, KitCgTypeId ty, i64 a, i64* out) {
    640   u32 width;
    641   if (!gvn_int_like_width(f, ty, &width)) return 0;
    642   return kit_ir_eval_unop(op, width, a, out);
    643 }
    644 
    645 static int gvn_fold_cmp(Func* f, CmpOp op, KitCgTypeId ty, i64 a, i64 b,
    646                         i64* out) {
    647   u32 width;
    648   if (!gvn_int_like_width(f, ty, &width)) return 0;
    649   return kit_ir_eval_cmp(op, width, a, b, out);
    650 }
    651 
    652 static int gvn_fold_convert(Func* f, ConvKind k, KitCgTypeId dst_ty,
    653                             KitCgTypeId src_ty, i64 src, i64* out) {
    654   u32 sw, dw;
    655   if (!gvn_int_like_width(f, src_ty, &sw) ||
    656       !gvn_int_like_width(f, dst_ty, &dw))
    657     return 0;
    658   return kit_ir_eval_convert(k, sw, dw, src, out);
    659 }
    660 
    661 static Val gvn_find(GvnCtx* ctx, Val v) {
    662   if (v == VAL_NONE || v >= ctx->f->nvals) return VAL_NONE;
    663   Val p = ctx->parent[v];
    664   if (p == VAL_NONE || p == v) return v;
    665   ctx->parent[v] = gvn_find(ctx, p);
    666   return ctx->parent[v];
    667 }
    668 
    669 static int gvn_same_shape(Func* f, Val a, Val b) {
    670   if (a == VAL_NONE || b == VAL_NONE || a >= f->nvals || b >= f->nvals)
    671     return 0;
    672   return f->val_type[a] == f->val_type[b] && f->val_cls[a] == f->val_cls[b];
    673 }
    674 
    675 static int gvn_const_for_operand(GvnCtx* ctx, const Operand* op, i64* out) {
    676   if (!op) return 0;
    677   if (op->kind == OPK_IMM) {
    678     *out = op->v.imm;
    679     return 1;
    680   }
    681   if (op->kind != OPK_REG) return 0;
    682   Val v = gvn_find(ctx, (Val)op->v.reg);
    683   if (v == VAL_NONE || v >= ctx->f->nvals || !ctx->constants[v].valid) return 0;
    684   *out = ctx->constants[v].value;
    685   return 1;
    686 }
    687 
    688 static void gvn_make_load_imm(Func* f, Inst* in, i64 value) {
    689   Val dst = in->def;
    690   KitCgTypeId ty =
    691       dst != VAL_NONE && dst < f->nvals ? f->val_type[dst] : in->type;
    692   u8 cls = dst != VAL_NONE && dst < f->nvals ? f->val_cls[dst] : RC_INT;
    693   Operand* opnds = in->opnds;
    694   if (!opnds) opnds = arena_array(f->arena, Operand, 1);
    695   memset(&opnds[0], 0, sizeof opnds[0]);
    696   opnds[0].kind = OPK_REG;
    697   opnds[0].type = ty;
    698   opnds[0].cls = cls;
    699   opnds[0].v.reg = (Reg)dst;
    700   in->op = IR_LOAD_IMM;
    701   in->type = ty;
    702   in->opnds = opnds;
    703   in->nopnds = 1;
    704   in->extra.imm = value;
    705 }
    706 
    707 static int gvn_try_fold_inst(GvnCtx* ctx, Inst* in) {
    708   i64 a, b, out;
    709   switch ((IROp)in->op) {
    710     case IR_BINOP:
    711       if (in->nopnds < 3) return 0;
    712       if (!gvn_const_for_operand(ctx, &in->opnds[1], &a) ||
    713           !gvn_const_for_operand(ctx, &in->opnds[2], &b))
    714         return 0;
    715       if (!gvn_fold_binop(ctx->f, (BinOp)in->extra.imm, in->type, a, b, &out))
    716         return 0;
    717       gvn_make_load_imm(ctx->f, in, out);
    718       return 1;
    719     case IR_UNOP:
    720       if (in->nopnds < 2) return 0;
    721       if (!gvn_const_for_operand(ctx, &in->opnds[1], &a)) return 0;
    722       if (!gvn_fold_unop(ctx->f, (UnOp)in->extra.imm, in->type, a, &out))
    723         return 0;
    724       gvn_make_load_imm(ctx->f, in, out);
    725       return 1;
    726     case IR_CMP:
    727       if (in->nopnds < 3) return 0;
    728       if (!gvn_const_for_operand(ctx, &in->opnds[1], &a) ||
    729           !gvn_const_for_operand(ctx, &in->opnds[2], &b))
    730         return 0;
    731       if (!gvn_fold_cmp(ctx->f, (CmpOp)in->extra.imm, in->opnds[1].type, a, b,
    732                         &out))
    733         return 0;
    734       gvn_make_load_imm(ctx->f, in, out);
    735       return 1;
    736     case IR_CONVERT:
    737       if (in->nopnds < 2) return 0;
    738       if (!gvn_const_for_operand(ctx, &in->opnds[1], &a)) return 0;
    739       if (!gvn_fold_convert(ctx->f, (ConvKind)in->extra.imm, in->opnds[0].type,
    740                             in->opnds[1].type, a, &out))
    741         return 0;
    742       gvn_make_load_imm(ctx->f, in, out);
    743       return 1;
    744     default:
    745       return 0;
    746   }
    747 }
    748 
    749 static u64 gvn_mix_u64(u64 h, u64 v) {
    750   h ^= v + 0x9e3779b97f4a7c15ull + (h << 6) + (h >> 2);
    751   return h;
    752 }
    753 
    754 static u64 gvn_key_hash(const GvnKey* k) {
    755   u64 h = 1469598103934665603ull;
    756   h = gvn_mix_u64(h, k->op);
    757   h = gvn_mix_u64(h, k->nops);
    758   h = gvn_mix_u64(h, k->type);
    759   h = gvn_mix_u64(h, k->cls);
    760   h = gvn_mix_u64(h, (u64)k->tag);
    761   h = gvn_mix_u64(h, k->mem.valid);
    762   if (k->mem.valid) {
    763     h = gvn_mix_u64(h, k->mem.root_kind);
    764     h = gvn_mix_u64(h, k->mem.has_addr);
    765     h = gvn_mix_u64(h, k->mem.addr_space);
    766     h = gvn_mix_u64(h, k->mem.flags);
    767     h = gvn_mix_u64(h, k->mem.mem_type);
    768     h = gvn_mix_u64(h, k->mem.size);
    769     h = gvn_mix_u64(h, k->mem.align);
    770     h = gvn_mix_u64(h, (u64)k->mem.root_id);
    771     h = gvn_mix_u64(h, (u64)k->mem.offset);
    772     h = gvn_mix_u64(h, k->mem.root_version);
    773     h = gvn_mix_u64(h, k->mem.unknown_version);
    774   }
    775   for (u32 i = 0; i < k->nops; ++i) {
    776     h = gvn_mix_u64(h, k->ops[i].kind);
    777     h = gvn_mix_u64(h, k->ops[i].cls);
    778     h = gvn_mix_u64(h, k->ops[i].type);
    779     if (k->ops[i].kind == OPK_INDIRECT) {
    780       h = gvn_mix_u64(h, k->ops[i].v.ind.base);
    781       h = gvn_mix_u64(h, k->ops[i].v.ind.index);
    782       h = gvn_mix_u64(h, (u64)(i64)k->ops[i].v.ind.ofs);
    783       h = gvn_mix_u64(h, k->ops[i].v.ind.log2_scale);
    784     } else {
    785       h = gvn_mix_u64(h, k->ops[i].kind == OPK_REG ? k->ops[i].v.reg
    786                                                    : (u64)k->ops[i].v.imm);
    787     }
    788   }
    789   return h;
    790 }
    791 
    792 static int gvn_operand_key_equal(const GvnOperandKey* a,
    793                                  const GvnOperandKey* b) {
    794   if (a->kind != b->kind || a->cls != b->cls || a->type != b->type) return 0;
    795   if (a->kind == OPK_INDIRECT)
    796     return a->v.ind.base == b->v.ind.base && a->v.ind.index == b->v.ind.index &&
    797            a->v.ind.ofs == b->v.ind.ofs &&
    798            a->v.ind.log2_scale == b->v.ind.log2_scale;
    799   if (a->kind == OPK_REG) return a->v.reg == b->v.reg;
    800   return a->v.imm == b->v.imm;
    801 }
    802 
    803 static int gvn_key_equal(const GvnKey* a, const GvnKey* b) {
    804   if (a->op != b->op || a->nops != b->nops || a->type != b->type ||
    805       a->cls != b->cls || a->tag != b->tag)
    806     return 0;
    807   if (a->mem.valid != b->mem.valid) return 0;
    808   if (a->mem.valid &&
    809       (a->mem.root_kind != b->mem.root_kind ||
    810        a->mem.has_addr != b->mem.has_addr ||
    811        a->mem.addr_space != b->mem.addr_space || a->mem.flags != b->mem.flags ||
    812        a->mem.mem_type != b->mem.mem_type || a->mem.size != b->mem.size ||
    813        a->mem.align != b->mem.align || a->mem.root_id != b->mem.root_id ||
    814        a->mem.offset != b->mem.offset ||
    815        a->mem.root_version != b->mem.root_version ||
    816        a->mem.unknown_version != b->mem.unknown_version))
    817     return 0;
    818   for (u32 i = 0; i < a->nops; ++i)
    819     if (!gvn_operand_key_equal(&a->ops[i], &b->ops[i])) return 0;
    820   return 1;
    821 }
    822 
    823 static void gvn_table_init(GvnTable* t, Func* f) {
    824   u32 ninsts = opt_inst_count(f);
    825   u32 nbuckets = 16u;
    826   while (nbuckets < ninsts * 2u + 1u) nbuckets *= 2u;
    827   t->f = f;
    828   t->nbuckets = nbuckets;
    829   t->buckets = arena_array(f->arena, u32, nbuckets);
    830   for (u32 i = 0; i < nbuckets; ++i) t->buckets[i] = GVN_ENTRY_NONE;
    831   t->entries = NULL;
    832   t->nentries = 0;
    833   t->cap = 0;
    834 }
    835 
    836 static void gvn_table_add(GvnTable* t, const GvnKey* key, Val v, u32 b, u32 i) {
    837   if (t->nentries == t->cap) {
    838     u32 ncap = t->cap ? t->cap * 2u : 32u;
    839     GvnEntry* entries = arena_array(t->f->arena, GvnEntry, ncap);
    840     if (t->entries)
    841       memcpy(entries, t->entries, sizeof(entries[0]) * t->nentries);
    842     t->entries = entries;
    843     t->cap = ncap;
    844   }
    845   u32 bucket = (u32)(gvn_key_hash(key) & (u64)(t->nbuckets - 1u));
    846   GvnEntry* e = &t->entries[t->nentries];
    847   e->key = *key;
    848   e->val = v;
    849   e->block = b;
    850   e->inst = i;
    851   e->next = t->buckets[bucket];
    852   t->buckets[bucket] = t->nentries++;
    853 }
    854 
    855 static int gvn_leader_dominates_site(const OptAnalysis* a, u32 leader_b,
    856                                      u32 leader_i, u32 use_b, u32 use_i) {
    857   if (leader_b == use_b) return leader_i < use_i;
    858   return opt_analysis_dominates(a, leader_b, use_b);
    859 }
    860 
    861 static int gvn_leader_dominates_use(GvnCtx* ctx, u32 leader_b, u32 leader_i,
    862                                     const OptUse* use) {
    863   if (use->kind == OPT_USE_PHI_INPUT) {
    864     Inst* in = &ctx->f->blocks[use->block].insts[use->inst];
    865     IRPhiAux* aux = (IRPhiAux*)in->extra.aux;
    866     if (!aux || use->phi_pred_index >= aux->npreds) return 0;
    867     u32 pred = aux->pred_blocks[use->phi_pred_index];
    868     if (leader_b == pred) return 1;
    869     return opt_analysis_dominates(ctx->analysis, leader_b, pred);
    870   }
    871   return gvn_leader_dominates_site(ctx->analysis, leader_b, leader_i,
    872                                    use->block, use->inst);
    873 }
    874 
    875 static int gvn_table_find(GvnCtx* ctx, const GvnKey* key, u32 b, u32 i,
    876                           Val* out) {
    877   GvnTable* t = &ctx->table;
    878   u32 bucket = (u32)(gvn_key_hash(key) & (u64)(t->nbuckets - 1u));
    879   for (u32 e = t->buckets[bucket]; e != GVN_ENTRY_NONE;
    880        e = t->entries[e].next) {
    881     GvnEntry* ent = &t->entries[e];
    882     if (!gvn_key_equal(&ent->key, key)) continue;
    883     if (!gvn_leader_dominates_site(ctx->analysis, ent->block, ent->inst, b, i))
    884       continue;
    885     *out = ent->val;
    886     return 1;
    887   }
    888   return 0;
    889 }
    890 
    891 static int gvn_make_operand_key(GvnCtx* ctx, const Operand* op,
    892                                 GvnOperandKey* out) {
    893   memset(out, 0, sizeof *out);
    894   if (!op || (op->kind != OPK_REG && op->kind != OPK_IMM)) return 0;
    895   out->kind = op->kind;
    896   out->cls = op->cls;
    897   out->type = op->type;
    898   if (op->kind == OPK_REG) {
    899     Val v = gvn_find(ctx, (Val)op->v.reg);
    900     if (v == VAL_NONE || v >= ctx->f->nvals) return 0;
    901     out->v.reg = v;
    902   } else {
    903     out->v.imm = op->v.imm;
    904   }
    905   return 1;
    906 }
    907 
    908 static int gvn_make_addr_operand_key(GvnCtx* ctx, const Operand* op,
    909                                      GvnOperandKey* out) {
    910   memset(out, 0, sizeof *out);
    911   if (!op) return 0;
    912   out->kind = op->kind;
    913   out->cls = op->cls;
    914   out->type = op->type;
    915   switch ((OpKind)op->kind) {
    916     case OPK_REG: {
    917       Val v = gvn_find(ctx, (Val)op->v.reg);
    918       if (v == VAL_NONE || v >= ctx->f->nvals) return 0;
    919       out->v.reg = v;
    920       return 1;
    921     }
    922     case OPK_INDIRECT: {
    923       Val base = gvn_find(ctx, (Val)op->v.ind.base);
    924       if (base == VAL_NONE || base >= ctx->f->nvals) return 0;
    925       out->v.ind.base = base;
    926       out->v.ind.index = VAL_NONE;
    927       if (op->v.ind.index != REG_NONE) {
    928         Val index = gvn_find(ctx, (Val)op->v.ind.index);
    929         if (index == VAL_NONE || index >= ctx->f->nvals) return 0;
    930         out->v.ind.index = index;
    931         out->v.ind.log2_scale = op->v.ind.log2_scale;
    932       }
    933       out->v.ind.ofs = op->v.ind.ofs;
    934       return 1;
    935     }
    936     case OPK_LOCAL:
    937       out->v.imm = (i64)op->v.frame_slot;
    938       return 1;
    939     case OPK_GLOBAL:
    940       out->v.imm = (i64)op->v.global.sym;
    941       return 1;
    942     default:
    943       return 0;
    944   }
    945 }
    946 
    947 static int gvn_mem_root_from_addr_val(GvnCtx* ctx, Val addr_val, u32 depth,
    948                                       u8* kind_out, i64* id_out,
    949                                       i64* offset_out, int* singleton_out) {
    950   Inst* def = NULL;
    951   if (!ctx || depth > 4u) return 0;
    952   addr_val = gvn_find(ctx, addr_val);
    953   if (!val_def_inst(ctx->f, addr_val, &def)) return 0;
    954 
    955   if ((IROp)def->op == IR_ADDR_OF && def->nopnds >= 2) {
    956     Operand* lv = &def->opnds[1];
    957     if (lv->kind == OPK_LOCAL) {
    958       *kind_out = ALIAS_LOCAL;
    959       *id_out = (i64)lv->v.frame_slot;
    960       *offset_out = 0;
    961       *singleton_out = 1;
    962       return 1;
    963     }
    964     if (lv->kind == OPK_GLOBAL) {
    965       *kind_out = ALIAS_GLOBAL;
    966       *id_out = (i64)lv->v.global.sym;
    967       *offset_out = lv->v.global.addend;
    968       *singleton_out = 1;
    969       return 1;
    970     }
    971     return 0;
    972   }
    973 
    974   if ((IROp)def->op == IR_COPY && def->nopnds >= 2 &&
    975       def->opnds[1].kind == OPK_REG) {
    976     return gvn_mem_root_from_addr_val(ctx, (Val)def->opnds[1].v.reg, depth + 1u,
    977                                       kind_out, id_out, offset_out,
    978                                       singleton_out);
    979   }
    980 
    981   if ((IROp)def->op == IR_BINOP && def->nopnds >= 3 &&
    982       (BinOp)def->extra.imm == BO_IADD) {
    983     i64 c = 0;
    984     if (def->opnds[1].kind == OPK_REG &&
    985         gvn_const_for_operand(ctx, &def->opnds[2], &c) && c == 0) {
    986       return gvn_mem_root_from_addr_val(ctx, (Val)def->opnds[1].v.reg,
    987                                         depth + 1u, kind_out, id_out,
    988                                         offset_out, singleton_out);
    989     }
    990     if (def->opnds[2].kind == OPK_REG &&
    991         gvn_const_for_operand(ctx, &def->opnds[1], &c) && c == 0) {
    992       return gvn_mem_root_from_addr_val(ctx, (Val)def->opnds[2].v.reg,
    993                                         depth + 1u, kind_out, id_out,
    994                                         offset_out, singleton_out);
    995     }
    996   }
    997 
    998   return 0;
    999 }
   1000 
   1001 static int gvn_mem_root_from_access(GvnCtx* ctx, const Operand* addr,
   1002                                     const MemAccess* mem, u8* kind_out,
   1003                                     i64* id_out, i64* offset_out,
   1004                                     int* singleton_out) {
   1005   u8 kind = ALIAS_UNKNOWN;
   1006   i64 id = 0;
   1007   i64 offset = 0;
   1008   int singleton = 0;
   1009 
   1010   if (addr) {
   1011     switch ((OpKind)addr->kind) {
   1012       case OPK_LOCAL:
   1013         kind = ALIAS_LOCAL;
   1014         id = (i64)addr->v.frame_slot;
   1015         singleton = 1;
   1016         break;
   1017       case OPK_GLOBAL:
   1018         kind = ALIAS_GLOBAL;
   1019         id = (i64)addr->v.global.sym;
   1020         offset = addr->v.global.addend;
   1021         singleton = 1;
   1022         break;
   1023       case OPK_INDIRECT:
   1024         offset = addr->v.ind.ofs;
   1025         if (addr->v.ind.index != REG_NONE) singleton = 0;
   1026         if (ctx) {
   1027           Val base = gvn_find(ctx, (Val)addr->v.ind.base);
   1028           u8 akind;
   1029           i64 aid;
   1030           i64 aofs;
   1031           int asing;
   1032           if (gvn_mem_root_from_addr_val(ctx, base, 0, &akind, &aid, &aofs,
   1033                                          &asing)) {
   1034             kind = akind;
   1035             id = aid;
   1036             offset += aofs;
   1037             singleton = asing && addr->v.ind.index == REG_NONE;
   1038           }
   1039         }
   1040         break;
   1041       default:
   1042         break;
   1043     }
   1044   }
   1045 
   1046   if (kind == ALIAS_UNKNOWN && mem) {
   1047     kind = mem->alias.kind;
   1048     switch ((AliasKind)kind) {
   1049       case ALIAS_LOCAL:
   1050         id = mem->alias.v.local_id;
   1051         break;
   1052       case ALIAS_GLOBAL:
   1053         id = (i64)mem->alias.v.global;
   1054         break;
   1055       case ALIAS_PARAM:
   1056         id = (i64)mem->alias.v.param_idx;
   1057         break;
   1058       case ALIAS_STRING:
   1059         id = (i64)mem->alias.v.string_id;
   1060         break;
   1061       case ALIAS_HEAP:
   1062         id = 0;
   1063         break;
   1064       default:
   1065         kind = ALIAS_UNKNOWN;
   1066         id = 0;
   1067         break;
   1068     }
   1069   }
   1070 
   1071   *kind_out = kind;
   1072   *id_out = id;
   1073   *offset_out = offset;
   1074   *singleton_out = singleton;
   1075   return kind != ALIAS_UNKNOWN;
   1076 }
   1077 
   1078 static GvnMemVersion* gvn_mem_version(GvnCtx* ctx, u8 kind, u16 addr_space,
   1079                                       i64 root_id) {
   1080   GvnMemState* s = &ctx->mem;
   1081   for (u32 i = 0; i < s->nroots; ++i) {
   1082     GvnMemVersion* r = &s->roots[i];
   1083     if (r->root_kind == kind && r->addr_space == addr_space &&
   1084         r->root_id == root_id)
   1085       return r;
   1086   }
   1087   if (s->nroots == s->cap) {
   1088     u32 ncap = s->cap ? s->cap * 2u : 16u;
   1089     GvnMemVersion* roots = arena_array(ctx->f->arena, GvnMemVersion, ncap);
   1090     if (s->roots) memcpy(roots, s->roots, sizeof(roots[0]) * s->nroots);
   1091     s->roots = roots;
   1092     s->cap = ncap;
   1093   }
   1094   GvnMemVersion* r = &s->roots[s->nroots++];
   1095   memset(r, 0, sizeof *r);
   1096   r->root_kind = kind;
   1097   r->addr_space = addr_space;
   1098   r->root_id = root_id;
   1099   return r;
   1100 }
   1101 
   1102 static void gvn_mem_state_copy(GvnCtx* ctx, GvnMemState* dst,
   1103                                const GvnMemState* src) {
   1104   memset(dst, 0, sizeof *dst);
   1105   dst->unknown_version = src->unknown_version;
   1106   if (src->nroots) {
   1107     dst->roots = arena_array(ctx->f->arena, GvnMemVersion, src->nroots);
   1108     memcpy(dst->roots, src->roots, sizeof(dst->roots[0]) * src->nroots);
   1109     dst->nroots = src->nroots;
   1110     dst->cap = src->nroots;
   1111   }
   1112   if (src->navail) {
   1113     dst->avail = arena_array(ctx->f->arena, GvnMemAvail, src->navail);
   1114     memcpy(dst->avail, src->avail, sizeof(dst->avail[0]) * src->navail);
   1115     dst->navail = src->navail;
   1116     dst->cap_avail = src->navail;
   1117   }
   1118 }
   1119 
   1120 static int gvn_mem_key_tracks_unknown(GvnCtx* ctx, u8 kind, i64 root_id) {
   1121   if (kind == ALIAS_LOCAL && root_id > 0 &&
   1122       (u32)root_id <= ctx->f->nframe_slots)
   1123     return ctx->local_escaped && ctx->local_escaped[root_id];
   1124   return kind != ALIAS_STRING;
   1125 }
   1126 
   1127 static int gvn_mem_call_clobbers_root(GvnCtx* ctx, u8 kind, i64 root_id) {
   1128   if (kind == ALIAS_LOCAL && root_id > 0 &&
   1129       (u32)root_id <= ctx->f->nframe_slots)
   1130     return !ctx->local_escaped || ctx->local_escaped[root_id];
   1131   return kind != ALIAS_STRING;
   1132 }
   1133 
   1134 static void gvn_mem_avail_remove_at(GvnMemState* s, u32 i) {
   1135   if (i + 1u < s->navail)
   1136     memmove(&s->avail[i], &s->avail[i + 1u],
   1137             sizeof(s->avail[0]) * (s->navail - i - 1u));
   1138   --s->navail;
   1139 }
   1140 
   1141 static void gvn_mem_avail_add(GvnCtx* ctx, const GvnKey* key, Val val) {
   1142   GvnMemState* s = &ctx->mem;
   1143   for (u32 i = 0; i < s->navail; ++i) {
   1144     if (gvn_key_equal(&s->avail[i].key, key)) {
   1145       s->avail[i].val = val;
   1146       return;
   1147     }
   1148   }
   1149   if (s->navail == s->cap_avail) {
   1150     u32 ncap = s->cap_avail ? s->cap_avail * 2u : 16u;
   1151     GvnMemAvail* avail = arena_array(ctx->f->arena, GvnMemAvail, ncap);
   1152     if (s->avail) memcpy(avail, s->avail, sizeof(avail[0]) * s->navail);
   1153     s->avail = avail;
   1154     s->cap_avail = ncap;
   1155   }
   1156   s->avail[s->navail].key = *key;
   1157   s->avail[s->navail].val = val;
   1158   ++s->navail;
   1159 }
   1160 
   1161 static int gvn_mem_avail_find(GvnCtx* ctx, const GvnKey* key, u32 b, u32 i,
   1162                               Val* out) {
   1163   for (u32 a = 0; a < ctx->mem.navail; ++a) {
   1164     Val v = ctx->mem.avail[a].val;
   1165     if (!gvn_key_equal(&ctx->mem.avail[a].key, key)) continue;
   1166     if (v == VAL_NONE || v >= ctx->f->nvals) continue;
   1167     if (!gvn_leader_dominates_site(ctx->analysis, ctx->f->val_def_block[v],
   1168                                    ctx->f->val_def_inst[v], b, i))
   1169       continue;
   1170     *out = v;
   1171     return 1;
   1172   }
   1173   return 0;
   1174 }
   1175 
   1176 static void gvn_mem_avail_remove_call_clobbered(GvnCtx* ctx) {
   1177   GvnMemState* s = &ctx->mem;
   1178   for (u32 i = 0; i < s->navail;) {
   1179     GvnMemKey* m = &s->avail[i].key.mem;
   1180     if (!m->valid || m->root_kind == ALIAS_UNKNOWN ||
   1181         gvn_mem_call_clobbers_root(ctx, m->root_kind, m->root_id))
   1182       gvn_mem_avail_remove_at(s, i);
   1183     else
   1184       ++i;
   1185   }
   1186 }
   1187 
   1188 static void gvn_mem_barrier(GvnCtx* ctx) {
   1189   ++ctx->mem.unknown_version;
   1190   for (u32 i = 0; i < ctx->mem.nroots; ++i) ++ctx->mem.roots[i].version;
   1191   ctx->mem.navail = 0;
   1192 }
   1193 
   1194 static void gvn_mem_call_barrier(GvnCtx* ctx) {
   1195   ++ctx->mem.unknown_version;
   1196   for (u32 i = 0; i < ctx->mem.nroots; ++i) {
   1197     GvnMemVersion* r = &ctx->mem.roots[i];
   1198     if (gvn_mem_call_clobbers_root(ctx, r->root_kind, r->root_id)) ++r->version;
   1199   }
   1200   gvn_mem_avail_remove_call_clobbered(ctx);
   1201 }
   1202 
   1203 static void gvn_mem_bump_store(GvnCtx* ctx, u8 kind, u16 addr_space,
   1204                                i64 root_id) {
   1205   if (kind == ALIAS_UNKNOWN) {
   1206     gvn_mem_barrier(ctx);
   1207     return;
   1208   }
   1209   ++gvn_mem_version(ctx, kind, addr_space, root_id)->version;
   1210 }
   1211 
   1212 static int gvn_make_memory_key(GvnCtx* ctx, const Inst* in, u32 addr_idx,
   1213                                KitCgTypeId val_ty, u8 val_cls, GvnKey* key) {
   1214   const MemAccess* mem;
   1215   const Operand* addr;
   1216   u8 root_kind;
   1217   i64 root_id;
   1218   i64 offset;
   1219   int singleton;
   1220 
   1221   if (!in || addr_idx >= in->nopnds) return 0;
   1222   mem = &in->extra.mem;
   1223   addr = &in->opnds[addr_idx];
   1224   if (opt_mem_observable(mem)) return 0;
   1225 
   1226   memset(key, 0, sizeof *key);
   1227   key->op = IR_LOAD;
   1228   key->type = val_ty;
   1229   key->cls = val_cls;
   1230   key->mem.valid = 1;
   1231   key->mem.addr_space = mem->addr_space;
   1232   key->mem.flags = mem->flags;
   1233   key->mem.mem_type = mem->type;
   1234   key->mem.size = mem->size;
   1235   key->mem.align = mem->align;
   1236   (void)gvn_mem_root_from_access(ctx, addr, mem, &root_kind, &root_id, &offset,
   1237                                  &singleton);
   1238   key->mem.root_kind = root_kind;
   1239   key->mem.root_id = root_id;
   1240   key->mem.offset = offset;
   1241   if (gvn_mem_key_tracks_unknown(ctx, root_kind, root_id))
   1242     key->mem.unknown_version = ctx->mem.unknown_version;
   1243   if (root_kind != ALIAS_UNKNOWN) {
   1244     key->mem.root_version =
   1245         gvn_mem_version(ctx, root_kind, mem->addr_space, root_id)->version;
   1246   }
   1247 
   1248   if (!singleton) {
   1249     key->mem.has_addr = 1;
   1250     key->nops = 1;
   1251     if (!gvn_make_addr_operand_key(ctx, addr, &key->ops[0])) return 0;
   1252   }
   1253   return 1;
   1254 }
   1255 
   1256 static int gvn_operand_key_less(const GvnOperandKey* a,
   1257                                 const GvnOperandKey* b) {
   1258   if (a->kind != b->kind) return a->kind < b->kind;
   1259   if (a->type != b->type) return a->type < b->type;
   1260   if (a->cls != b->cls) return a->cls < b->cls;
   1261   if (a->kind == OPK_INDIRECT) {
   1262     if (a->v.ind.base != b->v.ind.base) return a->v.ind.base < b->v.ind.base;
   1263     if (a->v.ind.index != b->v.ind.index)
   1264       return a->v.ind.index < b->v.ind.index;
   1265     if (a->v.ind.ofs != b->v.ind.ofs) return a->v.ind.ofs < b->v.ind.ofs;
   1266     return a->v.ind.log2_scale < b->v.ind.log2_scale;
   1267   }
   1268   if (a->kind == OPK_REG) return a->v.reg < b->v.reg;
   1269   return a->v.imm < b->v.imm;
   1270 }
   1271 
   1272 static int gvn_make_key(GvnCtx* ctx, const Inst* in, GvnKey* key) {
   1273   memset(key, 0, sizeof *key);
   1274   if (in->def == VAL_NONE || in->def >= ctx->f->nvals) return 0;
   1275   key->op = in->op;
   1276   key->type = ctx->f->val_type[in->def];
   1277   key->cls = ctx->f->val_cls[in->def];
   1278   key->tag = in->extra.imm;
   1279 
   1280   switch ((IROp)in->op) {
   1281     case IR_LOAD_IMM:
   1282     case IR_CONST_I:
   1283       key->nops = 0;
   1284       key->tag = in->extra.imm;
   1285       return 1;
   1286     case IR_COPY:
   1287       if (in->nopnds < 2) return 0;
   1288       key->nops = 1;
   1289       return gvn_make_operand_key(ctx, &in->opnds[1], &key->ops[0]);
   1290     case IR_UNOP:
   1291     case IR_CONVERT:
   1292       if (in->nopnds < 2) return 0;
   1293       key->nops = 1;
   1294       return gvn_make_operand_key(ctx, &in->opnds[1], &key->ops[0]);
   1295     case IR_BINOP:
   1296     case IR_CMP:
   1297       if (in->nopnds < 3) return 0;
   1298       key->nops = 2;
   1299       if (!gvn_make_operand_key(ctx, &in->opnds[1], &key->ops[0]) ||
   1300           !gvn_make_operand_key(ctx, &in->opnds[2], &key->ops[1]))
   1301         return 0;
   1302       if (((IROp)in->op == IR_BINOP &&
   1303            kit_ir_binop_is_commutative_int((BinOp)in->extra.imm)) ||
   1304           ((IROp)in->op == IR_CMP &&
   1305            kit_ir_cmp_is_commutative_int((CmpOp)in->extra.imm))) {
   1306         if (gvn_operand_key_less(&key->ops[1], &key->ops[0])) {
   1307           GvnOperandKey tmp = key->ops[0];
   1308           key->ops[0] = key->ops[1];
   1309           key->ops[1] = tmp;
   1310         }
   1311       }
   1312       return 1;
   1313     default:
   1314       return 0;
   1315   }
   1316 }
   1317 
   1318 static void gvn_replace_one_use(Func* f, const OptUse* use, Val repl) {
   1319   Inst* in = &f->blocks[use->block].insts[use->inst];
   1320   switch ((OptUseKind)use->kind) {
   1321     case OPT_USE_OPERAND:
   1322       use->operand->v.reg = (Reg)repl;
   1323       use->operand->type = f->val_type[repl];
   1324       use->operand->cls = f->val_cls[repl];
   1325       break;
   1326     case OPT_USE_INDIRECT_BASE:
   1327       use->operand->v.ind.base = (Reg)repl;
   1328       break;
   1329     case OPT_USE_INDIRECT_INDEX:
   1330       use->operand->v.ind.index = (Reg)repl;
   1331       break;
   1332     case OPT_USE_PHI_INPUT: {
   1333       IRPhiAux* aux = (IRPhiAux*)in->extra.aux;
   1334       if (aux && use->phi_pred_index < aux->npreds)
   1335         aux->pred_vals[use->phi_pred_index] = repl;
   1336       break;
   1337     }
   1338     default:
   1339       break;
   1340   }
   1341 }
   1342 
   1343 static int gvn_replace_dominated_uses(GvnCtx* ctx, Val old, Val repl, u32 rb,
   1344                                       u32 ri) {
   1345   Func* f = ctx->f;
   1346   int changed = 0;
   1347   if (old == VAL_NONE || old >= f->nvals || repl == VAL_NONE ||
   1348       repl >= f->nvals || old == repl || !gvn_same_shape(f, old, repl))
   1349     return 0;
   1350   for (u32 u = f->opt_first_use_by_val[old]; u != OPT_USE_NONE;
   1351        u = f->opt_uses[u].next_for_val) {
   1352     OptUse* use = &f->opt_uses[u];
   1353     if (!gvn_leader_dominates_use(ctx, rb, ri, use)) continue;
   1354     gvn_replace_one_use(f, use, repl);
   1355     changed = 1;
   1356   }
   1357   if (changed) {
   1358     ctx->parent[old] = repl;
   1359     ctx->constants[old] = ctx->constants[gvn_find(ctx, repl)];
   1360   }
   1361   return changed;
   1362 }
   1363 
   1364 static void gvn_note_const(GvnCtx* ctx, Inst* in) {
   1365   if (in->def == VAL_NONE || in->def >= ctx->f->nvals) return;
   1366   if ((IROp)in->op == IR_LOAD_IMM || (IROp)in->op == IR_CONST_I) {
   1367     ctx->constants[in->def].valid = 1;
   1368     ctx->constants[in->def].value = in->extra.imm;
   1369   }
   1370 }
   1371 
   1372 static int gvn_scalar_candidate(const Inst* in) {
   1373   if (!in || in->def == VAL_NONE || in->ndefs) return 0;
   1374   switch ((IROp)in->op) {
   1375     case IR_CONST_I:
   1376     case IR_LOAD_IMM:
   1377     case IR_COPY:
   1378     case IR_BINOP:
   1379     case IR_UNOP:
   1380     case IR_CMP:
   1381     case IR_CONVERT:
   1382       return 1;
   1383     default:
   1384       return 0;
   1385   }
   1386 }
   1387 
   1388 static int gvn_fold_branch(GvnCtx* ctx, u32 b, Inst* in) {
   1389   Block* bl = &ctx->f->blocks[b];
   1390   i64 a, c, taken;
   1391   u32 target;
   1392   switch ((IROp)in->op) {
   1393     case IR_CONDBR:
   1394       if (in->nopnds < 1 || bl->nsucc < 2) return 0;
   1395       if (in->opnds[0].kind != OPK_REG) return 0;
   1396       if (!gvn_const_for_operand(ctx, &in->opnds[0], &a)) return 0;
   1397       target = bl->succ[a != 0 ? 0u : 1u];
   1398       break;
   1399     case IR_CMP_BRANCH:
   1400       if (in->nopnds < 2 || bl->nsucc < 2) return 0;
   1401       if (in->opnds[0].kind != OPK_REG && in->opnds[1].kind != OPK_REG)
   1402         return 0;
   1403       if (!gvn_const_for_operand(ctx, &in->opnds[0], &a) ||
   1404           !gvn_const_for_operand(ctx, &in->opnds[1], &c))
   1405         return 0;
   1406       if (!gvn_fold_cmp(ctx->f, (CmpOp)in->extra.imm, in->opnds[0].type, a, c,
   1407                         &taken))
   1408         return 0;
   1409       target = bl->succ[taken ? 0u : 1u];
   1410       break;
   1411     default:
   1412       return 0;
   1413   }
   1414 
   1415   in->op = IR_BR;
   1416   in->def = VAL_NONE;
   1417   in->ndefs = 0;
   1418   in->defs = NULL;
   1419   in->nopnds = 0;
   1420   in->opnds = NULL;
   1421   bl->succ[0] = target;
   1422   bl->nsucc = 1;
   1423   return 1;
   1424 }
   1425 
   1426 static int gvn_inst_memory_barrier(const Inst* in) {
   1427   switch ((IROp)in->op) {
   1428     case IR_CALL:
   1429     case IR_AGG_COPY:
   1430     case IR_AGG_SET:
   1431     case IR_BITFIELD_LOAD:
   1432     case IR_BITFIELD_STORE:
   1433     case IR_VA_START:
   1434     case IR_VA_ARG:
   1435     case IR_VA_END:
   1436     case IR_VA_COPY:
   1437     case IR_ATOMIC_LOAD:
   1438     case IR_ATOMIC_STORE:
   1439     case IR_ATOMIC_RMW:
   1440     case IR_ATOMIC_CAS:
   1441     case IR_FENCE:
   1442     case IR_ASM_BLOCK:
   1443     case IR_INTRINSIC:
   1444       return 1;
   1445     default:
   1446       return 0;
   1447   }
   1448 }
   1449 
   1450 static int gvn_visit_memory_inst(GvnCtx* ctx, u32 b, u32 i, Inst* in) {
   1451   if ((IROp)in->op == IR_LOAD) {
   1452     if (opt_mem_observable(&in->extra.mem)) {
   1453       gvn_mem_barrier(ctx);
   1454       return 1;
   1455     }
   1456     if (in->def == VAL_NONE || in->def >= ctx->f->nvals) return 1;
   1457     GvnKey key;
   1458     if (!gvn_make_memory_key(ctx, in, 1u, ctx->f->val_type[in->def],
   1459                              ctx->f->val_cls[in->def], &key))
   1460       return 1;
   1461     Val leader = VAL_NONE;
   1462     if (gvn_mem_avail_find(ctx, &key, b, i, &leader)) {
   1463       if (gvn_replace_dominated_uses(ctx, in->def, leader,
   1464                                      ctx->f->val_def_block[leader],
   1465                                      ctx->f->val_def_inst[leader]))
   1466         ctx->changed = 1;
   1467       return 1;
   1468     }
   1469     gvn_mem_avail_add(ctx, &key, in->def);
   1470     return 1;
   1471   }
   1472 
   1473   if ((IROp)in->op == IR_STORE) {
   1474     u8 root_kind;
   1475     i64 root_id;
   1476     i64 offset;
   1477     int singleton;
   1478     Val src;
   1479     if (opt_mem_observable(&in->extra.mem)) {
   1480       gvn_mem_barrier(ctx);
   1481       return 1;
   1482     }
   1483     if (in->nopnds < 2) {
   1484       gvn_mem_barrier(ctx);
   1485       return 1;
   1486     }
   1487     (void)gvn_mem_root_from_access(ctx, &in->opnds[0], &in->extra.mem,
   1488                                    &root_kind, &root_id, &offset, &singleton);
   1489     (void)offset;
   1490     (void)singleton;
   1491     gvn_mem_bump_store(ctx, root_kind, in->extra.mem.addr_space, root_id);
   1492     if (in->opnds[1].kind != OPK_REG) return 1;
   1493     src = gvn_find(ctx, (Val)in->opnds[1].v.reg);
   1494     if (src == VAL_NONE || src >= ctx->f->nvals) return 1;
   1495     GvnKey key;
   1496     if (!gvn_make_memory_key(ctx, in, 0u, ctx->f->val_type[src],
   1497                              ctx->f->val_cls[src], &key))
   1498       return 1;
   1499     gvn_mem_avail_add(ctx, &key, src);
   1500     return 1;
   1501   }
   1502 
   1503   if ((IROp)in->op == IR_CALL) {
   1504     gvn_mem_call_barrier(ctx);
   1505     return 1;
   1506   }
   1507 
   1508   if (gvn_inst_memory_barrier(in)) {
   1509     gvn_mem_barrier(ctx);
   1510     return 1;
   1511   }
   1512   return 0;
   1513 }
   1514 
   1515 static void gvn_visit_inst(GvnCtx* ctx, u32 b, u32 i) {
   1516   Inst* in = &ctx->f->blocks[b].insts[i];
   1517   if (gvn_fold_branch(ctx, b, in)) {
   1518     ctx->changed = 1;
   1519     ctx->cfg_changed = 1;
   1520     return;
   1521   }
   1522   if (gvn_visit_memory_inst(ctx, b, i, in)) return;
   1523   if (!gvn_scalar_candidate(in)) return;
   1524 
   1525   if (gvn_try_fold_inst(ctx, in)) ctx->changed = 1;
   1526   gvn_note_const(ctx, in);
   1527 
   1528   GvnKey key;
   1529   if (!gvn_make_key(ctx, in, &key)) return;
   1530 
   1531   Val leader = VAL_NONE;
   1532   if (gvn_table_find(ctx, &key, b, i, &leader)) {
   1533     if (gvn_replace_dominated_uses(ctx, in->def, leader,
   1534                                    ctx->f->val_def_block[leader],
   1535                                    ctx->f->val_def_inst[leader]))
   1536       ctx->changed = 1;
   1537     return;
   1538   }
   1539   gvn_table_add(&ctx->table, &key, in->def, b, i);
   1540 }
   1541 
   1542 static int gvn_raw_const_zero(Func* f, Val v) {
   1543   Inst* def = NULL;
   1544   return val_def_inst(f, v, &def) && def && (IROp)def->op == IR_LOAD_IMM &&
   1545          def->extra.imm == 0;
   1546 }
   1547 
   1548 static int gvn_raw_operand_zero(Func* f, const Operand* op) {
   1549   if (!op) return 0;
   1550   if (op->kind == OPK_IMM) return op->v.imm == 0;
   1551   if (op->kind == OPK_REG) return gvn_raw_const_zero(f, (Val)op->v.reg);
   1552   return 0;
   1553 }
   1554 
   1555 static int gvn_raw_local_addr_root(Func* f, Val v, u32 depth, FrameSlot* out) {
   1556   Inst* def = NULL;
   1557   if (!f || depth > 4u || v == VAL_NONE) return 0;
   1558   if (!val_def_inst(f, v, &def) || !def) return 0;
   1559   if ((IROp)def->op == IR_ADDR_OF && def->nopnds >= 2 &&
   1560       def->opnds[1].kind == OPK_LOCAL) {
   1561     *out = def->opnds[1].v.frame_slot;
   1562     return 1;
   1563   }
   1564   if ((IROp)def->op == IR_COPY && def->nopnds >= 2 &&
   1565       def->opnds[1].kind == OPK_REG)
   1566     return gvn_raw_local_addr_root(f, (Val)def->opnds[1].v.reg, depth + 1u,
   1567                                    out);
   1568   if ((IROp)def->op == IR_BINOP && def->nopnds >= 3) {
   1569     if ((BinOp)def->extra.imm == BO_IADD) {
   1570       if (def->opnds[1].kind == OPK_REG &&
   1571           gvn_raw_operand_zero(f, &def->opnds[2]))
   1572         return gvn_raw_local_addr_root(f, (Val)def->opnds[1].v.reg, depth + 1u,
   1573                                        out);
   1574       if (def->opnds[2].kind == OPK_REG &&
   1575           gvn_raw_operand_zero(f, &def->opnds[1]))
   1576         return gvn_raw_local_addr_root(f, (Val)def->opnds[2].v.reg, depth + 1u,
   1577                                        out);
   1578     }
   1579     if ((BinOp)def->extra.imm == BO_ISUB && def->opnds[1].kind == OPK_REG &&
   1580         gvn_raw_operand_zero(f, &def->opnds[2]))
   1581       return gvn_raw_local_addr_root(f, (Val)def->opnds[1].v.reg, depth + 1u,
   1582                                      out);
   1583   }
   1584   return 0;
   1585 }
   1586 
   1587 static void gvn_mark_escaped_operand(GvnCtx* ctx, const Operand* op) {
   1588   FrameSlot fs = FRAME_SLOT_NONE;
   1589   if (!op) return;
   1590   if (op->kind == OPK_LOCAL) {
   1591     fs = op->v.frame_slot;
   1592     if (fs > 0 && (u32)fs <= ctx->f->nframe_slots) ctx->local_escaped[fs] = 1;
   1593   } else if (op->kind == OPK_REG) {
   1594     if (gvn_raw_local_addr_root(ctx->f, (Val)op->v.reg, 0, &fs) && fs > 0 &&
   1595         (u32)fs <= ctx->f->nframe_slots)
   1596       ctx->local_escaped[fs] = 1;
   1597   } else if (op->kind == OPK_INDIRECT) {
   1598     if (gvn_raw_local_addr_root(ctx->f, (Val)op->v.ind.base, 0, &fs) &&
   1599         fs > 0 && (u32)fs <= ctx->f->nframe_slots)
   1600       ctx->local_escaped[fs] = 1;
   1601   }
   1602 }
   1603 
   1604 static void gvn_mark_escaped_abivalue(GvnCtx* ctx, const CGABIValue* v) {
   1605   if (!v) return;
   1606   gvn_mark_escaped_operand(ctx, &v->storage);
   1607   for (u32 i = 0; i < v->nparts; ++i)
   1608     gvn_mark_escaped_operand(ctx, &v->parts[i].op);
   1609 }
   1610 
   1611 static void gvn_collect_local_escapes(GvnCtx* ctx) {
   1612   Func* f = ctx->f;
   1613   for (u32 b = 0; b < f->nblocks; ++b) {
   1614     Block* bl = &f->blocks[b];
   1615     for (u32 i = 0; i < bl->ninsts; ++i) {
   1616       Inst* in = &bl->insts[i];
   1617       switch ((IROp)in->op) {
   1618         case IR_LOAD:
   1619         case IR_ADDR_OF:
   1620         case IR_COPY:
   1621           break;
   1622         case IR_STORE:
   1623           if (in->nopnds >= 2) gvn_mark_escaped_operand(ctx, &in->opnds[1]);
   1624           break;
   1625         case IR_BINOP:
   1626           if (in->nopnds >= 3 && (BinOp)in->extra.imm == BO_IADD &&
   1627               (gvn_raw_operand_zero(f, &in->opnds[1]) ||
   1628                gvn_raw_operand_zero(f, &in->opnds[2])))
   1629             break;
   1630           if (in->nopnds >= 3 && (BinOp)in->extra.imm == BO_ISUB &&
   1631               gvn_raw_operand_zero(f, &in->opnds[2]))
   1632             break;
   1633           for (u32 o = 1; o < in->nopnds; ++o)
   1634             gvn_mark_escaped_operand(ctx, &in->opnds[o]);
   1635           break;
   1636         case IR_CALL: {
   1637           IRCallAux* aux = (IRCallAux*)in->extra.aux;
   1638           if (!aux) break;
   1639           if (aux->use_plan_replay) {
   1640             gvn_mark_escaped_operand(ctx, &aux->plan.callee);
   1641             for (u32 a = 0; a < aux->plan.nargs; ++a)
   1642               gvn_mark_escaped_operand(ctx, &aux->plan.args[a].src);
   1643             for (u32 r = 0; r < aux->plan.nrets; ++r)
   1644               gvn_mark_escaped_operand(ctx, &aux->plan.rets[r].dst);
   1645           } else {
   1646             gvn_mark_escaped_operand(ctx, &aux->desc.callee);
   1647             for (u32 a = 0; a < aux->desc.nargs; ++a)
   1648               gvn_mark_escaped_abivalue(ctx, &aux->desc.args[a]);
   1649             gvn_mark_escaped_abivalue(ctx, &aux->desc.ret);
   1650           }
   1651           break;
   1652         }
   1653         case IR_RET: {
   1654           IRRetAux* aux = (IRRetAux*)in->extra.aux;
   1655           if (aux && aux->present) gvn_mark_escaped_abivalue(ctx, &aux->val);
   1656           break;
   1657         }
   1658         default:
   1659           if (gvn_inst_memory_barrier(in))
   1660             for (u32 o = 0; o < in->nopnds; ++o)
   1661               gvn_mark_escaped_operand(ctx, &in->opnds[o]);
   1662           break;
   1663       }
   1664     }
   1665   }
   1666 }
   1667 
   1668 static int gvn_mem_state_has_avail(const GvnMemState* s, const GvnKey* key,
   1669                                    Val val) {
   1670   for (u32 i = 0; i < s->navail; ++i)
   1671     if (s->avail[i].val == val && gvn_key_equal(&s->avail[i].key, key))
   1672       return 1;
   1673   return 0;
   1674 }
   1675 
   1676 static void gvn_mem_merge_block_in(GvnCtx* ctx, u32 b) {
   1677   Func* f = ctx->f;
   1678   Block* bl = &f->blocks[b];
   1679   GvnBlockMemState* bs = &ctx->block_mem[b];
   1680   memset(&bs->in, 0, sizeof bs->in);
   1681   if (!bl->npreds) return;
   1682   const GvnMemState* first = NULL;
   1683   for (u32 p = 0; p < bl->npreds; ++p) {
   1684     u32 pred = bl->preds[p];
   1685     if (pred >= f->nblocks || !ctx->analysis->reachable[pred]) continue;
   1686     if (!ctx->block_mem[pred].out_valid) return;
   1687     if (!first) first = &ctx->block_mem[pred].out;
   1688   }
   1689   if (!first) return;
   1690   gvn_mem_state_copy(ctx, &bs->in, first);
   1691   for (u32 i = 0; i < bs->in.navail;) {
   1692     GvnMemAvail* a = &bs->in.avail[i];
   1693     int keep = 1;
   1694     for (u32 p = 0; p < bl->npreds; ++p) {
   1695       u32 pred = bl->preds[p];
   1696       if (pred >= f->nblocks || !ctx->analysis->reachable[pred]) continue;
   1697       if (!gvn_mem_state_has_avail(&ctx->block_mem[pred].out, &a->key,
   1698                                    a->val)) {
   1699         keep = 0;
   1700         break;
   1701       }
   1702     }
   1703     if (keep)
   1704       ++i;
   1705     else
   1706       gvn_mem_avail_remove_at(&bs->in, i);
   1707   }
   1708 }
   1709 
   1710 static void gvn_realign_phi_preds(Func* f) {
   1711   for (u32 b = 0; b < f->nblocks; ++b) {
   1712     Block* bl = &f->blocks[b];
   1713     for (u32 i = 0; i < bl->ninsts; ++i) {
   1714       Inst* phi = &bl->insts[i];
   1715       if ((IROp)phi->op != IR_PHI) break;
   1716       IRPhiAux* aux = (IRPhiAux*)phi->extra.aux;
   1717       if (!aux) continue;
   1718       u32 old_n = aux->npreds;
   1719       u32* old_blocks = aux->pred_blocks;
   1720       Val* old_vals = aux->pred_vals;
   1721       u32* pred_blocks =
   1722           bl->npreds ? arena_array(f->arena, u32, bl->npreds) : NULL;
   1723       Val* pred_vals =
   1724           bl->npreds ? arena_zarray(f->arena, Val, bl->npreds) : NULL;
   1725       for (u32 p = 0; p < bl->npreds; ++p) {
   1726         pred_blocks[p] = bl->preds[p];
   1727         pred_vals[p] = phi->def;
   1728         for (u32 old = 0; old < old_n; ++old) {
   1729           if (old_blocks && old_blocks[old] == bl->preds[p]) {
   1730             pred_vals[p] = old_vals ? old_vals[old] : VAL_NONE;
   1731             break;
   1732           }
   1733         }
   1734       }
   1735       aux->npreds = bl->npreds;
   1736       aux->pred_blocks = pred_blocks;
   1737       aux->pred_vals = pred_vals;
   1738     }
   1739   }
   1740 }
   1741 
   1742 void opt_gvn(Func* f) {
   1743   if (!f || f->opt_rewritten) return;
   1744   if (!f->opt_reg_ssa && f->npregs > 1) {
   1745     opt_rebuild_def_use(f);
   1746     return;
   1747   }
   1748 
   1749   opt_rebuild_def_use(f);
   1750   OptAnalysis a;
   1751   memset(&a, 0, sizeof a);
   1752   opt_analysis_build_dominators(f, &a);
   1753 
   1754   GvnCtx ctx;
   1755   memset(&ctx, 0, sizeof ctx);
   1756   ctx.f = f;
   1757   ctx.analysis = &a;
   1758   ctx.parent = arena_array(f->arena, Val, f->nvals ? f->nvals : 1u);
   1759   ctx.constants = arena_zarray(f->arena, GvnConst, f->nvals ? f->nvals : 1u);
   1760   ctx.block_mem =
   1761       arena_zarray(f->arena, GvnBlockMemState, f->nblocks ? f->nblocks : 1u);
   1762   ctx.local_escaped =
   1763       arena_zarray(f->arena, u8, f->nframe_slots ? f->nframe_slots + 1u : 1u);
   1764   for (Val v = 0; v < f->nvals; ++v) ctx.parent[v] = v;
   1765   gvn_table_init(&ctx.table, f);
   1766   gvn_collect_local_escapes(&ctx);
   1767 
   1768   for (u32 ri = 0; ri < a.nrpo; ++ri) {
   1769     u32 b = a.rpo[ri];
   1770     if (b >= f->nblocks || !a.reachable[b]) continue;
   1771     gvn_mem_merge_block_in(&ctx, b);
   1772     gvn_mem_state_copy(&ctx, &ctx.mem, &ctx.block_mem[b].in);
   1773     for (u32 i = 0; i < f->blocks[b].ninsts; ++i) gvn_visit_inst(&ctx, b, i);
   1774     gvn_mem_state_copy(&ctx, &ctx.block_mem[b].out, &ctx.mem);
   1775     ctx.block_mem[b].out_valid = 1;
   1776   }
   1777 
   1778   if (ctx.cfg_changed) {
   1779     opt_analysis_invalidate(f, OPT_ANALYSIS_CFG | OPT_ANALYSIS_DEF_USE |
   1780                                    OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP);
   1781     opt_build_cfg(f);
   1782     gvn_realign_phi_preds(f);
   1783   } else if (ctx.changed) {
   1784     opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE);
   1785   }
   1786   opt_rebuild_def_use(f);
   1787 }
   1788 
   1789 static int dse_key_equal(const DseKey* a, const DseKey* b) {
   1790   return a->root_kind == b->root_kind && a->addr_space == b->addr_space &&
   1791          a->flags == b->flags && a->mem_type == b->mem_type &&
   1792          a->size == b->size && a->align == b->align &&
   1793          a->root_id == b->root_id && a->offset == b->offset;
   1794 }
   1795 
   1796 static int dse_key_same_root(const DseKey* a, const DseKey* b) {
   1797   return a->root_kind == b->root_kind && a->addr_space == b->addr_space &&
   1798          a->root_id == b->root_id;
   1799 }
   1800 
   1801 static int dse_ranges_overlap(const DseKey* a, const DseKey* b) {
   1802   i64 ae = a->offset + (i64)a->size;
   1803   i64 be = b->offset + (i64)b->size;
   1804   return a->offset < be && b->offset < ae;
   1805 }
   1806 
   1807 static int dse_range_covers(const DseKey* cover, const DseKey* covered) {
   1808   i64 ce = cover->offset + (i64)cover->size;
   1809   i64 de = covered->offset + (i64)covered->size;
   1810   return cover->offset <= covered->offset && ce >= de;
   1811 }
   1812 
   1813 static int dse_root_call_clobbered(DseCtx* ctx, const DseKey* key) {
   1814   if (key->root_kind == ALIAS_LOCAL && key->root_id > 0 &&
   1815       (u32)key->root_id <= ctx->f->nframe_slots)
   1816     return ctx->local_escaped && ctx->local_escaped[key->root_id];
   1817   return key->root_kind != ALIAS_STRING;
   1818 }
   1819 
   1820 static int dse_root_observable_at_exit(DseCtx* ctx, const DseKey* key) {
   1821   if (key->root_kind == ALIAS_LOCAL && key->root_id > 0 &&
   1822       (u32)key->root_id <= ctx->f->nframe_slots)
   1823     return ctx->local_escaped && ctx->local_escaped[key->root_id];
   1824   return key->root_kind != ALIAS_UNKNOWN;
   1825 }
   1826 
   1827 static int dse_make_key(DseCtx* ctx, const Inst* in, u32 addr_idx,
   1828                         DseKey* out) {
   1829   u8 root_kind;
   1830   i64 root_id;
   1831   i64 offset;
   1832   int singleton;
   1833 
   1834   if (!ctx || !in || addr_idx >= in->nopnds ||
   1835       opt_mem_observable(&in->extra.mem))
   1836     return 0;
   1837   if (in->extra.mem.size == 0) return 0;
   1838   if (!gvn_mem_root_from_access(&ctx->gvn, &in->opnds[addr_idx], &in->extra.mem,
   1839                                 &root_kind, &root_id, &offset, &singleton))
   1840     return 0;
   1841   if (!singleton || root_kind == ALIAS_UNKNOWN) return 0;
   1842 
   1843   memset(out, 0, sizeof *out);
   1844   out->root_kind = root_kind;
   1845   out->addr_space = in->extra.mem.addr_space;
   1846   out->flags = in->extra.mem.flags;
   1847   out->mem_type = in->extra.mem.type;
   1848   out->size = in->extra.mem.size;
   1849   out->align = in->extra.mem.align;
   1850   out->root_id = root_id;
   1851   out->offset = offset;
   1852   return 1;
   1853 }
   1854 
   1855 static u32 dse_key_intern(DseCtx* ctx, const DseKey* key) {
   1856   for (u32 i = 0; i < ctx->nkeys; ++i)
   1857     if (dse_key_equal(&ctx->keys[i], key)) return i;
   1858   if (ctx->nkeys == ctx->cap_keys) {
   1859     u32 ncap = ctx->cap_keys ? ctx->cap_keys * 2u : 16u;
   1860     DseKey* keys = arena_array(ctx->f->arena, DseKey, ncap);
   1861     if (ctx->keys) memcpy(keys, ctx->keys, sizeof(keys[0]) * ctx->nkeys);
   1862     ctx->keys = keys;
   1863     ctx->cap_keys = ncap;
   1864   }
   1865   ctx->keys[ctx->nkeys] = *key;
   1866   return ctx->nkeys++;
   1867 }
   1868 
   1869 static void dse_bitset_clear(DseCtx* ctx, DseBitset* bs) {
   1870   memset(bs->words, 0, sizeof(bs->words[0]) * ctx->words);
   1871 }
   1872 
   1873 static int dse_bitset_copy(DseCtx* ctx, DseBitset* dst, const DseBitset* src) {
   1874   int changed =
   1875       memcmp(dst->words, src->words, sizeof(dst->words[0]) * ctx->words) != 0;
   1876   if (changed)
   1877     memcpy(dst->words, src->words, sizeof(dst->words[0]) * ctx->words);
   1878   return changed;
   1879 }
   1880 
   1881 static void dse_bitset_or(DseCtx* ctx, DseBitset* dst, const DseBitset* src) {
   1882   for (u32 w = 0; w < ctx->words; ++w) dst->words[w] |= src->words[w];
   1883 }
   1884 
   1885 static int dse_bitset_test(const DseBitset* bs, u32 bit) {
   1886   return (bs->words[bit / 64u] & (1ull << (bit % 64u))) != 0;
   1887 }
   1888 
   1889 static void dse_bitset_set(DseBitset* bs, u32 bit) {
   1890   bs->words[bit / 64u] |= 1ull << (bit % 64u);
   1891 }
   1892 
   1893 static void dse_bitset_reset(DseBitset* bs, u32 bit) {
   1894   bs->words[bit / 64u] &= ~(1ull << (bit % 64u));
   1895 }
   1896 
   1897 static int dse_live_overlaps(DseCtx* ctx, const DseBitset* live,
   1898                              const DseKey* key) {
   1899   for (u32 i = 0; i < ctx->nkeys; ++i) {
   1900     if (!dse_bitset_test(live, i)) continue;
   1901     if (dse_key_same_root(&ctx->keys[i], key) &&
   1902         dse_ranges_overlap(&ctx->keys[i], key))
   1903       return 1;
   1904   }
   1905   return 0;
   1906 }
   1907 
   1908 static void dse_live_mark_overlaps(DseCtx* ctx, DseBitset* live,
   1909                                    const DseKey* key) {
   1910   for (u32 i = 0; i < ctx->nkeys; ++i) {
   1911     if (dse_key_same_root(&ctx->keys[i], key) &&
   1912         dse_ranges_overlap(&ctx->keys[i], key))
   1913       dse_bitset_set(live, i);
   1914   }
   1915 }
   1916 
   1917 static void dse_live_clear_covered(DseCtx* ctx, DseBitset* live,
   1918                                    const DseKey* key) {
   1919   for (u32 i = 0; i < ctx->nkeys; ++i) {
   1920     if (dse_key_same_root(&ctx->keys[i], key) &&
   1921         dse_range_covers(key, &ctx->keys[i]))
   1922       dse_bitset_reset(live, i);
   1923   }
   1924 }
   1925 
   1926 static void dse_live_mark_all(DseCtx* ctx, DseBitset* live) {
   1927   for (u32 i = 0; i < ctx->nkeys; ++i) dse_bitset_set(live, i);
   1928 }
   1929 
   1930 static void dse_live_mark_call_clobbered(DseCtx* ctx, DseBitset* live) {
   1931   for (u32 i = 0; i < ctx->nkeys; ++i)
   1932     if (dse_root_call_clobbered(ctx, &ctx->keys[i])) dse_bitset_set(live, i);
   1933 }
   1934 
   1935 static void dse_live_mark_exit_observable(DseCtx* ctx, DseBitset* live) {
   1936   for (u32 i = 0; i < ctx->nkeys; ++i)
   1937     if (dse_root_observable_at_exit(ctx, &ctx->keys[i]))
   1938       dse_bitset_set(live, i);
   1939 }
   1940 
   1941 static int dse_store_site_key(DseCtx* ctx, u32 block, u32 inst, u32* out) {
   1942   if (!ctx->store_key_by_inst || block >= ctx->f->nblocks ||
   1943       inst >= ctx->f->blocks[block].ninsts || !ctx->store_key_by_inst[block])
   1944     return 0;
   1945   *out = ctx->store_key_by_inst[block][inst];
   1946   if (*out != DSE_KEY_NONE) return 1;
   1947   return 0;
   1948 }
   1949 
   1950 static int dse_inst_full_barrier(const Inst* in) {
   1951   return gvn_inst_memory_barrier(in);
   1952 }
   1953 
   1954 static void dse_transfer_inst(DseCtx* ctx, DseBitset* live, u32 block, u32 inst,
   1955                               int mark_dead) {
   1956   Inst* in = &ctx->f->blocks[block].insts[inst];
   1957   DseKey key;
   1958   u32 key_id;
   1959 
   1960   switch ((IROp)in->op) {
   1961     case IR_LOAD:
   1962       if (opt_mem_observable(&in->extra.mem)) {
   1963         dse_live_mark_all(ctx, live);
   1964       } else if (dse_make_key(ctx, in, 1u, &key)) {
   1965         dse_live_mark_overlaps(ctx, live, &key);
   1966       } else {
   1967         dse_live_mark_all(ctx, live);
   1968       }
   1969       break;
   1970     case IR_STORE:
   1971       if (!dse_store_site_key(ctx, block, inst, &key_id)) {
   1972         dse_live_mark_all(ctx, live);
   1973         break;
   1974       }
   1975       key = ctx->keys[key_id];
   1976       if (mark_dead && !dse_live_overlaps(ctx, live, &key)) {
   1977         in->op = IR_NOP;
   1978         in->nopnds = 0;
   1979         in->opnds = NULL;
   1980         ctx->changed = 1;
   1981       }
   1982       dse_live_clear_covered(ctx, live, &key);
   1983       break;
   1984     case IR_CALL:
   1985       dse_live_mark_call_clobbered(ctx, live);
   1986       break;
   1987     default:
   1988       if (dse_inst_full_barrier(in)) dse_live_mark_all(ctx, live);
   1989       break;
   1990   }
   1991 }
   1992 
   1993 static void dse_collect_constants(GvnCtx* ctx) {
   1994   for (u32 b = 0; b < ctx->f->nblocks; ++b) {
   1995     Block* bl = &ctx->f->blocks[b];
   1996     for (u32 i = 0; i < bl->ninsts; ++i) gvn_note_const(ctx, &bl->insts[i]);
   1997   }
   1998 }
   1999 
   2000 static void dse_collect_stores(DseCtx* ctx) {
   2001   Func* f = ctx->f;
   2002   ctx->store_key_by_inst =
   2003       arena_zarray(f->arena, u32*, f->nblocks ? f->nblocks : 1u);
   2004   for (u32 b = 0; b < f->nblocks; ++b) {
   2005     Block* bl = &f->blocks[b];
   2006     ctx->store_key_by_inst[b] =
   2007         arena_array(f->arena, u32, bl->ninsts ? bl->ninsts : 1u);
   2008     for (u32 i = 0; i < bl->ninsts; ++i)
   2009       ctx->store_key_by_inst[b][i] = DSE_KEY_NONE;
   2010     for (u32 i = 0; i < bl->ninsts; ++i) {
   2011       Inst* in = &bl->insts[i];
   2012       DseKey key;
   2013       if ((IROp)in->op != IR_STORE) continue;
   2014       if (!dse_make_key(ctx, in, 0u, &key)) continue;
   2015       ctx->store_key_by_inst[b][i] = dse_key_intern(ctx, &key);
   2016     }
   2017   }
   2018 }
   2019 
   2020 static DseBitset dse_bitset_new(DseCtx* ctx) {
   2021   DseBitset bs;
   2022   bs.words = arena_zarray(ctx->f->arena, u64, ctx->words ? ctx->words : 1u);
   2023   return bs;
   2024 }
   2025 
   2026 static void dse_init_block_sets(DseCtx* ctx) {
   2027   Func* f = ctx->f;
   2028   ctx->words = (ctx->nkeys + 63u) / 64u;
   2029   if (!ctx->words) ctx->words = 1u;
   2030   ctx->block =
   2031       arena_zarray(f->arena, DseBlockState, f->nblocks ? f->nblocks : 1u);
   2032   for (u32 b = 0; b < f->nblocks; ++b) {
   2033     ctx->block[b].in = dse_bitset_new(ctx);
   2034     ctx->block[b].out = dse_bitset_new(ctx);
   2035   }
   2036   ctx->scratch_in = dse_bitset_new(ctx);
   2037   ctx->scratch_out = dse_bitset_new(ctx);
   2038 }
   2039 
   2040 static void dse_compute_block_out(DseCtx* ctx, u32 b, DseBitset* out) {
   2041   Func* f = ctx->f;
   2042   Block* bl = &f->blocks[b];
   2043   dse_bitset_clear(ctx, out);
   2044   if (!bl->nsucc) {
   2045     dse_live_mark_exit_observable(ctx, out);
   2046     return;
   2047   }
   2048   for (u32 s = 0; s < bl->nsucc; ++s) {
   2049     u32 succ = bl->succ[s];
   2050     if (succ < f->nblocks) dse_bitset_or(ctx, out, &ctx->block[succ].in);
   2051   }
   2052 }
   2053 
   2054 static void dse_transfer_block(DseCtx* ctx, u32 b, const DseBitset* out,
   2055                                DseBitset* in, int mark_dead) {
   2056   Block* bl = &ctx->f->blocks[b];
   2057   dse_bitset_copy(ctx, in, out);
   2058   for (u32 ri = bl->ninsts; ri > 0; --ri)
   2059     dse_transfer_inst(ctx, in, b, ri - 1u, mark_dead);
   2060 }
   2061 
   2062 static void dse_refresh_def_locations(Func* f) {
   2063   for (u32 b = 0; b < f->nblocks; ++b) {
   2064     Block* bl = &f->blocks[b];
   2065     for (u32 i = 0; i < bl->ninsts; ++i) {
   2066       Inst* in = &bl->insts[i];
   2067       if (in->def != VAL_NONE && in->def < f->nvals) {
   2068         f->val_def_block[in->def] = b;
   2069         f->val_def_inst[in->def] = i;
   2070       }
   2071       for (u32 d = 0; d < in->ndefs; ++d) {
   2072         Val v = in->defs[d];
   2073         if (v != VAL_NONE && v < f->nvals) {
   2074           f->val_def_block[v] = b;
   2075           f->val_def_inst[v] = i;
   2076         }
   2077       }
   2078     }
   2079   }
   2080 }
   2081 
   2082 static void dse_compact_nops(Func* f) {
   2083   for (u32 b = 0; b < f->nblocks; ++b) {
   2084     Block* bl = &f->blocks[b];
   2085     u32 w = 0;
   2086     for (u32 i = 0; i < bl->ninsts; ++i) {
   2087       if ((IROp)bl->insts[i].op == IR_NOP) continue;
   2088       bl->insts[w++] = bl->insts[i];
   2089     }
   2090     bl->ninsts = w;
   2091   }
   2092   dse_refresh_def_locations(f);
   2093 }
   2094 
   2095 void opt_dse(Func* f) {
   2096   if (!f || f->opt_rewritten) return;
   2097   if (!f->opt_reg_ssa && f->npregs > 1) {
   2098     opt_rebuild_def_use(f);
   2099     return;
   2100   }
   2101 
   2102   opt_rebuild_def_use(f);
   2103   DseCtx ctx;
   2104   memset(&ctx, 0, sizeof ctx);
   2105   ctx.f = f;
   2106   ctx.local_escaped =
   2107       arena_zarray(f->arena, u8, f->nframe_slots ? f->nframe_slots + 1u : 1u);
   2108 
   2109   memset(&ctx.gvn, 0, sizeof ctx.gvn);
   2110   ctx.gvn.f = f;
   2111   ctx.gvn.parent = arena_array(f->arena, Val, f->nvals ? f->nvals : 1u);
   2112   ctx.gvn.constants =
   2113       arena_zarray(f->arena, GvnConst, f->nvals ? f->nvals : 1u);
   2114   ctx.gvn.local_escaped = ctx.local_escaped;
   2115   for (Val v = 0; v < f->nvals; ++v) ctx.gvn.parent[v] = v;
   2116   gvn_collect_local_escapes(&ctx.gvn);
   2117   dse_collect_constants(&ctx.gvn);
   2118   dse_collect_stores(&ctx);
   2119   if (!ctx.nkeys) {
   2120     opt_rebuild_def_use(f);
   2121     return;
   2122   }
   2123 
   2124   dse_init_block_sets(&ctx);
   2125   int changed = 1;
   2126   while (changed) {
   2127     changed = 0;
   2128     for (u32 rb = f->nblocks; rb > 0; --rb) {
   2129       u32 b = rb - 1u;
   2130       dse_compute_block_out(&ctx, b, &ctx.scratch_out);
   2131       dse_transfer_block(&ctx, b, &ctx.scratch_out, &ctx.scratch_in, 0);
   2132       changed |= dse_bitset_copy(&ctx, &ctx.block[b].out, &ctx.scratch_out);
   2133       changed |= dse_bitset_copy(&ctx, &ctx.block[b].in, &ctx.scratch_in);
   2134     }
   2135   }
   2136 
   2137   for (u32 b = 0; b < f->nblocks; ++b) {
   2138     dse_transfer_block(&ctx, b, &ctx.block[b].out, &ctx.scratch_in, 1);
   2139   }
   2140 
   2141   if (ctx.changed) {
   2142     dse_compact_nops(f);
   2143     opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE);
   2144   }
   2145   opt_rebuild_def_use(f);
   2146 }
   2147 
   2148 static void licm_mark_body(Func* f, const u8* reachable, u32 header, u32 latch,
   2149                            u8* body, u32* stack) {
   2150   u32 sp = 0;
   2151   if (!body[header]) body[header] = 1;
   2152   if (!body[latch]) {
   2153     body[latch] = 1;
   2154     stack[sp++] = latch;
   2155   }
   2156 
   2157   while (sp) {
   2158     u32 b = stack[--sp];
   2159     if (b == header) continue;
   2160     Block* bl = &f->blocks[b];
   2161     for (u32 p = 0; p < bl->npreds; ++p) {
   2162       u32 pred = bl->preds[p];
   2163       if (pred >= f->nblocks || !reachable[pred] || body[pred]) continue;
   2164       body[pred] = 1;
   2165       stack[sp++] = pred;
   2166     }
   2167   }
   2168 }
   2169 
   2170 static u32 licm_collect_loops(Func* f, OptAnalysis* a, LicmLoop** out) {
   2171   u32 nloops = 0;
   2172   LicmLoop* loops =
   2173       arena_zarray(f->arena, LicmLoop, f->nblocks ? f->nblocks : 1u);
   2174   u8* scratch = arena_zarray(f->arena, u8, f->nblocks ? f->nblocks : 1u);
   2175   u32* stack = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u);
   2176 
   2177   for (u32 header = 0; header < f->nblocks; ++header) {
   2178     if (!a->reachable || !a->reachable[header]) continue;
   2179     memset(scratch, 0, f->nblocks * sizeof scratch[0]);
   2180     int has_loop = 0;
   2181 
   2182     for (u32 latch = 0; latch < f->nblocks; ++latch) {
   2183       if (!a->reachable[latch]) continue;
   2184       Block* lb = &f->blocks[latch];
   2185       for (u32 s = 0; s < lb->nsucc; ++s) {
   2186         if (lb->succ[s] != header) continue;
   2187         if (!opt_analysis_dominates(a, header, latch)) continue;
   2188         has_loop = 1;
   2189         licm_mark_body(f, a->reachable, header, latch, scratch, stack);
   2190       }
   2191     }
   2192     if (!has_loop) continue;
   2193 
   2194     u32 preheader = OPT_BLOCK_NONE;
   2195     Block* hb = &f->blocks[header];
   2196     for (u32 p = 0; p < hb->npreds; ++p) {
   2197       u32 pred = hb->preds[p];
   2198       if (pred >= f->nblocks || scratch[pred]) continue;
   2199       if (preheader != OPT_BLOCK_NONE) {
   2200         preheader = OPT_BLOCK_NONE;
   2201         break;
   2202       }
   2203       preheader = pred;
   2204     }
   2205     if (preheader == OPT_BLOCK_NONE) continue;
   2206     if (f->blocks[preheader].nsucc != 1 ||
   2207         f->blocks[preheader].succ[0] != header)
   2208       continue;
   2209 
   2210     u8* body = arena_array(f->arena, u8, f->nblocks ? f->nblocks : 1u);
   2211     memcpy(body, scratch, f->nblocks * sizeof body[0]);
   2212     loops[nloops].header = header;
   2213     loops[nloops].preheader = preheader;
   2214     loops[nloops].body = body;
   2215     ++nloops;
   2216   }
   2217 
   2218   *out = loops;
   2219   return nloops;
   2220 }
   2221 
   2222 static int licm_safe_binop(BinOp op) {
   2223   switch (op) {
   2224     case BO_SDIV:
   2225     case BO_UDIV:
   2226     case BO_SREM:
   2227     case BO_UREM:
   2228     case BO_FDIV:
   2229       return 0;
   2230     default:
   2231       return 1;
   2232   }
   2233 }
   2234 
   2235 static int licm_candidate(Func* f, const Inst* in) {
   2236   if (!in || in->def == VAL_NONE || in->ndefs || in->flags) return 0;
   2237   if (in->def >= f->nvals || opt_inst_has_side_effect(f, in)) return 0;
   2238   switch ((IROp)in->op) {
   2239     case IR_CONST_I:
   2240     case IR_CONST_BYTES:
   2241     case IR_LOAD_IMM:
   2242     case IR_LOAD_CONST:
   2243     case IR_LOAD_LABEL_ADDR:
   2244     case IR_COPY:
   2245     case IR_ADDR_OF:
   2246     case IR_UNOP:
   2247     case IR_CMP:
   2248     case IR_CONVERT:
   2249       return 1;
   2250     case IR_BINOP:
   2251       return licm_safe_binop((BinOp)in->extra.imm);
   2252     default:
   2253       return 0;
   2254   }
   2255 }
   2256 
   2257 static int licm_operand_invariant(Func* f, const u8* body, const Operand* op) {
   2258   if (!op) return 1;
   2259   if (op->kind == OPK_REG) {
   2260     Val v = (Val)op->v.reg;
   2261     if (v == VAL_NONE || v >= f->nvals) return 0;
   2262     u32 def_block = f->val_def_block[v];
   2263     return def_block < f->nblocks && !body[def_block];
   2264   }
   2265   if (op->kind == OPK_INDIRECT) {
   2266     Val v = (Val)op->v.ind.base;
   2267     if (v == VAL_NONE || v >= f->nvals) return 0;
   2268     u32 def_block = f->val_def_block[v];
   2269     return def_block < f->nblocks && !body[def_block];
   2270   }
   2271   return 1;
   2272 }
   2273 
   2274 static int licm_inst_invariant(Func* f, const LicmLoop* loop, const Inst* in) {
   2275   if (!licm_candidate(f, in)) return 0;
   2276   for (u32 i = 0; i < in->nopnds; ++i) {
   2277     const Operand* op = &in->opnds[i];
   2278     if (op->kind == OPK_REG && opt_val_in_inst_defs(in, (Val)op->v.reg))
   2279       continue;
   2280     if (!licm_operand_invariant(f, loop->body, op)) return 0;
   2281   }
   2282   return 1;
   2283 }
   2284 
   2285 static void licm_insert_inst(Func* f, u32 block, u32 at, Inst in) {
   2286   Block* bl = &f->blocks[block];
   2287   if (at > bl->ninsts) at = bl->ninsts;
   2288   if (bl->ninsts == bl->cap) {
   2289     u32 ncap = bl->cap ? bl->cap * 2u : 8u;
   2290     while (ncap < bl->ninsts + 1u) ncap *= 2u;
   2291     Inst* insts = arena_zarray(f->arena, Inst, ncap);
   2292     if (at) memcpy(insts, bl->insts, sizeof(insts[0]) * at);
   2293     if (bl->ninsts > at)
   2294       memcpy(&insts[at + 1u], &bl->insts[at],
   2295              sizeof(insts[0]) * (bl->ninsts - at));
   2296     bl->insts = insts;
   2297     bl->cap = ncap;
   2298   } else {
   2299     for (u32 i = bl->ninsts; i > at; --i) bl->insts[i] = bl->insts[i - 1u];
   2300   }
   2301   bl->insts[at] = in;
   2302   ++bl->ninsts;
   2303 }
   2304 
   2305 static void licm_remove_inst(Func* f, u32 block, u32 at) {
   2306   Block* bl = &f->blocks[block];
   2307   if (at >= bl->ninsts) return;
   2308   for (u32 i = at + 1u; i < bl->ninsts; ++i) bl->insts[i - 1u] = bl->insts[i];
   2309   --bl->ninsts;
   2310 }
   2311 
   2312 static u32 licm_preheader_insert_pos(Func* f, u32 preheader) {
   2313   Block* bl = &f->blocks[preheader];
   2314   if (bl->ninsts && o2_is_terminator(&bl->insts[bl->ninsts - 1u]))
   2315     return bl->ninsts - 1u;
   2316   return bl->ninsts;
   2317 }
   2318 
   2319 static void licm_hoist_inst(Func* f, const LicmLoop* loop, u32 block,
   2320                             u32 inst) {
   2321   Inst moved = f->blocks[block].insts[inst];
   2322   u32 at = licm_preheader_insert_pos(f, loop->preheader);
   2323   licm_insert_inst(f, loop->preheader, at, moved);
   2324   licm_remove_inst(f, block, inst);
   2325   if (moved.def != VAL_NONE && moved.def < f->nvals) {
   2326     f->val_def_block[moved.def] = loop->preheader;
   2327     f->val_def_inst[moved.def] = at;
   2328   }
   2329 }
   2330 
   2331 void opt_licm(Func* f) {
   2332   if (!f || f->opt_rewritten) return;
   2333   if (!f->opt_reg_ssa && f->npregs > 1) {
   2334     opt_rebuild_def_use(f);
   2335     return;
   2336   }
   2337 
   2338   OptAnalysis a;
   2339   memset(&a, 0, sizeof a);
   2340   opt_analysis_build_dominators(f, &a);
   2341   LicmLoop* loops = NULL;
   2342   u32 nloops = licm_collect_loops(f, &a, &loops);
   2343   int changed = 0;
   2344 
   2345   for (u32 l = 0; l < nloops; ++l) {
   2346     LicmLoop* loop = &loops[l];
   2347     int again = 1;
   2348     while (again) {
   2349       again = 0;
   2350       for (u32 b = 0; b < f->nblocks; ++b) {
   2351         if (!loop->body[b] || b == loop->preheader) continue;
   2352         Block* bl = &f->blocks[b];
   2353         for (u32 i = 0; i < bl->ninsts;) {
   2354           Inst* in = &bl->insts[i];
   2355           if (!licm_inst_invariant(f, loop, in)) {
   2356             ++i;
   2357             continue;
   2358           }
   2359           licm_hoist_inst(f, loop, b, i);
   2360           changed = 1;
   2361           again = 1;
   2362           bl = &f->blocks[b];
   2363         }
   2364       }
   2365     }
   2366   }
   2367 
   2368   if (changed) {
   2369     dse_refresh_def_locations(f);
   2370     opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE);
   2371   }
   2372   opt_rebuild_def_use(f);
   2373 }
   2374 
   2375 static int pressure_candidate(const Inst* in) {
   2376   if (!in || in->def == VAL_NONE || in->ndefs) return 0;
   2377   switch ((IROp)in->op) {
   2378     case IR_LOAD_IMM:
   2379     case IR_CONST_I:
   2380       return 1;
   2381     default:
   2382       return 0;
   2383   }
   2384 }
   2385 
   2386 static int pressure_barrier(const Inst* in) {
   2387   if (!in) return 1;
   2388   if (o2_is_terminator(in)) return 1;
   2389   switch ((IROp)in->op) {
   2390     case IR_LOAD:
   2391     case IR_STORE:
   2392     case IR_LOAD_CONST:
   2393     case IR_AGG_COPY:
   2394     case IR_AGG_SET:
   2395     case IR_BITFIELD_LOAD:
   2396     case IR_BITFIELD_STORE:
   2397     case IR_CALL:
   2398     case IR_SCOPE_BEGIN:
   2399     case IR_SCOPE_END:
   2400     case IR_BREAK_TO:
   2401     case IR_CONTINUE_TO:
   2402     case IR_ALLOCA:
   2403     case IR_VA_START:
   2404     case IR_VA_ARG:
   2405     case IR_VA_END:
   2406     case IR_VA_COPY:
   2407     case IR_ATOMIC_LOAD:
   2408     case IR_ATOMIC_STORE:
   2409     case IR_ATOMIC_RMW:
   2410     case IR_ATOMIC_CAS:
   2411     case IR_FENCE:
   2412     case IR_ASM_BLOCK:
   2413     case IR_INTRINSIC:
   2414       return 1;
   2415     default:
   2416       return 0;
   2417   }
   2418 }
   2419 
   2420 static int pressure_single_use(Func* f, Val v, OptUse* out) {
   2421   u32 n = 0;
   2422   OptUse one;
   2423   memset(&one, 0, sizeof one);
   2424   for (u32 u = f->opt_first_use_by_val[v]; u != OPT_USE_NONE;
   2425        u = f->opt_uses[u].next_for_val) {
   2426     one = f->opt_uses[u];
   2427     ++n;
   2428     if (n > 1u) return 0;
   2429   }
   2430   if (n != 1u) return 0;
   2431   *out = one;
   2432   return 1;
   2433 }
   2434 
   2435 static int pressure_can_move_within_block(Func* f, u32 b, u32 from, u32 to) {
   2436   if (b >= f->nblocks || from >= to) return 0;
   2437   Block* bl = &f->blocks[b];
   2438   if (to > bl->ninsts) return 0;
   2439   for (u32 i = from + 1u; i < to; ++i)
   2440     if (pressure_barrier(&bl->insts[i])) return 0;
   2441   return 1;
   2442 }
   2443 
   2444 static PressureBlockPlan* pressure_block_plan(Func* f, PressurePlan* plan,
   2445                                               u32 b) {
   2446   if (!plan->blocks)
   2447     plan->blocks =
   2448         arena_zarray(f->arena, PressureBlockPlan, f->nblocks ? f->nblocks : 1u);
   2449   PressureBlockPlan* bp = &plan->blocks[b];
   2450   if (!bp->move_src) {
   2451     Block* bl = &f->blocks[b];
   2452     u32 n = bl->ninsts ? bl->ninsts : 1u;
   2453     bp->move_src = arena_zarray(f->arena, u8, n);
   2454     bp->first_before = arena_array(f->arena, u32, n);
   2455     bp->last_before = arena_array(f->arena, u32, n);
   2456     bp->next_move = arena_array(f->arena, u32, n);
   2457     for (u32 i = 0; i < n; ++i) {
   2458       bp->first_before[i] = OPT_BLOCK_NONE;
   2459       bp->last_before[i] = OPT_BLOCK_NONE;
   2460       bp->next_move[i] = OPT_BLOCK_NONE;
   2461     }
   2462   }
   2463   return bp;
   2464 }
   2465 
   2466 static void pressure_plan_move(Func* f, PressurePlan* plan, u32 b, u32 from,
   2467                                u32 to) {
   2468   PressureBlockPlan* bp = pressure_block_plan(f, plan, b);
   2469   if (bp->move_src[from]) return;
   2470   bp->move_src[from] = 1;
   2471   bp->next_move[from] = OPT_BLOCK_NONE;
   2472   if (bp->first_before[to] == OPT_BLOCK_NONE) {
   2473     bp->first_before[to] = from;
   2474   } else {
   2475     bp->next_move[bp->last_before[to]] = from;
   2476   }
   2477   bp->last_before[to] = from;
   2478   plan->nmoves++;
   2479 }
   2480 
   2481 static int pressure_plan(Func* f, PressurePlan* plan) {
   2482   memset(plan, 0, sizeof *plan);
   2483   for (Val v = 1; v < f->nvals; ++v) {
   2484     u32 db = f->val_def_block[v];
   2485     u32 di = f->val_def_inst[v];
   2486     if (db >= f->nblocks || di >= f->blocks[db].ninsts) continue;
   2487     Inst* def = &f->blocks[db].insts[di];
   2488     if (def->def != v || !pressure_candidate(def)) continue;
   2489 
   2490     OptUse use;
   2491     if (!pressure_single_use(f, v, &use)) continue;
   2492     if (use.kind != OPT_USE_OPERAND) continue;
   2493     if (use.block != db) continue;
   2494     if (use.inst <= di + 1u) continue;
   2495     if (f->blocks[use.block].loop_depth > f->blocks[db].loop_depth) continue;
   2496     if (!pressure_can_move_within_block(f, db, di, use.inst)) continue;
   2497 
   2498     pressure_plan_move(f, plan, db, di, use.inst);
   2499   }
   2500   return plan->nmoves != 0;
   2501 }
   2502 
   2503 static void pressure_apply_block(Func* f, u32 b, PressureBlockPlan* bp) {
   2504   Block* bl = &f->blocks[b];
   2505   Inst* old = bl->insts;
   2506   Inst* insts = arena_array(f->arena, Inst, bl->ninsts ? bl->ninsts : 1u);
   2507   u32 w = 0;
   2508   for (u32 i = 0; i < bl->ninsts; ++i) {
   2509     for (u32 m = bp->first_before[i]; m != OPT_BLOCK_NONE;
   2510          m = bp->next_move[m]) {
   2511       insts[w++] = old[m];
   2512     }
   2513     if (!bp->move_src[i]) insts[w++] = old[i];
   2514   }
   2515   bl->insts = insts;
   2516   bl->cap = bl->ninsts;
   2517   if (w != bl->ninsts) {
   2518     SrcLoc loc = {0, 0, 0};
   2519     compiler_panic(f->c, loc, "opt pressure-relief: bad rewrite (%u, %u)",
   2520                    (unsigned)w, (unsigned)bl->ninsts);
   2521   }
   2522 }
   2523 
   2524 static void pressure_apply(Func* f, PressurePlan* plan) {
   2525   if (!plan->blocks || !plan->nmoves) return;
   2526   for (u32 b = 0; b < f->nblocks; ++b) {
   2527     PressureBlockPlan* bp = &plan->blocks[b];
   2528     if (bp->move_src) pressure_apply_block(f, b, bp);
   2529   }
   2530   dse_refresh_def_locations(f);
   2531   opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE);
   2532 }
   2533 
   2534 void opt_pressure_relief(Func* f) {
   2535   if (!f || f->opt_rewritten) return;
   2536   if (!f->opt_reg_ssa && f->npregs > 1) {
   2537     opt_rebuild_def_use(f);
   2538     return;
   2539   }
   2540 
   2541   PressurePlan plan;
   2542   opt_rebuild_def_use(f);
   2543   if (pressure_plan(f, &plan)) pressure_apply(f, &plan);
   2544   opt_rebuild_def_use(f);
   2545 }
   2546 
   2547 static int ssa_combine_cmp_def(Func* f, Val v, Inst** out) {
   2548   Inst* def = NULL;
   2549   if (!val_def_inst(f, v, &def)) return 0;
   2550   if ((IROp)def->op != IR_CMP || def->nopnds < 3) return 0;
   2551   *out = def;
   2552   return 1;
   2553 }
   2554 
   2555 static int ssa_combine_fold_cmp_branch(Func* f) {
   2556   int changed = 0;
   2557   for (u32 b = 0; b < f->nblocks; ++b) {
   2558     Block* bl = &f->blocks[b];
   2559     if (!bl->ninsts || bl->nsucc < 2) continue;
   2560     Inst* br = &bl->insts[bl->ninsts - 1u];
   2561     if ((IROp)br->op != IR_CONDBR || br->nopnds < 1) continue;
   2562     if (br->opnds[0].kind != OPK_REG) continue;
   2563     Inst* cmp = NULL;
   2564     if (!ssa_combine_cmp_def(f, (Val)br->opnds[0].v.reg, &cmp)) continue;
   2565 
   2566     Operand* opnds = arena_array(f->arena, Operand, 2);
   2567     opnds[0] = cmp->opnds[1];
   2568     opnds[1] = cmp->opnds[2];
   2569     br->op = IR_CMP_BRANCH;
   2570     br->opnds = opnds;
   2571     br->nopnds = 2;
   2572     br->extra.imm = cmp->extra.imm;
   2573     changed = 1;
   2574   }
   2575   return changed;
   2576 }
   2577 
   2578 static int ssa_combine_fold_addr_uses(Func* f) {
   2579   int changed = 0;
   2580   opt_rebuild_def_use(f);
   2581   u32 nvals = f->nvals;
   2582   for (Val v = 1; v < nvals; ++v) {
   2583     Inst* def = NULL;
   2584     if (!addr_def_inst(f, v, &def)) continue;
   2585     Operand addr = def->opnds[1];
   2586     if (addr.kind != OPK_LOCAL) continue;
   2587 
   2588     for (u32 u = f->opt_first_use_by_val[v]; u != OPT_USE_NONE;
   2589          u = f->opt_uses[u].next_for_val) {
   2590       OptUse* use = &f->opt_uses[u];
   2591       /* Only fold zero-EA uses to OPK_LOCAL. EA-shaped uses keep the EA
   2592        * on the load/store; OPK_LOCAL cannot carry the offset/index. */
   2593       if (addr_use_foldable_kind(f, use) != 1) continue;
   2594       Inst* mem = &f->blocks[use->block].insts[use->inst];
   2595       Operand folded = addr;
   2596       folded.type = mem->extra.mem.type ? mem->extra.mem.type : addr.type;
   2597       *use->operand = folded;
   2598       changed = 1;
   2599     }
   2600   }
   2601   return changed;
   2602 }
   2603 
   2604 void opt_ssa_combine(Func* f) {
   2605   if (!f || f->opt_rewritten) return;
   2606   if (!f->opt_reg_ssa && f->npregs > 1) {
   2607     opt_rebuild_def_use(f);
   2608     return;
   2609   }
   2610 
   2611   opt_rebuild_def_use(f);
   2612   int changed = ssa_combine_fold_cmp_branch(f);
   2613   changed |= ssa_combine_fold_addr_uses(f);
   2614   if (changed) opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE);
   2615   opt_rebuild_def_use(f);
   2616 }