pass_o2.c (79713B)
1 #include <kit/cg.h> 2 #include <string.h> 3 4 #include "cg/ir_eval.h" 5 #include "opt/opt_internal.h" 6 7 #define OPT_BLOCK_NONE 0xffffffffu 8 #define OPT_CLONE_MAX_BLOCK_INSTS 4u 9 #define OPT_CLONE_MAX_PREDS 4u 10 #define OPT_CLONE_ABS_GROWTH_LIMIT 32u 11 #define GVN_ENTRY_NONE 0xffffffffu 12 #define DSE_KEY_NONE 0xffffffffu 13 14 void opt_cleanup(Func* f) { 15 if (!f) return; 16 opt_build_cfg(f); 17 opt_jump_cleanup(f, OPT_JUMP_CLEANUP_CFG); 18 opt_build_cfg(f); 19 opt_verify(f, "o2-pre-ssa-cfg"); 20 opt_build_reg_ssa(f); 21 opt_verify(f, "o2-reg-ssa"); 22 opt_block_cloning(f); 23 opt_verify(f, "o2-block-clone-cfg"); 24 opt_build_ssa(f); 25 opt_verify(f, "o2-ssa"); 26 opt_ssa_dce(f); 27 opt_copy_cleanup(f); 28 opt_verify(f, "o2-pre-addr-cleanup"); 29 opt_addr_xform(f); 30 opt_verify(f, "o2-addr-xform-ssa"); 31 opt_ssa_dce(f); 32 opt_verify(f, "o2-addr-xform-dce"); 33 opt_copy_cleanup(f); 34 opt_verify(f, "o2-addr-xform-copy-cleanup"); 35 opt_simplify(f); 36 opt_verify(f, "o2-simplify-ssa"); 37 opt_ssa_dce(f); 38 opt_copy_cleanup(f); 39 opt_verify(f, "o2-simplify-cleanup"); 40 opt_gvn(f); 41 opt_verify(f, "o2-gvn-ssa"); 42 opt_copy_prop(f); 43 opt_verify(f, "o2-copy-prop-ssa"); 44 opt_ssa_dce(f); 45 opt_verify(f, "o2-copy-dce"); 46 opt_dse(f); 47 opt_verify(f, "o2-dse-ssa"); 48 opt_ssa_dce(f); 49 opt_verify(f, "o2-dse-dce"); 50 opt_build_loop_tree(f); 51 opt_licm(f); 52 opt_verify(f, "o2-licm-ssa"); 53 opt_pressure_relief(f); 54 opt_verify(f, "o2-pressure-relief-ssa"); 55 opt_make_conventional_ssa(f); 56 opt_verify(f, "o2-conventional-ssa"); 57 opt_ssa_combine(f); 58 opt_verify(f, "o2-ssa-combine"); 59 opt_undo_ssa(f); 60 opt_copy_cleanup(f); 61 opt_verify(f, "o2-undo-copy-cleanup"); 62 opt_jump_opt(f); 63 opt_verify(f, "o2-jump-opt"); 64 } 65 66 typedef struct CloneValMap { 67 Val old_val; 68 Val new_val; 69 } CloneValMap; 70 71 typedef struct GvnConst { 72 i64 value; 73 u8 valid; 74 } GvnConst; 75 76 typedef struct GvnOperandKey { 77 u8 kind; 78 u8 cls; 79 u16 pad; 80 KitCgTypeId type; 81 union { 82 Val reg; 83 i64 imm; 84 struct { 85 Val base; 86 Val index; 87 i32 ofs; 88 u8 log2_scale; 89 u8 pad[3]; 90 } ind; 91 } v; 92 } GvnOperandKey; 93 94 typedef struct GvnMemKey { 95 u8 valid; 96 u8 root_kind; 97 u8 has_addr; 98 u8 pad0; 99 u16 addr_space; 100 u16 flags; 101 KitCgTypeId mem_type; 102 u32 size; 103 u32 align; 104 i64 root_id; 105 i64 offset; 106 u32 root_version; 107 u32 unknown_version; 108 } GvnMemKey; 109 110 typedef struct GvnKey { 111 u16 op; 112 u16 nops; 113 KitCgTypeId type; 114 u8 cls; 115 u8 pad[3]; 116 i64 tag; 117 GvnMemKey mem; 118 GvnOperandKey ops[2]; 119 } GvnKey; 120 121 typedef struct GvnEntry { 122 GvnKey key; 123 Val val; 124 u32 block; 125 u32 inst; 126 u32 next; 127 } GvnEntry; 128 129 typedef struct GvnTable { 130 Func* f; 131 u32* buckets; 132 u32 nbuckets; 133 GvnEntry* entries; 134 u32 nentries; 135 u32 cap; 136 } GvnTable; 137 138 typedef struct GvnMemVersion { 139 u8 root_kind; 140 u8 pad0; 141 u16 addr_space; 142 i64 root_id; 143 u32 version; 144 } GvnMemVersion; 145 146 typedef struct GvnMemAvail { 147 GvnKey key; 148 Val val; 149 } GvnMemAvail; 150 151 typedef struct GvnMemState { 152 GvnMemVersion* roots; 153 u32 nroots; 154 u32 cap; 155 GvnMemAvail* avail; 156 u32 navail; 157 u32 cap_avail; 158 u32 unknown_version; 159 } GvnMemState; 160 161 typedef struct GvnBlockMemState { 162 GvnMemState in; 163 GvnMemState out; 164 u8 out_valid; 165 } GvnBlockMemState; 166 167 typedef struct GvnCtx { 168 Func* f; 169 OptAnalysis* analysis; 170 Val* parent; 171 GvnConst* constants; 172 GvnTable table; 173 GvnMemState mem; 174 GvnBlockMemState* block_mem; 175 u8* local_escaped; 176 int changed; 177 int cfg_changed; 178 } GvnCtx; 179 180 typedef struct DseKey { 181 u8 root_kind; 182 u8 pad0; 183 u16 addr_space; 184 u16 flags; 185 u16 pad1; 186 KitCgTypeId mem_type; 187 u32 size; 188 u32 align; 189 i64 root_id; 190 i64 offset; 191 } DseKey; 192 193 typedef struct DseBitset { 194 u64* words; 195 } DseBitset; 196 197 typedef struct DseBlockState { 198 DseBitset in; 199 DseBitset out; 200 } DseBlockState; 201 202 typedef struct DseCtx { 203 Func* f; 204 GvnCtx gvn; 205 DseKey* keys; 206 u32 nkeys; 207 u32 cap_keys; 208 u32** store_key_by_inst; 209 DseBlockState* block; 210 DseBitset scratch_in; 211 DseBitset scratch_out; 212 u8* local_escaped; 213 u32 words; 214 int changed; 215 } DseCtx; 216 217 typedef struct LicmLoop { 218 u32 header; 219 u32 preheader; 220 u8* body; 221 } LicmLoop; 222 223 typedef struct PressureBlockPlan { 224 u8* move_src; 225 u32* first_before; 226 u32* last_before; 227 u32* next_move; 228 } PressureBlockPlan; 229 230 typedef struct PressurePlan { 231 PressureBlockPlan* blocks; 232 u32 nmoves; 233 } PressurePlan; 234 235 static u32 opt_inst_count(Func* f) { 236 u32 n = 0; 237 for (u32 b = 0; b < f->nblocks; ++b) n += f->blocks[b].ninsts; 238 return n; 239 } 240 241 static int o2_is_terminator(const Inst* in) { 242 switch ((IROp)in->op) { 243 case IR_BR: 244 case IR_CONDBR: 245 case IR_CMP_BRANCH: 246 case IR_SWITCH: 247 case IR_INDIRECT_BRANCH: 248 case IR_RET: 249 case IR_UNREACHABLE: 250 case IR_BREAK_TO: 251 case IR_CONTINUE_TO: 252 return 1; 253 case IR_INTRINSIC: { 254 IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; 255 return aux && (aux->kind == INTRIN_LONGJMP || aux->kind == INTRIN_TRAP); 256 } 257 default: 258 return 0; 259 } 260 } 261 262 static int o2_cloneable_inst(const Inst* in) { 263 if (in->ndefs) return 0; 264 switch ((IROp)in->op) { 265 case IR_NOP: 266 case IR_LOAD_IMM: 267 case IR_LOAD_CONST: 268 case IR_COPY: 269 case IR_LOAD: 270 case IR_STORE: 271 case IR_ADDR_OF: 272 case IR_BINOP: 273 case IR_UNOP: 274 case IR_CMP: 275 case IR_CONVERT: 276 case IR_BR: 277 case IR_RET: 278 return 1; 279 default: 280 return 0; 281 } 282 } 283 284 static int block_has_phi(Func* f, u32 b) { 285 if (b >= f->nblocks) return 0; 286 Block* bl = &f->blocks[b]; 287 return bl->ninsts && (IROp)bl->insts[0].op == IR_PHI; 288 } 289 290 static int block_defs_are_local(Func* f, u32 b) { 291 Block* bl = &f->blocks[b]; 292 opt_rebuild_def_use(f); 293 for (u32 i = 0; i < bl->ninsts; ++i) { 294 Inst* in = &bl->insts[i]; 295 if (in->def == VAL_NONE) continue; 296 for (u32 u = f->opt_first_use_by_val[in->def]; u != OPT_USE_NONE; 297 u = f->opt_uses[u].next_for_val) { 298 if (f->opt_uses[u].block != b) return 0; 299 } 300 } 301 return 1; 302 } 303 304 static int block_defines_val(const Block* bl, Val v) { 305 for (u32 i = 0; i < bl->ninsts; ++i) { 306 const Inst* in = &bl->insts[i]; 307 if (in->def == v) return 1; 308 for (u32 d = 0; d < in->ndefs; ++d) 309 if (in->defs[d] == v) return 1; 310 } 311 return 0; 312 } 313 314 static int block_external_uses_dominated(Func* f, const OptAnalysis* a, u32 b) { 315 Block* bl = &f->blocks[b]; 316 opt_rebuild_def_use(f); 317 for (u32 i = 0; i < bl->ninsts; ++i) { 318 for (u32 u = 0; u < f->opt_nuses; ++u) { 319 OptUse* use = &f->opt_uses[u]; 320 if (use->block != b || use->inst != i) continue; 321 Val v = use->val; 322 if (block_defines_val(bl, v)) continue; 323 if (v == VAL_NONE || v >= f->nvals) return 0; 324 if (!opt_analysis_dominates(a, f->val_def_block[v], b)) return 0; 325 } 326 } 327 return 1; 328 } 329 330 static int clone_candidate(Func* f, const OptAnalysis* a, u32 block) { 331 if (block >= f->nblocks || block == f->entry) return 0; 332 Block* bl = &f->blocks[block]; 333 if (!a->reachable || !a->reachable[block]) return 0; 334 if (bl->npreds < 2 || bl->npreds > OPT_CLONE_MAX_PREDS) return 0; 335 if (bl->ninsts == 0 || bl->ninsts > OPT_CLONE_MAX_BLOCK_INSTS) return 0; 336 if (bl->nsucc > 1) return 0; 337 if (bl->loop_depth) return 0; 338 if (block_has_phi(f, block)) return 0; 339 if (bl->nsucc == 1 && block_has_phi(f, bl->succ[0])) return 0; 340 for (u32 i = 0; i < bl->ninsts; ++i) { 341 if (!o2_cloneable_inst(&bl->insts[i])) return 0; 342 if (i + 1u != bl->ninsts && o2_is_terminator(&bl->insts[i])) return 0; 343 } 344 for (u32 p = 0; p < bl->npreds; ++p) { 345 u32 pred = bl->preds[p]; 346 if (pred >= f->nblocks) return 0; 347 if (f->blocks[pred].loop_depth) return 0; 348 if (opt_analysis_dominates(a, block, pred)) return 0; 349 } 350 return block_defs_are_local(f, block) && 351 block_external_uses_dominated(f, a, block); 352 } 353 354 static Val map_lookup(const CloneValMap* map, u32 nmap, Val v) { 355 for (u32 i = 0; i < nmap; ++i) 356 if (map[i].old_val == v) return map[i].new_val; 357 return VAL_NONE; 358 } 359 360 static void remap_operand(Func* f, Inst* in, Operand* op, int is_def, 361 void* arg) { 362 (void)in; 363 (void)is_def; 364 CloneValMap* map = (CloneValMap*)arg; 365 u32 nmap = map[0].old_val; 366 if (op->kind != OPK_REG) return; 367 Val repl = map_lookup(map + 1, nmap, (Val)op->v.reg); 368 if (repl == VAL_NONE) return; 369 op->v.reg = (Reg)repl; 370 op->type = f->val_type[repl]; 371 op->cls = f->val_cls[repl]; 372 } 373 374 static IRRetAux* clone_ret_aux(Func* f, const IRRetAux* src) { 375 if (!src) return NULL; 376 IRRetAux* dst = arena_znew(f->arena, IRRetAux); 377 *dst = *src; 378 if (src->val.nparts) { 379 CGABIPart* parts = arena_array(f->arena, CGABIPart, src->val.nparts); 380 memcpy(parts, src->val.parts, sizeof(parts[0]) * src->val.nparts); 381 dst->val.parts = parts; 382 } 383 return dst; 384 } 385 386 static void clone_inst_aux(Func* f, Inst* dst, const Inst* src) { 387 if ((IROp)src->op == IR_RET) { 388 dst->extra.aux = clone_ret_aux(f, (const IRRetAux*)src->extra.aux); 389 } 390 } 391 392 static u32 emit_order_pos(Func* f, u32 block) { 393 for (u32 i = 0; i < f->emit_order_n; ++i) 394 if (f->emit_order[i] == block) return i; 395 return OPT_BLOCK_NONE; 396 } 397 398 static void emit_order_insert_after(Func* f, u32 after, u32 block) { 399 if (emit_order_pos(f, block) != OPT_BLOCK_NONE) return; 400 if (f->emit_order_n == f->emit_order_cap) { 401 u32 ncap = f->emit_order_cap ? f->emit_order_cap * 2u : 8u; 402 u32* order = arena_array(f->arena, u32, ncap); 403 if (f->emit_order) 404 memcpy(order, f->emit_order, sizeof(order[0]) * f->emit_order_n); 405 f->emit_order = order; 406 f->emit_order_cap = ncap; 407 } 408 u32 pos = f->emit_order_n; 409 u32 after_pos = emit_order_pos(f, after); 410 if (after_pos != OPT_BLOCK_NONE) pos = after_pos + 1u; 411 for (u32 i = f->emit_order_n; i > pos; --i) 412 f->emit_order[i] = f->emit_order[i - 1u]; 413 f->emit_order[pos] = block; 414 ++f->emit_order_n; 415 } 416 417 static void replace_succ_ref(Func* f, u32 pred, u32 old_succ, u32 new_succ) { 418 Block* bl = &f->blocks[pred]; 419 for (u32 s = 0; s < bl->nsucc; ++s) 420 if (bl->succ[s] == old_succ) bl->succ[s] = new_succ; 421 if (!bl->ninsts) return; 422 Inst* term = &bl->insts[bl->ninsts - 1u]; 423 if ((IROp)term->op == IR_SWITCH) { 424 IRSwitchAux* aux = (IRSwitchAux*)term->extra.aux; 425 if (!aux) return; 426 for (u32 i = 0; i < aux->ncases; ++i) 427 if (aux->cases[i].block == old_succ) aux->cases[i].block = new_succ; 428 if (aux->default_block == old_succ) aux->default_block = new_succ; 429 } else if ((IROp)term->op == IR_INDIRECT_BRANCH) { 430 IRIndirectAux* aux = (IRIndirectAux*)term->extra.aux; 431 if (!aux) return; 432 for (u32 i = 0; i < aux->ntargets; ++i) 433 if (aux->targets[i] == old_succ) aux->targets[i] = new_succ; 434 } 435 } 436 437 static u32 clone_block_for_pred(Func* f, u32 block, u32 pred) { 438 Block* src = &f->blocks[block]; 439 u32 nb = ir_block_new(f); 440 Block* dst = &f->blocks[nb]; 441 ir_block_set_nsucc(f, nb, src->nsucc); 442 for (u32 s = 0; s < src->nsucc; ++s) dst->succ[s] = src->succ[s]; 443 dst->loop_depth = src->loop_depth; 444 dst->frequency = src->frequency; 445 446 CloneValMap* map = arena_zarray(f->arena, CloneValMap, src->ninsts + 1u); 447 u32 nmap = 0; 448 for (u32 i = 0; i < src->ninsts; ++i) { 449 Val old = src->insts[i].def; 450 if (old == VAL_NONE) continue; 451 map[++nmap].old_val = old; 452 map[nmap].new_val = ir_alloc_val(f, f->val_type[old], f->val_cls[old]); 453 } 454 map[0].old_val = nmap; 455 456 for (u32 i = 0; i < src->ninsts; ++i) { 457 Inst* in = ir_emit(f, nb, (IROp)src->insts[i].op); 458 InstId id = in->id; 459 *in = src->insts[i]; 460 in->id = id; 461 if (in->nopnds) { 462 Operand* opnds = arena_array(f->arena, Operand, in->nopnds); 463 memcpy(opnds, src->insts[i].opnds, sizeof(opnds[0]) * in->nopnds); 464 in->opnds = opnds; 465 } 466 clone_inst_aux(f, in, &src->insts[i]); 467 if (in->def != VAL_NONE) in->def = map_lookup(map + 1, nmap, in->def); 468 opt_walk_inst_operands(f, in, remap_operand, map); 469 if (in->def != VAL_NONE && in->def < f->nvals) { 470 f->val_def_block[in->def] = nb; 471 f->val_def_inst[in->def] = dst->ninsts - 1u; 472 } 473 } 474 475 replace_succ_ref(f, pred, block, nb); 476 emit_order_insert_after(f, pred, nb); 477 return nb; 478 } 479 480 void opt_block_cloning(Func* f) { 481 if (!f || f->opt_rewritten) return; 482 /* This pass remaps OPK_REG defs as Val ids. In the normal O2 pipeline it 483 * must run after opt_build_reg_ssa; standalone unit tests that build Val IR 484 * directly have no pseudo-register table beyond the sentinel. */ 485 if (!f->opt_reg_ssa && f->npregs > 1) return; 486 opt_build_loop_tree(f); 487 OptAnalysis a; 488 memset(&a, 0, sizeof a); 489 opt_analysis_build_dominators(f, &a); 490 491 u32 base_insts = opt_inst_count(f); 492 u32 max_extra = base_insts / 4u + 8u; 493 if (max_extra > OPT_CLONE_ABS_GROWTH_LIMIT) 494 max_extra = OPT_CLONE_ABS_GROWTH_LIMIT; 495 u32 extra = 0; 496 int changed = 0; 497 498 u32 original_blocks = f->nblocks; 499 for (u32 b = 0; b < original_blocks; ++b) { 500 if (!clone_candidate(f, &a, b)) continue; 501 Block* bl = &f->blocks[b]; 502 u32 cost = bl->ninsts; 503 if (cost == 0 || cost * bl->npreds + extra > max_extra) continue; 504 u32 npreds = bl->npreds; 505 u32* preds = arena_array(f->arena, u32, npreds); 506 memcpy(preds, bl->preds, sizeof(preds[0]) * npreds); 507 for (u32 p = 0; p < npreds; ++p) { 508 clone_block_for_pred(f, b, preds[p]); 509 extra += cost; 510 } 511 changed = 1; 512 } 513 514 if (changed) { 515 opt_analysis_invalidate(f, OPT_ANALYSIS_CFG | OPT_ANALYSIS_DEF_USE | 516 OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP); 517 opt_build_cfg(f); 518 } 519 opt_rebuild_def_use(f); 520 } 521 522 static int addr_def_inst(Func* f, Val v, Inst** out) { 523 if (v == VAL_NONE || v >= f->nvals) return 0; 524 u32 b = f->val_def_block[v]; 525 u32 i = f->val_def_inst[v]; 526 if (b >= f->nblocks || i >= f->blocks[b].ninsts) return 0; 527 Inst* in = &f->blocks[b].insts[i]; 528 if ((IROp)in->op != IR_ADDR_OF || in->def != v || in->nopnds < 2) return 0; 529 *out = in; 530 return 1; 531 } 532 533 static int val_def_inst(Func* f, Val v, Inst** out) { 534 if (v == VAL_NONE || v >= f->nvals) return 0; 535 u32 b = f->val_def_block[v]; 536 u32 i = f->val_def_inst[v]; 537 if (b >= f->nblocks || i >= f->blocks[b].ninsts) return 0; 538 Inst* in = &f->blocks[b].insts[i]; 539 if (in->def != v) return 0; 540 *out = in; 541 return 1; 542 } 543 544 /* Use classification for the SSA-namespace addr-xform; mirrors the PReg 545 * variant. Returns 0 for escapes, 1 for zero-EA folds (rewrite to 546 * OPK_LOCAL), 2 for EA-shaped folds (leave the OPK_INDIRECT alone so the 547 * EA stays on the load/store). */ 548 static int addr_use_foldable_kind(Func* f, const OptUse* use) { 549 if (!use || use->kind != OPT_USE_INDIRECT_BASE) return 0; 550 if (use->block >= f->nblocks || use->inst >= f->blocks[use->block].ninsts) 551 return 0; 552 Inst* in = &f->blocks[use->block].insts[use->inst]; 553 if ((IROp)in->op != IR_LOAD && (IROp)in->op != IR_STORE) return 0; 554 if (opt_mem_observable(&in->extra.mem)) return 0; 555 if (!use->operand || use->operand->kind != OPK_INDIRECT) return 0; 556 if ((IROp)in->op == IR_LOAD && use->operand_index != 1u) return 0; 557 if ((IROp)in->op == IR_STORE && use->operand_index != 0u) return 0; 558 if (use->operand->v.ind.ofs == 0 && 559 use->operand->v.ind.index == (Reg)REG_NONE) 560 return 1; 561 return 2; 562 } 563 564 static int addr_all_uses_foldable(Func* f, Val v, int* out_has_ea) { 565 u32 nuses = 0; 566 int has_ea = 0; 567 for (u32 u = f->opt_first_use_by_val[v]; u != OPT_USE_NONE; 568 u = f->opt_uses[u].next_for_val) { 569 ++nuses; 570 int k = addr_use_foldable_kind(f, &f->opt_uses[u]); 571 if (!k) return 0; 572 if (k == 2) has_ea = 1; 573 } 574 if (out_has_ea) *out_has_ea = has_ea; 575 return nuses != 0; 576 } 577 578 static void addr_inst_remove(Inst* in) { 579 in->op = IR_NOP; 580 in->def = VAL_NONE; 581 in->ndefs = 0; 582 in->defs = NULL; 583 in->nopnds = 0; 584 in->opnds = NULL; 585 } 586 587 void opt_addr_xform(Func* f) { 588 if (!f || f->opt_rewritten) return; 589 opt_rebuild_def_use(f); 590 int changed = 0; 591 u32 nvals = f->nvals; 592 for (Val v = 1; v < nvals; ++v) { 593 Inst* def = NULL; 594 if (!addr_def_inst(f, v, &def)) continue; 595 Operand lv = def->opnds[1]; 596 if (lv.kind != OPK_LOCAL) continue; 597 int has_ea = 0; 598 if (!addr_all_uses_foldable(f, v, &has_ea)) continue; 599 600 /* Rewrite zero-EA uses to OPK_LOCAL; leave EA-shaped uses as 601 * OPK_INDIRECT(p, ofs, index, log2_scale). When any EA-shaped use 602 * remains, the IR_ADDR_OF def must stay alive to feed its base. */ 603 for (u32 u = f->opt_first_use_by_val[v]; u != OPT_USE_NONE; 604 u = f->opt_uses[u].next_for_val) { 605 OptUse* use = &f->opt_uses[u]; 606 Operand* op = use->operand; 607 if (!op || op->kind != OPK_INDIRECT) continue; 608 if (op->v.ind.ofs != 0 || op->v.ind.index != (Reg)REG_NONE) continue; 609 Inst* mem = &f->blocks[use->block].insts[use->inst]; 610 Operand folded = lv; 611 folded.type = mem->extra.mem.type ? mem->extra.mem.type : lv.type; 612 *op = folded; 613 } 614 if (!has_ea) addr_inst_remove(def); 615 changed = 1; 616 } 617 if (changed) 618 opt_analysis_invalidate( 619 f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP); 620 opt_rebuild_def_use(f); 621 } 622 623 /* gvn's TYPE policy is int-LIKE width (no-PTR); the value arithmetic is the 624 * shared cg/ir_eval core (see src/cg/ir_eval.h). */ 625 static int gvn_int_like_width(Func* f, KitCgTypeId ty, u32* width_out) { 626 u32 width = kit_cg_type_int_width((KitCompiler*)f->c, ty); 627 if (!width || width > 64u) return 0; 628 *width_out = width; 629 return 1; 630 } 631 632 static int gvn_fold_binop(Func* f, BinOp op, KitCgTypeId ty, i64 a, i64 b, 633 i64* out) { 634 u32 width; 635 if (!gvn_int_like_width(f, ty, &width)) return 0; 636 return kit_ir_eval_binop(op, width, a, b, out); 637 } 638 639 static int gvn_fold_unop(Func* f, UnOp op, KitCgTypeId ty, i64 a, i64* out) { 640 u32 width; 641 if (!gvn_int_like_width(f, ty, &width)) return 0; 642 return kit_ir_eval_unop(op, width, a, out); 643 } 644 645 static int gvn_fold_cmp(Func* f, CmpOp op, KitCgTypeId ty, i64 a, i64 b, 646 i64* out) { 647 u32 width; 648 if (!gvn_int_like_width(f, ty, &width)) return 0; 649 return kit_ir_eval_cmp(op, width, a, b, out); 650 } 651 652 static int gvn_fold_convert(Func* f, ConvKind k, KitCgTypeId dst_ty, 653 KitCgTypeId src_ty, i64 src, i64* out) { 654 u32 sw, dw; 655 if (!gvn_int_like_width(f, src_ty, &sw) || 656 !gvn_int_like_width(f, dst_ty, &dw)) 657 return 0; 658 return kit_ir_eval_convert(k, sw, dw, src, out); 659 } 660 661 static Val gvn_find(GvnCtx* ctx, Val v) { 662 if (v == VAL_NONE || v >= ctx->f->nvals) return VAL_NONE; 663 Val p = ctx->parent[v]; 664 if (p == VAL_NONE || p == v) return v; 665 ctx->parent[v] = gvn_find(ctx, p); 666 return ctx->parent[v]; 667 } 668 669 static int gvn_same_shape(Func* f, Val a, Val b) { 670 if (a == VAL_NONE || b == VAL_NONE || a >= f->nvals || b >= f->nvals) 671 return 0; 672 return f->val_type[a] == f->val_type[b] && f->val_cls[a] == f->val_cls[b]; 673 } 674 675 static int gvn_const_for_operand(GvnCtx* ctx, const Operand* op, i64* out) { 676 if (!op) return 0; 677 if (op->kind == OPK_IMM) { 678 *out = op->v.imm; 679 return 1; 680 } 681 if (op->kind != OPK_REG) return 0; 682 Val v = gvn_find(ctx, (Val)op->v.reg); 683 if (v == VAL_NONE || v >= ctx->f->nvals || !ctx->constants[v].valid) return 0; 684 *out = ctx->constants[v].value; 685 return 1; 686 } 687 688 static void gvn_make_load_imm(Func* f, Inst* in, i64 value) { 689 Val dst = in->def; 690 KitCgTypeId ty = 691 dst != VAL_NONE && dst < f->nvals ? f->val_type[dst] : in->type; 692 u8 cls = dst != VAL_NONE && dst < f->nvals ? f->val_cls[dst] : RC_INT; 693 Operand* opnds = in->opnds; 694 if (!opnds) opnds = arena_array(f->arena, Operand, 1); 695 memset(&opnds[0], 0, sizeof opnds[0]); 696 opnds[0].kind = OPK_REG; 697 opnds[0].type = ty; 698 opnds[0].cls = cls; 699 opnds[0].v.reg = (Reg)dst; 700 in->op = IR_LOAD_IMM; 701 in->type = ty; 702 in->opnds = opnds; 703 in->nopnds = 1; 704 in->extra.imm = value; 705 } 706 707 static int gvn_try_fold_inst(GvnCtx* ctx, Inst* in) { 708 i64 a, b, out; 709 switch ((IROp)in->op) { 710 case IR_BINOP: 711 if (in->nopnds < 3) return 0; 712 if (!gvn_const_for_operand(ctx, &in->opnds[1], &a) || 713 !gvn_const_for_operand(ctx, &in->opnds[2], &b)) 714 return 0; 715 if (!gvn_fold_binop(ctx->f, (BinOp)in->extra.imm, in->type, a, b, &out)) 716 return 0; 717 gvn_make_load_imm(ctx->f, in, out); 718 return 1; 719 case IR_UNOP: 720 if (in->nopnds < 2) return 0; 721 if (!gvn_const_for_operand(ctx, &in->opnds[1], &a)) return 0; 722 if (!gvn_fold_unop(ctx->f, (UnOp)in->extra.imm, in->type, a, &out)) 723 return 0; 724 gvn_make_load_imm(ctx->f, in, out); 725 return 1; 726 case IR_CMP: 727 if (in->nopnds < 3) return 0; 728 if (!gvn_const_for_operand(ctx, &in->opnds[1], &a) || 729 !gvn_const_for_operand(ctx, &in->opnds[2], &b)) 730 return 0; 731 if (!gvn_fold_cmp(ctx->f, (CmpOp)in->extra.imm, in->opnds[1].type, a, b, 732 &out)) 733 return 0; 734 gvn_make_load_imm(ctx->f, in, out); 735 return 1; 736 case IR_CONVERT: 737 if (in->nopnds < 2) return 0; 738 if (!gvn_const_for_operand(ctx, &in->opnds[1], &a)) return 0; 739 if (!gvn_fold_convert(ctx->f, (ConvKind)in->extra.imm, in->opnds[0].type, 740 in->opnds[1].type, a, &out)) 741 return 0; 742 gvn_make_load_imm(ctx->f, in, out); 743 return 1; 744 default: 745 return 0; 746 } 747 } 748 749 static u64 gvn_mix_u64(u64 h, u64 v) { 750 h ^= v + 0x9e3779b97f4a7c15ull + (h << 6) + (h >> 2); 751 return h; 752 } 753 754 static u64 gvn_key_hash(const GvnKey* k) { 755 u64 h = 1469598103934665603ull; 756 h = gvn_mix_u64(h, k->op); 757 h = gvn_mix_u64(h, k->nops); 758 h = gvn_mix_u64(h, k->type); 759 h = gvn_mix_u64(h, k->cls); 760 h = gvn_mix_u64(h, (u64)k->tag); 761 h = gvn_mix_u64(h, k->mem.valid); 762 if (k->mem.valid) { 763 h = gvn_mix_u64(h, k->mem.root_kind); 764 h = gvn_mix_u64(h, k->mem.has_addr); 765 h = gvn_mix_u64(h, k->mem.addr_space); 766 h = gvn_mix_u64(h, k->mem.flags); 767 h = gvn_mix_u64(h, k->mem.mem_type); 768 h = gvn_mix_u64(h, k->mem.size); 769 h = gvn_mix_u64(h, k->mem.align); 770 h = gvn_mix_u64(h, (u64)k->mem.root_id); 771 h = gvn_mix_u64(h, (u64)k->mem.offset); 772 h = gvn_mix_u64(h, k->mem.root_version); 773 h = gvn_mix_u64(h, k->mem.unknown_version); 774 } 775 for (u32 i = 0; i < k->nops; ++i) { 776 h = gvn_mix_u64(h, k->ops[i].kind); 777 h = gvn_mix_u64(h, k->ops[i].cls); 778 h = gvn_mix_u64(h, k->ops[i].type); 779 if (k->ops[i].kind == OPK_INDIRECT) { 780 h = gvn_mix_u64(h, k->ops[i].v.ind.base); 781 h = gvn_mix_u64(h, k->ops[i].v.ind.index); 782 h = gvn_mix_u64(h, (u64)(i64)k->ops[i].v.ind.ofs); 783 h = gvn_mix_u64(h, k->ops[i].v.ind.log2_scale); 784 } else { 785 h = gvn_mix_u64(h, k->ops[i].kind == OPK_REG ? k->ops[i].v.reg 786 : (u64)k->ops[i].v.imm); 787 } 788 } 789 return h; 790 } 791 792 static int gvn_operand_key_equal(const GvnOperandKey* a, 793 const GvnOperandKey* b) { 794 if (a->kind != b->kind || a->cls != b->cls || a->type != b->type) return 0; 795 if (a->kind == OPK_INDIRECT) 796 return a->v.ind.base == b->v.ind.base && a->v.ind.index == b->v.ind.index && 797 a->v.ind.ofs == b->v.ind.ofs && 798 a->v.ind.log2_scale == b->v.ind.log2_scale; 799 if (a->kind == OPK_REG) return a->v.reg == b->v.reg; 800 return a->v.imm == b->v.imm; 801 } 802 803 static int gvn_key_equal(const GvnKey* a, const GvnKey* b) { 804 if (a->op != b->op || a->nops != b->nops || a->type != b->type || 805 a->cls != b->cls || a->tag != b->tag) 806 return 0; 807 if (a->mem.valid != b->mem.valid) return 0; 808 if (a->mem.valid && 809 (a->mem.root_kind != b->mem.root_kind || 810 a->mem.has_addr != b->mem.has_addr || 811 a->mem.addr_space != b->mem.addr_space || a->mem.flags != b->mem.flags || 812 a->mem.mem_type != b->mem.mem_type || a->mem.size != b->mem.size || 813 a->mem.align != b->mem.align || a->mem.root_id != b->mem.root_id || 814 a->mem.offset != b->mem.offset || 815 a->mem.root_version != b->mem.root_version || 816 a->mem.unknown_version != b->mem.unknown_version)) 817 return 0; 818 for (u32 i = 0; i < a->nops; ++i) 819 if (!gvn_operand_key_equal(&a->ops[i], &b->ops[i])) return 0; 820 return 1; 821 } 822 823 static void gvn_table_init(GvnTable* t, Func* f) { 824 u32 ninsts = opt_inst_count(f); 825 u32 nbuckets = 16u; 826 while (nbuckets < ninsts * 2u + 1u) nbuckets *= 2u; 827 t->f = f; 828 t->nbuckets = nbuckets; 829 t->buckets = arena_array(f->arena, u32, nbuckets); 830 for (u32 i = 0; i < nbuckets; ++i) t->buckets[i] = GVN_ENTRY_NONE; 831 t->entries = NULL; 832 t->nentries = 0; 833 t->cap = 0; 834 } 835 836 static void gvn_table_add(GvnTable* t, const GvnKey* key, Val v, u32 b, u32 i) { 837 if (t->nentries == t->cap) { 838 u32 ncap = t->cap ? t->cap * 2u : 32u; 839 GvnEntry* entries = arena_array(t->f->arena, GvnEntry, ncap); 840 if (t->entries) 841 memcpy(entries, t->entries, sizeof(entries[0]) * t->nentries); 842 t->entries = entries; 843 t->cap = ncap; 844 } 845 u32 bucket = (u32)(gvn_key_hash(key) & (u64)(t->nbuckets - 1u)); 846 GvnEntry* e = &t->entries[t->nentries]; 847 e->key = *key; 848 e->val = v; 849 e->block = b; 850 e->inst = i; 851 e->next = t->buckets[bucket]; 852 t->buckets[bucket] = t->nentries++; 853 } 854 855 static int gvn_leader_dominates_site(const OptAnalysis* a, u32 leader_b, 856 u32 leader_i, u32 use_b, u32 use_i) { 857 if (leader_b == use_b) return leader_i < use_i; 858 return opt_analysis_dominates(a, leader_b, use_b); 859 } 860 861 static int gvn_leader_dominates_use(GvnCtx* ctx, u32 leader_b, u32 leader_i, 862 const OptUse* use) { 863 if (use->kind == OPT_USE_PHI_INPUT) { 864 Inst* in = &ctx->f->blocks[use->block].insts[use->inst]; 865 IRPhiAux* aux = (IRPhiAux*)in->extra.aux; 866 if (!aux || use->phi_pred_index >= aux->npreds) return 0; 867 u32 pred = aux->pred_blocks[use->phi_pred_index]; 868 if (leader_b == pred) return 1; 869 return opt_analysis_dominates(ctx->analysis, leader_b, pred); 870 } 871 return gvn_leader_dominates_site(ctx->analysis, leader_b, leader_i, 872 use->block, use->inst); 873 } 874 875 static int gvn_table_find(GvnCtx* ctx, const GvnKey* key, u32 b, u32 i, 876 Val* out) { 877 GvnTable* t = &ctx->table; 878 u32 bucket = (u32)(gvn_key_hash(key) & (u64)(t->nbuckets - 1u)); 879 for (u32 e = t->buckets[bucket]; e != GVN_ENTRY_NONE; 880 e = t->entries[e].next) { 881 GvnEntry* ent = &t->entries[e]; 882 if (!gvn_key_equal(&ent->key, key)) continue; 883 if (!gvn_leader_dominates_site(ctx->analysis, ent->block, ent->inst, b, i)) 884 continue; 885 *out = ent->val; 886 return 1; 887 } 888 return 0; 889 } 890 891 static int gvn_make_operand_key(GvnCtx* ctx, const Operand* op, 892 GvnOperandKey* out) { 893 memset(out, 0, sizeof *out); 894 if (!op || (op->kind != OPK_REG && op->kind != OPK_IMM)) return 0; 895 out->kind = op->kind; 896 out->cls = op->cls; 897 out->type = op->type; 898 if (op->kind == OPK_REG) { 899 Val v = gvn_find(ctx, (Val)op->v.reg); 900 if (v == VAL_NONE || v >= ctx->f->nvals) return 0; 901 out->v.reg = v; 902 } else { 903 out->v.imm = op->v.imm; 904 } 905 return 1; 906 } 907 908 static int gvn_make_addr_operand_key(GvnCtx* ctx, const Operand* op, 909 GvnOperandKey* out) { 910 memset(out, 0, sizeof *out); 911 if (!op) return 0; 912 out->kind = op->kind; 913 out->cls = op->cls; 914 out->type = op->type; 915 switch ((OpKind)op->kind) { 916 case OPK_REG: { 917 Val v = gvn_find(ctx, (Val)op->v.reg); 918 if (v == VAL_NONE || v >= ctx->f->nvals) return 0; 919 out->v.reg = v; 920 return 1; 921 } 922 case OPK_INDIRECT: { 923 Val base = gvn_find(ctx, (Val)op->v.ind.base); 924 if (base == VAL_NONE || base >= ctx->f->nvals) return 0; 925 out->v.ind.base = base; 926 out->v.ind.index = VAL_NONE; 927 if (op->v.ind.index != REG_NONE) { 928 Val index = gvn_find(ctx, (Val)op->v.ind.index); 929 if (index == VAL_NONE || index >= ctx->f->nvals) return 0; 930 out->v.ind.index = index; 931 out->v.ind.log2_scale = op->v.ind.log2_scale; 932 } 933 out->v.ind.ofs = op->v.ind.ofs; 934 return 1; 935 } 936 case OPK_LOCAL: 937 out->v.imm = (i64)op->v.frame_slot; 938 return 1; 939 case OPK_GLOBAL: 940 out->v.imm = (i64)op->v.global.sym; 941 return 1; 942 default: 943 return 0; 944 } 945 } 946 947 static int gvn_mem_root_from_addr_val(GvnCtx* ctx, Val addr_val, u32 depth, 948 u8* kind_out, i64* id_out, 949 i64* offset_out, int* singleton_out) { 950 Inst* def = NULL; 951 if (!ctx || depth > 4u) return 0; 952 addr_val = gvn_find(ctx, addr_val); 953 if (!val_def_inst(ctx->f, addr_val, &def)) return 0; 954 955 if ((IROp)def->op == IR_ADDR_OF && def->nopnds >= 2) { 956 Operand* lv = &def->opnds[1]; 957 if (lv->kind == OPK_LOCAL) { 958 *kind_out = ALIAS_LOCAL; 959 *id_out = (i64)lv->v.frame_slot; 960 *offset_out = 0; 961 *singleton_out = 1; 962 return 1; 963 } 964 if (lv->kind == OPK_GLOBAL) { 965 *kind_out = ALIAS_GLOBAL; 966 *id_out = (i64)lv->v.global.sym; 967 *offset_out = lv->v.global.addend; 968 *singleton_out = 1; 969 return 1; 970 } 971 return 0; 972 } 973 974 if ((IROp)def->op == IR_COPY && def->nopnds >= 2 && 975 def->opnds[1].kind == OPK_REG) { 976 return gvn_mem_root_from_addr_val(ctx, (Val)def->opnds[1].v.reg, depth + 1u, 977 kind_out, id_out, offset_out, 978 singleton_out); 979 } 980 981 if ((IROp)def->op == IR_BINOP && def->nopnds >= 3 && 982 (BinOp)def->extra.imm == BO_IADD) { 983 i64 c = 0; 984 if (def->opnds[1].kind == OPK_REG && 985 gvn_const_for_operand(ctx, &def->opnds[2], &c) && c == 0) { 986 return gvn_mem_root_from_addr_val(ctx, (Val)def->opnds[1].v.reg, 987 depth + 1u, kind_out, id_out, 988 offset_out, singleton_out); 989 } 990 if (def->opnds[2].kind == OPK_REG && 991 gvn_const_for_operand(ctx, &def->opnds[1], &c) && c == 0) { 992 return gvn_mem_root_from_addr_val(ctx, (Val)def->opnds[2].v.reg, 993 depth + 1u, kind_out, id_out, 994 offset_out, singleton_out); 995 } 996 } 997 998 return 0; 999 } 1000 1001 static int gvn_mem_root_from_access(GvnCtx* ctx, const Operand* addr, 1002 const MemAccess* mem, u8* kind_out, 1003 i64* id_out, i64* offset_out, 1004 int* singleton_out) { 1005 u8 kind = ALIAS_UNKNOWN; 1006 i64 id = 0; 1007 i64 offset = 0; 1008 int singleton = 0; 1009 1010 if (addr) { 1011 switch ((OpKind)addr->kind) { 1012 case OPK_LOCAL: 1013 kind = ALIAS_LOCAL; 1014 id = (i64)addr->v.frame_slot; 1015 singleton = 1; 1016 break; 1017 case OPK_GLOBAL: 1018 kind = ALIAS_GLOBAL; 1019 id = (i64)addr->v.global.sym; 1020 offset = addr->v.global.addend; 1021 singleton = 1; 1022 break; 1023 case OPK_INDIRECT: 1024 offset = addr->v.ind.ofs; 1025 if (addr->v.ind.index != REG_NONE) singleton = 0; 1026 if (ctx) { 1027 Val base = gvn_find(ctx, (Val)addr->v.ind.base); 1028 u8 akind; 1029 i64 aid; 1030 i64 aofs; 1031 int asing; 1032 if (gvn_mem_root_from_addr_val(ctx, base, 0, &akind, &aid, &aofs, 1033 &asing)) { 1034 kind = akind; 1035 id = aid; 1036 offset += aofs; 1037 singleton = asing && addr->v.ind.index == REG_NONE; 1038 } 1039 } 1040 break; 1041 default: 1042 break; 1043 } 1044 } 1045 1046 if (kind == ALIAS_UNKNOWN && mem) { 1047 kind = mem->alias.kind; 1048 switch ((AliasKind)kind) { 1049 case ALIAS_LOCAL: 1050 id = mem->alias.v.local_id; 1051 break; 1052 case ALIAS_GLOBAL: 1053 id = (i64)mem->alias.v.global; 1054 break; 1055 case ALIAS_PARAM: 1056 id = (i64)mem->alias.v.param_idx; 1057 break; 1058 case ALIAS_STRING: 1059 id = (i64)mem->alias.v.string_id; 1060 break; 1061 case ALIAS_HEAP: 1062 id = 0; 1063 break; 1064 default: 1065 kind = ALIAS_UNKNOWN; 1066 id = 0; 1067 break; 1068 } 1069 } 1070 1071 *kind_out = kind; 1072 *id_out = id; 1073 *offset_out = offset; 1074 *singleton_out = singleton; 1075 return kind != ALIAS_UNKNOWN; 1076 } 1077 1078 static GvnMemVersion* gvn_mem_version(GvnCtx* ctx, u8 kind, u16 addr_space, 1079 i64 root_id) { 1080 GvnMemState* s = &ctx->mem; 1081 for (u32 i = 0; i < s->nroots; ++i) { 1082 GvnMemVersion* r = &s->roots[i]; 1083 if (r->root_kind == kind && r->addr_space == addr_space && 1084 r->root_id == root_id) 1085 return r; 1086 } 1087 if (s->nroots == s->cap) { 1088 u32 ncap = s->cap ? s->cap * 2u : 16u; 1089 GvnMemVersion* roots = arena_array(ctx->f->arena, GvnMemVersion, ncap); 1090 if (s->roots) memcpy(roots, s->roots, sizeof(roots[0]) * s->nroots); 1091 s->roots = roots; 1092 s->cap = ncap; 1093 } 1094 GvnMemVersion* r = &s->roots[s->nroots++]; 1095 memset(r, 0, sizeof *r); 1096 r->root_kind = kind; 1097 r->addr_space = addr_space; 1098 r->root_id = root_id; 1099 return r; 1100 } 1101 1102 static void gvn_mem_state_copy(GvnCtx* ctx, GvnMemState* dst, 1103 const GvnMemState* src) { 1104 memset(dst, 0, sizeof *dst); 1105 dst->unknown_version = src->unknown_version; 1106 if (src->nroots) { 1107 dst->roots = arena_array(ctx->f->arena, GvnMemVersion, src->nroots); 1108 memcpy(dst->roots, src->roots, sizeof(dst->roots[0]) * src->nroots); 1109 dst->nroots = src->nroots; 1110 dst->cap = src->nroots; 1111 } 1112 if (src->navail) { 1113 dst->avail = arena_array(ctx->f->arena, GvnMemAvail, src->navail); 1114 memcpy(dst->avail, src->avail, sizeof(dst->avail[0]) * src->navail); 1115 dst->navail = src->navail; 1116 dst->cap_avail = src->navail; 1117 } 1118 } 1119 1120 static int gvn_mem_key_tracks_unknown(GvnCtx* ctx, u8 kind, i64 root_id) { 1121 if (kind == ALIAS_LOCAL && root_id > 0 && 1122 (u32)root_id <= ctx->f->nframe_slots) 1123 return ctx->local_escaped && ctx->local_escaped[root_id]; 1124 return kind != ALIAS_STRING; 1125 } 1126 1127 static int gvn_mem_call_clobbers_root(GvnCtx* ctx, u8 kind, i64 root_id) { 1128 if (kind == ALIAS_LOCAL && root_id > 0 && 1129 (u32)root_id <= ctx->f->nframe_slots) 1130 return !ctx->local_escaped || ctx->local_escaped[root_id]; 1131 return kind != ALIAS_STRING; 1132 } 1133 1134 static void gvn_mem_avail_remove_at(GvnMemState* s, u32 i) { 1135 if (i + 1u < s->navail) 1136 memmove(&s->avail[i], &s->avail[i + 1u], 1137 sizeof(s->avail[0]) * (s->navail - i - 1u)); 1138 --s->navail; 1139 } 1140 1141 static void gvn_mem_avail_add(GvnCtx* ctx, const GvnKey* key, Val val) { 1142 GvnMemState* s = &ctx->mem; 1143 for (u32 i = 0; i < s->navail; ++i) { 1144 if (gvn_key_equal(&s->avail[i].key, key)) { 1145 s->avail[i].val = val; 1146 return; 1147 } 1148 } 1149 if (s->navail == s->cap_avail) { 1150 u32 ncap = s->cap_avail ? s->cap_avail * 2u : 16u; 1151 GvnMemAvail* avail = arena_array(ctx->f->arena, GvnMemAvail, ncap); 1152 if (s->avail) memcpy(avail, s->avail, sizeof(avail[0]) * s->navail); 1153 s->avail = avail; 1154 s->cap_avail = ncap; 1155 } 1156 s->avail[s->navail].key = *key; 1157 s->avail[s->navail].val = val; 1158 ++s->navail; 1159 } 1160 1161 static int gvn_mem_avail_find(GvnCtx* ctx, const GvnKey* key, u32 b, u32 i, 1162 Val* out) { 1163 for (u32 a = 0; a < ctx->mem.navail; ++a) { 1164 Val v = ctx->mem.avail[a].val; 1165 if (!gvn_key_equal(&ctx->mem.avail[a].key, key)) continue; 1166 if (v == VAL_NONE || v >= ctx->f->nvals) continue; 1167 if (!gvn_leader_dominates_site(ctx->analysis, ctx->f->val_def_block[v], 1168 ctx->f->val_def_inst[v], b, i)) 1169 continue; 1170 *out = v; 1171 return 1; 1172 } 1173 return 0; 1174 } 1175 1176 static void gvn_mem_avail_remove_call_clobbered(GvnCtx* ctx) { 1177 GvnMemState* s = &ctx->mem; 1178 for (u32 i = 0; i < s->navail;) { 1179 GvnMemKey* m = &s->avail[i].key.mem; 1180 if (!m->valid || m->root_kind == ALIAS_UNKNOWN || 1181 gvn_mem_call_clobbers_root(ctx, m->root_kind, m->root_id)) 1182 gvn_mem_avail_remove_at(s, i); 1183 else 1184 ++i; 1185 } 1186 } 1187 1188 static void gvn_mem_barrier(GvnCtx* ctx) { 1189 ++ctx->mem.unknown_version; 1190 for (u32 i = 0; i < ctx->mem.nroots; ++i) ++ctx->mem.roots[i].version; 1191 ctx->mem.navail = 0; 1192 } 1193 1194 static void gvn_mem_call_barrier(GvnCtx* ctx) { 1195 ++ctx->mem.unknown_version; 1196 for (u32 i = 0; i < ctx->mem.nroots; ++i) { 1197 GvnMemVersion* r = &ctx->mem.roots[i]; 1198 if (gvn_mem_call_clobbers_root(ctx, r->root_kind, r->root_id)) ++r->version; 1199 } 1200 gvn_mem_avail_remove_call_clobbered(ctx); 1201 } 1202 1203 static void gvn_mem_bump_store(GvnCtx* ctx, u8 kind, u16 addr_space, 1204 i64 root_id) { 1205 if (kind == ALIAS_UNKNOWN) { 1206 gvn_mem_barrier(ctx); 1207 return; 1208 } 1209 ++gvn_mem_version(ctx, kind, addr_space, root_id)->version; 1210 } 1211 1212 static int gvn_make_memory_key(GvnCtx* ctx, const Inst* in, u32 addr_idx, 1213 KitCgTypeId val_ty, u8 val_cls, GvnKey* key) { 1214 const MemAccess* mem; 1215 const Operand* addr; 1216 u8 root_kind; 1217 i64 root_id; 1218 i64 offset; 1219 int singleton; 1220 1221 if (!in || addr_idx >= in->nopnds) return 0; 1222 mem = &in->extra.mem; 1223 addr = &in->opnds[addr_idx]; 1224 if (opt_mem_observable(mem)) return 0; 1225 1226 memset(key, 0, sizeof *key); 1227 key->op = IR_LOAD; 1228 key->type = val_ty; 1229 key->cls = val_cls; 1230 key->mem.valid = 1; 1231 key->mem.addr_space = mem->addr_space; 1232 key->mem.flags = mem->flags; 1233 key->mem.mem_type = mem->type; 1234 key->mem.size = mem->size; 1235 key->mem.align = mem->align; 1236 (void)gvn_mem_root_from_access(ctx, addr, mem, &root_kind, &root_id, &offset, 1237 &singleton); 1238 key->mem.root_kind = root_kind; 1239 key->mem.root_id = root_id; 1240 key->mem.offset = offset; 1241 if (gvn_mem_key_tracks_unknown(ctx, root_kind, root_id)) 1242 key->mem.unknown_version = ctx->mem.unknown_version; 1243 if (root_kind != ALIAS_UNKNOWN) { 1244 key->mem.root_version = 1245 gvn_mem_version(ctx, root_kind, mem->addr_space, root_id)->version; 1246 } 1247 1248 if (!singleton) { 1249 key->mem.has_addr = 1; 1250 key->nops = 1; 1251 if (!gvn_make_addr_operand_key(ctx, addr, &key->ops[0])) return 0; 1252 } 1253 return 1; 1254 } 1255 1256 static int gvn_operand_key_less(const GvnOperandKey* a, 1257 const GvnOperandKey* b) { 1258 if (a->kind != b->kind) return a->kind < b->kind; 1259 if (a->type != b->type) return a->type < b->type; 1260 if (a->cls != b->cls) return a->cls < b->cls; 1261 if (a->kind == OPK_INDIRECT) { 1262 if (a->v.ind.base != b->v.ind.base) return a->v.ind.base < b->v.ind.base; 1263 if (a->v.ind.index != b->v.ind.index) 1264 return a->v.ind.index < b->v.ind.index; 1265 if (a->v.ind.ofs != b->v.ind.ofs) return a->v.ind.ofs < b->v.ind.ofs; 1266 return a->v.ind.log2_scale < b->v.ind.log2_scale; 1267 } 1268 if (a->kind == OPK_REG) return a->v.reg < b->v.reg; 1269 return a->v.imm < b->v.imm; 1270 } 1271 1272 static int gvn_make_key(GvnCtx* ctx, const Inst* in, GvnKey* key) { 1273 memset(key, 0, sizeof *key); 1274 if (in->def == VAL_NONE || in->def >= ctx->f->nvals) return 0; 1275 key->op = in->op; 1276 key->type = ctx->f->val_type[in->def]; 1277 key->cls = ctx->f->val_cls[in->def]; 1278 key->tag = in->extra.imm; 1279 1280 switch ((IROp)in->op) { 1281 case IR_LOAD_IMM: 1282 case IR_CONST_I: 1283 key->nops = 0; 1284 key->tag = in->extra.imm; 1285 return 1; 1286 case IR_COPY: 1287 if (in->nopnds < 2) return 0; 1288 key->nops = 1; 1289 return gvn_make_operand_key(ctx, &in->opnds[1], &key->ops[0]); 1290 case IR_UNOP: 1291 case IR_CONVERT: 1292 if (in->nopnds < 2) return 0; 1293 key->nops = 1; 1294 return gvn_make_operand_key(ctx, &in->opnds[1], &key->ops[0]); 1295 case IR_BINOP: 1296 case IR_CMP: 1297 if (in->nopnds < 3) return 0; 1298 key->nops = 2; 1299 if (!gvn_make_operand_key(ctx, &in->opnds[1], &key->ops[0]) || 1300 !gvn_make_operand_key(ctx, &in->opnds[2], &key->ops[1])) 1301 return 0; 1302 if (((IROp)in->op == IR_BINOP && 1303 kit_ir_binop_is_commutative_int((BinOp)in->extra.imm)) || 1304 ((IROp)in->op == IR_CMP && 1305 kit_ir_cmp_is_commutative_int((CmpOp)in->extra.imm))) { 1306 if (gvn_operand_key_less(&key->ops[1], &key->ops[0])) { 1307 GvnOperandKey tmp = key->ops[0]; 1308 key->ops[0] = key->ops[1]; 1309 key->ops[1] = tmp; 1310 } 1311 } 1312 return 1; 1313 default: 1314 return 0; 1315 } 1316 } 1317 1318 static void gvn_replace_one_use(Func* f, const OptUse* use, Val repl) { 1319 Inst* in = &f->blocks[use->block].insts[use->inst]; 1320 switch ((OptUseKind)use->kind) { 1321 case OPT_USE_OPERAND: 1322 use->operand->v.reg = (Reg)repl; 1323 use->operand->type = f->val_type[repl]; 1324 use->operand->cls = f->val_cls[repl]; 1325 break; 1326 case OPT_USE_INDIRECT_BASE: 1327 use->operand->v.ind.base = (Reg)repl; 1328 break; 1329 case OPT_USE_INDIRECT_INDEX: 1330 use->operand->v.ind.index = (Reg)repl; 1331 break; 1332 case OPT_USE_PHI_INPUT: { 1333 IRPhiAux* aux = (IRPhiAux*)in->extra.aux; 1334 if (aux && use->phi_pred_index < aux->npreds) 1335 aux->pred_vals[use->phi_pred_index] = repl; 1336 break; 1337 } 1338 default: 1339 break; 1340 } 1341 } 1342 1343 static int gvn_replace_dominated_uses(GvnCtx* ctx, Val old, Val repl, u32 rb, 1344 u32 ri) { 1345 Func* f = ctx->f; 1346 int changed = 0; 1347 if (old == VAL_NONE || old >= f->nvals || repl == VAL_NONE || 1348 repl >= f->nvals || old == repl || !gvn_same_shape(f, old, repl)) 1349 return 0; 1350 for (u32 u = f->opt_first_use_by_val[old]; u != OPT_USE_NONE; 1351 u = f->opt_uses[u].next_for_val) { 1352 OptUse* use = &f->opt_uses[u]; 1353 if (!gvn_leader_dominates_use(ctx, rb, ri, use)) continue; 1354 gvn_replace_one_use(f, use, repl); 1355 changed = 1; 1356 } 1357 if (changed) { 1358 ctx->parent[old] = repl; 1359 ctx->constants[old] = ctx->constants[gvn_find(ctx, repl)]; 1360 } 1361 return changed; 1362 } 1363 1364 static void gvn_note_const(GvnCtx* ctx, Inst* in) { 1365 if (in->def == VAL_NONE || in->def >= ctx->f->nvals) return; 1366 if ((IROp)in->op == IR_LOAD_IMM || (IROp)in->op == IR_CONST_I) { 1367 ctx->constants[in->def].valid = 1; 1368 ctx->constants[in->def].value = in->extra.imm; 1369 } 1370 } 1371 1372 static int gvn_scalar_candidate(const Inst* in) { 1373 if (!in || in->def == VAL_NONE || in->ndefs) return 0; 1374 switch ((IROp)in->op) { 1375 case IR_CONST_I: 1376 case IR_LOAD_IMM: 1377 case IR_COPY: 1378 case IR_BINOP: 1379 case IR_UNOP: 1380 case IR_CMP: 1381 case IR_CONVERT: 1382 return 1; 1383 default: 1384 return 0; 1385 } 1386 } 1387 1388 static int gvn_fold_branch(GvnCtx* ctx, u32 b, Inst* in) { 1389 Block* bl = &ctx->f->blocks[b]; 1390 i64 a, c, taken; 1391 u32 target; 1392 switch ((IROp)in->op) { 1393 case IR_CONDBR: 1394 if (in->nopnds < 1 || bl->nsucc < 2) return 0; 1395 if (in->opnds[0].kind != OPK_REG) return 0; 1396 if (!gvn_const_for_operand(ctx, &in->opnds[0], &a)) return 0; 1397 target = bl->succ[a != 0 ? 0u : 1u]; 1398 break; 1399 case IR_CMP_BRANCH: 1400 if (in->nopnds < 2 || bl->nsucc < 2) return 0; 1401 if (in->opnds[0].kind != OPK_REG && in->opnds[1].kind != OPK_REG) 1402 return 0; 1403 if (!gvn_const_for_operand(ctx, &in->opnds[0], &a) || 1404 !gvn_const_for_operand(ctx, &in->opnds[1], &c)) 1405 return 0; 1406 if (!gvn_fold_cmp(ctx->f, (CmpOp)in->extra.imm, in->opnds[0].type, a, c, 1407 &taken)) 1408 return 0; 1409 target = bl->succ[taken ? 0u : 1u]; 1410 break; 1411 default: 1412 return 0; 1413 } 1414 1415 in->op = IR_BR; 1416 in->def = VAL_NONE; 1417 in->ndefs = 0; 1418 in->defs = NULL; 1419 in->nopnds = 0; 1420 in->opnds = NULL; 1421 bl->succ[0] = target; 1422 bl->nsucc = 1; 1423 return 1; 1424 } 1425 1426 static int gvn_inst_memory_barrier(const Inst* in) { 1427 switch ((IROp)in->op) { 1428 case IR_CALL: 1429 case IR_AGG_COPY: 1430 case IR_AGG_SET: 1431 case IR_BITFIELD_LOAD: 1432 case IR_BITFIELD_STORE: 1433 case IR_VA_START: 1434 case IR_VA_ARG: 1435 case IR_VA_END: 1436 case IR_VA_COPY: 1437 case IR_ATOMIC_LOAD: 1438 case IR_ATOMIC_STORE: 1439 case IR_ATOMIC_RMW: 1440 case IR_ATOMIC_CAS: 1441 case IR_FENCE: 1442 case IR_ASM_BLOCK: 1443 case IR_INTRINSIC: 1444 return 1; 1445 default: 1446 return 0; 1447 } 1448 } 1449 1450 static int gvn_visit_memory_inst(GvnCtx* ctx, u32 b, u32 i, Inst* in) { 1451 if ((IROp)in->op == IR_LOAD) { 1452 if (opt_mem_observable(&in->extra.mem)) { 1453 gvn_mem_barrier(ctx); 1454 return 1; 1455 } 1456 if (in->def == VAL_NONE || in->def >= ctx->f->nvals) return 1; 1457 GvnKey key; 1458 if (!gvn_make_memory_key(ctx, in, 1u, ctx->f->val_type[in->def], 1459 ctx->f->val_cls[in->def], &key)) 1460 return 1; 1461 Val leader = VAL_NONE; 1462 if (gvn_mem_avail_find(ctx, &key, b, i, &leader)) { 1463 if (gvn_replace_dominated_uses(ctx, in->def, leader, 1464 ctx->f->val_def_block[leader], 1465 ctx->f->val_def_inst[leader])) 1466 ctx->changed = 1; 1467 return 1; 1468 } 1469 gvn_mem_avail_add(ctx, &key, in->def); 1470 return 1; 1471 } 1472 1473 if ((IROp)in->op == IR_STORE) { 1474 u8 root_kind; 1475 i64 root_id; 1476 i64 offset; 1477 int singleton; 1478 Val src; 1479 if (opt_mem_observable(&in->extra.mem)) { 1480 gvn_mem_barrier(ctx); 1481 return 1; 1482 } 1483 if (in->nopnds < 2) { 1484 gvn_mem_barrier(ctx); 1485 return 1; 1486 } 1487 (void)gvn_mem_root_from_access(ctx, &in->opnds[0], &in->extra.mem, 1488 &root_kind, &root_id, &offset, &singleton); 1489 (void)offset; 1490 (void)singleton; 1491 gvn_mem_bump_store(ctx, root_kind, in->extra.mem.addr_space, root_id); 1492 if (in->opnds[1].kind != OPK_REG) return 1; 1493 src = gvn_find(ctx, (Val)in->opnds[1].v.reg); 1494 if (src == VAL_NONE || src >= ctx->f->nvals) return 1; 1495 GvnKey key; 1496 if (!gvn_make_memory_key(ctx, in, 0u, ctx->f->val_type[src], 1497 ctx->f->val_cls[src], &key)) 1498 return 1; 1499 gvn_mem_avail_add(ctx, &key, src); 1500 return 1; 1501 } 1502 1503 if ((IROp)in->op == IR_CALL) { 1504 gvn_mem_call_barrier(ctx); 1505 return 1; 1506 } 1507 1508 if (gvn_inst_memory_barrier(in)) { 1509 gvn_mem_barrier(ctx); 1510 return 1; 1511 } 1512 return 0; 1513 } 1514 1515 static void gvn_visit_inst(GvnCtx* ctx, u32 b, u32 i) { 1516 Inst* in = &ctx->f->blocks[b].insts[i]; 1517 if (gvn_fold_branch(ctx, b, in)) { 1518 ctx->changed = 1; 1519 ctx->cfg_changed = 1; 1520 return; 1521 } 1522 if (gvn_visit_memory_inst(ctx, b, i, in)) return; 1523 if (!gvn_scalar_candidate(in)) return; 1524 1525 if (gvn_try_fold_inst(ctx, in)) ctx->changed = 1; 1526 gvn_note_const(ctx, in); 1527 1528 GvnKey key; 1529 if (!gvn_make_key(ctx, in, &key)) return; 1530 1531 Val leader = VAL_NONE; 1532 if (gvn_table_find(ctx, &key, b, i, &leader)) { 1533 if (gvn_replace_dominated_uses(ctx, in->def, leader, 1534 ctx->f->val_def_block[leader], 1535 ctx->f->val_def_inst[leader])) 1536 ctx->changed = 1; 1537 return; 1538 } 1539 gvn_table_add(&ctx->table, &key, in->def, b, i); 1540 } 1541 1542 static int gvn_raw_const_zero(Func* f, Val v) { 1543 Inst* def = NULL; 1544 return val_def_inst(f, v, &def) && def && (IROp)def->op == IR_LOAD_IMM && 1545 def->extra.imm == 0; 1546 } 1547 1548 static int gvn_raw_operand_zero(Func* f, const Operand* op) { 1549 if (!op) return 0; 1550 if (op->kind == OPK_IMM) return op->v.imm == 0; 1551 if (op->kind == OPK_REG) return gvn_raw_const_zero(f, (Val)op->v.reg); 1552 return 0; 1553 } 1554 1555 static int gvn_raw_local_addr_root(Func* f, Val v, u32 depth, FrameSlot* out) { 1556 Inst* def = NULL; 1557 if (!f || depth > 4u || v == VAL_NONE) return 0; 1558 if (!val_def_inst(f, v, &def) || !def) return 0; 1559 if ((IROp)def->op == IR_ADDR_OF && def->nopnds >= 2 && 1560 def->opnds[1].kind == OPK_LOCAL) { 1561 *out = def->opnds[1].v.frame_slot; 1562 return 1; 1563 } 1564 if ((IROp)def->op == IR_COPY && def->nopnds >= 2 && 1565 def->opnds[1].kind == OPK_REG) 1566 return gvn_raw_local_addr_root(f, (Val)def->opnds[1].v.reg, depth + 1u, 1567 out); 1568 if ((IROp)def->op == IR_BINOP && def->nopnds >= 3) { 1569 if ((BinOp)def->extra.imm == BO_IADD) { 1570 if (def->opnds[1].kind == OPK_REG && 1571 gvn_raw_operand_zero(f, &def->opnds[2])) 1572 return gvn_raw_local_addr_root(f, (Val)def->opnds[1].v.reg, depth + 1u, 1573 out); 1574 if (def->opnds[2].kind == OPK_REG && 1575 gvn_raw_operand_zero(f, &def->opnds[1])) 1576 return gvn_raw_local_addr_root(f, (Val)def->opnds[2].v.reg, depth + 1u, 1577 out); 1578 } 1579 if ((BinOp)def->extra.imm == BO_ISUB && def->opnds[1].kind == OPK_REG && 1580 gvn_raw_operand_zero(f, &def->opnds[2])) 1581 return gvn_raw_local_addr_root(f, (Val)def->opnds[1].v.reg, depth + 1u, 1582 out); 1583 } 1584 return 0; 1585 } 1586 1587 static void gvn_mark_escaped_operand(GvnCtx* ctx, const Operand* op) { 1588 FrameSlot fs = FRAME_SLOT_NONE; 1589 if (!op) return; 1590 if (op->kind == OPK_LOCAL) { 1591 fs = op->v.frame_slot; 1592 if (fs > 0 && (u32)fs <= ctx->f->nframe_slots) ctx->local_escaped[fs] = 1; 1593 } else if (op->kind == OPK_REG) { 1594 if (gvn_raw_local_addr_root(ctx->f, (Val)op->v.reg, 0, &fs) && fs > 0 && 1595 (u32)fs <= ctx->f->nframe_slots) 1596 ctx->local_escaped[fs] = 1; 1597 } else if (op->kind == OPK_INDIRECT) { 1598 if (gvn_raw_local_addr_root(ctx->f, (Val)op->v.ind.base, 0, &fs) && 1599 fs > 0 && (u32)fs <= ctx->f->nframe_slots) 1600 ctx->local_escaped[fs] = 1; 1601 } 1602 } 1603 1604 static void gvn_mark_escaped_abivalue(GvnCtx* ctx, const CGABIValue* v) { 1605 if (!v) return; 1606 gvn_mark_escaped_operand(ctx, &v->storage); 1607 for (u32 i = 0; i < v->nparts; ++i) 1608 gvn_mark_escaped_operand(ctx, &v->parts[i].op); 1609 } 1610 1611 static void gvn_collect_local_escapes(GvnCtx* ctx) { 1612 Func* f = ctx->f; 1613 for (u32 b = 0; b < f->nblocks; ++b) { 1614 Block* bl = &f->blocks[b]; 1615 for (u32 i = 0; i < bl->ninsts; ++i) { 1616 Inst* in = &bl->insts[i]; 1617 switch ((IROp)in->op) { 1618 case IR_LOAD: 1619 case IR_ADDR_OF: 1620 case IR_COPY: 1621 break; 1622 case IR_STORE: 1623 if (in->nopnds >= 2) gvn_mark_escaped_operand(ctx, &in->opnds[1]); 1624 break; 1625 case IR_BINOP: 1626 if (in->nopnds >= 3 && (BinOp)in->extra.imm == BO_IADD && 1627 (gvn_raw_operand_zero(f, &in->opnds[1]) || 1628 gvn_raw_operand_zero(f, &in->opnds[2]))) 1629 break; 1630 if (in->nopnds >= 3 && (BinOp)in->extra.imm == BO_ISUB && 1631 gvn_raw_operand_zero(f, &in->opnds[2])) 1632 break; 1633 for (u32 o = 1; o < in->nopnds; ++o) 1634 gvn_mark_escaped_operand(ctx, &in->opnds[o]); 1635 break; 1636 case IR_CALL: { 1637 IRCallAux* aux = (IRCallAux*)in->extra.aux; 1638 if (!aux) break; 1639 if (aux->use_plan_replay) { 1640 gvn_mark_escaped_operand(ctx, &aux->plan.callee); 1641 for (u32 a = 0; a < aux->plan.nargs; ++a) 1642 gvn_mark_escaped_operand(ctx, &aux->plan.args[a].src); 1643 for (u32 r = 0; r < aux->plan.nrets; ++r) 1644 gvn_mark_escaped_operand(ctx, &aux->plan.rets[r].dst); 1645 } else { 1646 gvn_mark_escaped_operand(ctx, &aux->desc.callee); 1647 for (u32 a = 0; a < aux->desc.nargs; ++a) 1648 gvn_mark_escaped_abivalue(ctx, &aux->desc.args[a]); 1649 gvn_mark_escaped_abivalue(ctx, &aux->desc.ret); 1650 } 1651 break; 1652 } 1653 case IR_RET: { 1654 IRRetAux* aux = (IRRetAux*)in->extra.aux; 1655 if (aux && aux->present) gvn_mark_escaped_abivalue(ctx, &aux->val); 1656 break; 1657 } 1658 default: 1659 if (gvn_inst_memory_barrier(in)) 1660 for (u32 o = 0; o < in->nopnds; ++o) 1661 gvn_mark_escaped_operand(ctx, &in->opnds[o]); 1662 break; 1663 } 1664 } 1665 } 1666 } 1667 1668 static int gvn_mem_state_has_avail(const GvnMemState* s, const GvnKey* key, 1669 Val val) { 1670 for (u32 i = 0; i < s->navail; ++i) 1671 if (s->avail[i].val == val && gvn_key_equal(&s->avail[i].key, key)) 1672 return 1; 1673 return 0; 1674 } 1675 1676 static void gvn_mem_merge_block_in(GvnCtx* ctx, u32 b) { 1677 Func* f = ctx->f; 1678 Block* bl = &f->blocks[b]; 1679 GvnBlockMemState* bs = &ctx->block_mem[b]; 1680 memset(&bs->in, 0, sizeof bs->in); 1681 if (!bl->npreds) return; 1682 const GvnMemState* first = NULL; 1683 for (u32 p = 0; p < bl->npreds; ++p) { 1684 u32 pred = bl->preds[p]; 1685 if (pred >= f->nblocks || !ctx->analysis->reachable[pred]) continue; 1686 if (!ctx->block_mem[pred].out_valid) return; 1687 if (!first) first = &ctx->block_mem[pred].out; 1688 } 1689 if (!first) return; 1690 gvn_mem_state_copy(ctx, &bs->in, first); 1691 for (u32 i = 0; i < bs->in.navail;) { 1692 GvnMemAvail* a = &bs->in.avail[i]; 1693 int keep = 1; 1694 for (u32 p = 0; p < bl->npreds; ++p) { 1695 u32 pred = bl->preds[p]; 1696 if (pred >= f->nblocks || !ctx->analysis->reachable[pred]) continue; 1697 if (!gvn_mem_state_has_avail(&ctx->block_mem[pred].out, &a->key, 1698 a->val)) { 1699 keep = 0; 1700 break; 1701 } 1702 } 1703 if (keep) 1704 ++i; 1705 else 1706 gvn_mem_avail_remove_at(&bs->in, i); 1707 } 1708 } 1709 1710 static void gvn_realign_phi_preds(Func* f) { 1711 for (u32 b = 0; b < f->nblocks; ++b) { 1712 Block* bl = &f->blocks[b]; 1713 for (u32 i = 0; i < bl->ninsts; ++i) { 1714 Inst* phi = &bl->insts[i]; 1715 if ((IROp)phi->op != IR_PHI) break; 1716 IRPhiAux* aux = (IRPhiAux*)phi->extra.aux; 1717 if (!aux) continue; 1718 u32 old_n = aux->npreds; 1719 u32* old_blocks = aux->pred_blocks; 1720 Val* old_vals = aux->pred_vals; 1721 u32* pred_blocks = 1722 bl->npreds ? arena_array(f->arena, u32, bl->npreds) : NULL; 1723 Val* pred_vals = 1724 bl->npreds ? arena_zarray(f->arena, Val, bl->npreds) : NULL; 1725 for (u32 p = 0; p < bl->npreds; ++p) { 1726 pred_blocks[p] = bl->preds[p]; 1727 pred_vals[p] = phi->def; 1728 for (u32 old = 0; old < old_n; ++old) { 1729 if (old_blocks && old_blocks[old] == bl->preds[p]) { 1730 pred_vals[p] = old_vals ? old_vals[old] : VAL_NONE; 1731 break; 1732 } 1733 } 1734 } 1735 aux->npreds = bl->npreds; 1736 aux->pred_blocks = pred_blocks; 1737 aux->pred_vals = pred_vals; 1738 } 1739 } 1740 } 1741 1742 void opt_gvn(Func* f) { 1743 if (!f || f->opt_rewritten) return; 1744 if (!f->opt_reg_ssa && f->npregs > 1) { 1745 opt_rebuild_def_use(f); 1746 return; 1747 } 1748 1749 opt_rebuild_def_use(f); 1750 OptAnalysis a; 1751 memset(&a, 0, sizeof a); 1752 opt_analysis_build_dominators(f, &a); 1753 1754 GvnCtx ctx; 1755 memset(&ctx, 0, sizeof ctx); 1756 ctx.f = f; 1757 ctx.analysis = &a; 1758 ctx.parent = arena_array(f->arena, Val, f->nvals ? f->nvals : 1u); 1759 ctx.constants = arena_zarray(f->arena, GvnConst, f->nvals ? f->nvals : 1u); 1760 ctx.block_mem = 1761 arena_zarray(f->arena, GvnBlockMemState, f->nblocks ? f->nblocks : 1u); 1762 ctx.local_escaped = 1763 arena_zarray(f->arena, u8, f->nframe_slots ? f->nframe_slots + 1u : 1u); 1764 for (Val v = 0; v < f->nvals; ++v) ctx.parent[v] = v; 1765 gvn_table_init(&ctx.table, f); 1766 gvn_collect_local_escapes(&ctx); 1767 1768 for (u32 ri = 0; ri < a.nrpo; ++ri) { 1769 u32 b = a.rpo[ri]; 1770 if (b >= f->nblocks || !a.reachable[b]) continue; 1771 gvn_mem_merge_block_in(&ctx, b); 1772 gvn_mem_state_copy(&ctx, &ctx.mem, &ctx.block_mem[b].in); 1773 for (u32 i = 0; i < f->blocks[b].ninsts; ++i) gvn_visit_inst(&ctx, b, i); 1774 gvn_mem_state_copy(&ctx, &ctx.block_mem[b].out, &ctx.mem); 1775 ctx.block_mem[b].out_valid = 1; 1776 } 1777 1778 if (ctx.cfg_changed) { 1779 opt_analysis_invalidate(f, OPT_ANALYSIS_CFG | OPT_ANALYSIS_DEF_USE | 1780 OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP); 1781 opt_build_cfg(f); 1782 gvn_realign_phi_preds(f); 1783 } else if (ctx.changed) { 1784 opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE); 1785 } 1786 opt_rebuild_def_use(f); 1787 } 1788 1789 static int dse_key_equal(const DseKey* a, const DseKey* b) { 1790 return a->root_kind == b->root_kind && a->addr_space == b->addr_space && 1791 a->flags == b->flags && a->mem_type == b->mem_type && 1792 a->size == b->size && a->align == b->align && 1793 a->root_id == b->root_id && a->offset == b->offset; 1794 } 1795 1796 static int dse_key_same_root(const DseKey* a, const DseKey* b) { 1797 return a->root_kind == b->root_kind && a->addr_space == b->addr_space && 1798 a->root_id == b->root_id; 1799 } 1800 1801 static int dse_ranges_overlap(const DseKey* a, const DseKey* b) { 1802 i64 ae = a->offset + (i64)a->size; 1803 i64 be = b->offset + (i64)b->size; 1804 return a->offset < be && b->offset < ae; 1805 } 1806 1807 static int dse_range_covers(const DseKey* cover, const DseKey* covered) { 1808 i64 ce = cover->offset + (i64)cover->size; 1809 i64 de = covered->offset + (i64)covered->size; 1810 return cover->offset <= covered->offset && ce >= de; 1811 } 1812 1813 static int dse_root_call_clobbered(DseCtx* ctx, const DseKey* key) { 1814 if (key->root_kind == ALIAS_LOCAL && key->root_id > 0 && 1815 (u32)key->root_id <= ctx->f->nframe_slots) 1816 return ctx->local_escaped && ctx->local_escaped[key->root_id]; 1817 return key->root_kind != ALIAS_STRING; 1818 } 1819 1820 static int dse_root_observable_at_exit(DseCtx* ctx, const DseKey* key) { 1821 if (key->root_kind == ALIAS_LOCAL && key->root_id > 0 && 1822 (u32)key->root_id <= ctx->f->nframe_slots) 1823 return ctx->local_escaped && ctx->local_escaped[key->root_id]; 1824 return key->root_kind != ALIAS_UNKNOWN; 1825 } 1826 1827 static int dse_make_key(DseCtx* ctx, const Inst* in, u32 addr_idx, 1828 DseKey* out) { 1829 u8 root_kind; 1830 i64 root_id; 1831 i64 offset; 1832 int singleton; 1833 1834 if (!ctx || !in || addr_idx >= in->nopnds || 1835 opt_mem_observable(&in->extra.mem)) 1836 return 0; 1837 if (in->extra.mem.size == 0) return 0; 1838 if (!gvn_mem_root_from_access(&ctx->gvn, &in->opnds[addr_idx], &in->extra.mem, 1839 &root_kind, &root_id, &offset, &singleton)) 1840 return 0; 1841 if (!singleton || root_kind == ALIAS_UNKNOWN) return 0; 1842 1843 memset(out, 0, sizeof *out); 1844 out->root_kind = root_kind; 1845 out->addr_space = in->extra.mem.addr_space; 1846 out->flags = in->extra.mem.flags; 1847 out->mem_type = in->extra.mem.type; 1848 out->size = in->extra.mem.size; 1849 out->align = in->extra.mem.align; 1850 out->root_id = root_id; 1851 out->offset = offset; 1852 return 1; 1853 } 1854 1855 static u32 dse_key_intern(DseCtx* ctx, const DseKey* key) { 1856 for (u32 i = 0; i < ctx->nkeys; ++i) 1857 if (dse_key_equal(&ctx->keys[i], key)) return i; 1858 if (ctx->nkeys == ctx->cap_keys) { 1859 u32 ncap = ctx->cap_keys ? ctx->cap_keys * 2u : 16u; 1860 DseKey* keys = arena_array(ctx->f->arena, DseKey, ncap); 1861 if (ctx->keys) memcpy(keys, ctx->keys, sizeof(keys[0]) * ctx->nkeys); 1862 ctx->keys = keys; 1863 ctx->cap_keys = ncap; 1864 } 1865 ctx->keys[ctx->nkeys] = *key; 1866 return ctx->nkeys++; 1867 } 1868 1869 static void dse_bitset_clear(DseCtx* ctx, DseBitset* bs) { 1870 memset(bs->words, 0, sizeof(bs->words[0]) * ctx->words); 1871 } 1872 1873 static int dse_bitset_copy(DseCtx* ctx, DseBitset* dst, const DseBitset* src) { 1874 int changed = 1875 memcmp(dst->words, src->words, sizeof(dst->words[0]) * ctx->words) != 0; 1876 if (changed) 1877 memcpy(dst->words, src->words, sizeof(dst->words[0]) * ctx->words); 1878 return changed; 1879 } 1880 1881 static void dse_bitset_or(DseCtx* ctx, DseBitset* dst, const DseBitset* src) { 1882 for (u32 w = 0; w < ctx->words; ++w) dst->words[w] |= src->words[w]; 1883 } 1884 1885 static int dse_bitset_test(const DseBitset* bs, u32 bit) { 1886 return (bs->words[bit / 64u] & (1ull << (bit % 64u))) != 0; 1887 } 1888 1889 static void dse_bitset_set(DseBitset* bs, u32 bit) { 1890 bs->words[bit / 64u] |= 1ull << (bit % 64u); 1891 } 1892 1893 static void dse_bitset_reset(DseBitset* bs, u32 bit) { 1894 bs->words[bit / 64u] &= ~(1ull << (bit % 64u)); 1895 } 1896 1897 static int dse_live_overlaps(DseCtx* ctx, const DseBitset* live, 1898 const DseKey* key) { 1899 for (u32 i = 0; i < ctx->nkeys; ++i) { 1900 if (!dse_bitset_test(live, i)) continue; 1901 if (dse_key_same_root(&ctx->keys[i], key) && 1902 dse_ranges_overlap(&ctx->keys[i], key)) 1903 return 1; 1904 } 1905 return 0; 1906 } 1907 1908 static void dse_live_mark_overlaps(DseCtx* ctx, DseBitset* live, 1909 const DseKey* key) { 1910 for (u32 i = 0; i < ctx->nkeys; ++i) { 1911 if (dse_key_same_root(&ctx->keys[i], key) && 1912 dse_ranges_overlap(&ctx->keys[i], key)) 1913 dse_bitset_set(live, i); 1914 } 1915 } 1916 1917 static void dse_live_clear_covered(DseCtx* ctx, DseBitset* live, 1918 const DseKey* key) { 1919 for (u32 i = 0; i < ctx->nkeys; ++i) { 1920 if (dse_key_same_root(&ctx->keys[i], key) && 1921 dse_range_covers(key, &ctx->keys[i])) 1922 dse_bitset_reset(live, i); 1923 } 1924 } 1925 1926 static void dse_live_mark_all(DseCtx* ctx, DseBitset* live) { 1927 for (u32 i = 0; i < ctx->nkeys; ++i) dse_bitset_set(live, i); 1928 } 1929 1930 static void dse_live_mark_call_clobbered(DseCtx* ctx, DseBitset* live) { 1931 for (u32 i = 0; i < ctx->nkeys; ++i) 1932 if (dse_root_call_clobbered(ctx, &ctx->keys[i])) dse_bitset_set(live, i); 1933 } 1934 1935 static void dse_live_mark_exit_observable(DseCtx* ctx, DseBitset* live) { 1936 for (u32 i = 0; i < ctx->nkeys; ++i) 1937 if (dse_root_observable_at_exit(ctx, &ctx->keys[i])) 1938 dse_bitset_set(live, i); 1939 } 1940 1941 static int dse_store_site_key(DseCtx* ctx, u32 block, u32 inst, u32* out) { 1942 if (!ctx->store_key_by_inst || block >= ctx->f->nblocks || 1943 inst >= ctx->f->blocks[block].ninsts || !ctx->store_key_by_inst[block]) 1944 return 0; 1945 *out = ctx->store_key_by_inst[block][inst]; 1946 if (*out != DSE_KEY_NONE) return 1; 1947 return 0; 1948 } 1949 1950 static int dse_inst_full_barrier(const Inst* in) { 1951 return gvn_inst_memory_barrier(in); 1952 } 1953 1954 static void dse_transfer_inst(DseCtx* ctx, DseBitset* live, u32 block, u32 inst, 1955 int mark_dead) { 1956 Inst* in = &ctx->f->blocks[block].insts[inst]; 1957 DseKey key; 1958 u32 key_id; 1959 1960 switch ((IROp)in->op) { 1961 case IR_LOAD: 1962 if (opt_mem_observable(&in->extra.mem)) { 1963 dse_live_mark_all(ctx, live); 1964 } else if (dse_make_key(ctx, in, 1u, &key)) { 1965 dse_live_mark_overlaps(ctx, live, &key); 1966 } else { 1967 dse_live_mark_all(ctx, live); 1968 } 1969 break; 1970 case IR_STORE: 1971 if (!dse_store_site_key(ctx, block, inst, &key_id)) { 1972 dse_live_mark_all(ctx, live); 1973 break; 1974 } 1975 key = ctx->keys[key_id]; 1976 if (mark_dead && !dse_live_overlaps(ctx, live, &key)) { 1977 in->op = IR_NOP; 1978 in->nopnds = 0; 1979 in->opnds = NULL; 1980 ctx->changed = 1; 1981 } 1982 dse_live_clear_covered(ctx, live, &key); 1983 break; 1984 case IR_CALL: 1985 dse_live_mark_call_clobbered(ctx, live); 1986 break; 1987 default: 1988 if (dse_inst_full_barrier(in)) dse_live_mark_all(ctx, live); 1989 break; 1990 } 1991 } 1992 1993 static void dse_collect_constants(GvnCtx* ctx) { 1994 for (u32 b = 0; b < ctx->f->nblocks; ++b) { 1995 Block* bl = &ctx->f->blocks[b]; 1996 for (u32 i = 0; i < bl->ninsts; ++i) gvn_note_const(ctx, &bl->insts[i]); 1997 } 1998 } 1999 2000 static void dse_collect_stores(DseCtx* ctx) { 2001 Func* f = ctx->f; 2002 ctx->store_key_by_inst = 2003 arena_zarray(f->arena, u32*, f->nblocks ? f->nblocks : 1u); 2004 for (u32 b = 0; b < f->nblocks; ++b) { 2005 Block* bl = &f->blocks[b]; 2006 ctx->store_key_by_inst[b] = 2007 arena_array(f->arena, u32, bl->ninsts ? bl->ninsts : 1u); 2008 for (u32 i = 0; i < bl->ninsts; ++i) 2009 ctx->store_key_by_inst[b][i] = DSE_KEY_NONE; 2010 for (u32 i = 0; i < bl->ninsts; ++i) { 2011 Inst* in = &bl->insts[i]; 2012 DseKey key; 2013 if ((IROp)in->op != IR_STORE) continue; 2014 if (!dse_make_key(ctx, in, 0u, &key)) continue; 2015 ctx->store_key_by_inst[b][i] = dse_key_intern(ctx, &key); 2016 } 2017 } 2018 } 2019 2020 static DseBitset dse_bitset_new(DseCtx* ctx) { 2021 DseBitset bs; 2022 bs.words = arena_zarray(ctx->f->arena, u64, ctx->words ? ctx->words : 1u); 2023 return bs; 2024 } 2025 2026 static void dse_init_block_sets(DseCtx* ctx) { 2027 Func* f = ctx->f; 2028 ctx->words = (ctx->nkeys + 63u) / 64u; 2029 if (!ctx->words) ctx->words = 1u; 2030 ctx->block = 2031 arena_zarray(f->arena, DseBlockState, f->nblocks ? f->nblocks : 1u); 2032 for (u32 b = 0; b < f->nblocks; ++b) { 2033 ctx->block[b].in = dse_bitset_new(ctx); 2034 ctx->block[b].out = dse_bitset_new(ctx); 2035 } 2036 ctx->scratch_in = dse_bitset_new(ctx); 2037 ctx->scratch_out = dse_bitset_new(ctx); 2038 } 2039 2040 static void dse_compute_block_out(DseCtx* ctx, u32 b, DseBitset* out) { 2041 Func* f = ctx->f; 2042 Block* bl = &f->blocks[b]; 2043 dse_bitset_clear(ctx, out); 2044 if (!bl->nsucc) { 2045 dse_live_mark_exit_observable(ctx, out); 2046 return; 2047 } 2048 for (u32 s = 0; s < bl->nsucc; ++s) { 2049 u32 succ = bl->succ[s]; 2050 if (succ < f->nblocks) dse_bitset_or(ctx, out, &ctx->block[succ].in); 2051 } 2052 } 2053 2054 static void dse_transfer_block(DseCtx* ctx, u32 b, const DseBitset* out, 2055 DseBitset* in, int mark_dead) { 2056 Block* bl = &ctx->f->blocks[b]; 2057 dse_bitset_copy(ctx, in, out); 2058 for (u32 ri = bl->ninsts; ri > 0; --ri) 2059 dse_transfer_inst(ctx, in, b, ri - 1u, mark_dead); 2060 } 2061 2062 static void dse_refresh_def_locations(Func* f) { 2063 for (u32 b = 0; b < f->nblocks; ++b) { 2064 Block* bl = &f->blocks[b]; 2065 for (u32 i = 0; i < bl->ninsts; ++i) { 2066 Inst* in = &bl->insts[i]; 2067 if (in->def != VAL_NONE && in->def < f->nvals) { 2068 f->val_def_block[in->def] = b; 2069 f->val_def_inst[in->def] = i; 2070 } 2071 for (u32 d = 0; d < in->ndefs; ++d) { 2072 Val v = in->defs[d]; 2073 if (v != VAL_NONE && v < f->nvals) { 2074 f->val_def_block[v] = b; 2075 f->val_def_inst[v] = i; 2076 } 2077 } 2078 } 2079 } 2080 } 2081 2082 static void dse_compact_nops(Func* f) { 2083 for (u32 b = 0; b < f->nblocks; ++b) { 2084 Block* bl = &f->blocks[b]; 2085 u32 w = 0; 2086 for (u32 i = 0; i < bl->ninsts; ++i) { 2087 if ((IROp)bl->insts[i].op == IR_NOP) continue; 2088 bl->insts[w++] = bl->insts[i]; 2089 } 2090 bl->ninsts = w; 2091 } 2092 dse_refresh_def_locations(f); 2093 } 2094 2095 void opt_dse(Func* f) { 2096 if (!f || f->opt_rewritten) return; 2097 if (!f->opt_reg_ssa && f->npregs > 1) { 2098 opt_rebuild_def_use(f); 2099 return; 2100 } 2101 2102 opt_rebuild_def_use(f); 2103 DseCtx ctx; 2104 memset(&ctx, 0, sizeof ctx); 2105 ctx.f = f; 2106 ctx.local_escaped = 2107 arena_zarray(f->arena, u8, f->nframe_slots ? f->nframe_slots + 1u : 1u); 2108 2109 memset(&ctx.gvn, 0, sizeof ctx.gvn); 2110 ctx.gvn.f = f; 2111 ctx.gvn.parent = arena_array(f->arena, Val, f->nvals ? f->nvals : 1u); 2112 ctx.gvn.constants = 2113 arena_zarray(f->arena, GvnConst, f->nvals ? f->nvals : 1u); 2114 ctx.gvn.local_escaped = ctx.local_escaped; 2115 for (Val v = 0; v < f->nvals; ++v) ctx.gvn.parent[v] = v; 2116 gvn_collect_local_escapes(&ctx.gvn); 2117 dse_collect_constants(&ctx.gvn); 2118 dse_collect_stores(&ctx); 2119 if (!ctx.nkeys) { 2120 opt_rebuild_def_use(f); 2121 return; 2122 } 2123 2124 dse_init_block_sets(&ctx); 2125 int changed = 1; 2126 while (changed) { 2127 changed = 0; 2128 for (u32 rb = f->nblocks; rb > 0; --rb) { 2129 u32 b = rb - 1u; 2130 dse_compute_block_out(&ctx, b, &ctx.scratch_out); 2131 dse_transfer_block(&ctx, b, &ctx.scratch_out, &ctx.scratch_in, 0); 2132 changed |= dse_bitset_copy(&ctx, &ctx.block[b].out, &ctx.scratch_out); 2133 changed |= dse_bitset_copy(&ctx, &ctx.block[b].in, &ctx.scratch_in); 2134 } 2135 } 2136 2137 for (u32 b = 0; b < f->nblocks; ++b) { 2138 dse_transfer_block(&ctx, b, &ctx.block[b].out, &ctx.scratch_in, 1); 2139 } 2140 2141 if (ctx.changed) { 2142 dse_compact_nops(f); 2143 opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE); 2144 } 2145 opt_rebuild_def_use(f); 2146 } 2147 2148 static void licm_mark_body(Func* f, const u8* reachable, u32 header, u32 latch, 2149 u8* body, u32* stack) { 2150 u32 sp = 0; 2151 if (!body[header]) body[header] = 1; 2152 if (!body[latch]) { 2153 body[latch] = 1; 2154 stack[sp++] = latch; 2155 } 2156 2157 while (sp) { 2158 u32 b = stack[--sp]; 2159 if (b == header) continue; 2160 Block* bl = &f->blocks[b]; 2161 for (u32 p = 0; p < bl->npreds; ++p) { 2162 u32 pred = bl->preds[p]; 2163 if (pred >= f->nblocks || !reachable[pred] || body[pred]) continue; 2164 body[pred] = 1; 2165 stack[sp++] = pred; 2166 } 2167 } 2168 } 2169 2170 static u32 licm_collect_loops(Func* f, OptAnalysis* a, LicmLoop** out) { 2171 u32 nloops = 0; 2172 LicmLoop* loops = 2173 arena_zarray(f->arena, LicmLoop, f->nblocks ? f->nblocks : 1u); 2174 u8* scratch = arena_zarray(f->arena, u8, f->nblocks ? f->nblocks : 1u); 2175 u32* stack = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u); 2176 2177 for (u32 header = 0; header < f->nblocks; ++header) { 2178 if (!a->reachable || !a->reachable[header]) continue; 2179 memset(scratch, 0, f->nblocks * sizeof scratch[0]); 2180 int has_loop = 0; 2181 2182 for (u32 latch = 0; latch < f->nblocks; ++latch) { 2183 if (!a->reachable[latch]) continue; 2184 Block* lb = &f->blocks[latch]; 2185 for (u32 s = 0; s < lb->nsucc; ++s) { 2186 if (lb->succ[s] != header) continue; 2187 if (!opt_analysis_dominates(a, header, latch)) continue; 2188 has_loop = 1; 2189 licm_mark_body(f, a->reachable, header, latch, scratch, stack); 2190 } 2191 } 2192 if (!has_loop) continue; 2193 2194 u32 preheader = OPT_BLOCK_NONE; 2195 Block* hb = &f->blocks[header]; 2196 for (u32 p = 0; p < hb->npreds; ++p) { 2197 u32 pred = hb->preds[p]; 2198 if (pred >= f->nblocks || scratch[pred]) continue; 2199 if (preheader != OPT_BLOCK_NONE) { 2200 preheader = OPT_BLOCK_NONE; 2201 break; 2202 } 2203 preheader = pred; 2204 } 2205 if (preheader == OPT_BLOCK_NONE) continue; 2206 if (f->blocks[preheader].nsucc != 1 || 2207 f->blocks[preheader].succ[0] != header) 2208 continue; 2209 2210 u8* body = arena_array(f->arena, u8, f->nblocks ? f->nblocks : 1u); 2211 memcpy(body, scratch, f->nblocks * sizeof body[0]); 2212 loops[nloops].header = header; 2213 loops[nloops].preheader = preheader; 2214 loops[nloops].body = body; 2215 ++nloops; 2216 } 2217 2218 *out = loops; 2219 return nloops; 2220 } 2221 2222 static int licm_safe_binop(BinOp op) { 2223 switch (op) { 2224 case BO_SDIV: 2225 case BO_UDIV: 2226 case BO_SREM: 2227 case BO_UREM: 2228 case BO_FDIV: 2229 return 0; 2230 default: 2231 return 1; 2232 } 2233 } 2234 2235 static int licm_candidate(Func* f, const Inst* in) { 2236 if (!in || in->def == VAL_NONE || in->ndefs || in->flags) return 0; 2237 if (in->def >= f->nvals || opt_inst_has_side_effect(f, in)) return 0; 2238 switch ((IROp)in->op) { 2239 case IR_CONST_I: 2240 case IR_CONST_BYTES: 2241 case IR_LOAD_IMM: 2242 case IR_LOAD_CONST: 2243 case IR_LOAD_LABEL_ADDR: 2244 case IR_COPY: 2245 case IR_ADDR_OF: 2246 case IR_UNOP: 2247 case IR_CMP: 2248 case IR_CONVERT: 2249 return 1; 2250 case IR_BINOP: 2251 return licm_safe_binop((BinOp)in->extra.imm); 2252 default: 2253 return 0; 2254 } 2255 } 2256 2257 static int licm_operand_invariant(Func* f, const u8* body, const Operand* op) { 2258 if (!op) return 1; 2259 if (op->kind == OPK_REG) { 2260 Val v = (Val)op->v.reg; 2261 if (v == VAL_NONE || v >= f->nvals) return 0; 2262 u32 def_block = f->val_def_block[v]; 2263 return def_block < f->nblocks && !body[def_block]; 2264 } 2265 if (op->kind == OPK_INDIRECT) { 2266 Val v = (Val)op->v.ind.base; 2267 if (v == VAL_NONE || v >= f->nvals) return 0; 2268 u32 def_block = f->val_def_block[v]; 2269 return def_block < f->nblocks && !body[def_block]; 2270 } 2271 return 1; 2272 } 2273 2274 static int licm_inst_invariant(Func* f, const LicmLoop* loop, const Inst* in) { 2275 if (!licm_candidate(f, in)) return 0; 2276 for (u32 i = 0; i < in->nopnds; ++i) { 2277 const Operand* op = &in->opnds[i]; 2278 if (op->kind == OPK_REG && opt_val_in_inst_defs(in, (Val)op->v.reg)) 2279 continue; 2280 if (!licm_operand_invariant(f, loop->body, op)) return 0; 2281 } 2282 return 1; 2283 } 2284 2285 static void licm_insert_inst(Func* f, u32 block, u32 at, Inst in) { 2286 Block* bl = &f->blocks[block]; 2287 if (at > bl->ninsts) at = bl->ninsts; 2288 if (bl->ninsts == bl->cap) { 2289 u32 ncap = bl->cap ? bl->cap * 2u : 8u; 2290 while (ncap < bl->ninsts + 1u) ncap *= 2u; 2291 Inst* insts = arena_zarray(f->arena, Inst, ncap); 2292 if (at) memcpy(insts, bl->insts, sizeof(insts[0]) * at); 2293 if (bl->ninsts > at) 2294 memcpy(&insts[at + 1u], &bl->insts[at], 2295 sizeof(insts[0]) * (bl->ninsts - at)); 2296 bl->insts = insts; 2297 bl->cap = ncap; 2298 } else { 2299 for (u32 i = bl->ninsts; i > at; --i) bl->insts[i] = bl->insts[i - 1u]; 2300 } 2301 bl->insts[at] = in; 2302 ++bl->ninsts; 2303 } 2304 2305 static void licm_remove_inst(Func* f, u32 block, u32 at) { 2306 Block* bl = &f->blocks[block]; 2307 if (at >= bl->ninsts) return; 2308 for (u32 i = at + 1u; i < bl->ninsts; ++i) bl->insts[i - 1u] = bl->insts[i]; 2309 --bl->ninsts; 2310 } 2311 2312 static u32 licm_preheader_insert_pos(Func* f, u32 preheader) { 2313 Block* bl = &f->blocks[preheader]; 2314 if (bl->ninsts && o2_is_terminator(&bl->insts[bl->ninsts - 1u])) 2315 return bl->ninsts - 1u; 2316 return bl->ninsts; 2317 } 2318 2319 static void licm_hoist_inst(Func* f, const LicmLoop* loop, u32 block, 2320 u32 inst) { 2321 Inst moved = f->blocks[block].insts[inst]; 2322 u32 at = licm_preheader_insert_pos(f, loop->preheader); 2323 licm_insert_inst(f, loop->preheader, at, moved); 2324 licm_remove_inst(f, block, inst); 2325 if (moved.def != VAL_NONE && moved.def < f->nvals) { 2326 f->val_def_block[moved.def] = loop->preheader; 2327 f->val_def_inst[moved.def] = at; 2328 } 2329 } 2330 2331 void opt_licm(Func* f) { 2332 if (!f || f->opt_rewritten) return; 2333 if (!f->opt_reg_ssa && f->npregs > 1) { 2334 opt_rebuild_def_use(f); 2335 return; 2336 } 2337 2338 OptAnalysis a; 2339 memset(&a, 0, sizeof a); 2340 opt_analysis_build_dominators(f, &a); 2341 LicmLoop* loops = NULL; 2342 u32 nloops = licm_collect_loops(f, &a, &loops); 2343 int changed = 0; 2344 2345 for (u32 l = 0; l < nloops; ++l) { 2346 LicmLoop* loop = &loops[l]; 2347 int again = 1; 2348 while (again) { 2349 again = 0; 2350 for (u32 b = 0; b < f->nblocks; ++b) { 2351 if (!loop->body[b] || b == loop->preheader) continue; 2352 Block* bl = &f->blocks[b]; 2353 for (u32 i = 0; i < bl->ninsts;) { 2354 Inst* in = &bl->insts[i]; 2355 if (!licm_inst_invariant(f, loop, in)) { 2356 ++i; 2357 continue; 2358 } 2359 licm_hoist_inst(f, loop, b, i); 2360 changed = 1; 2361 again = 1; 2362 bl = &f->blocks[b]; 2363 } 2364 } 2365 } 2366 } 2367 2368 if (changed) { 2369 dse_refresh_def_locations(f); 2370 opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE); 2371 } 2372 opt_rebuild_def_use(f); 2373 } 2374 2375 static int pressure_candidate(const Inst* in) { 2376 if (!in || in->def == VAL_NONE || in->ndefs) return 0; 2377 switch ((IROp)in->op) { 2378 case IR_LOAD_IMM: 2379 case IR_CONST_I: 2380 return 1; 2381 default: 2382 return 0; 2383 } 2384 } 2385 2386 static int pressure_barrier(const Inst* in) { 2387 if (!in) return 1; 2388 if (o2_is_terminator(in)) return 1; 2389 switch ((IROp)in->op) { 2390 case IR_LOAD: 2391 case IR_STORE: 2392 case IR_LOAD_CONST: 2393 case IR_AGG_COPY: 2394 case IR_AGG_SET: 2395 case IR_BITFIELD_LOAD: 2396 case IR_BITFIELD_STORE: 2397 case IR_CALL: 2398 case IR_SCOPE_BEGIN: 2399 case IR_SCOPE_END: 2400 case IR_BREAK_TO: 2401 case IR_CONTINUE_TO: 2402 case IR_ALLOCA: 2403 case IR_VA_START: 2404 case IR_VA_ARG: 2405 case IR_VA_END: 2406 case IR_VA_COPY: 2407 case IR_ATOMIC_LOAD: 2408 case IR_ATOMIC_STORE: 2409 case IR_ATOMIC_RMW: 2410 case IR_ATOMIC_CAS: 2411 case IR_FENCE: 2412 case IR_ASM_BLOCK: 2413 case IR_INTRINSIC: 2414 return 1; 2415 default: 2416 return 0; 2417 } 2418 } 2419 2420 static int pressure_single_use(Func* f, Val v, OptUse* out) { 2421 u32 n = 0; 2422 OptUse one; 2423 memset(&one, 0, sizeof one); 2424 for (u32 u = f->opt_first_use_by_val[v]; u != OPT_USE_NONE; 2425 u = f->opt_uses[u].next_for_val) { 2426 one = f->opt_uses[u]; 2427 ++n; 2428 if (n > 1u) return 0; 2429 } 2430 if (n != 1u) return 0; 2431 *out = one; 2432 return 1; 2433 } 2434 2435 static int pressure_can_move_within_block(Func* f, u32 b, u32 from, u32 to) { 2436 if (b >= f->nblocks || from >= to) return 0; 2437 Block* bl = &f->blocks[b]; 2438 if (to > bl->ninsts) return 0; 2439 for (u32 i = from + 1u; i < to; ++i) 2440 if (pressure_barrier(&bl->insts[i])) return 0; 2441 return 1; 2442 } 2443 2444 static PressureBlockPlan* pressure_block_plan(Func* f, PressurePlan* plan, 2445 u32 b) { 2446 if (!plan->blocks) 2447 plan->blocks = 2448 arena_zarray(f->arena, PressureBlockPlan, f->nblocks ? f->nblocks : 1u); 2449 PressureBlockPlan* bp = &plan->blocks[b]; 2450 if (!bp->move_src) { 2451 Block* bl = &f->blocks[b]; 2452 u32 n = bl->ninsts ? bl->ninsts : 1u; 2453 bp->move_src = arena_zarray(f->arena, u8, n); 2454 bp->first_before = arena_array(f->arena, u32, n); 2455 bp->last_before = arena_array(f->arena, u32, n); 2456 bp->next_move = arena_array(f->arena, u32, n); 2457 for (u32 i = 0; i < n; ++i) { 2458 bp->first_before[i] = OPT_BLOCK_NONE; 2459 bp->last_before[i] = OPT_BLOCK_NONE; 2460 bp->next_move[i] = OPT_BLOCK_NONE; 2461 } 2462 } 2463 return bp; 2464 } 2465 2466 static void pressure_plan_move(Func* f, PressurePlan* plan, u32 b, u32 from, 2467 u32 to) { 2468 PressureBlockPlan* bp = pressure_block_plan(f, plan, b); 2469 if (bp->move_src[from]) return; 2470 bp->move_src[from] = 1; 2471 bp->next_move[from] = OPT_BLOCK_NONE; 2472 if (bp->first_before[to] == OPT_BLOCK_NONE) { 2473 bp->first_before[to] = from; 2474 } else { 2475 bp->next_move[bp->last_before[to]] = from; 2476 } 2477 bp->last_before[to] = from; 2478 plan->nmoves++; 2479 } 2480 2481 static int pressure_plan(Func* f, PressurePlan* plan) { 2482 memset(plan, 0, sizeof *plan); 2483 for (Val v = 1; v < f->nvals; ++v) { 2484 u32 db = f->val_def_block[v]; 2485 u32 di = f->val_def_inst[v]; 2486 if (db >= f->nblocks || di >= f->blocks[db].ninsts) continue; 2487 Inst* def = &f->blocks[db].insts[di]; 2488 if (def->def != v || !pressure_candidate(def)) continue; 2489 2490 OptUse use; 2491 if (!pressure_single_use(f, v, &use)) continue; 2492 if (use.kind != OPT_USE_OPERAND) continue; 2493 if (use.block != db) continue; 2494 if (use.inst <= di + 1u) continue; 2495 if (f->blocks[use.block].loop_depth > f->blocks[db].loop_depth) continue; 2496 if (!pressure_can_move_within_block(f, db, di, use.inst)) continue; 2497 2498 pressure_plan_move(f, plan, db, di, use.inst); 2499 } 2500 return plan->nmoves != 0; 2501 } 2502 2503 static void pressure_apply_block(Func* f, u32 b, PressureBlockPlan* bp) { 2504 Block* bl = &f->blocks[b]; 2505 Inst* old = bl->insts; 2506 Inst* insts = arena_array(f->arena, Inst, bl->ninsts ? bl->ninsts : 1u); 2507 u32 w = 0; 2508 for (u32 i = 0; i < bl->ninsts; ++i) { 2509 for (u32 m = bp->first_before[i]; m != OPT_BLOCK_NONE; 2510 m = bp->next_move[m]) { 2511 insts[w++] = old[m]; 2512 } 2513 if (!bp->move_src[i]) insts[w++] = old[i]; 2514 } 2515 bl->insts = insts; 2516 bl->cap = bl->ninsts; 2517 if (w != bl->ninsts) { 2518 SrcLoc loc = {0, 0, 0}; 2519 compiler_panic(f->c, loc, "opt pressure-relief: bad rewrite (%u, %u)", 2520 (unsigned)w, (unsigned)bl->ninsts); 2521 } 2522 } 2523 2524 static void pressure_apply(Func* f, PressurePlan* plan) { 2525 if (!plan->blocks || !plan->nmoves) return; 2526 for (u32 b = 0; b < f->nblocks; ++b) { 2527 PressureBlockPlan* bp = &plan->blocks[b]; 2528 if (bp->move_src) pressure_apply_block(f, b, bp); 2529 } 2530 dse_refresh_def_locations(f); 2531 opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE); 2532 } 2533 2534 void opt_pressure_relief(Func* f) { 2535 if (!f || f->opt_rewritten) return; 2536 if (!f->opt_reg_ssa && f->npregs > 1) { 2537 opt_rebuild_def_use(f); 2538 return; 2539 } 2540 2541 PressurePlan plan; 2542 opt_rebuild_def_use(f); 2543 if (pressure_plan(f, &plan)) pressure_apply(f, &plan); 2544 opt_rebuild_def_use(f); 2545 } 2546 2547 static int ssa_combine_cmp_def(Func* f, Val v, Inst** out) { 2548 Inst* def = NULL; 2549 if (!val_def_inst(f, v, &def)) return 0; 2550 if ((IROp)def->op != IR_CMP || def->nopnds < 3) return 0; 2551 *out = def; 2552 return 1; 2553 } 2554 2555 static int ssa_combine_fold_cmp_branch(Func* f) { 2556 int changed = 0; 2557 for (u32 b = 0; b < f->nblocks; ++b) { 2558 Block* bl = &f->blocks[b]; 2559 if (!bl->ninsts || bl->nsucc < 2) continue; 2560 Inst* br = &bl->insts[bl->ninsts - 1u]; 2561 if ((IROp)br->op != IR_CONDBR || br->nopnds < 1) continue; 2562 if (br->opnds[0].kind != OPK_REG) continue; 2563 Inst* cmp = NULL; 2564 if (!ssa_combine_cmp_def(f, (Val)br->opnds[0].v.reg, &cmp)) continue; 2565 2566 Operand* opnds = arena_array(f->arena, Operand, 2); 2567 opnds[0] = cmp->opnds[1]; 2568 opnds[1] = cmp->opnds[2]; 2569 br->op = IR_CMP_BRANCH; 2570 br->opnds = opnds; 2571 br->nopnds = 2; 2572 br->extra.imm = cmp->extra.imm; 2573 changed = 1; 2574 } 2575 return changed; 2576 } 2577 2578 static int ssa_combine_fold_addr_uses(Func* f) { 2579 int changed = 0; 2580 opt_rebuild_def_use(f); 2581 u32 nvals = f->nvals; 2582 for (Val v = 1; v < nvals; ++v) { 2583 Inst* def = NULL; 2584 if (!addr_def_inst(f, v, &def)) continue; 2585 Operand addr = def->opnds[1]; 2586 if (addr.kind != OPK_LOCAL) continue; 2587 2588 for (u32 u = f->opt_first_use_by_val[v]; u != OPT_USE_NONE; 2589 u = f->opt_uses[u].next_for_val) { 2590 OptUse* use = &f->opt_uses[u]; 2591 /* Only fold zero-EA uses to OPK_LOCAL. EA-shaped uses keep the EA 2592 * on the load/store; OPK_LOCAL cannot carry the offset/index. */ 2593 if (addr_use_foldable_kind(f, use) != 1) continue; 2594 Inst* mem = &f->blocks[use->block].insts[use->inst]; 2595 Operand folded = addr; 2596 folded.type = mem->extra.mem.type ? mem->extra.mem.type : addr.type; 2597 *use->operand = folded; 2598 changed = 1; 2599 } 2600 } 2601 return changed; 2602 } 2603 2604 void opt_ssa_combine(Func* f) { 2605 if (!f || f->opt_rewritten) return; 2606 if (!f->opt_reg_ssa && f->npregs > 1) { 2607 opt_rebuild_def_use(f); 2608 return; 2609 } 2610 2611 opt_rebuild_def_use(f); 2612 int changed = ssa_combine_fold_cmp_branch(f); 2613 changed |= ssa_combine_fold_addr_uses(f); 2614 if (changed) opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE); 2615 opt_rebuild_def_use(f); 2616 }