pass_addr_fold.c (36623B)
1 /* O1 HIR address-folding passes. 2 * 3 * Split out of pass_o2.c so the always-on O1 lowering pipeline does not depend 4 * on the (currently disabled) O2 pass translation unit. These three passes run 5 * at every opt_level >= 1: 6 * 7 * - opt_addr_xform_pregs: PReg-namespace addr-of-local folding 8 * - opt_promote_scalar_locals: promote non-escaped scalar locals to PRegs 9 * - opt_addr_of_global_cse: hoist/CSE duplicate ADDR_OF(global) 10 */ 11 #include <kit/cg.h> 12 #include <string.h> 13 14 #include "opt/opt_internal.h" 15 16 /* Private copy of the tiny inst-removal helper shared with pass_o2.c's 17 * opt_addr_xform. Both are file-local statics, so there is no link conflict. */ 18 static void addr_inst_remove(Inst* in) { 19 in->op = IR_NOP; 20 in->def = VAL_NONE; 21 in->ndefs = 0; 22 in->defs = NULL; 23 in->nopnds = 0; 24 in->opnds = NULL; 25 } 26 27 /* PReg-namespace variant of opt_addr_xform for the O1 pipeline (no SSA, no 28 * Val-keyed def-use chains). Scans the whole function once per candidate 29 * IR_ADDR_OF def to classify uses of its PReg result. 30 * 31 * Use classifications (see addr_xform_pregs_classify_use): 32 * 33 * OPF_ESCAPE The use is something other than a non-observable 34 * IR_LOAD/IR_STORE base operand. The IR_ADDR_OF cannot 35 * be folded; the local's address truly escapes. 36 * OPF_FOLD_LOCAL Zero-EA use: `OPK_INDIRECT(base=p, ofs=0, index=NONE)` 37 * in load/store base position. Foldable to OPK_LOCAL. 38 * OPF_FOLD_EA EA-shaped use: same load/store base position, but with 39 * nonzero `ofs` or `index != REG_NONE`. The EA must stay 40 * on the load/store (the operand layout for OPK_LOCAL 41 * cannot carry the EA today), so the operand is left 42 * alone and the IR_ADDR_OF def must stay alive to feed 43 * the OPK_INDIRECT base. The use is still recognized as 44 * "non-escape" for downstream analysis (e.g. scalar 45 * promotion's non-escape check). 46 * 47 * After classification: if any use is OPF_ESCAPE, no rewrite happens. If 48 * every use is OPF_FOLD_LOCAL, fold all uses to OPK_LOCAL and NOP the 49 * IR_ADDR_OF. If a mix of OPF_FOLD_LOCAL and OPF_FOLD_EA, fold the 50 * zero-EA uses but keep the IR_ADDR_OF alive for the EA-shaped uses. */ 51 52 typedef enum AddrXformUseClass { 53 OPF_ESCAPE = 0, 54 OPF_FOLD_LOCAL = 1, 55 OPF_FOLD_EA = 2, 56 } AddrXformUseClass; 57 58 static int addr_xform_pregs_main_op_position_ok(Inst* in, u32 op_idx) { 59 if ((IROp)in->op != IR_LOAD && (IROp)in->op != IR_STORE) return 0; 60 if (opt_mem_observable(&in->extra.mem)) return 0; 61 if ((IROp)in->op == IR_LOAD && op_idx != 1u) return 0; 62 if ((IROp)in->op == IR_STORE && op_idx != 0u) return 0; 63 return 1; 64 } 65 66 static AddrXformUseClass addr_xform_pregs_classify_use(Inst* in, Operand* op, 67 u32 op_idx) { 68 if (op->kind != OPK_INDIRECT) return OPF_ESCAPE; 69 if (!addr_xform_pregs_main_op_position_ok(in, op_idx)) return OPF_ESCAPE; 70 if (op->v.ind.ofs == 0 && op->v.ind.index == (Reg)REG_NONE) 71 return OPF_FOLD_LOCAL; 72 return OPF_FOLD_EA; 73 } 74 75 static int addr_xform_pregs_op_uses(const Operand* op, PReg p) { 76 if (!op) return 0; 77 if (op->kind == OPK_REG && (PReg)op->v.reg == p) return 1; 78 if (op->kind == OPK_INDIRECT) { 79 if ((PReg)op->v.ind.base == p) return 1; 80 if (op->v.ind.index != (Reg)REG_NONE && (PReg)op->v.ind.index == p) 81 return 1; 82 } 83 return 0; 84 } 85 86 static int addr_xform_pregs_abivalue_uses(const CGABIValue* v, PReg p) { 87 if (!v) return 0; 88 if (addr_xform_pregs_op_uses(&v->storage, p)) return 1; 89 for (u32 i = 0; i < v->nparts; ++i) 90 if (addr_xform_pregs_op_uses((const Operand*)&v->parts[i].op, p)) return 1; 91 return 0; 92 } 93 94 static int addr_xform_pregs_aux_uses(Inst* in, PReg p) { 95 switch ((IROp)in->op) { 96 case IR_CALL: { 97 IRCallAux* aux = (IRCallAux*)in->extra.aux; 98 if (!aux) return 0; 99 if (aux->use_plan_replay) { 100 if (addr_xform_pregs_op_uses(&aux->plan.callee, p)) return 1; 101 for (u32 i = 0; i < aux->plan.nargs; ++i) 102 if (addr_xform_pregs_op_uses(&aux->plan.args[i].src, p)) return 1; 103 for (u32 i = 0; i < aux->plan.nrets; ++i) 104 if (addr_xform_pregs_op_uses(&aux->plan.rets[i].dst, p)) return 1; 105 } else { 106 if (addr_xform_pregs_op_uses(&aux->desc.callee, p)) return 1; 107 for (u32 i = 0; i < aux->desc.nargs; ++i) 108 if (addr_xform_pregs_abivalue_uses( 109 (const CGABIValue*)&aux->desc.args[i], p)) 110 return 1; 111 if (addr_xform_pregs_abivalue_uses(&aux->desc.ret, p)) return 1; 112 } 113 return 0; 114 } 115 case IR_RET: { 116 IRRetAux* aux = (IRRetAux*)in->extra.aux; 117 if (!aux || !aux->present) return 0; 118 return addr_xform_pregs_abivalue_uses(&aux->val, p); 119 } 120 case IR_SCOPE_BEGIN: 121 return 0; 122 case IR_ASM_BLOCK: { 123 IRAsmAux* aux = (IRAsmAux*)in->extra.aux; 124 if (!aux) return 0; 125 for (u32 i = 0; i < aux->nin; ++i) 126 if (addr_xform_pregs_op_uses(&aux->in_ops[i], p)) return 1; 127 for (u32 i = 0; i < aux->nout; ++i) 128 if (addr_xform_pregs_op_uses(&aux->out_ops[i], p)) return 1; 129 return 0; 130 } 131 case IR_INTRINSIC: { 132 IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; 133 if (!aux) return 0; 134 for (u32 i = 0; i < aux->narg; ++i) 135 if (addr_xform_pregs_op_uses(&aux->args[i], p)) return 1; 136 for (u32 i = 0; i < aux->ndst; ++i) 137 if (addr_xform_pregs_op_uses(&aux->dsts[i], p)) return 1; 138 return 0; 139 } 140 default: 141 return 0; 142 } 143 } 144 145 /* Returns nonzero if every use of `p` is foldable (OPF_FOLD_LOCAL or 146 * OPF_FOLD_EA) and at least one use exists. *out_has_ea is set to 1 if any 147 * use was OPF_FOLD_EA; in that case the rewrite must keep the IR_ADDR_OF 148 * alive (the EA-shaped use still names p as the OPK_INDIRECT base). */ 149 static int addr_xform_pregs_classify(Func* f, PReg p, Inst* def_inst, 150 int* out_has_ea) { 151 int has_foldable_use = 0; 152 int has_ea = 0; 153 for (u32 b = 0; b < f->nblocks; ++b) { 154 Block* bl = &f->blocks[b]; 155 for (u32 i = 0; i < bl->ninsts; ++i) { 156 Inst* in = &bl->insts[i]; 157 if (in == def_inst) continue; 158 for (u32 o = 0; o < in->nopnds; ++o) { 159 Operand* op = &in->opnds[o]; 160 if (!addr_xform_pregs_op_uses(op, p)) continue; 161 AddrXformUseClass uc = addr_xform_pregs_classify_use(in, op, o); 162 if (uc == OPF_ESCAPE) return 0; 163 has_foldable_use = 1; 164 if (uc == OPF_FOLD_EA) has_ea = 1; 165 } 166 if (addr_xform_pregs_aux_uses(in, p)) return 0; 167 } 168 } 169 if (out_has_ea) *out_has_ea = has_ea; 170 return has_foldable_use; 171 } 172 173 void opt_addr_xform_pregs(Func* f) { 174 if (!f || f->opt_reg_ssa || f->opt_rewritten) return; 175 int changed = 0; 176 for (u32 b = 0; b < f->nblocks; ++b) { 177 Block* bl = &f->blocks[b]; 178 for (u32 i = 0; i < bl->ninsts; ++i) { 179 Inst* in = &bl->insts[i]; 180 if ((IROp)in->op != IR_ADDR_OF) continue; 181 if (in->nopnds < 2) continue; 182 if (in->opnds[0].kind != OPK_REG) continue; 183 if (in->opnds[1].kind != OPK_LOCAL) continue; 184 PReg p = (PReg)in->opnds[0].v.reg; 185 if (!opt_reg_valid(f, p)) continue; 186 int has_ea = 0; 187 if (!addr_xform_pregs_classify(f, p, in, &has_ea)) continue; 188 Operand local = in->opnds[1]; 189 /* Fold every zero-EA use of p to OPK_LOCAL. EA-shaped uses are left 190 * as OPK_INDIRECT(base=p, ofs, index, log2_scale) so the EA stays on 191 * the load/store; the IR_ADDR_OF def must survive to feed them. */ 192 for (u32 bb = 0; bb < f->nblocks; ++bb) { 193 Block* rb = &f->blocks[bb]; 194 for (u32 ii = 0; ii < rb->ninsts; ++ii) { 195 Inst* use = &rb->insts[ii]; 196 if (use == in) continue; 197 for (u32 o = 0; o < use->nopnds; ++o) { 198 Operand* op = &use->opnds[o]; 199 if (op->kind != OPK_INDIRECT) continue; 200 if ((PReg)op->v.ind.base != p) continue; 201 if (op->v.ind.ofs != 0 || op->v.ind.index != (Reg)REG_NONE) 202 continue; /* EA-shaped; leave alone */ 203 Operand folded = local; 204 folded.type = 205 use->extra.mem.type ? use->extra.mem.type : local.type; 206 *op = folded; 207 } 208 } 209 } 210 if (!has_ea) addr_inst_remove(in); 211 changed = 1; 212 } 213 } 214 /* After folding, walk all frame slots and clear FSF_ADDR_TAKEN on any 215 * slot whose surviving IR_ADDR_OF defs (if any) have all been retired. 216 * The frontend-set ADDR_TAKEN flag is conservative; if we proved the 217 * address no longer escapes, downstream passes (opt_promote_scalar_locals) 218 * can take advantage of the actual non-escape state. */ 219 if (changed) { 220 u8* still_taken = 221 arena_zarray(f->arena, u8, f->nframe_slots ? f->nframe_slots : 1u); 222 for (u32 b = 0; b < f->nblocks; ++b) { 223 Block* bl = &f->blocks[b]; 224 for (u32 i = 0; i < bl->ninsts; ++i) { 225 Inst* in = &bl->insts[i]; 226 if ((IROp)in->op != IR_ADDR_OF) continue; 227 if (in->nopnds < 2 || in->opnds[1].kind != OPK_LOCAL) continue; 228 FrameSlot slot = in->opnds[1].v.frame_slot; 229 if (slot && slot <= f->nframe_slots) still_taken[slot - 1u] = 1; 230 } 231 } 232 for (u32 s = 0; s < f->nframe_slots; ++s) { 233 if (!still_taken[s]) f->frame_slots[s].flags &= (u16)~FSF_ADDR_TAKEN; 234 } 235 } 236 if (changed) 237 opt_analysis_invalidate( 238 f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP); 239 } 240 241 /* Scalar local promotion for the O1 pipeline. Runs after 242 * `opt_addr_xform_pregs` has folded zero-EA `OPK_INDIRECT(p)` uses to 243 * `OPK_LOCAL(slot)` and retired non-escaping `IR_ADDR_OF` defs. For each 244 * frame slot that is now only referenced as the base of matching-type, 245 * non-observable `IR_LOAD`/`IR_STORE`, the slot is replaced by a fresh 246 * mutable PReg: each store becomes `IR_COPY P_slot, src` (or `IR_LOAD_IMM` 247 * for an immediate source), each load becomes `IR_COPY dst, P_slot`. The 248 * slot becomes unreferenced and the backend drops it from the frame. 249 * 250 * A mutable PReg in `-O1` IR has the same data-flow semantics as a named 251 * memory cell that does not escape (multiple defs, multiple uses, value at 252 * a use comes from whichever def reaches it via CFG edges). No phis are 253 * required because the IR model has no phis; PReg flow becomes hard-reg 254 * flow after regalloc, and regalloc already handles it. 255 * 256 * Conditions for promotion (per slot): 257 * 258 * 1. Slot kind is FS_LOCAL (real locals, not spills, sret, alloca). 259 * 2. Slot has no FSF_ADDR_TAKEN, FSF_VOLATILE flag (after 260 * `opt_addr_xform_pregs` has cleared the conservative ADDR_TAKEN 261 * flag for slots whose IR_ADDR_OF defs were all retired). 262 * 3. Slot's declared type is scalar (int, float, bool, ptr, enum). 263 * 4. Every appearance of `OPK_LOCAL(slot)` in any instruction operand is 264 * either: 265 * - `IR_LOAD.opnds[1]` with matching `access.type == slot.type`, 266 * no observable mem flags, dst is OPK_REG; 267 * - `IR_STORE.opnds[0]` with matching `access.type == slot.type`, 268 * no observable mem flags, src is OPK_REG or OPK_IMM. 269 * 5. Slot does not appear in any aux operand position (calls, asm, etc.) 270 * or as an OPK_LOCAL anywhere else (e.g., a surviving IR_ADDR_OF). 271 * 272 * Param-slot case: FS_PARAM slots are excluded. The backend prologue is 273 * responsible for moving the ABI-incoming hard reg into the slot, and that 274 * move is not visible in the IR (there is no `IR_STORE OPK_LOCAL(slot)` to 275 * rewrite). At O1 the wrapper already places scalar params in REG storage 276 * when the frontend does not force a memory home, so the param's value 277 * arrives in a PReg without needing this pass. If a future scheme records 278 * the entry-move as a synthetic IR_STORE OPK_LOCAL(slot), this pass would 279 * promote it the same way it promotes any other store-to-slot. */ 280 281 static int promote_local_type_is_scalar(Func* f, KitCgTypeId ty) { 282 if (!ty) return 0; 283 KitCgTypeKind kind = kit_cg_type_kind((KitCompiler*)f->c, ty); 284 switch (kind) { 285 case KIT_CG_TYPE_BOOL: 286 case KIT_CG_TYPE_INT: 287 case KIT_CG_TYPE_FLOAT: 288 case KIT_CG_TYPE_PTR: 289 case KIT_CG_TYPE_ENUM: 290 return 1; 291 default: 292 return 0; 293 } 294 } 295 296 static int promote_op_uses_slot(const Operand* op, FrameSlot slot) { 297 return op && op->kind == OPK_LOCAL && op->v.frame_slot == slot; 298 } 299 300 static int promote_abivalue_uses_slot(const CGABIValue* v, FrameSlot slot) { 301 if (!v) return 0; 302 if (promote_op_uses_slot(&v->storage, slot)) return 1; 303 for (u32 i = 0; i < v->nparts; ++i) 304 if (promote_op_uses_slot((const Operand*)&v->parts[i].op, slot)) return 1; 305 return 0; 306 } 307 308 static int promote_aux_uses_slot(const Inst* in, FrameSlot slot) { 309 switch ((IROp)in->op) { 310 case IR_CALL: { 311 IRCallAux* aux = (IRCallAux*)in->extra.aux; 312 if (!aux) return 0; 313 if (aux->use_plan_replay) { 314 if (promote_op_uses_slot(&aux->plan.callee, slot)) return 1; 315 for (u32 i = 0; i < aux->plan.nargs; ++i) 316 if (promote_op_uses_slot(&aux->plan.args[i].src, slot)) return 1; 317 for (u32 i = 0; i < aux->plan.nrets; ++i) 318 if (promote_op_uses_slot(&aux->plan.rets[i].dst, slot)) return 1; 319 } else { 320 if (promote_op_uses_slot(&aux->desc.callee, slot)) return 1; 321 for (u32 i = 0; i < aux->desc.nargs; ++i) 322 if (promote_abivalue_uses_slot((const CGABIValue*)&aux->desc.args[i], 323 slot)) 324 return 1; 325 if (promote_abivalue_uses_slot(&aux->desc.ret, slot)) return 1; 326 } 327 return 0; 328 } 329 case IR_RET: { 330 IRRetAux* aux = (IRRetAux*)in->extra.aux; 331 if (!aux || !aux->present) return 0; 332 return promote_abivalue_uses_slot(&aux->val, slot); 333 } 334 case IR_SCOPE_BEGIN: 335 return 0; 336 case IR_ASM_BLOCK: { 337 IRAsmAux* aux = (IRAsmAux*)in->extra.aux; 338 if (!aux) return 0; 339 for (u32 i = 0; i < aux->nin; ++i) 340 if (promote_op_uses_slot(&aux->in_ops[i], slot)) return 1; 341 for (u32 i = 0; i < aux->nout; ++i) 342 if (promote_op_uses_slot(&aux->out_ops[i], slot)) return 1; 343 return 0; 344 } 345 case IR_INTRINSIC: { 346 IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; 347 if (!aux) return 0; 348 for (u32 i = 0; i < aux->narg; ++i) 349 if (promote_op_uses_slot(&aux->args[i], slot)) return 1; 350 for (u32 i = 0; i < aux->ndst; ++i) 351 if (promote_op_uses_slot(&aux->dsts[i], slot)) return 1; 352 return 0; 353 } 354 default: 355 return 0; 356 } 357 } 358 359 /* Per-inst check. Returns: 360 * 1 = "instruction touches slot in a promotable position" (load/store base). 361 * 0 = "instruction does not touch slot at all". 362 * -1 = "instruction touches slot in a non-promotable way" (e.g., wrong 363 * operand position, type mismatch, observable flags, aux use). */ 364 static int promote_inst_classify(const Inst* in, FrameSlot slot, 365 KitCgTypeId slot_ty) { 366 int touched = 0; 367 /* IR_LOAD: opnds[0]=dst REG, opnds[1]=addr (allowed: OPK_LOCAL slot). */ 368 if ((IROp)in->op == IR_LOAD) { 369 if (in->nopnds >= 2 && promote_op_uses_slot(&in->opnds[1], slot)) { 370 if (opt_mem_observable(&in->extra.mem)) return -1; 371 if (in->opnds[0].kind != OPK_REG) return -1; 372 KitCgTypeId at = in->extra.mem.type; 373 if (at && at != slot_ty) return -1; 374 touched = 1; 375 } 376 /* opnds[0] is the dst REG — never OPK_LOCAL by construction. */ 377 if (in->nopnds >= 1 && promote_op_uses_slot(&in->opnds[0], slot)) return -1; 378 } else if ((IROp)in->op == IR_STORE) { 379 if (in->nopnds >= 1 && promote_op_uses_slot(&in->opnds[0], slot)) { 380 if (opt_mem_observable(&in->extra.mem)) return -1; 381 if (in->nopnds < 2) return -1; 382 Operand* src = &in->opnds[1]; 383 if (src->kind != OPK_REG && src->kind != OPK_IMM) return -1; 384 KitCgTypeId at = in->extra.mem.type; 385 if (at && at != slot_ty) return -1; 386 touched = 1; 387 } 388 /* opnds[1] is the src value — should never be OPK_LOCAL for a scalar. */ 389 if (in->nopnds >= 2 && promote_op_uses_slot(&in->opnds[1], slot)) return -1; 390 } else { 391 /* Any other instruction with an OPK_LOCAL(slot) operand blocks promotion. 392 */ 393 for (u32 o = 0; o < in->nopnds; ++o) 394 if (promote_op_uses_slot(&in->opnds[o], slot)) return -1; 395 } 396 if (promote_aux_uses_slot(in, slot)) return -1; 397 return touched; 398 } 399 400 /* Rewrite an `IR_STORE OPK_LOCAL(slot), src` into a PReg def. If src is 401 * OPK_IMM, emit IR_LOAD_IMM into preg; otherwise emit IR_COPY. */ 402 static void promote_rewrite_store(Func* f, Inst* in, PReg preg, KitCgTypeId ty, 403 u8 cls) { 404 Operand src = in->opnds[1]; 405 Operand* opnds = arena_array(f->arena, Operand, 2); 406 memset(&opnds[0], 0, sizeof opnds[0]); 407 opnds[0].kind = OPK_REG; 408 opnds[0].type = ty; 409 opnds[0].cls = cls; 410 opnds[0].v.reg = (Reg)preg; 411 in->type = ty; 412 in->def = (Val)preg; 413 if (src.kind == OPK_IMM) { 414 in->op = IR_LOAD_IMM; 415 in->nopnds = 1; 416 in->opnds = opnds; 417 in->extra.imm = src.v.imm; 418 } else { 419 opnds[1] = src; 420 opnds[1].type = ty; 421 opnds[1].cls = cls; 422 in->op = IR_COPY; 423 in->nopnds = 2; 424 in->opnds = opnds; 425 memset(&in->extra, 0, sizeof in->extra); 426 } 427 } 428 429 /* Rewrite an `IR_LOAD dst, OPK_LOCAL(slot)` into `IR_COPY dst, preg`. */ 430 static void promote_rewrite_load(Func* f, Inst* in, PReg preg, KitCgTypeId ty, 431 u8 cls) { 432 Operand dst = in->opnds[0]; 433 Operand* opnds = arena_array(f->arena, Operand, 2); 434 opnds[0] = dst; 435 opnds[0].type = ty; 436 opnds[0].cls = cls; 437 memset(&opnds[1], 0, sizeof opnds[1]); 438 opnds[1].kind = OPK_REG; 439 opnds[1].type = ty; 440 opnds[1].cls = cls; 441 opnds[1].v.reg = (Reg)preg; 442 in->op = IR_COPY; 443 in->type = ty; 444 in->nopnds = 2; 445 in->opnds = opnds; 446 memset(&in->extra, 0, sizeof in->extra); 447 } 448 449 void opt_promote_scalar_locals(Func* f) { 450 if (!f || f->opt_reg_ssa || f->opt_rewritten) return; 451 if (!f->nframe_slots) return; 452 int changed = 0; 453 for (u32 sidx = 0; sidx < f->nframe_slots; ++sidx) { 454 IRFrameSlot* slot = &f->frame_slots[sidx]; 455 FrameSlot id = slot->id; 456 /* FS_PARAM slots are owned by the backend prologue (which copies the 457 * ABI-incoming hard reg into the slot before any user IR runs); there 458 * is no IR-level store to rewrite. At O1, the wrapper already places 459 * scalar params in REG storage when the frontend does not force a 460 * memory home, so the FS_PARAM promotion path is normally a no-op. 461 * Only promote FS_LOCAL slots. */ 462 if (slot->kind != FS_LOCAL) continue; 463 if (slot->flags & (FSF_ADDR_TAKEN | FSF_VOLATILE)) continue; 464 if (!promote_local_type_is_scalar(f, slot->type)) continue; 465 int touched_count = 0; 466 int rejected = 0; 467 for (u32 b = 0; b < f->nblocks && !rejected; ++b) { 468 Block* bl = &f->blocks[b]; 469 for (u32 i = 0; i < bl->ninsts; ++i) { 470 Inst* in = &bl->insts[i]; 471 int r = promote_inst_classify(in, id, slot->type); 472 if (r < 0) { 473 rejected = 1; 474 break; 475 } 476 touched_count += r; 477 } 478 } 479 if (rejected || !touched_count) continue; 480 u8 cls = opt_value_reg_class(f->c, slot->type); 481 PReg preg = ir_alloc_preg(f, slot->type, cls); 482 for (u32 b = 0; b < f->nblocks; ++b) { 483 Block* bl = &f->blocks[b]; 484 for (u32 i = 0; i < bl->ninsts; ++i) { 485 Inst* in = &bl->insts[i]; 486 if ((IROp)in->op == IR_LOAD && in->nopnds >= 2 && 487 promote_op_uses_slot(&in->opnds[1], id)) { 488 promote_rewrite_load(f, in, preg, slot->type, cls); 489 } else if ((IROp)in->op == IR_STORE && in->nopnds >= 2 && 490 promote_op_uses_slot(&in->opnds[0], id)) { 491 promote_rewrite_store(f, in, preg, slot->type, cls); 492 } 493 } 494 } 495 /* The frame slot is now unreferenced. Leave the slot table entry in 496 * place (compaction would require remapping every other slot id); 497 * the backend's frame layout pass simply omits unreferenced slots. */ 498 changed = 1; 499 } 500 if (changed) 501 opt_analysis_invalidate( 502 f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP); 503 } 504 505 /* CSE-style hoist of `IR_ADDR_OF(OPK_GLOBAL{sym, addend})` defs that appear 506 * more than once in the same function. The address is a link-time constant 507 * (TLS and IFUNC live on separate IROps), so all occurrences compute the 508 * same value; consolidating to a single entry-block def shrinks each loop 509 * body by the per-iter `adrp`/`add` pair the backend would otherwise re-emit. 510 * 511 * Implementation: 512 * - Walk all insts, group ADDR_OF defs by (sym, addend). 513 * - For each key with >= 2 defs: allocate a fresh PReg, materialize one 514 * IR_ADDR_OF in block 0 (after any IR_PARAM_DECL prologue), build a 515 * preg-remap from each original def-PReg to the new PReg, and NOP each 516 * original def. 517 * - One IR walk applies the remap to every operand `v.reg` / 518 * `v.ind.base`. 519 * 520 * Runs after opt_addr_xform_pregs so local addr-of has already been folded 521 * out; the remaining IR_ADDR_OF defs are global. */ 522 523 typedef struct AddrCseEntry { 524 ObjSymId sym; 525 i64 addend; 526 PReg canonical; /* freshly allocated PReg, def in block 0 */ 527 KitCgTypeId addr_type; /* operand[0].type from the first def */ 528 u8 cls; /* operand[0].cls from the first def */ 529 u32 count; /* number of original ADDR_OF defs seen */ 530 } AddrCseEntry; 531 532 static u32 addr_cse_find_or_add(AddrCseEntry** entries, u32* n, u32* cap, 533 Arena* arena, ObjSymId sym, i64 addend) { 534 for (u32 i = 0; i < *n; ++i) { 535 if ((*entries)[i].sym == sym && (*entries)[i].addend == addend) return i; 536 } 537 if (*n == *cap) { 538 u32 ncap = *cap ? *cap * 2u : 16u; 539 AddrCseEntry* nv = arena_array(arena, AddrCseEntry, ncap); 540 if (*entries) memcpy(nv, *entries, sizeof(AddrCseEntry) * (*n)); 541 *entries = nv; 542 *cap = ncap; 543 } 544 u32 idx = (*n)++; 545 AddrCseEntry* e = &(*entries)[idx]; 546 memset(e, 0, sizeof *e); 547 e->sym = sym; 548 e->addend = addend; 549 e->canonical = PREG_NONE; 550 e->count = 0; 551 return idx; 552 } 553 554 static void addr_cse_apply_to_operand(Operand* op, const PReg* remap) { 555 /* remap is zero-initialized; 0 means "no remap" (preg 0 is reserved as 556 * unused). PREG_NONE = 0xffffffff and would be a valid remap target but 557 * we never produce that. */ 558 if (!op) return; 559 if (op->kind == OPK_REG) { 560 PReg p = (PReg)op->v.reg; 561 if (p != PREG_NONE && p != 0 && remap[p] != 0) op->v.reg = remap[p]; 562 } else if (op->kind == OPK_INDIRECT) { 563 PReg p = (PReg)op->v.ind.base; 564 if (p != PREG_NONE && p != 0 && remap[p] != 0) op->v.ind.base = remap[p]; 565 if (op->v.ind.index != (Reg)REG_NONE) { 566 PReg pi = (PReg)op->v.ind.index; 567 if (pi != PREG_NONE && pi != 0 && remap[pi] != 0) 568 op->v.ind.index = remap[pi]; 569 } 570 } 571 } 572 573 static void addr_cse_apply_to_inst(Inst* in, const PReg* remap) { 574 for (u32 o = 0; o < in->nopnds; ++o) 575 addr_cse_apply_to_operand(&in->opnds[o], remap); 576 /* IR_CALL aux carries operands too; rewrite both replay variants. */ 577 if ((IROp)in->op == IR_CALL) { 578 IRCallAux* aux = (IRCallAux*)in->extra.aux; 579 if (!aux) return; 580 if (aux->use_plan_replay) { 581 addr_cse_apply_to_operand(&aux->plan.callee, remap); 582 for (u32 i = 0; i < aux->plan.nargs; ++i) 583 addr_cse_apply_to_operand(&aux->plan.args[i].src, remap); 584 for (u32 i = 0; i < aux->plan.nrets; ++i) 585 addr_cse_apply_to_operand(&aux->plan.rets[i].dst, remap); 586 } else { 587 addr_cse_apply_to_operand(&aux->desc.callee, remap); 588 for (u32 i = 0; i < aux->desc.nargs; ++i) { 589 CGABIValue* v = (CGABIValue*)&aux->desc.args[i]; 590 addr_cse_apply_to_operand(&v->storage, remap); 591 for (u32 k = 0; k < v->nparts; ++k) 592 addr_cse_apply_to_operand((Operand*)&v->parts[k].op, remap); 593 } 594 addr_cse_apply_to_operand(&aux->desc.ret.storage, remap); 595 for (u32 k = 0; k < aux->desc.ret.nparts; ++k) 596 addr_cse_apply_to_operand((Operand*)&aux->desc.ret.parts[k].op, remap); 597 } 598 } else if ((IROp)in->op == IR_RET) { 599 IRRetAux* aux = (IRRetAux*)in->extra.aux; 600 if (aux && aux->present) { 601 addr_cse_apply_to_operand(&aux->val.storage, remap); 602 for (u32 k = 0; k < aux->val.nparts; ++k) 603 addr_cse_apply_to_operand((Operand*)&aux->val.parts[k].op, remap); 604 } 605 } else if ((IROp)in->op == IR_ASM_BLOCK) { 606 IRAsmAux* aux = (IRAsmAux*)in->extra.aux; 607 if (!aux) return; 608 for (u32 i = 0; i < aux->nin; ++i) 609 addr_cse_apply_to_operand(&aux->in_ops[i], remap); 610 for (u32 i = 0; i < aux->nout; ++i) 611 addr_cse_apply_to_operand(&aux->out_ops[i], remap); 612 } else if ((IROp)in->op == IR_INTRINSIC) { 613 IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; 614 if (!aux) return; 615 for (u32 i = 0; i < aux->narg; ++i) 616 addr_cse_apply_to_operand(&aux->args[i], remap); 617 for (u32 i = 0; i < aux->ndst; ++i) 618 addr_cse_apply_to_operand(&aux->dsts[i], remap); 619 } 620 } 621 622 Inst* opt_block_insert_at(Func* f, Block* bl, u32 at, u32 k) { 623 if (at > bl->ninsts) at = bl->ninsts; 624 if (bl->ninsts + k > bl->cap) { 625 u32 ncap = bl->cap ? bl->cap : 8u; 626 while (ncap < bl->ninsts + k) ncap *= 2u; 627 Inst* nb = arena_zarray(f->arena, Inst, ncap); 628 if (bl->insts && at) memcpy(nb, bl->insts, sizeof(Inst) * at); 629 if (bl->insts && bl->ninsts > at) 630 memcpy(nb + at + k, bl->insts + at, sizeof(Inst) * (bl->ninsts - at)); 631 bl->insts = nb; 632 bl->cap = ncap; 633 } else { 634 if (bl->ninsts > at) 635 memmove(bl->insts + at + k, bl->insts + at, 636 sizeof(Inst) * (bl->ninsts - at)); 637 } 638 for (u32 i = 0; i < k; ++i) memset(&bl->insts[at + i], 0, sizeof(Inst)); 639 bl->ninsts += k; 640 return &bl->insts[at]; 641 } 642 643 void opt_addr_of_global_cse(Func* f) { 644 if (!f || f->opt_reg_ssa || f->opt_rewritten) return; 645 if (f->nblocks == 0) return; 646 647 /* Pass 1: index ADDR_OF(global) defs by (sym, addend). */ 648 AddrCseEntry* entries = NULL; 649 u32 n_entries = 0; 650 u32 cap_entries = 0; 651 for (u32 b = 0; b < f->nblocks; ++b) { 652 Block* bl = &f->blocks[b]; 653 for (u32 i = 0; i < bl->ninsts; ++i) { 654 Inst* in = &bl->insts[i]; 655 if ((IROp)in->op != IR_ADDR_OF) continue; 656 if (in->nopnds < 2) continue; 657 if (in->opnds[0].kind != OPK_REG) continue; 658 if (in->opnds[1].kind != OPK_GLOBAL) continue; 659 u32 idx = addr_cse_find_or_add(&entries, &n_entries, &cap_entries, 660 f->arena, in->opnds[1].v.global.sym, 661 in->opnds[1].v.global.addend); 662 AddrCseEntry* e = &entries[idx]; 663 if (e->count == 0) { 664 e->addr_type = in->opnds[0].type; 665 e->cls = in->opnds[0].cls; 666 } 667 ++e->count; 668 } 669 } 670 if (!n_entries) return; 671 672 /* Pass 2: for each duplicate key, allocate a canonical PReg. */ 673 u32 dup_count = 0; 674 for (u32 i = 0; i < n_entries; ++i) { 675 if (entries[i].count >= 2) { 676 entries[i].canonical = 677 ir_alloc_preg(f, entries[i].addr_type, entries[i].cls); 678 ++dup_count; 679 } 680 } 681 if (!dup_count) return; 682 683 /* Pass 3: walk again, build per-old-PReg remap and NOP duplicate defs. */ 684 PReg* remap = arena_zarray(f->arena, PReg, opt_reg_count(f)); 685 for (u32 b = 0; b < f->nblocks; ++b) { 686 Block* bl = &f->blocks[b]; 687 for (u32 i = 0; i < bl->ninsts; ++i) { 688 Inst* in = &bl->insts[i]; 689 if ((IROp)in->op != IR_ADDR_OF) continue; 690 if (in->nopnds < 2) continue; 691 if (in->opnds[0].kind != OPK_REG) continue; 692 if (in->opnds[1].kind != OPK_GLOBAL) continue; 693 u32 idx = addr_cse_find_or_add(&entries, &n_entries, &cap_entries, 694 f->arena, in->opnds[1].v.global.sym, 695 in->opnds[1].v.global.addend); 696 if (entries[idx].canonical == PREG_NONE) continue; /* singleton */ 697 PReg old = (PReg)in->opnds[0].v.reg; 698 if (opt_reg_valid(f, old)) remap[old] = entries[idx].canonical; 699 /* NOP the original def. */ 700 in->op = IR_NOP; 701 in->def = VAL_NONE; 702 in->ndefs = 0; 703 in->defs = NULL; 704 in->nopnds = 0; 705 in->opnds = NULL; 706 } 707 } 708 709 /* Pass 4: hoist a single ADDR_OF for each duplicated key to the entry 710 * block, inserted after any leading IR_PARAM_DECL instructions. */ 711 if (f->entry >= f->nblocks) return; 712 Block* entry = &f->blocks[f->entry]; 713 u32 insert_at = 0; 714 while (insert_at < entry->ninsts && 715 (IROp)entry->insts[insert_at].op == IR_PARAM_DECL) 716 ++insert_at; 717 Inst* slot = opt_block_insert_at(f, entry, insert_at, dup_count); 718 u32 w = 0; 719 for (u32 i = 0; i < n_entries; ++i) { 720 if (entries[i].canonical == PREG_NONE) continue; 721 Inst* in = &slot[w++]; 722 in->op = (u16)IR_ADDR_OF; 723 in->def = (Val)entries[i].canonical; 724 in->type = entries[i].addr_type; 725 in->nopnds = 2; 726 in->opnds = arena_array(f->arena, Operand, 2); 727 memset(&in->opnds[0], 0, sizeof(Operand)); 728 in->opnds[0].kind = OPK_REG; 729 in->opnds[0].cls = entries[i].cls; 730 in->opnds[0].type = entries[i].addr_type; 731 in->opnds[0].v.reg = entries[i].canonical; 732 memset(&in->opnds[1], 0, sizeof(Operand)); 733 in->opnds[1].kind = OPK_GLOBAL; 734 in->opnds[1].cls = entries[i].cls; 735 in->opnds[1].type = entries[i].addr_type; 736 in->opnds[1].v.global.sym = entries[i].sym; 737 in->opnds[1].v.global.addend = entries[i].addend; 738 ir_assign_inst_id(f, in); 739 } 740 741 /* Pass 5: apply remap to all operand uses in the function. */ 742 for (u32 b = 0; b < f->nblocks; ++b) { 743 Block* bl = &f->blocks[b]; 744 for (u32 i = 0; i < bl->ninsts; ++i) { 745 addr_cse_apply_to_inst(&bl->insts[i], remap); 746 } 747 } 748 749 opt_analysis_invalidate( 750 f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP); 751 } 752 753 /* LICM-lite for constant materialization. A LOAD_IMM has no inputs, so its 754 * result is loop-invariant and valid anywhere the entry block dominates (i.e. 755 * everywhere). When such a def lives inside a loop, the backend would otherwise 756 * re-materialize the constant (movz/movk) on every iteration; hoisting one copy 757 * to the entry block and reusing it removes that per-iteration work, mirroring 758 * opt_addr_of_global_cse for the `adrp`/`add` case. 759 * 760 * Scope/cost guards: 761 * - Only hoist constants that actually appear inside a loop (loop_depth > 0); 762 * non-loop constants gain nothing and would only lengthen live ranges. 763 * - Skip imm 0: the emit path stores a constant 0 from the hardware zero 764 * register (no materialization at all), which beats holding 0 in an 765 * allocated register across the loop. 766 * 767 * Must run after opt_build_loop_tree (reads block loop_depth) and before 768 * register allocation. */ 769 typedef struct ConstHoistEntry { 770 i64 imm; 771 KitCgTypeId type; 772 PReg canonical; 773 u8 cls; 774 u8 in_loop; 775 } ConstHoistEntry; 776 777 static u32 const_hoist_find_or_add(ConstHoistEntry** entries, u32* n, u32* cap, 778 Arena* arena, i64 imm, KitCgTypeId type, 779 u8 cls) { 780 for (u32 i = 0; i < *n; ++i) { 781 if ((*entries)[i].imm == imm && (*entries)[i].type == type && 782 (*entries)[i].cls == cls) 783 return i; 784 } 785 if (*n == *cap) { 786 u32 ncap = *cap ? *cap * 2u : 16u; 787 ConstHoistEntry* nv = arena_array(arena, ConstHoistEntry, ncap); 788 if (*entries) memcpy(nv, *entries, sizeof(ConstHoistEntry) * (*n)); 789 *entries = nv; 790 *cap = ncap; 791 } 792 u32 idx = (*n)++; 793 ConstHoistEntry* e = &(*entries)[idx]; 794 memset(e, 0, sizeof *e); 795 e->imm = imm; 796 e->type = type; 797 e->cls = cls; 798 e->canonical = PREG_NONE; 799 return idx; 800 } 801 802 static void const_hoist_count_def(Func* f, Inst* in, Operand* op, int is_def, 803 void* ud) { 804 u32* counts = (u32*)ud; 805 (void)in; 806 if (!is_def || op->kind != OPK_REG) return; 807 PReg p = (PReg)op->v.reg; 808 if (opt_reg_valid(f, p)) ++counts[p]; 809 } 810 811 /* True for a LOAD_IMM whose constant is a hoist candidate: a non-zero immediate 812 * defining a register that has exactly one def in the function. The single-def 813 * requirement is the safety condition for remapping every use of that register 814 * to one canonical entry def — the PReg form here is non-SSA, so a mutable 815 * local promoted by opt_promote_scalar_locals (e.g. a loop counter initialized 816 * to a constant) is multiply defined; remapping it would discard the other 817 * defs (the increment) and break the loop. */ 818 static int const_hoist_candidate(const Func* f, const Inst* in, 819 const u32* def_counts) { 820 if ((IROp)in->op != IR_LOAD_IMM || in->nopnds < 1) return 0; 821 if (in->opnds[0].kind != OPK_REG || in->extra.imm == 0) return 0; 822 PReg p = (PReg)in->opnds[0].v.reg; 823 return opt_reg_valid(f, p) && def_counts[p] == 1; 824 } 825 826 void opt_hoist_loop_consts(Func* f) { 827 if (!f || f->opt_reg_ssa || f->opt_rewritten) return; 828 if (f->nblocks == 0 || f->entry >= f->nblocks) return; 829 830 /* Count defs per PReg so const_hoist_candidate can reject multiply-defined 831 * registers (mutable promoted locals) — see its comment. */ 832 u32* def_counts = arena_zarray(f->arena, u32, opt_reg_count(f)); 833 for (u32 b = 0; b < f->nblocks; ++b) { 834 Block* bl = &f->blocks[b]; 835 for (u32 i = 0; i < bl->ninsts; ++i) 836 opt_walk_inst_operands(f, &bl->insts[i], const_hoist_count_def, 837 def_counts); 838 } 839 840 /* Pass 1: index LOAD_IMM defs by (imm, type, cls); flag keys seen in a loop. 841 */ 842 ConstHoistEntry* entries = NULL; 843 u32 n_entries = 0, cap_entries = 0; 844 for (u32 b = 0; b < f->nblocks; ++b) { 845 Block* bl = &f->blocks[b]; 846 int in_loop = bl->loop_depth > 0; 847 for (u32 i = 0; i < bl->ninsts; ++i) { 848 Inst* in = &bl->insts[i]; 849 if (!const_hoist_candidate(f, in, def_counts)) continue; 850 u32 idx = const_hoist_find_or_add(&entries, &n_entries, &cap_entries, 851 f->arena, in->extra.imm, 852 in->opnds[0].type, in->opnds[0].cls); 853 if (in_loop) entries[idx].in_loop = 1; 854 } 855 } 856 857 /* Pass 2: allocate a canonical PReg for each constant that appears in a loop. 858 */ 859 u32 dup_count = 0; 860 for (u32 i = 0; i < n_entries; ++i) { 861 if (!entries[i].in_loop) continue; 862 entries[i].canonical = ir_alloc_preg(f, entries[i].type, entries[i].cls); 863 ++dup_count; 864 } 865 if (!dup_count) return; 866 867 /* Pass 3: NOP every def of a hoisted constant, mapping its PReg to the 868 * canonical one (consolidates loop and non-loop occurrences alike). */ 869 PReg* remap = arena_zarray(f->arena, PReg, opt_reg_count(f)); 870 for (u32 b = 0; b < f->nblocks; ++b) { 871 Block* bl = &f->blocks[b]; 872 for (u32 i = 0; i < bl->ninsts; ++i) { 873 Inst* in = &bl->insts[i]; 874 if (!const_hoist_candidate(f, in, def_counts)) continue; 875 u32 idx = const_hoist_find_or_add(&entries, &n_entries, &cap_entries, 876 f->arena, in->extra.imm, 877 in->opnds[0].type, in->opnds[0].cls); 878 if (entries[idx].canonical == PREG_NONE) continue; 879 PReg old = (PReg)in->opnds[0].v.reg; 880 if (opt_reg_valid(f, old)) remap[old] = entries[idx].canonical; 881 in->op = IR_NOP; 882 in->def = VAL_NONE; 883 in->ndefs = 0; 884 in->defs = NULL; 885 in->nopnds = 0; 886 in->opnds = NULL; 887 } 888 } 889 890 /* Pass 4: emit one LOAD_IMM per hoisted constant in the entry block, after 891 * any leading IR_PARAM_DECL prologue. */ 892 Block* entry = &f->blocks[f->entry]; 893 u32 insert_at = 0; 894 while (insert_at < entry->ninsts && 895 (IROp)entry->insts[insert_at].op == IR_PARAM_DECL) 896 ++insert_at; 897 Inst* slot = opt_block_insert_at(f, entry, insert_at, dup_count); 898 u32 w = 0; 899 for (u32 i = 0; i < n_entries; ++i) { 900 if (entries[i].canonical == PREG_NONE) continue; 901 Inst* in = &slot[w++]; 902 in->op = (u16)IR_LOAD_IMM; 903 in->def = (Val)entries[i].canonical; 904 in->type = entries[i].type; 905 in->extra.imm = entries[i].imm; 906 in->nopnds = 1; 907 in->opnds = arena_array(f->arena, Operand, 1); 908 memset(&in->opnds[0], 0, sizeof(Operand)); 909 in->opnds[0].kind = OPK_REG; 910 in->opnds[0].cls = entries[i].cls; 911 in->opnds[0].type = entries[i].type; 912 in->opnds[0].v.reg = entries[i].canonical; 913 ir_assign_inst_id(f, in); 914 } 915 916 /* Pass 5: apply remap to all operand uses (reuses the ADDR_OF-CSE walker, 917 * which also covers IR_CALL aux operands). */ 918 for (u32 b = 0; b < f->nblocks; ++b) { 919 Block* bl = &f->blocks[b]; 920 for (u32 i = 0; i < bl->ninsts; ++i) 921 addr_cse_apply_to_inst(&bl->insts[i], remap); 922 } 923 924 opt_analysis_invalidate( 925 f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP); 926 }