pass_combine.c (54369B)
1 #include <kit/cg.h> 2 #include <stdint.h> 3 #include <string.h> 4 5 #include "cg/ir_eval.h" 6 #include "core/arena.h" 7 #include "opt/opt_internal.h" 8 9 enum { 10 COMBINE_CG_TYPE_SEG_SHIFT = 6, 11 COMBINE_CG_TYPE_SEG_MASK = (1u << COMBINE_CG_TYPE_SEG_SHIFT) - 1u, 12 COMBINE_CG_TYPE_BUILTIN_SEG = 1u, 13 }; 14 15 /* O1 combine, MIR-shaped (see mir-gen.c:8808-9146). One forward pass per BB 16 * maintains last-def / last-mem-def tracking; per-BB fixpoint loop iterates 17 * until no fold fires. Rewrites in this file: 18 * 19 * 1. Substitute producer source into consumer operand 20 * (IR_COPY / IR_LOAD_IMM / IR_CONVERT producer; into register slot, 21 * indirect base, or indirect index). 22 * 2. Address-mode synthesis (IR_BINOP IADD/ISHL producer; into OPK_INDIRECT 23 * base/index/scale/ofs). 24 * 3. Sink producer into single-use IR_COPY destination. 25 * 4. combine_exts: fold ext-of-ext chains with size+signedness rules. 26 * 27 * Spill compaction (store-then-reload, etc.) and self-copy removal continue 28 * to run in opt_combine_compact_block, unchanged. */ 29 30 /* ---- shared helpers (operand inspection / counting) ---- */ 31 32 static int same_reg_operand(const Operand* a, const Operand* b) { 33 return a->kind == OPK_REG && b->kind == OPK_REG && a->cls == b->cls && 34 a->v.reg == b->v.reg; 35 } 36 37 static int frame_slot_is_spill(Func* f, FrameSlot fs) { 38 if (fs == FRAME_SLOT_NONE || fs > f->nframe_slots) return 0; 39 return f->frame_slots[fs - 1u].kind == FS_SPILL; 40 } 41 42 static int spill_local_slot(Func* f, const Operand* addr, const MemAccess* mem, 43 FrameSlot* out) { 44 if (!addr || addr->kind != OPK_LOCAL) return 0; 45 if (opt_mem_observable(mem)) return 0; 46 if (mem->alias.kind != ALIAS_LOCAL) return 0; 47 if (mem->alias.v.local_id != (i32)addr->v.frame_slot) return 0; 48 if (!frame_slot_is_spill(f, addr->v.frame_slot)) return 0; 49 *out = addr->v.frame_slot; 50 return 1; 51 } 52 53 static int same_spill_access(Func* f, const Inst* a, const Inst* b, 54 FrameSlot* slot_out) { 55 FrameSlot as = FRAME_SLOT_NONE; 56 FrameSlot bs = FRAME_SLOT_NONE; 57 if (!spill_local_slot(f, &a->opnds[0], &a->extra.mem, &as)) return 0; 58 if (!spill_local_slot(f, &b->opnds[0], &b->extra.mem, &bs)) return 0; 59 if (as != bs) return 0; 60 if (a->extra.mem.size != b->extra.mem.size) return 0; 61 if (a->extra.mem.addr_space != b->extra.mem.addr_space) return 0; 62 if (slot_out) *slot_out = as; 63 return 1; 64 } 65 66 static int load_spill_slot(Func* f, const Inst* in, FrameSlot* slot_out) { 67 if ((IROp)in->op != IR_LOAD || in->nopnds < 2) return 0; 68 return spill_local_slot(f, &in->opnds[1], &in->extra.mem, slot_out); 69 } 70 71 static int store_spill_slot(Func* f, const Inst* in, FrameSlot* slot_out) { 72 if ((IROp)in->op != IR_STORE || in->nopnds < 2) return 0; 73 return spill_local_slot(f, &in->opnds[0], &in->extra.mem, slot_out); 74 } 75 76 static int same_spill_slot_and_size(Func* f, const Inst* a, const Inst* b) { 77 FrameSlot as = FRAME_SLOT_NONE; 78 FrameSlot bs = FRAME_SLOT_NONE; 79 if ((IROp)a->op == IR_LOAD) { 80 if (!load_spill_slot(f, a, &as)) return 0; 81 } else if (!store_spill_slot(f, a, &as)) { 82 return 0; 83 } 84 if ((IROp)b->op == IR_LOAD) { 85 if (!load_spill_slot(f, b, &bs)) return 0; 86 } else if (!store_spill_slot(f, b, &bs)) { 87 return 0; 88 } 89 return as == bs && a->extra.mem.size == b->extra.mem.size && 90 a->extra.mem.addr_space == b->extra.mem.addr_space; 91 } 92 93 static int same_phys_reg(const Operand* a, const Operand* b) { 94 return a && b && a->kind == OPK_REG && b->kind == OPK_REG && 95 a->cls == b->cls && a->v.reg == b->v.reg; 96 } 97 98 /* Count register references inside a single operand. An OPK_INDIRECT can 99 * reference the queried register twice (e.g. `[r4 + r4 * 4]`); we return the 100 * true count so single-use accounting is exact. */ 101 static int count_operand_phys_uses(const Operand* op, const Operand* r) { 102 if (!op || !r || r->kind != OPK_REG) return 0; 103 if (op->kind == OPK_REG) 104 return op->cls == r->cls && op->v.reg == r->v.reg ? 1 : 0; 105 if (op->kind == OPK_INDIRECT) { 106 if (r->cls != RC_INT) return 0; 107 int n = 0; 108 if (op->v.ind.base == r->v.reg) ++n; 109 if (op->v.ind.index != (Reg)REG_NONE && op->v.ind.index == r->v.reg) ++n; 110 return n; 111 } 112 return 0; 113 } 114 115 static int operand_uses_phys_reg(const Operand* op, const Operand* r) { 116 return count_operand_phys_uses(op, r) > 0; 117 } 118 119 static int abi_uses_phys_reg(const CGABIValue* v, const Operand* r) { 120 int n = 0; 121 if (!v) return 0; 122 n += count_operand_phys_uses(&v->storage, r); 123 for (u32 i = 0; i < v->nparts; ++i) 124 n += count_operand_phys_uses(&v->parts[i].op, r); 125 return n; 126 } 127 128 static int inst_uses_phys_reg(const Inst* in, const Operand* r) { 129 int n = 0; 130 switch ((IROp)in->op) { 131 case IR_COPY: 132 case IR_CONVERT: 133 case IR_UNOP: 134 case IR_VA_ARG: 135 if (in->nopnds >= 2) n += count_operand_phys_uses(&in->opnds[1], r); 136 break; 137 case IR_LOAD: 138 case IR_ADDR_OF: 139 case IR_BITFIELD_LOAD: 140 case IR_ATOMIC_LOAD: 141 if (in->nopnds >= 2) n += count_operand_phys_uses(&in->opnds[1], r); 142 break; 143 case IR_BINOP: 144 case IR_CMP: 145 if (in->nopnds >= 2) n += count_operand_phys_uses(&in->opnds[1], r); 146 if (in->nopnds >= 3) n += count_operand_phys_uses(&in->opnds[2], r); 147 break; 148 case IR_STORE: 149 case IR_AGG_COPY: 150 case IR_AGG_SET: 151 case IR_BITFIELD_STORE: 152 case IR_VA_COPY: 153 if (in->nopnds >= 1) n += count_operand_phys_uses(&in->opnds[0], r); 154 if (in->nopnds >= 2) n += count_operand_phys_uses(&in->opnds[1], r); 155 break; 156 case IR_CALL: { 157 IRCallAux* aux = (IRCallAux*)in->extra.aux; 158 if (!aux) break; 159 if (aux->use_plan_replay) { 160 n += count_operand_phys_uses(&aux->plan.callee, r); 161 for (u32 i = 0; i < aux->plan.nargs; ++i) 162 n += count_operand_phys_uses(&aux->plan.args[i].src, r); 163 } else { 164 n += count_operand_phys_uses(&aux->desc.callee, r); 165 for (u32 i = 0; i < aux->desc.nargs; ++i) 166 n += abi_uses_phys_reg(&aux->desc.args[i], r); 167 } 168 break; 169 } 170 case IR_CMP_BRANCH: 171 case IR_CONDBR: 172 case IR_SWITCH: 173 case IR_INDIRECT_BRANCH: 174 for (u32 i = 0; i < in->nopnds; ++i) 175 n += count_operand_phys_uses(&in->opnds[i], r); 176 break; 177 case IR_RET: { 178 IRRetAux* aux = (IRRetAux*)in->extra.aux; 179 if (aux && aux->present) n += abi_uses_phys_reg(&aux->val, r); 180 break; 181 } 182 case IR_SCOPE_BEGIN: 183 break; 184 case IR_ALLOCA: 185 if (in->nopnds >= 2) n += count_operand_phys_uses(&in->opnds[1], r); 186 break; 187 case IR_VA_START: 188 case IR_VA_END: 189 if (in->nopnds >= 1) n += count_operand_phys_uses(&in->opnds[0], r); 190 break; 191 case IR_ATOMIC_STORE: 192 if (in->nopnds >= 1) n += count_operand_phys_uses(&in->opnds[0], r); 193 if (in->nopnds >= 2) n += count_operand_phys_uses(&in->opnds[1], r); 194 break; 195 case IR_ATOMIC_RMW: 196 if (in->nopnds >= 2) n += count_operand_phys_uses(&in->opnds[1], r); 197 if (in->nopnds >= 3) n += count_operand_phys_uses(&in->opnds[2], r); 198 break; 199 case IR_ATOMIC_CAS: 200 if (in->nopnds >= 3) n += count_operand_phys_uses(&in->opnds[2], r); 201 if (in->nopnds >= 4) n += count_operand_phys_uses(&in->opnds[3], r); 202 if (in->nopnds >= 5) n += count_operand_phys_uses(&in->opnds[4], r); 203 break; 204 case IR_ASM_BLOCK: { 205 IRAsmAux* aux = (IRAsmAux*)in->extra.aux; 206 if (!aux) break; 207 for (u32 i = 0; i < aux->nin; ++i) 208 n += count_operand_phys_uses(&aux->in_ops[i], r); 209 break; 210 } 211 case IR_INTRINSIC: { 212 IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; 213 if (!aux) break; 214 for (u32 i = 0; i < aux->narg; ++i) 215 n += count_operand_phys_uses(&aux->args[i], r); 216 break; 217 } 218 default: 219 break; 220 } 221 return n; 222 } 223 224 static int abi_defines_phys_reg(const CGABIValue* v, const Operand* r) { 225 int n = 0; 226 if (!v) return 0; 227 if (same_phys_reg(&v->storage, r)) ++n; 228 for (u32 i = 0; i < v->nparts; ++i) 229 if (same_phys_reg(&v->parts[i].op, r)) ++n; 230 return n; 231 } 232 233 static int inst_defines_phys_reg(const Inst* in, const Operand* r) { 234 if (!r || r->kind != OPK_REG) return 0; 235 switch ((IROp)in->op) { 236 case IR_LOAD_IMM: 237 case IR_LOAD_CONST: 238 case IR_LOAD_LABEL_ADDR: 239 case IR_COPY: 240 case IR_LOAD: 241 case IR_ADDR_OF: 242 case IR_TLS_ADDR_OF: 243 case IR_BITFIELD_LOAD: 244 case IR_BINOP: 245 case IR_UNOP: 246 case IR_CMP: 247 case IR_CONVERT: 248 case IR_ALLOCA: 249 case IR_VA_ARG: 250 case IR_ATOMIC_LOAD: 251 case IR_ATOMIC_RMW: 252 return in->nopnds >= 1 && same_phys_reg(&in->opnds[0], r); 253 case IR_CALL: { 254 IRCallAux* aux = (IRCallAux*)in->extra.aux; 255 if (!aux) return 0; 256 if (aux->use_plan_replay) { 257 for (u32 i = 0; i < aux->plan.nargs; ++i) 258 if (aux->plan.args[i].dst_kind == CG_CALL_PLAN_REG && 259 r->cls == aux->plan.args[i].cls && 260 r->v.reg == aux->plan.args[i].dst_reg) 261 return 1; 262 for (u32 i = 0; i < aux->plan.nrets; ++i) 263 if ((r->cls == aux->plan.rets[i].cls && 264 r->v.reg == aux->plan.rets[i].src_reg) || 265 same_phys_reg(&aux->plan.rets[i].dst, r)) 266 return 1; 267 return 0; 268 } 269 return abi_defines_phys_reg(&aux->desc.ret, r); 270 } 271 case IR_ATOMIC_CAS: 272 return (in->nopnds >= 1 && same_phys_reg(&in->opnds[0], r)) || 273 (in->nopnds >= 2 && same_phys_reg(&in->opnds[1], r)); 274 case IR_ASM_BLOCK: { 275 IRAsmAux* aux = (IRAsmAux*)in->extra.aux; 276 if (!aux) return 0; 277 for (u32 i = 0; i < aux->nout; ++i) 278 if (same_phys_reg(&aux->out_ops[i], r)) return 1; 279 if (r->cls < OPT_REG_CLASSES && r->v.reg < 32 && 280 (aux->clobber_mask[r->cls] & (1u << r->v.reg))) 281 return 1; 282 return 0; 283 } 284 case IR_INTRINSIC: { 285 IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; 286 if (!aux) return 0; 287 for (u32 i = 0; i < aux->ndst; ++i) 288 if (same_phys_reg(&aux->dsts[i], r)) return 1; 289 return 0; 290 } 291 default: 292 return 0; 293 } 294 } 295 296 /* True if `in` may write to memory. Used to invalidate memory-reading 297 * producers (IR_LOAD) when the consumer is past an intervening write. */ 298 static int inst_writes_memory(const Inst* in) { 299 switch ((IROp)in->op) { 300 case IR_STORE: 301 case IR_AGG_COPY: 302 case IR_AGG_SET: 303 case IR_BITFIELD_STORE: 304 case IR_ATOMIC_STORE: 305 case IR_ATOMIC_RMW: 306 case IR_ATOMIC_CAS: 307 case IR_CALL: 308 case IR_ASM_BLOCK: 309 case IR_INTRINSIC: 310 return 1; 311 default: 312 return 0; 313 } 314 } 315 316 static int inst_reads_memory(const Inst* in) { 317 switch ((IROp)in->op) { 318 case IR_LOAD: 319 case IR_BITFIELD_LOAD: 320 case IR_ATOMIC_LOAD: 321 return 1; 322 default: 323 return 0; 324 } 325 } 326 327 /* ---- substitution-slot whitelist (operand positions that accept a register 328 * substitute, an immediate substitute, or are valid for ext-of-ext folding) 329 * ---- */ 330 331 typedef enum SubstKind { 332 SK_REG, /* register-to-register (IR_COPY producer) */ 333 SK_IMM, /* register-to-immediate (IR_LOAD_IMM producer) */ 334 SK_CV, /* register-to-register through identical convert (IR_CONVERT prod) */ 335 } SubstKind; 336 337 /* Returns 1 if the given operand-index `idx` of `in` is foldable for `kind`. 338 * SK_REG / SK_CV: register substitution slots. SK_IMM: immediate substitution 339 * slots. */ 340 static int combine_subst_slot(const Inst* in, u32 idx, SubstKind kind, 341 int copy_imm_ok) { 342 switch ((IROp)in->op) { 343 case IR_COPY: 344 /* Normally IR_COPY stays register-to-register so that, after coalescing 345 * assigns src and dst the same hard reg, it becomes a self-copy combine 346 * removes. The O1 path never coalesces, so folding the immediate 347 * (copy_imm_ok) collapses `load_imm rT,k; copy rD,rT` into `copy rD,#k`, 348 * which the emit path turns into a single `load_imm rD,k`. */ 349 return (kind != SK_IMM || copy_imm_ok) && idx == 1; 350 case IR_UNOP: 351 return kind != SK_IMM && idx == 1; 352 case IR_CONVERT: 353 return kind != SK_IMM && idx == 1; 354 case IR_BINOP: 355 case IR_CMP: 356 return idx == 1 || idx == 2; 357 case IR_CMP_BRANCH: 358 return idx == 0 || idx == 1; 359 case IR_CONDBR: 360 return kind != SK_IMM && idx == 0; 361 case IR_STORE: 362 /* opnds[0] is the address; register subst into the indirect base/index 363 * is handled separately. The data slot at idx==1 accepts reg or imm. */ 364 return idx == 1; 365 case IR_LOAD: 366 case IR_ATOMIC_LOAD: 367 case IR_ATOMIC_RMW: 368 case IR_BITFIELD_LOAD: 369 case IR_BITFIELD_STORE: 370 case IR_AGG_COPY: 371 case IR_AGG_SET: 372 /* OPK_INDIRECT base/index substitution is handled inside 373 * subst_consumer_operands; these slots take no direct OPK_REG. */ 374 return 0; 375 case IR_ALLOCA: 376 return kind != SK_IMM && idx == 1; 377 default: 378 return 0; 379 } 380 } 381 382 /* ---- per-BB tracking context ---- */ 383 384 typedef struct CombineCtx { 385 Func* f; 386 Block* bl; 387 const OptHardBlockLive* hard_live; 388 /* Index of the most recent inst in this BB that defined (cls, reg); 389 * -1 means no definition seen this BB. */ 390 i32 last_def[OPT_REG_CLASSES][OPT_MAX_HARD_REGS]; 391 i32 last_mem_def; 392 int block_change_p; 393 } CombineCtx; 394 395 static void ctx_reset(CombineCtx* ctx) { 396 memset(ctx->last_def, 0xff, sizeof ctx->last_def); /* all -1 */ 397 ctx->last_mem_def = -1; 398 ctx->block_change_p = 0; 399 } 400 401 /* True if `in` is a barrier that conservatively invalidates all prior 402 * producers in this BB (modelled after the old find_single_direct_use, which 403 * treats every IR_CALL as a clobber regardless of explicit ABI describes). */ 404 static int inst_is_clobber_barrier(const Inst* in) { 405 switch ((IROp)in->op) { 406 case IR_CALL: 407 case IR_ASM_BLOCK: 408 case IR_INTRINSIC: 409 return 1; 410 default: 411 return 0; 412 } 413 } 414 415 static void ctx_record_reg_def(CombineCtx* ctx, const Operand* op, i32 i) { 416 if (op->kind != OPK_REG) return; 417 if (op->cls >= OPT_REG_CLASSES || op->v.reg >= OPT_MAX_HARD_REGS) return; 418 ctx->last_def[op->cls][op->v.reg] = i; 419 } 420 421 /* After processing inst at index `i`, mark each register it defines (incl. 422 * implicit clobbers from CALL / ASM / INTRINSIC) as defined here. Mark 423 * memory if this inst writes. 424 * 425 * Walks the inst's destination operand(s) directly rather than probing every 426 * (cls, reg) pair. The barrier case (CALL/ASM/INTRINSIC) keeps a bulk mark 427 * because explicit defs/clobbers aren't always populated in IRCallAux at 428 * this point in the pipeline. */ 429 static void ctx_record(CombineCtx* ctx, const Inst* in, i32 i) { 430 if (inst_writes_memory(in)) ctx->last_mem_def = i; 431 if (inst_is_clobber_barrier(in)) { 432 for (u32 c = 0; c < OPT_REG_CLASSES; ++c) 433 for (u32 r = 0; r < OPT_MAX_HARD_REGS; ++r) ctx->last_def[c][r] = i; 434 return; 435 } 436 switch ((IROp)in->op) { 437 case IR_LOAD_IMM: 438 case IR_LOAD_CONST: 439 case IR_LOAD_LABEL_ADDR: 440 case IR_COPY: 441 case IR_LOAD: 442 case IR_ADDR_OF: 443 case IR_TLS_ADDR_OF: 444 case IR_BITFIELD_LOAD: 445 case IR_BINOP: 446 case IR_UNOP: 447 case IR_CMP: 448 case IR_CONVERT: 449 case IR_ALLOCA: 450 case IR_VA_ARG: 451 case IR_ATOMIC_LOAD: 452 case IR_ATOMIC_RMW: 453 if (in->nopnds >= 1) ctx_record_reg_def(ctx, &in->opnds[0], i); 454 break; 455 case IR_ATOMIC_CAS: 456 if (in->nopnds >= 1) ctx_record_reg_def(ctx, &in->opnds[0], i); 457 if (in->nopnds >= 2) ctx_record_reg_def(ctx, &in->opnds[1], i); 458 break; 459 default: 460 break; 461 } 462 } 463 464 /* Lookup the producer of (cls, reg) in this BB, if any. Returns -1 if no 465 * definer has been seen yet in this BB. */ 466 static i32 ctx_producer_of(const CombineCtx* ctx, u8 cls, Reg reg) { 467 if (reg >= OPT_MAX_HARD_REGS || cls >= OPT_REG_CLASSES) return -1; 468 return ctx->last_def[cls][reg]; 469 } 470 471 /* Does (cls, reg) get redefined strictly after `since_idx` (exclusive) in this 472 * BB, given current ctx state? We use last_def: if last_def[cls][reg] > 473 * since_idx, then yes. (since_idx is typically the producer's index.) */ 474 static int ctx_def_changed_since(const CombineCtx* ctx, u8 cls, Reg reg, 475 i32 since_idx) { 476 if (reg >= OPT_MAX_HARD_REGS || cls >= OPT_REG_CLASSES) return 0; 477 return ctx->last_def[cls][reg] > since_idx; 478 } 479 480 static i32 ctx_prev_def_before(const CombineCtx* ctx, const Operand* reg, 481 i32 before_idx) { 482 if (!reg || reg->kind != OPK_REG) return -1; 483 for (i32 j = before_idx - 1; j >= 0; --j) { 484 const Inst* prev = &ctx->bl->insts[j]; 485 if (inst_is_clobber_barrier(prev) || inst_defines_phys_reg(prev, reg)) 486 return j; 487 } 488 return -1; 489 } 490 491 static void ctx_restore_removed_def(CombineCtx* ctx, const Operand* reg, 492 i32 removed_idx) { 493 if (!reg || reg->kind != OPK_REG) return; 494 if (reg->cls >= OPT_REG_CLASSES || reg->v.reg >= OPT_MAX_HARD_REGS) return; 495 if (ctx->last_def[reg->cls][reg->v.reg] == removed_idx) 496 ctx->last_def[reg->cls][reg->v.reg] = 497 ctx_prev_def_before(ctx, reg, removed_idx); 498 } 499 500 /* ---- forward-scan use accounting (used for single-use checks) ---- */ 501 502 /* Count phys uses of `def` within the live range starting just after 503 * `prod_idx`. The live range ends at the first inst that redefines `def`'s 504 * physical register or that is a clobber barrier (CALL/ASM/INTRINSIC); uses 505 * after that point belong to a different live range of the same physreg and 506 * are irrelevant to this producer. 507 * 508 * `*killed_in_block_out` is set non-zero when the live range terminates 509 * inside the block (caller may then skip the cross-block live-out check). 510 * 511 * Without this live-range scoping, reuse of a scratch physreg later in the 512 * block (`sxtw x12, ...; mov x13, x12; mov x12, ...`) makes every fold look 513 * multi-use and combine rejects almost everything. */ 514 /* True if `in` redefines or clobbers `def`'s physical register. Consults the 515 * instruction's precise def/clobber set (the same one liveness uses) rather 516 * than a blanket clobber-barrier test: a call clobbers only its caller-saved 517 * set, so a value held in a callee-saved register survives it. Treating every 518 * call as killing every register made `killed` spuriously true and let combine 519 * skip the cross-block live-out check, deleting a still-live def. */ 520 static int inst_kills_phys_reg(Func* f, const Inst* in, const Operand* def) { 521 OptHardRegSet use, d; 522 if (!def || def->kind != OPK_REG || def->cls >= OPT_REG_CLASSES || 523 def->v.reg >= 32) 524 return 0; 525 opt_hard_inst_use_def(f, in, &use, &d); 526 return (d.cls[def->cls] & (1u << def->v.reg)) != 0; 527 } 528 529 static int count_uses_in_live_range(Func* f, const Block* bl, i32 prod_idx, 530 const Operand* def, 531 int* killed_in_block_out) { 532 int n = 0; 533 int killed = 0; 534 for (i32 i = prod_idx + 1; i < (i32)bl->ninsts; ++i) { 535 const Inst* in = &bl->insts[i]; 536 n += inst_uses_phys_reg(in, def); 537 if (inst_kills_phys_reg(f, in, def)) { 538 killed = 1; 539 break; 540 } 541 } 542 if (killed_in_block_out) *killed_in_block_out = killed; 543 return n; 544 } 545 546 static int use_after_clobber_before_redef(const Block* bl, i32 prod_idx, 547 const Operand* def) { 548 int saw_clobber = 0; 549 for (i32 i = prod_idx + 1; i < (i32)bl->ninsts; ++i) { 550 const Inst* in = &bl->insts[i]; 551 if (saw_clobber && inst_uses_phys_reg(in, def)) return 1; 552 if (inst_defines_phys_reg(in, def)) return 0; 553 if (inst_is_clobber_barrier(in)) saw_clobber = 1; 554 } 555 return 0; 556 } 557 558 /* ---- ConvKind helpers (for combine_exts) ---- */ 559 560 static u32 builtin_int_bytes(KitCgTypeId t) { 561 if ((t >> COMBINE_CG_TYPE_SEG_SHIFT) != COMBINE_CG_TYPE_BUILTIN_SEG) return 0; 562 KitCgBuiltinType b = (KitCgBuiltinType)(t & COMBINE_CG_TYPE_SEG_MASK); 563 switch (b) { 564 case KIT_CG_BUILTIN_BOOL: 565 case KIT_CG_BUILTIN_I8: 566 return 1; 567 case KIT_CG_BUILTIN_I16: 568 return 2; 569 case KIT_CG_BUILTIN_I32: 570 return 4; 571 case KIT_CG_BUILTIN_I64: 572 return 8; 573 case KIT_CG_BUILTIN_I128: 574 return 16; 575 default: 576 return 0; 577 } 578 } 579 580 /* For an IR_CONVERT inst, decode (src_bytes, dst_bytes, sign_p). Returns 0 581 * if this isn't an integer ext convert (CV_SEXT or CV_ZEXT). */ 582 static int ext_params(const Inst* in, u32* src_bytes_out, u32* dst_bytes_out, 583 int* sign_p_out) { 584 if ((IROp)in->op != IR_CONVERT || in->nopnds < 2) return 0; 585 ConvKind k = (ConvKind)in->extra.imm; 586 if (k != CV_SEXT && k != CV_ZEXT) return 0; 587 u32 sb = builtin_int_bytes(in->opnds[1].type); 588 u32 db = builtin_int_bytes(in->opnds[0].type); 589 if (!sb || !db || db < sb) return 0; 590 *src_bytes_out = sb; 591 *dst_bytes_out = db; 592 *sign_p_out = (k == CV_SEXT); 593 return 1; 594 } 595 596 /* Width in bytes of a scalar integer or pointer type (1..8), else 0. Mirrors 597 * pass_simplify's simplify_width: builtin ints decode without the compiler, 598 * pointers fall back to the type-size query. */ 599 static u32 combine_scalar_width_bytes(Func* f, KitCgTypeId t) { 600 u32 b = builtin_int_bytes(t); 601 if (b) return b > 8u ? 0u : b; 602 if (f->c && kit_cg_type_kind((KitCompiler*)f->c, t) == KIT_CG_TYPE_PTR) { 603 u64 sz = kit_cg_type_size((KitCompiler*)f->c, t); 604 if (sz && sz <= 8u) return (u32)sz; 605 } 606 return 0; 607 } 608 609 /* Compute the constant produced by applying an integer/pointer convert `k` 610 * (src width `sb`, dst width `db`, both BYTES) to immediate `imm`. Returns 0 611 * for kinds that aren't a bit-preserving integer/pointer move (the float 612 * conversions reinterpret the value and must not be folded this way). The 613 * materialized-constant model: a load_imm puts (u64)imm into a register, so a 614 * widening move/zext keeps the low `sb` bits and a trunc/narrowing keeps the 615 * low `db` bits. 616 * 617 * The arithmetic is the shared cg/ir_eval convert core (in BITS); combine's 618 * byte widths convert at the boundary. combine's BITCAST is the bit-preserving 619 * register move (== ZEXT here, no equal-width requirement), so it maps to 620 * CV_ZEXT rather than the shared eval's stricter equal-width BITCAST. */ 621 static int const_convert_value(ConvKind k, i64 imm, u32 sb, u32 db, i64* out) { 622 if (!sb || !db) return 0; 623 if (k == CV_BITCAST) k = CV_ZEXT; 624 return kit_ir_eval_convert(k, sb * 8u, db * 8u, imm, out); 625 } 626 627 /* ---- producer-retarget legality (for sink rewrite) ---- */ 628 629 /* combine retargets the destination of a commutative producer, so it includes 630 * the FP commutatives (FADD/FMUL) on top of the shared integer set. */ 631 static int binop_is_commutative(BinOp op) { 632 return kit_ir_binop_is_commutative_int(op) || op == BO_FADD || op == BO_FMUL; 633 } 634 635 /* Is `producer` an op whose destination register can be safely retargeted 636 * without backend consultation? Excludes ops with implicit destination 637 * constraints, ABI pinning, multi-def outputs, or aux-driven dst layout. */ 638 static int producer_retargetable_op(IROp op) { 639 switch (op) { 640 case IR_BINOP: 641 case IR_UNOP: 642 case IR_LOAD_IMM: 643 case IR_LOAD_CONST: 644 case IR_LOAD: 645 case IR_CONVERT: 646 case IR_CMP: 647 case IR_ADDR_OF: 648 case IR_LOAD_LABEL_ADDR: 649 return 1; 650 default: 651 return 0; 652 } 653 } 654 655 /* Can `producer`'s destination be retargeted to `new_dst` without violating 656 * IR shape? For binop, may signal that a commutative-swap is needed. */ 657 static int retarget_producer_legal(Inst* producer, const Operand* new_dst, 658 int* swap_binop) { 659 *swap_binop = 0; 660 if (!new_dst || new_dst->kind != OPK_REG) return 0; 661 if (producer->nopnds < 1 || producer->opnds[0].kind != OPK_REG) return 0; 662 if (producer->opnds[0].cls != new_dst->cls) return 0; 663 if (producer->opnds[0].type != new_dst->type) return 0; 664 665 switch ((IROp)producer->op) { 666 case IR_LOAD_IMM: 667 case IR_LOAD_CONST: 668 case IR_LOAD_LABEL_ADDR: 669 case IR_ADDR_OF: 670 /* Single-defining ops with no register source to alias against. */ 671 return 1; 672 case IR_UNOP: 673 case IR_CONVERT: 674 case IR_LOAD: 675 return 1; 676 case IR_CMP: 677 return 1; 678 case IR_BINOP: { 679 if (producer->nopnds < 3) return 0; 680 int dst_is_lhs = operand_uses_phys_reg(&producer->opnds[1], new_dst); 681 int dst_is_rhs = operand_uses_phys_reg(&producer->opnds[2], new_dst); 682 if (!dst_is_lhs && !dst_is_rhs) return 1; 683 if (dst_is_lhs) return 1; 684 if (binop_is_commutative((BinOp)producer->extra.imm)) { 685 *swap_binop = 1; 686 return 1; 687 } 688 return 0; 689 } 690 default: 691 return 0; 692 } 693 } 694 695 static int first_return_reg(Func* f, u8 cls, Reg* out) { 696 if (!f || cls >= OPT_REG_CLASSES) return 0; 697 u32 mask = f->opt_ret_regs[cls]; 698 for (Reg r = 0; r < 32; ++r) { 699 if (mask & (1u << r)) { 700 *out = r; 701 return 1; 702 } 703 } 704 return 0; 705 } 706 707 static int ret_scalar_storage(CGABIValue* v, Operand** out) { 708 if (!v || v->storage.kind != OPK_REG) return 0; 709 if (v->nparts > 1) return 0; 710 *out = &v->storage; 711 return 1; 712 } 713 714 static int is_target_scratch_reg(Func* f, u8 cls, Reg reg) { 715 if (!f || cls >= OPT_REG_CLASSES) return 0; 716 for (u32 i = 0; i < f->opt_scratch_reg_count[cls]; ++i) { 717 if (f->opt_scratch_regs[cls][i] == reg) return 1; 718 } 719 return 0; 720 } 721 722 /* ---- Rewrite 1: substitute producer source into uses ---- */ 723 724 /* Set the indirect's base or index to `new_reg`. */ 725 static void set_indirect_field(Operand* ind, Reg old_reg, Reg new_reg) { 726 if (ind->v.ind.base == old_reg) ind->v.ind.base = new_reg; 727 if (ind->v.ind.index != (Reg)REG_NONE && ind->v.ind.index == old_reg) 728 ind->v.ind.index = new_reg; 729 } 730 731 /* Substitute `def` -> `src` in a single aux register slot (no slot-index 732 * whitelist; caller has already restricted to SK_REG and ensured src is a 733 * register). Returns 1 if rewritten. */ 734 static int subst_one_aux_operand(Operand* op, const Operand* def, 735 const Operand* src) { 736 if (op->kind == OPK_REG && same_phys_reg(op, def)) { 737 *op = *src; 738 return 1; 739 } 740 if (op->kind == OPK_INDIRECT && src->kind == OPK_REG && def->cls == RC_INT && 741 src->cls == RC_INT && 742 (op->v.ind.base == def->v.reg || 743 (op->v.ind.index != (Reg)REG_NONE && op->v.ind.index == def->v.reg))) { 744 set_indirect_field(op, def->v.reg, src->v.reg); 745 return 1; 746 } 747 return 0; 748 } 749 750 static int subst_abivalue_uses(CGABIValue* v, const Operand* def, 751 const Operand* src) { 752 int n = 0; 753 n += subst_one_aux_operand(&v->storage, def, src); 754 for (u32 i = 0; i < v->nparts; ++i) 755 n += subst_one_aux_operand((Operand*)&v->parts[i].op, def, src); 756 return n; 757 } 758 759 /* Walk the operands of consumer `in` and try to substitute uses of `def` 760 * (producer's destination, OPK_REG) with `src` (producer's source, OPK_REG 761 * or OPK_IMM). For OPK_INDIRECT operands, substitution into base/index is 762 * only valid for OPK_REG `src`. Returns the number of operands actually 763 * rewritten. */ 764 static int subst_consumer_operands(Inst* in, const Operand* def, 765 const Operand* src, SubstKind kind, 766 int copy_imm_ok) { 767 int n = 0; 768 for (u32 oi = 0; oi < in->nopnds; ++oi) { 769 Operand* op = &in->opnds[oi]; 770 /* Direct OPK_REG substitution: requires the slot to be on the whitelist. */ 771 if (op->kind == OPK_REG && same_phys_reg(op, def) && 772 combine_subst_slot(in, oi, kind, copy_imm_ok)) { 773 *op = *src; 774 ++n; 775 continue; 776 } 777 /* Indirect base/index substitution: only OPK_REG src may land here. The 778 * substitution only changes which register computes the address, not the 779 * value being stored — safe for IR_STORE / IR_ATOMIC_STORE / 780 * IR_BITFIELD_STORE / IR_AGG_COPY / IR_AGG_SET as well. */ 781 if (op->kind == OPK_INDIRECT && kind == SK_REG && src->kind == OPK_REG && 782 def->kind == OPK_REG && def->cls == RC_INT && src->cls == RC_INT && 783 (op->v.ind.base == def->v.reg || 784 (op->v.ind.index != (Reg)REG_NONE && op->v.ind.index == def->v.reg))) { 785 set_indirect_field(op, def->v.reg, src->v.reg); 786 ++n; 787 } 788 } 789 790 /* Aux register uses: IR_CALL plan/desc args + callee. SK_REG only — 791 * substituting an immediate or convert into an ABI-bound use would break 792 * the call lowering. IR_RET is intentionally excluded: the return register 793 * is live-out by ABI, so a producer copying into it stays alive even after 794 * substituting val.storage, and emit_ret would emit a redundant second 795 * move. */ 796 if (kind == SK_REG) { 797 if ((IROp)in->op == IR_CALL) { 798 IRCallAux* aux = (IRCallAux*)in->extra.aux; 799 if (aux) { 800 if (aux->use_plan_replay) { 801 n += subst_one_aux_operand(&aux->plan.callee, def, src); 802 for (u32 k = 0; k < aux->plan.nargs; ++k) 803 n += subst_one_aux_operand(&aux->plan.args[k].src, def, src); 804 } else { 805 n += subst_one_aux_operand(&aux->desc.callee, def, src); 806 for (u32 k = 0; k < aux->desc.nargs; ++k) 807 n += subst_abivalue_uses((CGABIValue*)&aux->desc.args[k], def, src); 808 } 809 } 810 } 811 } 812 return n; 813 } 814 815 /* Try to substitute the producer of (cls, reg) into uses of that register 816 * in `in`. Returns 1 if at least one operand was rewritten. */ 817 static int try_substitute_for_reg(CombineCtx* ctx, Inst* in, i32 i, u8 cls, 818 Reg reg) { 819 i32 prod_idx = ctx_producer_of(ctx, cls, reg); 820 if (prod_idx < 0 || prod_idx >= i) return 0; 821 Inst* prod = &ctx->bl->insts[prod_idx]; 822 IROp pop = (IROp)prod->op; 823 if (pop != IR_COPY && pop != IR_LOAD_IMM && pop != IR_CONVERT) return 0; 824 if (prod->nopnds < 1 || prod->opnds[0].kind != OPK_REG) return 0; 825 if (prod->opnds[0].cls != cls || prod->opnds[0].v.reg != reg) return 0; 826 827 Operand def = prod->opnds[0]; 828 SubstKind kind; 829 Operand src_op; 830 memset(&src_op, 0, sizeof src_op); 831 if (pop == IR_COPY) { 832 if (prod->nopnds < 2 || prod->opnds[1].kind != OPK_REG) return 0; 833 if (ctx_def_changed_since(ctx, prod->opnds[1].cls, prod->opnds[1].v.reg, 834 prod_idx)) 835 return 0; 836 if (is_target_scratch_reg(ctx->f, prod->opnds[1].cls, prod->opnds[1].v.reg)) 837 return 0; 838 kind = SK_REG; 839 src_op = prod->opnds[1]; 840 } else if (pop == IR_LOAD_IMM) { 841 kind = SK_IMM; 842 src_op.kind = OPK_IMM; 843 src_op.cls = def.cls; 844 src_op.type = def.type; 845 src_op.v.imm = prod->extra.imm; 846 } else { /* IR_CONVERT */ 847 if (prod->nopnds < 2 || prod->opnds[1].kind != OPK_REG) return 0; 848 /* Fold only when consumer is also a convert with matching shape (the 849 * broader ext-of-ext rules run in try_combine_exts). */ 850 if ((IROp)in->op != IR_CONVERT) return 0; 851 if (in->nopnds < 2 || in->opnds[1].kind != OPK_REG) return 0; 852 if (in->extra.imm != prod->extra.imm) return 0; 853 if (in->opnds[1].type != prod->opnds[0].type) return 0; 854 if (in->opnds[0].type != prod->opnds[0].type) return 0; 855 if (ctx_def_changed_since(ctx, prod->opnds[1].cls, prod->opnds[1].v.reg, 856 prod_idx)) 857 return 0; 858 kind = SK_CV; 859 src_op = prod->opnds[1]; 860 } 861 862 /* For a register-source IR_COPY this is local copy propagation: rewriting 863 * one use of the copy's dest to its source is value-safe whenever the source 864 * is unchanged between the copy and this consumer — already verified by the 865 * ctx_def_changed_since check above (which treats CALL/ASM/INTRINSIC as a 866 * clobber of every register). The copy itself is not removed here; if all its 867 * uses fold away and it is not live-out, post-combine DCE deletes it. So no 868 * single-use or cross-block live-out restriction is required, which lets the 869 * O1 path collapse the multi-use reg->reg copy chains it would otherwise 870 * leave behind (it never runs the O2 coalescer). 871 * 872 * The immediate / convert producers don't propagate a register value, so 873 * they keep the stricter single-use + live-out gate: substituting them into 874 * every use would duplicate a (possibly multi-instruction) materialization 875 * rather than shorten a copy chain. */ 876 if (kind != SK_REG) { 877 int killed = 0; 878 int uses_total = 879 count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &def, &killed); 880 if (uses_total != 1) return 0; 881 if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live, 882 ctx->bl->id, &def)) 883 return 0; 884 } 885 886 /* O1 (no coalescing) folds immediates into IR_COPY; O2 leaves the copy 887 * register-to-register so coalescing + self-copy removal handles it. */ 888 int copy_imm_ok = ctx->f && !ctx->f->opt_coalesce_parent; 889 int n = subst_consumer_operands(in, &def, &src_op, kind, copy_imm_ok); 890 if (n > 0) { 891 ctx->block_change_p = 1; 892 return 1; 893 } 894 return 0; 895 } 896 897 /* Mark (cls, reg) as already-attempted for this consumer so the same producer 898 * is not looked up twice when a register appears in multiple operand slots 899 * (e.g. `[r4 + r4*4]`, or `add r1, r4, r4`). */ 900 static int seen_mark(u32 seen[OPT_REG_CLASSES], u8 cls, Reg reg) { 901 if (cls >= OPT_REG_CLASSES || reg >= 32) return 1; 902 u32 bit = 1u << reg; 903 if (seen[cls] & bit) return 1; 904 seen[cls] |= bit; 905 return 0; 906 } 907 908 static void try_substitute_aux_operand(CombineCtx* ctx, Inst* in, i32 i, 909 const Operand* op, 910 u32 seen[OPT_REG_CLASSES], int* any) { 911 if (op->kind == OPK_REG) { 912 if (!seen_mark(seen, op->cls, op->v.reg)) 913 *any |= try_substitute_for_reg(ctx, in, i, op->cls, op->v.reg); 914 } else if (op->kind == OPK_INDIRECT) { 915 if (op->v.ind.base != (Reg)REG_NONE && 916 !seen_mark(seen, RC_INT, op->v.ind.base)) 917 *any |= try_substitute_for_reg(ctx, in, i, RC_INT, op->v.ind.base); 918 if (op->v.ind.index != (Reg)REG_NONE && 919 !seen_mark(seen, RC_INT, op->v.ind.index)) 920 *any |= try_substitute_for_reg(ctx, in, i, RC_INT, op->v.ind.index); 921 } 922 } 923 924 static void try_substitute_aux_abivalue(CombineCtx* ctx, Inst* in, i32 i, 925 const CGABIValue* v, 926 u32 seen[OPT_REG_CLASSES], int* any) { 927 try_substitute_aux_operand(ctx, in, i, &v->storage, seen, any); 928 for (u32 k = 0; k < v->nparts; ++k) 929 try_substitute_aux_operand(ctx, in, i, &v->parts[k].op, seen, any); 930 } 931 932 /* Try to substitute producers into operand slots of `in`. Walks the direct 933 * operands of `in` and looks up only the producers of registers actually 934 * referenced (typically 2-3 per inst). Also walks IR_CALL aux register uses 935 * (args + callee) so reg->reg copies feeding call args or the callee get 936 * propagated. See subst_consumer_operands for why IR_RET is excluded. */ 937 static int try_substitute(CombineCtx* ctx, Inst* in, i32 i) { 938 int any = 0; 939 u32 seen[OPT_REG_CLASSES] = {0}; 940 for (u32 oi = 0; oi < in->nopnds; ++oi) { 941 const Operand* op = &in->opnds[oi]; 942 try_substitute_aux_operand(ctx, in, i, op, seen, &any); 943 } 944 if ((IROp)in->op == IR_CALL) { 945 IRCallAux* aux = (IRCallAux*)in->extra.aux; 946 if (aux) { 947 if (aux->use_plan_replay) { 948 try_substitute_aux_operand(ctx, in, i, &aux->plan.callee, seen, &any); 949 for (u32 k = 0; k < aux->plan.nargs; ++k) 950 try_substitute_aux_operand(ctx, in, i, &aux->plan.args[k].src, seen, 951 &any); 952 } else { 953 try_substitute_aux_operand(ctx, in, i, &aux->desc.callee, seen, &any); 954 for (u32 k = 0; k < aux->desc.nargs; ++k) 955 try_substitute_aux_abivalue( 956 ctx, in, i, (const CGABIValue*)&aux->desc.args[k], seen, &any); 957 } 958 } 959 } 960 return any; 961 } 962 963 /* ---- Rewrite 2: address-mode synthesis ---- */ 964 965 /* Try to fold add/shl producers into one OPK_INDIRECT operand of `in`. */ 966 static int try_addr_synth_one_op(CombineCtx* ctx, Inst* in, i32 i, 967 Operand* op) { 968 (void)in; 969 if (op->kind != OPK_INDIRECT) return 0; 970 int any = 0; 971 972 /* (a/c) base producer is IR_BINOP IADD. reg+reg synthesizes a base+index 973 * pair (requires no existing index, sub-rule a); reg+imm folds the immediate 974 * into ofs (sub-rule c, ok with or without an existing index). */ 975 if (op->v.ind.base != (Reg)REG_NONE) { 976 Reg b = op->v.ind.base; 977 i32 prod_idx = ctx_producer_of(ctx, RC_INT, b); 978 if (prod_idx >= 0 && prod_idx < i) { 979 Inst* prod = &ctx->bl->insts[prod_idx]; 980 if ((IROp)prod->op == IR_BINOP && prod->nopnds == 3 && 981 (BinOp)prod->extra.imm == BO_IADD && prod->opnds[0].kind == OPK_REG && 982 prod->opnds[0].cls == RC_INT && prod->opnds[0].v.reg == b) { 983 Operand prod_def = prod->opnds[0]; 984 /* Single-use within producer's live range (the only further use 985 * before next redef/barrier). */ 986 int killed = 0; 987 int uses_after = count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, 988 &prod_def, &killed); 989 if (uses_after == 1 && 990 (killed || !opt_block_live_out_has_phys_reg( 991 ctx->f, ctx->hard_live, ctx->bl->id, &prod_def))) { 992 Operand lhs = prod->opnds[1]; 993 Operand rhs = prod->opnds[2]; 994 int has_no_index = (op->v.ind.index == (Reg)REG_NONE); 995 /* reg + reg: base = lhs, index = rhs, scale=0. Needs an empty 996 * index slot — cannot stack two indices. */ 997 if (has_no_index && lhs.kind == OPK_REG && rhs.kind == OPK_REG && 998 lhs.cls == RC_INT && rhs.cls == RC_INT && 999 !ctx_def_changed_since(ctx, RC_INT, lhs.v.reg, prod_idx) && 1000 !ctx_def_changed_since(ctx, RC_INT, rhs.v.reg, prod_idx)) { 1001 op->v.ind.base = lhs.v.reg; 1002 op->v.ind.index = rhs.v.reg; 1003 op->v.ind.log2_scale = 0; 1004 any = 1; 1005 } 1006 /* reg + imm: base = lhs, fold imm into ofs. Safe with or without 1007 * an existing index — only the offset is mutated. */ 1008 else if (lhs.kind == OPK_REG && rhs.kind == OPK_IMM && 1009 lhs.cls == RC_INT && 1010 !ctx_def_changed_since(ctx, RC_INT, lhs.v.reg, prod_idx)) { 1011 i64 sum = (i64)op->v.ind.ofs + rhs.v.imm; 1012 if (sum >= INT32_MIN && sum <= INT32_MAX) { 1013 op->v.ind.base = lhs.v.reg; 1014 op->v.ind.ofs = (i32)sum; 1015 any = 1; 1016 } 1017 } 1018 /* imm + reg: base = rhs, fold imm into ofs (commutative IADD). */ 1019 else if (lhs.kind == OPK_IMM && rhs.kind == OPK_REG && 1020 rhs.cls == RC_INT && 1021 !ctx_def_changed_since(ctx, RC_INT, rhs.v.reg, prod_idx)) { 1022 i64 sum = (i64)op->v.ind.ofs + lhs.v.imm; 1023 if (sum >= INT32_MIN && sum <= INT32_MAX) { 1024 op->v.ind.base = rhs.v.reg; 1025 op->v.ind.ofs = (i32)sum; 1026 any = 1; 1027 } 1028 } 1029 } 1030 } 1031 } 1032 } 1033 1034 /* (b) index producer is IR_BINOP ISHL reg, imm with scale=0. */ 1035 if (op->v.ind.index != (Reg)REG_NONE && op->v.ind.log2_scale == 0) { 1036 Reg idx = op->v.ind.index; 1037 i32 prod_idx = ctx_producer_of(ctx, RC_INT, idx); 1038 if (prod_idx >= 0 && prod_idx < i) { 1039 Inst* prod = &ctx->bl->insts[prod_idx]; 1040 if ((IROp)prod->op == IR_BINOP && prod->nopnds == 3 && 1041 (BinOp)prod->extra.imm == BO_SHL && prod->opnds[0].kind == OPK_REG && 1042 prod->opnds[0].cls == RC_INT && prod->opnds[0].v.reg == idx && 1043 prod->opnds[1].kind == OPK_REG && prod->opnds[1].cls == RC_INT && 1044 prod->opnds[2].kind == OPK_IMM) { 1045 i64 sh = prod->opnds[2].v.imm; 1046 if (sh >= 1 && sh <= 3) { 1047 Operand prod_def = prod->opnds[0]; 1048 int killed = 0; 1049 int uses_after = count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, 1050 &prod_def, &killed); 1051 /* `<= 2` allows the degenerate [base == index] case where the SHL 1052 * dst appears twice in the same indirect; rewriting only the index 1053 * still yields equivalent arithmetic when base also held the SHL 1054 * dst (base unchanged: r*4; index rewritten: r_src*4; r*4 == r*4). */ 1055 if (uses_after >= 1 && uses_after <= 2 && 1056 (killed || !opt_block_live_out_has_phys_reg( 1057 ctx->f, ctx->hard_live, ctx->bl->id, &prod_def)) && 1058 !ctx_def_changed_since(ctx, RC_INT, prod->opnds[1].v.reg, 1059 prod_idx)) { 1060 op->v.ind.index = prod->opnds[1].v.reg; 1061 op->v.ind.log2_scale = (u8)sh; 1062 any = 1; 1063 } 1064 } 1065 } 1066 } 1067 } 1068 1069 return any; 1070 } 1071 1072 static int try_addr_synth(CombineCtx* ctx, Inst* in, i32 i) { 1073 int any = 0; 1074 for (u32 oi = 0; oi < in->nopnds; ++oi) { 1075 if (in->opnds[oi].kind == OPK_INDIRECT) { 1076 if (try_addr_synth_one_op(ctx, in, i, &in->opnds[oi])) { 1077 any = 1; 1078 ctx->block_change_p = 1; 1079 } 1080 } 1081 } 1082 return any; 1083 } 1084 1085 /* ---- Rewrite 3: sink producer into single-use IR_COPY destination ---- */ 1086 1087 static int try_sink(CombineCtx* ctx, Inst* in, i32 i) { 1088 if ((IROp)in->op != IR_COPY || in->nopnds < 2) return 0; 1089 if (in->opnds[0].kind != OPK_REG || in->opnds[1].kind != OPK_REG) return 0; 1090 if (same_phys_reg(&in->opnds[0], &in->opnds[1])) return 0; 1091 1092 Operand src = in->opnds[1]; 1093 Operand dst = in->opnds[0]; 1094 i32 prod_idx = ctx_producer_of(ctx, src.cls, src.v.reg); 1095 if (prod_idx < 0 || prod_idx >= i) return 0; 1096 Inst* prod = &ctx->bl->insts[prod_idx]; 1097 if (!producer_retargetable_op((IROp)prod->op)) return 0; 1098 if (prod->nopnds < 1 || prod->opnds[0].kind != OPK_REG) return 0; 1099 if (!same_phys_reg(&prod->opnds[0], &src)) return 0; 1100 1101 /* Producer's source operands must not have been redefined since. */ 1102 for (u32 oi = 1; oi < prod->nopnds; ++oi) { 1103 const Operand* p = &prod->opnds[oi]; 1104 if (p->kind == OPK_REG) { 1105 if (ctx_def_changed_since(ctx, p->cls, p->v.reg, prod_idx)) return 0; 1106 } else if (p->kind == OPK_INDIRECT) { 1107 if (ctx_def_changed_since(ctx, RC_INT, p->v.ind.base, prod_idx)) return 0; 1108 if (p->v.ind.index != (Reg)REG_NONE && 1109 ctx_def_changed_since(ctx, RC_INT, p->v.ind.index, prod_idx)) 1110 return 0; 1111 } 1112 } 1113 /* If producer reads memory, no intervening memory write. */ 1114 if (inst_reads_memory(prod) && ctx->last_mem_def > prod_idx) return 0; 1115 /* Retargeting moves the producer's write to `dst` earlier. That is only 1116 * legal if the hard register is dead throughout the interval before the 1117 * copy that originally wrote it. */ 1118 for (i32 j = prod_idx + 1; j < i; ++j) { 1119 Inst* mid = &ctx->bl->insts[j]; 1120 if (inst_uses_phys_reg(mid, &dst) || inst_defines_phys_reg(mid, &dst)) 1121 return 0; 1122 } 1123 1124 /* Producer dst must have exactly one use within its live range (the copy 1125 * at i). If the live range terminates inside the block, the cross-block 1126 * live-out check is moot. */ 1127 int killed = 0; 1128 int uses_total = 1129 count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &src, &killed); 1130 if (uses_total != 1) return 0; 1131 if (killed && use_after_clobber_before_redef(ctx->bl, prod_idx, &src)) 1132 return 0; 1133 if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live, 1134 ctx->bl->id, &src)) 1135 return 0; 1136 1137 int swap_binop = 0; 1138 if (!retarget_producer_legal(prod, &dst, &swap_binop)) return 0; 1139 1140 if (swap_binop) { 1141 Operand tmp = prod->opnds[1]; 1142 prod->opnds[1] = prod->opnds[2]; 1143 prod->opnds[2] = tmp; 1144 } 1145 prod->opnds[0] = dst; 1146 if (prod->def != VAL_NONE) prod->def = (Val)dst.v.reg; 1147 1148 /* Update last-def: producer no longer defines src, now defines dst. If src 1149 * had an earlier reaching definition in the block, keep it visible for 1150 * later source-availability checks. */ 1151 ctx_restore_removed_def(ctx, &src, prod_idx); 1152 ctx->last_def[dst.cls][dst.v.reg] = prod_idx; 1153 1154 /* Mark the copy NOP'd so compact removes it. */ 1155 in->op = IR_NOP; 1156 in->def = VAL_NONE; 1157 in->ndefs = 0; 1158 in->defs = NULL; 1159 in->nopnds = 0; 1160 in->opnds = NULL; 1161 ctx->block_change_p = 1; 1162 return 1; 1163 } 1164 1165 /* ---- Rewrite 4: combine_exts (ext-of-ext chains) ---- */ 1166 1167 static int try_combine_exts(CombineCtx* ctx, Inst* in, i32 i) { 1168 if ((IROp)in->op != IR_CONVERT || in->nopnds < 2) return 0; 1169 if (in->opnds[1].kind != OPK_REG) return 0; 1170 u32 sb, db; 1171 int outer_sign; 1172 if (!ext_params(in, &sb, &db, &outer_sign)) return 0; 1173 1174 Reg src_reg = in->opnds[1].v.reg; 1175 u8 src_cls = in->opnds[1].cls; 1176 i32 prod_idx = ctx_producer_of(ctx, src_cls, src_reg); 1177 if (prod_idx < 0 || prod_idx >= i) return 0; 1178 Inst* prod = &ctx->bl->insts[prod_idx]; 1179 u32 isb, idb; 1180 int inner_sign; 1181 if (!ext_params(prod, &isb, &idb, &inner_sign)) return 0; 1182 if (prod->opnds[1].kind != OPK_REG) return 0; 1183 if (ctx_def_changed_since(ctx, prod->opnds[1].cls, prod->opnds[1].v.reg, 1184 prod_idx)) 1185 return 0; 1186 1187 /* Single-use of inner ext dst within its live range (only this insn). */ 1188 Operand prod_def = prod->opnds[0]; 1189 int killed = 0; 1190 int uses_total = 1191 count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &prod_def, &killed); 1192 if (uses_total != 1) return 0; 1193 if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live, 1194 ctx->bl->id, &prod_def)) 1195 return 0; 1196 1197 /* Outer width `db`, inner width `idb`. Inner reads `isb` bytes. */ 1198 u32 outer_in = sb; /* outer's source width */ 1199 u32 inner_out = idb; /* inner's output width */ 1200 if (outer_in > inner_out) return 0; /* sanity: inner feeds outer */ 1201 1202 /* If inner is an identity-shaped convert that reads back its own def 1203 * register (e.g. `convert rB, rB`), the rewrite would set in.opnds[1] to 1204 * the same register it already holds. Reporting a change spins the 1205 * per-BB fixpoint forever. Skip. */ 1206 if (prod->opnds[1].kind == OPK_REG && in->opnds[1].kind == OPK_REG && 1207 prod->opnds[1].cls == in->opnds[1].cls && 1208 prod->opnds[1].v.reg == in->opnds[1].v.reg) 1209 return 0; 1210 1211 /* Pattern A: outer width <= inner-source width. Outer can reach past the 1212 * inner ext to the inner source directly, keeping outer's signedness. 1213 * MIR allows any signedness combination here. */ 1214 if (db <= isb) { 1215 in->opnds[1] = prod->opnds[1]; 1216 in->opnds[1].type = prod->opnds[1].type; 1217 ctx->block_change_p = 1; 1218 return 1; 1219 } 1220 1221 /* Pattern B: inner-source width < outer width. Allowed iff outer is signed 1222 * OR inner is unsigned; excludes the unsafe uext(sext) combination. */ 1223 if (isb < db && (outer_sign || !inner_sign)) { 1224 in->opnds[1] = prod->opnds[1]; 1225 in->opnds[1].type = prod->opnds[1].type; 1226 /* outer's ConvKind already encodes the outer signedness; keep it. */ 1227 ctx->block_change_p = 1; 1228 return 1; 1229 } 1230 1231 return 0; 1232 } 1233 1234 /* ---- Existing IR_RET retarget (kept; runs in the forward pass) ---- */ 1235 1236 static int try_ret_retarget(Func* f, Block* bl, i32 i) { 1237 if (!f->opt_rewritten || i <= 0) return 0; 1238 Inst* in = &bl->insts[i]; 1239 if ((IROp)in->op != IR_RET) return 0; 1240 IRRetAux* aux = (IRRetAux*)in->extra.aux; 1241 Operand* ret_op = NULL; 1242 Reg ret_reg = REG_NONE; 1243 if (!aux || !aux->present || !ret_scalar_storage(&aux->val, &ret_op) || 1244 !first_return_reg(f, ret_op->cls, &ret_reg) || ret_reg == (Reg)REG_NONE || 1245 ret_reg == ret_op->v.reg) 1246 return 0; 1247 Inst* producer = &bl->insts[i - 1u]; 1248 Operand ret_dst = *ret_op; 1249 ret_dst.v.reg = ret_reg; 1250 int swap_binop = 0; 1251 if (producer->nopnds < 1 || !same_phys_reg(&producer->opnds[0], ret_op) || 1252 !retarget_producer_legal(producer, &ret_dst, &swap_binop)) 1253 return 0; 1254 if (swap_binop) { 1255 Operand tmp = producer->opnds[1]; 1256 producer->opnds[1] = producer->opnds[2]; 1257 producer->opnds[2] = tmp; 1258 } 1259 producer->opnds[0] = ret_dst; 1260 *ret_op = ret_dst; 1261 return 1; 1262 } 1263 1264 /* ---- Rewrite 5: constant-fold a convert of a load_imm into a load_imm ---- 1265 * 1266 * `load_imm rT,k ; convert rD,rT` where the convert is a bit-preserving 1267 * integer/pointer move collapses to `load_imm rD,k'` (k' = convert applied to 1268 * k). The original load_imm, now dead, is removed by post-combine DCE. 1269 * 1270 * This is the convert-shaped sibling of the load_imm-into-copy fold (see 1271 * try_substitute / combine_subst_slot). It matters for pointer-typed constant 1272 * call args (e.g. NewTreeNode((void*)0, ...)): the arg-register hint reaches 1273 * the convert's def but not its load_imm source, so without this the source 1274 * lands in a scratch and the convert emits a `mov rD, scratch`. Folding lets 1275 * the load_imm inherit the convert's (hinted) dst directly — `movz x0, 0`. */ 1276 static int try_fold_const_convert(CombineCtx* ctx, Inst* in, i32 i) { 1277 if ((IROp)in->op != IR_CONVERT || in->nopnds < 2) return 0; 1278 if (in->opnds[0].kind != OPK_REG || in->opnds[1].kind != OPK_REG) return 0; 1279 /* Integer/pointer domain only: a load_imm produces an int-class value, and 1280 * the bit-preserving convert kinds we fold (BITCAST/ZEXT/SEXT/TRUNC) keep it 1281 * int-class. A class change (RC_INT->RC_FP, e.g. an int->float bitcast) is 1282 * not bit-preserving and must not collapse to a load_imm. */ 1283 if (in->opnds[0].cls != RC_INT || in->opnds[1].cls != RC_INT) return 0; 1284 1285 i32 prod_idx = ctx_producer_of(ctx, in->opnds[1].cls, in->opnds[1].v.reg); 1286 if (prod_idx < 0 || prod_idx >= i) return 0; 1287 Inst* prod = &ctx->bl->insts[prod_idx]; 1288 if ((IROp)prod->op != IR_LOAD_IMM || prod->nopnds < 1 || 1289 prod->opnds[0].kind != OPK_REG || 1290 !same_phys_reg(&prod->opnds[0], &in->opnds[1])) 1291 return 0; 1292 1293 u32 sb = combine_scalar_width_bytes(ctx->f, in->opnds[1].type); 1294 u32 db = combine_scalar_width_bytes(ctx->f, in->opnds[0].type); 1295 i64 value = 0; 1296 if (!const_convert_value((ConvKind)in->extra.imm, prod->extra.imm, sb, db, 1297 &value)) 1298 return 0; 1299 1300 /* Single-use + not-live-out gate on the load_imm dst, mirroring the SK_IMM 1301 * gate in try_substitute_for_reg: only fold when this convert is the sole 1302 * use, so we replace rather than duplicate the materialization. */ 1303 Operand src_def = prod->opnds[0]; 1304 int killed = 0; 1305 if (count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &src_def, &killed) != 1306 1) 1307 return 0; 1308 if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live, 1309 ctx->bl->id, &src_def)) 1310 return 0; 1311 1312 /* Rewrite the convert in place into a load_imm of the converted constant. 1313 * The dst operand (opnds[0]) is the hinted arg/return reg and is kept; the 1314 * source operand is dropped. The dead load_imm at prod_idx is left for DCE. 1315 */ 1316 in->op = (u16)IR_LOAD_IMM; 1317 in->type = in->opnds[0].type; 1318 in->nopnds = 1; 1319 in->extra.imm = value; 1320 ctx->block_change_p = 1; 1321 return 1; 1322 } 1323 1324 /* ---- per-BB driver ---- */ 1325 1326 static int opt_combine_fold_block(Func* f, Block* bl, 1327 const OptHardBlockLive* hard_live) { 1328 enum { enable_o1_combine_rewrites = 1 }; 1329 enum { enable_o1_sink_rewrites = 1 }; 1330 CombineCtx ctx; 1331 ctx.f = f; 1332 ctx.bl = bl; 1333 ctx.hard_live = hard_live; 1334 ctx_reset(&ctx); 1335 1336 for (i32 i = 0; i < (i32)bl->ninsts; ++i) { 1337 Inst* in = &bl->insts[i]; 1338 1339 if (enable_o1_combine_rewrites && try_ret_retarget(f, bl, i)) { 1340 ctx.block_change_p = 1; 1341 /* The producer's def changed; update ctx for the producer at i-1. */ 1342 Inst* prev = &bl->insts[i - 1]; 1343 Operand probe; 1344 memset(&probe, 0, sizeof probe); 1345 probe.kind = OPK_REG; 1346 for (u8 c = 0; c < OPT_REG_CLASSES; ++c) { 1347 probe.cls = c; 1348 for (Reg r = 0; r < OPT_MAX_HARD_REGS; ++r) { 1349 probe.v.reg = r; 1350 if (ctx.last_def[c][r] == i - 1 && 1351 !inst_defines_phys_reg(prev, &probe)) 1352 ctx_restore_removed_def(&ctx, &probe, i - 1); 1353 } 1354 } 1355 ctx_record(&ctx, prev, i - 1); 1356 } 1357 1358 /* Skip NOPs left by prior sink rewrites. */ 1359 if ((IROp)in->op == IR_NOP) continue; 1360 1361 if (enable_o1_sink_rewrites && try_sink(&ctx, in, i)) { 1362 /* sink NOP'd the copy and updated ctx. */ 1363 continue; 1364 } 1365 1366 if (enable_o1_combine_rewrites) { 1367 try_fold_const_convert(&ctx, in, i); 1368 try_combine_exts(&ctx, in, i); 1369 try_substitute(&ctx, in, i); 1370 try_addr_synth(&ctx, in, i); 1371 } 1372 1373 ctx_record(&ctx, in, i); 1374 } 1375 return ctx.block_change_p; 1376 } 1377 1378 static int opt_combine_compact_block(Func* f, Block* bl) { 1379 u32 w = 0; 1380 int changed = 0; 1381 for (u32 i = 0; i < bl->ninsts; ++i) { 1382 Inst* in = &bl->insts[i]; 1383 1384 /* Drop NOPs (e.g. copies sunk by try_sink). */ 1385 if ((IROp)in->op == IR_NOP) { 1386 changed = 1; 1387 continue; 1388 } 1389 1390 if ((IROp)in->op == IR_COPY && in->nopnds == 2 && 1391 same_reg_operand(&in->opnds[0], &in->opnds[1])) { 1392 changed = 1; 1393 continue; 1394 } 1395 1396 if (w) { 1397 Inst* prev = &bl->insts[w - 1u]; 1398 if ((IROp)prev->op == IR_STORE && (IROp)in->op == IR_LOAD && 1399 same_spill_slot_and_size(f, prev, in) && 1400 same_reg_operand(&prev->opnds[1], &in->opnds[0])) { 1401 changed = 1; 1402 continue; 1403 } 1404 if ((IROp)prev->op == IR_LOAD && (IROp)in->op == IR_STORE && 1405 same_spill_slot_and_size(f, prev, in) && 1406 same_reg_operand(&prev->opnds[0], &in->opnds[1])) { 1407 changed = 1; 1408 continue; 1409 } 1410 if ((IROp)prev->op == IR_LOAD && (IROp)in->op == IR_LOAD && 1411 same_spill_slot_and_size(f, prev, in) && 1412 same_reg_operand(&prev->opnds[0], &in->opnds[0])) { 1413 changed = 1; 1414 continue; 1415 } 1416 if ((IROp)prev->op == IR_STORE && (IROp)in->op == IR_STORE && 1417 same_spill_access(f, prev, in, NULL)) { 1418 bl->insts[w - 1u] = *in; 1419 changed = 1; 1420 continue; 1421 } 1422 } 1423 1424 bl->insts[w++] = *in; 1425 } 1426 bl->ninsts = w; 1427 return changed; 1428 } 1429 1430 void opt_combine(Func* f) { 1431 if (!f) return; 1432 opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE); 1433 OptHardBlockLive* hard_live = opt_maybe_build_hard_live(f); 1434 /* Per-BB fixpoint, matching MIR's combine loop (mir-gen.c:9037-9038). */ 1435 /* Per-BB fixpoint with a defensive iteration cap. Each rewrite is supposed 1436 * to be monotonic (it only fires when it strictly improves the IR), so we 1437 * shouldn't hit the cap in practice — but a noisy abort beats a hang if a 1438 * future rewrite accidentally oscillates. */ 1439 enum { MAX_COMBINE_ITERS = 64 }; 1440 for (u32 b = 0; b < f->nblocks; ++b) { 1441 Block* bl = &f->blocks[b]; 1442 for (int iter = 0; iter < MAX_COMBINE_ITERS; ++iter) { 1443 int folded = opt_combine_fold_block(f, bl, hard_live); 1444 int compacted = opt_combine_compact_block(f, bl); 1445 if (!folded && !compacted) break; 1446 } 1447 } 1448 }