pass_lower.c (81403B)
1 #include <stdlib.h> 2 #include <string.h> 3 4 #include "core/arena.h" 5 #include "core/core.h" 6 #include "core/metrics.h" 7 #include "core/pool.h" 8 #include "core/slice.h" 9 #include "core/strbuf.h" 10 #include "opt/opt_internal.h" 11 12 enum { 13 OPT_CG_TYPE_SEG_SHIFT = 6, 14 OPT_CG_TYPE_SEG_MASK = (1u << OPT_CG_TYPE_SEG_SHIFT) - 1u, 15 OPT_CG_TYPE_BUILTIN_SEG = 1u, 16 }; 17 18 static u32 type_size_fallback(const Func* f, KitCgTypeId t) { 19 KitCgBuiltinType b; 20 if (!t) return f->opt_target.ptr_size ? f->opt_target.ptr_size : 8u; 21 if ((t >> OPT_CG_TYPE_SEG_SHIFT) != OPT_CG_TYPE_BUILTIN_SEG) { 22 return f->opt_target.ptr_size ? f->opt_target.ptr_size : 8u; 23 } 24 b = (KitCgBuiltinType)(t & OPT_CG_TYPE_SEG_MASK); 25 switch (b) { 26 case KIT_CG_BUILTIN_BOOL: 27 case KIT_CG_BUILTIN_I8: 28 return 1; 29 case KIT_CG_BUILTIN_I16: 30 return 2; 31 case KIT_CG_BUILTIN_I32: 32 case KIT_CG_BUILTIN_F32: 33 return 4; 34 case KIT_CG_BUILTIN_I64: 35 case KIT_CG_BUILTIN_F64: 36 return 8; 37 case KIT_CG_BUILTIN_I128: 38 return 16; 39 case KIT_CG_BUILTIN_VOID: 40 return 0; 41 case KIT_CG_BUILTIN_VARARG_STATE: 42 case KIT_CG_BUILTIN_COUNT: 43 default: 44 return f->opt_target.ptr_size ? f->opt_target.ptr_size : 8u; 45 } 46 } 47 48 static u32 bit_words(u32 npregs) { return (npregs + 63u) / 64u; } 49 50 static void bit_set(u64* bits, PReg v) { 51 u32 w = v / 64u; 52 u64 mask = 1ull << (v % 64u); 53 u64 old = bits[w]; 54 bits[w] = old | mask; 55 } 56 static void bit_clear(u64* bits, PReg v) { 57 u32 w = v / 64u; 58 u64 mask = 1ull << (v % 64u); 59 u64 old = bits[w]; 60 bits[w] = old & ~mask; 61 } 62 static int bit_has(const u64* bits, PReg v) { 63 u32 w = v / 64u; 64 u64 mask = 1ull << (v % 64u); 65 u64 old = bits[w]; 66 return (old & mask) != 0; 67 } 68 69 typedef struct BitsCtx { 70 u64* use; 71 u64* def; 72 } BitsCtx; 73 74 typedef struct InstRefs { 75 PReg* uses; 76 PReg* defs; 77 u32 nuses; 78 u32 ndefs; 79 u32 use_cap; 80 u32 def_cap; 81 } InstRefs; 82 83 static void collect_bits(Func* f, Inst* in, Operand* op, int is_def, 84 void* arg) { 85 (void)in; 86 BitsCtx* c = (BitsCtx*)arg; 87 if (op->kind != OPK_REG) return; 88 PReg v = (PReg)op->v.reg; 89 if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return; 90 if (is_def) 91 bit_set(c->def, v); 92 else 93 bit_set(c->use, v); 94 } 95 96 static void refs_reset(InstRefs* refs) { 97 refs->nuses = 0; 98 refs->ndefs = 0; 99 } 100 101 static void refs_push(Func* f, PReg** pregs, u32* npregs, u32* cap, PReg v) { 102 for (u32 i = 0; i < *npregs; ++i) 103 if ((*pregs)[i] == v) return; 104 if (*npregs == *cap) { 105 u32 ncap = *cap ? *cap * 2u : 8u; 106 PReg* nv = arena_array(f->arena, PReg, ncap); 107 if (*pregs) memcpy(nv, *pregs, sizeof((*pregs)[0]) * *npregs); 108 *pregs = nv; 109 *cap = ncap; 110 } 111 (*pregs)[(*npregs)++] = v; 112 } 113 114 static void refs_collect(Func* f, Inst* in, Operand* op, int is_def, 115 void* arg) { 116 (void)in; 117 InstRefs* refs = (InstRefs*)arg; 118 if (op->kind != OPK_REG) return; 119 PReg v = (PReg)op->v.reg; 120 if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return; 121 if (is_def) 122 refs_push(f, &refs->defs, &refs->ndefs, &refs->def_cap, v); 123 else 124 refs_push(f, &refs->uses, &refs->nuses, &refs->use_cap, v); 125 } 126 127 static int refs_has_def(const InstRefs* refs, PReg v) { 128 for (u32 i = 0; i < refs->ndefs; ++i) 129 if (refs->defs[i] == v) return 1; 130 return 0; 131 } 132 133 static void live_update_refs_before(u64* live, const InstRefs* refs) { 134 for (u32 i = 0; i < refs->ndefs; ++i) bit_clear(live, refs->defs[i]); 135 for (u32 i = 0; i < refs->nuses; ++i) bit_set(live, refs->uses[i]); 136 } 137 138 static u32 live_update_refs_before_active(u64* live, u32 active_words, 139 u32 nwords, const InstRefs* refs) { 140 for (u32 i = 0; i < refs->ndefs; ++i) { 141 PReg v = refs->defs[i]; 142 if (v == PREG_NONE || v == 0) continue; 143 u32 w = v / 64u; 144 if (w < active_words) live[w] &= ~(1ull << (v % 64u)); 145 } 146 while (active_words && live[active_words - 1u] == 0) --active_words; 147 for (u32 i = 0; i < refs->nuses; ++i) { 148 PReg v = refs->uses[i]; 149 if (v == PREG_NONE || v == 0) continue; 150 u32 w = v / 64u; 151 if (w >= nwords) continue; 152 live[w] |= 1ull << (v % 64u); 153 if (active_words <= w) active_words = w + 1u; 154 } 155 return active_words; 156 } 157 158 static void forbid_preg_reg(Func* f, PReg v, u8 cls, Reg r) { 159 if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f) || 160 cls >= OPT_REG_CLASSES || r >= 32) 161 return; 162 if (opt_reg_cls(f, v) != cls) return; 163 f->preg_info[v].forbidden_hard_regs |= 1u << r; 164 } 165 166 static void apply_fixed_asm_operand(Func* f, Operand* op, i32 fixed, 167 u8 fixed_cls) { 168 if (!op || op->kind != OPK_REG || fixed < 0) return; 169 PReg v = (PReg)op->v.reg; 170 if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return; 171 if (fixed_cls >= OPT_REG_CLASSES || opt_reg_cls(f, v) != fixed_cls) { 172 SrcLoc loc = {0, 0, 0}; 173 compiler_panic(f->c, loc, "opt asm: fixed register class mismatch"); 174 } 175 f->preg_info[v].tied_hard_reg = fixed; 176 } 177 178 static void apply_allowed_asm_operand(Func* f, Operand* op, u32 allowed, 179 u8 allowed_cls) { 180 u32 hard_mask = 0; 181 if (!op || op->kind != OPK_REG || !allowed) return; 182 PReg v = (PReg)op->v.reg; 183 if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return; 184 if (allowed_cls >= OPT_REG_CLASSES || opt_reg_cls(f, v) != allowed_cls) { 185 SrcLoc loc = {0, 0, 0}; 186 compiler_panic(f->c, loc, "opt asm: allowed register class mismatch"); 187 } 188 for (u32 i = 0; i < f->opt_hard_reg_count[allowed_cls]; ++i) { 189 Reg r = f->opt_hard_regs[allowed_cls][i]; 190 if (r < 32) hard_mask |= 1u << r; 191 } 192 if (f->preg_info[v].allowed_hard_regs) { 193 u32 both = f->preg_info[v].allowed_hard_regs & allowed; 194 if (!both) { 195 SrcLoc loc = {0, 0, 0}; 196 compiler_panic(f->c, loc, "opt asm: conflicting allowed register sets"); 197 } 198 f->preg_info[v].allowed_hard_regs = both; 199 } else { 200 f->preg_info[v].allowed_hard_regs = allowed; 201 } 202 f->preg_info[v].forbidden_hard_regs |= hard_mask & ~allowed; 203 } 204 205 static void apply_asm_register_constraints(Func* f, Inst* in, u64* use, 206 u64* def, u64* live_after) { 207 IRAsmAux* aux = (IRAsmAux*)in->extra.aux; 208 if (!aux || !f->preg_info) return; 209 210 for (u32 i = 0; i < aux->nout; ++i) { 211 i32 fixed = aux->out_fixed_regs ? aux->out_fixed_regs[i] : -1; 212 u8 cls = aux->out_fixed_cls ? aux->out_fixed_cls[i] : 0; 213 u32 allowed = aux->out_allowed_masks ? aux->out_allowed_masks[i] : 0; 214 u8 allowed_cls = aux->out_allowed_cls ? aux->out_allowed_cls[i] : 0; 215 apply_allowed_asm_operand(f, &aux->out_ops[i], allowed, allowed_cls); 216 apply_fixed_asm_operand(f, &aux->out_ops[i], fixed, cls); 217 } 218 for (u32 i = 0; i < aux->nin; ++i) { 219 i32 fixed = aux->in_fixed_regs ? aux->in_fixed_regs[i] : -1; 220 u8 cls = aux->in_fixed_cls ? aux->in_fixed_cls[i] : 0; 221 u32 allowed = aux->in_allowed_masks ? aux->in_allowed_masks[i] : 0; 222 u8 allowed_cls = aux->in_allowed_cls ? aux->in_allowed_cls[i] : 0; 223 apply_allowed_asm_operand(f, &aux->in_ops[i], allowed, allowed_cls); 224 apply_fixed_asm_operand(f, &aux->in_ops[i], fixed, cls); 225 } 226 227 for (u32 cls = 0; cls < OPT_REG_CLASSES; ++cls) { 228 u32 mask = aux->clobber_mask[cls]; 229 if (!mask) continue; 230 for (Reg r = 0; r < 32; ++r) { 231 if ((mask & (1u << r)) == 0) continue; 232 for (PReg v = 1; v < opt_reg_count(f); ++v) { 233 if (!bit_has(use, v) && !bit_has(def, v) && 234 !(live_after && bit_has(live_after, v))) 235 continue; 236 forbid_preg_reg(f, v, (u8)cls, r); 237 } 238 } 239 } 240 } 241 242 /* Apply the per-instruction fixed-register clobbers recorded in machinization 243 * (Func.inst_clobbers). A register the instruction's encoding destroys must not 244 * hold any value live AFTER the instruction unless that value is (re)defined 245 * here — so forbid each clobbered register for every live-after, non-def value. 246 * Values that merely die at the instruction (dying uses) need no constraint: 247 * the backend stages them into/out of the fixed registers itself. */ 248 static void apply_machine_reg_clobbers(Func* f, Inst* in, u64* def, 249 u64* live_after) { 250 if (!f->preg_info || !f->inst_clobbers || in->id == INST_ID_NONE || 251 in->id >= f->inst_clobbers_cap) 252 return; 253 for (u32 cls = 0; cls < OPT_REG_CLASSES; ++cls) { 254 u32 mask = f->inst_clobbers[in->id][cls]; 255 if (!mask) continue; 256 for (Reg r = 0; r < 32; ++r) { 257 if ((mask & (1u << r)) == 0) continue; 258 for (PReg v = 1; v < opt_reg_count(f); ++v) { 259 if (!(live_after && bit_has(live_after, v)) || bit_has(def, v)) 260 continue; 261 if ((u8)opt_reg_cls(f, v) != (u8)cls) continue; 262 f->preg_info[v].forbidden_hard_regs |= 1u << r; 263 /* Record this as a hard clobber so the return-register hint won't later 264 * clear the forbid and place the value in a register the instruction 265 * destroys (see set_preg_pref_to_ret_reg). */ 266 f->preg_info[v].clobbered_hard_regs |= 1u << r; 267 } 268 } 269 } 270 } 271 272 static int phys_arg_reg_for_index(Func* f, u8 cls, u32 abi_index, Reg* out) { 273 if (!f || cls >= OPT_REG_CLASSES) return 0; 274 for (u32 i = 0; i < f->opt_phys_reg_count[cls]; ++i) { 275 const CGPhysRegInfo* pi = &f->opt_phys_regs[cls][i]; 276 if ((pi->flags & CG_REG_ARG) && pi->abi_index == abi_index) { 277 if (out) *out = pi->reg; 278 return 1; 279 } 280 } 281 return 0; 282 } 283 284 static int hard_available(Func* f, u8 cls, Reg r); 285 286 static int is_caller_saved(Func* f, u8 cls, Reg r) { 287 if (cls >= OPT_REG_CLASSES || r >= 32) return 0; 288 return (f->opt_caller_saved[cls] & (1u << r)) != 0; 289 } 290 291 static Reg first_ret_reg(Func* f, u8 cls) { 292 if (!f || cls >= OPT_REG_CLASSES) return REG_NONE; 293 u32 mask = f->opt_ret_regs[cls]; 294 for (Reg r = 0; r < 32; ++r) 295 if (mask & (1u << r)) return r; 296 return REG_NONE; 297 } 298 299 static void set_preg_pref_to_ret_reg(Func* f, const Operand* op) { 300 if (!op || op->kind != OPK_REG) return; 301 PReg v = (PReg)op->v.reg; 302 if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return; 303 u8 cls = f->preg_info[v].cls; 304 if (cls >= OPT_REG_CLASSES) return; 305 Reg hint = first_ret_reg(f, cls); 306 if (hint == REG_NONE || hint >= 32) return; 307 /* Don't override a real pin. */ 308 if (f->preg_info[v].tied_hard_reg >= 0) return; 309 /* A value live across an instruction that clobbers the ret reg cannot live 310 * there; skip the hint so the allocator places it elsewhere and the return 311 * copy moves it into the ret reg (e.g. an accumulator returned past a va_arg, 312 * which uses rax). */ 313 if (f->preg_info[v].clobbered_hard_regs & (1u << hint)) return; 314 /* The hint reg may not be in opt_hard_regs (e.g. x0 on aa64 is reserved 315 * as the ABI ret reg, outside aa_int_allocable); the allocator's 316 * preferred-reg branch will still consider it via the unit-overlap 317 * precision check. Clear any leftover soft forbid bit so the hint isn't 318 * silently blocked. */ 319 f->preg_info[v].forbidden_hard_regs &= ~(1u << hint); 320 f->preg_info[v].preferred_hard_reg = (i8)hint; 321 } 322 323 static void set_preg_pref_for_abivalue(Func* f, const CGABIValue* v) { 324 if (!v) return; 325 set_preg_pref_to_ret_reg(f, &v->storage); 326 for (u32 i = 0; i < v->nparts; ++i) 327 set_preg_pref_to_ret_reg(f, &v->parts[i].op); 328 } 329 330 /* Soft hint: prefer a specific ABI register for `op`'s PReg. Symmetric to 331 * set_preg_pref_to_ret_reg but takes an arbitrary hint reg (the matching 332 * arg reg for the i-th call argument). */ 333 static void set_preg_pref_to_arg_reg(Func* f, const Operand* op, Reg hint) { 334 if (!op || op->kind != OPK_REG) return; 335 if (hint == REG_NONE || hint >= 32) return; 336 PReg v = (PReg)op->v.reg; 337 if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return; 338 u8 cls = f->preg_info[v].cls; 339 if (cls >= OPT_REG_CLASSES) return; 340 if (f->preg_info[v].tied_hard_reg >= 0) return; 341 if (f->preg_info[v].preferred_hard_reg >= 0) return; 342 f->preg_info[v].forbidden_hard_regs &= ~(1u << hint); 343 f->preg_info[v].preferred_hard_reg = (i8)hint; 344 } 345 346 /* Hint each single-PReg-stored param toward its own incoming ABI reg. When 347 * the allocator picks the incoming reg, bind_param sees src==dst and emits 348 * no entry move (aa_bind_native_param checks at native.c:3227). Live-range 349 * conflicts at body use sites still go through the normal allocator check, 350 * so cross-call params that need a callee-save get one. */ 351 /* True iff `f` contains any IR_CALL flagged as a tail call. Tail-call arg 352 * routing goes through the backend shuffle which can permute the caller's 353 * incoming arg regs into different positions for the callee — pinning each 354 * param PReg to its own incoming reg turns those permutations into multi-reg 355 * cycles the shuffle can't break. Symmetric to the per-call tail skip in 356 * set_preg_pref_for_call_args. */ 357 static int func_has_tail_call(const Func* f) { 358 if (!f) return 0; 359 for (u32 b = 0; b < f->nblocks; ++b) { 360 const Block* bl = &f->blocks[b]; 361 for (u32 i = 0; i < bl->ninsts; ++i) { 362 const Inst* in = &bl->insts[i]; 363 if ((IROp)in->op != IR_CALL) continue; 364 const IRCallAux* aux = (const IRCallAux*)in->extra.aux; 365 if (aux && (aux->desc.flags & CG_CALL_TAIL)) return 1; 366 } 367 } 368 return 0; 369 } 370 371 static void set_preg_pref_for_params(Func* f) { 372 if (!f || !f->preg_info || !f->nparams) return; 373 if (func_has_tail_call(f)) return; 374 /* Per-class ABI arg cursors. Drives from per-param ABI info rather than 375 * f->desc.abi so this fires on paths where only f->params[i].abi is set. */ 376 u32 next_int = 0; 377 u32 next_fp = 0; 378 /* An sret pointer passed in the first integer argument register consumes 379 * that slot (SysV-x64 rdi, Win64 rcx, RISC-V a0); ABIs that return it in a 380 * dedicated register (AArch64 x8) do not. Driven by the ABI descriptor so no 381 * arch identity is needed here. */ 382 if (f->desc.abi && f->desc.abi->sret_consumes_int_arg) next_int = 1; 383 for (u32 i = 0; i < f->nparams; ++i) { 384 IRParam* p = &f->params[i]; 385 const ABIArgInfo* ai = p->abi; 386 if (!ai || ai->kind == ABI_ARG_IGNORE) continue; 387 if (ai->kind == ABI_ARG_INDIRECT) { 388 ++next_int; 389 continue; 390 } 391 if (ai->kind != ABI_ARG_DIRECT) continue; 392 /* Only hint single-part DIRECT params whose home is a single PReg. 393 * Aggregate / split params take the bind_param frame-store path. */ 394 int single_part_to_preg = 395 (ai->nparts == 1) && (p->storage.kind == CG_LOCAL_STORAGE_REG); 396 if (single_part_to_preg) { 397 const ABIArgPart* part = &ai->parts[0]; 398 u8 cls = (part->cls == ABI_CLASS_FP) ? RC_FP : RC_INT; 399 u32* counter = (cls == RC_FP) ? &next_fp : &next_int; 400 Reg hint = REG_NONE; 401 if (*counter < 8u && phys_arg_reg_for_index(f, cls, *counter, &hint)) { 402 PReg v = (PReg)p->storage.v.reg; 403 if (v != PREG_NONE && v != 0 && v < opt_reg_count(f) && 404 f->preg_info[v].cls == cls && f->preg_info[v].tied_hard_reg < 0 && 405 f->preg_info[v].preferred_hard_reg < 0 && hint != REG_NONE && 406 hint < 32) { 407 f->preg_info[v].forbidden_hard_regs &= ~(1u << hint); 408 f->preg_info[v].preferred_hard_reg = (i8)hint; 409 } 410 } 411 } 412 /* Advance the ABI cursors for every part of this param's home, regardless 413 * of whether we hinted, so subsequent params see the right slot. */ 414 for (u16 j = 0; j < ai->nparts; ++j) { 415 u32* c = (ai->parts[j].cls == ABI_CLASS_FP) ? &next_fp : &next_int; 416 *c += 1u; 417 } 418 } 419 } 420 421 /* For each IR_CALL arg whose source storage is a single OPK_REG, hint that 422 * PReg to the matching ABI arg register. Sequential int/fp counters mirror 423 * the per-class arg slot assignment used by set_preg_pref_for_params. Skips 424 * variadic, has_sret, and indirect/aggregate args: they need per-target 425 * counter logic that hasn't been factored out of plan_call. */ 426 static void set_preg_pref_for_call_args(Func* f, const CGCallDesc* desc) { 427 if (!f || !desc) return; 428 /* Tail calls handle arg routing through the tail-call shuffle in the 429 * backend, which can resolve permutations (e.g. swap of caller's incoming 430 * x0/x1 into tail-call x1/x0). Hinting the arg source PRegs across that 431 * shuffle creates cycles bind_param / the entry-bind moves can't unbreak, 432 * miscompiling permute / cycle cases (toy 24, 27, 28, ...). Skip. */ 433 if (desc->flags & CG_CALL_TAIL) return; 434 const ABIFuncInfo* abi = desc->abi; 435 if (!abi && f->c && f->c->abi) 436 abi = abi_cg_func_info(f->c->abi, desc->fn_type); 437 if (!abi || abi->variadic || abi->has_sret) return; 438 u32 next_int = 0, next_fp = 0; 439 for (u32 i = 0; i < desc->nargs; ++i) { 440 if (i >= abi->nparams) break; 441 const CGABIValue* a = &desc->args[i]; 442 const ABIArgInfo* ai = &abi->params[i]; 443 if (ai->kind == ABI_ARG_IGNORE) continue; 444 if (ai->kind == ABI_ARG_INDIRECT) { 445 ++next_int; 446 continue; 447 } 448 if (ai->kind != ABI_ARG_DIRECT || ai->nparts == 0) continue; 449 const ABIArgPart* part0 = &ai->parts[0]; 450 u8 cls = part0->cls == ABI_CLASS_FP ? RC_FP : RC_INT; 451 u32* counter = cls == RC_FP ? &next_fp : &next_int; 452 if (*counter < 8u && ai->nparts == 1) { 453 Reg hint = REG_NONE; 454 if (phys_arg_reg_for_index(f, cls, *counter, &hint)) 455 set_preg_pref_to_arg_reg(f, &a->storage, hint); 456 } 457 for (u16 p = 0; p < ai->nparts; ++p) { 458 u32* c = (ai->parts[p].cls == ABI_CLASS_FP) ? &next_fp : &next_int; 459 *c += 1u; 460 } 461 } 462 } 463 464 /* Propagate preferred_hard_reg backward across IR_COPY chains. When the 465 * frontend emits `copy def=v_arg opnds=[v_arg, v_value]` to carry a value 466 * into a call's arg slot, set_preg_pref_for_call_args hints v_arg to the 467 * ABI dest (e.g. x0). But the underlying producer (v_value, e.g. the load 468 * result) is unhinted and lands in a generic caller-save (e.g. x8); the 469 * copy then emits `mov x0, x8`. Propagating the hint from v_arg to v_value 470 * lets both land at x0 and turns the copy into an identity move that 471 * combine elides. Walks insts; for each IR_COPY whose def has a hint and 472 * whose single OPK_REG source operand has none, propagate (and clear the 473 * source's forbid for the hint reg). Safe because the copy itself dies 474 * once both sides share the reg. */ 475 static void propagate_hint_through_copies(Func* f) { 476 if (!f || !f->preg_info) return; 477 for (u32 b = 0; b < f->nblocks; ++b) { 478 Block* bl = &f->blocks[b]; 479 for (u32 i = 0; i < bl->ninsts; ++i) { 480 Inst* in = &bl->insts[i]; 481 if ((IROp)in->op != IR_COPY) continue; 482 if (in->nopnds < 2 || in->opnds[0].kind != OPK_REG || 483 in->opnds[1].kind != OPK_REG) 484 continue; 485 PReg dst = (PReg)in->opnds[0].v.reg; 486 PReg src = (PReg)in->opnds[1].v.reg; 487 if (dst == PREG_NONE || dst == 0 || dst >= opt_reg_count(f)) continue; 488 if (src == PREG_NONE || src == 0 || src >= opt_reg_count(f)) continue; 489 i8 dst_pref = f->preg_info[dst].preferred_hard_reg; 490 if (dst_pref < 0) continue; 491 if (f->preg_info[src].tied_hard_reg >= 0) continue; 492 if (f->preg_info[src].preferred_hard_reg >= 0) continue; 493 if (f->preg_info[dst].cls != f->preg_info[src].cls) continue; 494 /* Don't propagate the hint into a value clobbered out of that register by 495 * a machine instruction live across it (e.g. a loop accumulator returned 496 * past a va_arg/idiv): it cannot live there. Leave it allocated 497 * elsewhere; the copy moves it into the hinted register. */ 498 if (f->preg_info[src].clobbered_hard_regs & (1u << (Reg)dst_pref)) 499 continue; 500 f->preg_info[src].forbidden_hard_regs &= ~(1u << (Reg)dst_pref); 501 f->preg_info[src].preferred_hard_reg = dst_pref; 502 } 503 } 504 } 505 506 /* Set a soft "prefer the ABI return reg" hint on: 507 * - IR_CALL result PRegs (so emit_call's `mov result, x0` is elided) 508 * - IR_RET value PRegs (so emit_ret's `mov x0, value` is elided) 509 * - IR_CALL arg source PRegs (so emit_call's `mov x0, src` is elided); 510 * hints are propagated backward through IR_COPY so the actual producer 511 * of the value also prefers the ABI dest reg. 512 * 513 * The hint is a tie-breaker only — see hard_reg_alloc_score. The allocator's 514 * existing conflict checks already exclude regs with real interference (e.g. 515 * a result PReg live across another call cannot pick x0). */ 516 static void apply_abi_aliasing_hints(Func* f) { 517 if (!f || !f->preg_info) return; 518 set_preg_pref_for_params(f); 519 for (u32 b = 0; b < f->nblocks; ++b) { 520 Block* bl = &f->blocks[b]; 521 for (u32 i = 0; i < bl->ninsts; ++i) { 522 Inst* in = &bl->insts[i]; 523 if ((IROp)in->op == IR_CALL) { 524 IRCallAux* aux = (IRCallAux*)in->extra.aux; 525 if (aux) { 526 set_preg_pref_for_abivalue(f, &aux->desc.ret); 527 set_preg_pref_for_call_args(f, &aux->desc); 528 } 529 } else if ((IROp)in->op == IR_RET) { 530 IRRetAux* aux = (IRRetAux*)in->extra.aux; 531 if (aux && aux->present) set_preg_pref_for_abivalue(f, &aux->val); 532 } 533 } 534 } 535 propagate_hint_through_copies(f); 536 } 537 538 /* --------------------------------------------------------------------------- 539 * Register allocator, MIR-shaped. 540 * 541 * Data structures and assignment algorithm mirror MIR's reg_alloc/assign 542 * (mir-gen.c:7551-7728, simplified_p branch). Conflict detection uses a 543 * point-indexed bitmap of locations (hard regs + stack slots) instead of 544 * a sorted interval vector per hard register. 545 * 546 * used_locs[p * loc_words .. p * loc_words + loc_words) -- one row per 547 * compressed 548 * program point 549 * 550 * Bit indices: 551 * 0 .. hard_loc_bits-1 -> hard registers (hard_loc_bit(cls, r)) 552 * hard_loc_bits + k -> stack slot index k (k < stack_slot_count) 553 * 554 * Live-range splitting (`get_hard_reg_with_split`, `lr_gap_t`, `split()`) 555 * is deferred per doc/plan/OPTIMIZER.md. 556 * ------------------------------------------------------------------------- */ 557 558 typedef struct OptAllocator OptAllocator; 559 560 static int hard_available(Func* f, u8 cls, Reg r) { 561 if (cls >= OPT_REG_CLASSES) return 0; 562 for (u32 i = 0; i < f->opt_hard_reg_count[cls]; ++i) 563 if (f->opt_hard_regs[cls][i] == r) return 1; 564 return 0; 565 } 566 567 static const CGPhysRegInfo* phys_info_for(Func* f, u8 cls, Reg r) { 568 if (cls >= OPT_REG_CLASSES) return NULL; 569 for (u32 i = 0; i < f->opt_phys_reg_count[cls]; ++i) 570 if (f->opt_phys_regs[cls][i].reg == r) return &f->opt_phys_regs[cls][i]; 571 return NULL; 572 } 573 574 static FrameSlot spill_slot_for(Func* f, PReg v) { 575 FrameSlot existing = opt_preg_spill_slot(f, v); 576 if (existing != FRAME_SLOT_NONE) return existing; 577 FrameSlotDesc d; 578 memset(&d, 0, sizeof d); 579 d.type = opt_reg_type(f, v); 580 d.size = type_size_fallback(f, opt_reg_type(f, v)); 581 d.align = d.size >= 8 ? 8 : d.size; 582 d.kind = FS_SPILL; 583 f->preg_info[v].spill_slot = ir_frame_slot_new(f, &d); 584 return f->preg_info[v].spill_slot; 585 } 586 587 static u32 hard_loc_bit(u8 cls, Reg r) { return ((u32)cls * 32u) + (u32)r; } 588 589 typedef struct OptAllocCandidate { 590 PReg v; /* coalesce-root PReg with live ranges */ 591 u32 spill_cost; 592 u32 live_length; 593 u8 tied; 594 u8 pad[3]; 595 } OptAllocCandidate; 596 597 typedef struct OptAllocGroupInfo { 598 PReg root; 599 u32 spill_cost; 600 u32 live_length; 601 u32 first; 602 u32 last; 603 i32 tied_hard_reg; 604 u32 forbidden_hard_regs; 605 u32 allowed_hard_regs; 606 u8 cls; 607 u8 pad[3]; 608 } OptAllocGroupInfo; 609 610 typedef struct OptAllocator { 611 OptLoc* locs; /* per-PReg result (cls, hard_reg, spill_slot) */ 612 613 /* Per-point bitmap of locations. used_locs[p * loc_words + w] is word w 614 * of the bitmap for compressed program point p. Bit indices: 615 * 0 .. hard_loc_bits - 1 -> hard regs (hard_loc_bit) 616 * hard_loc_bits + stack_idx -> stack slot indices */ 617 u64* used_locs; 618 u32 point_count; 619 u32 loc_words; /* width of one row, in u64 words */ 620 u32 hard_loc_bits; /* OPT_REG_CLASSES * 32 */ 621 622 /* Stack slot table (parallel arrays). */ 623 FrameSlot* stack_slots; 624 u32 stack_slot_count; 625 u32 stack_slot_cap; 626 627 /* hard_open[hard_loc_bit] is 1 if at least one PReg has been assigned to 628 * this hard reg in the current function. Drives `hard_reg_alloc_score`'s 629 * callee-save bias. */ 630 u8* hard_open; 631 632 /* Scratch bitmap of loc_words. Reused per candidate. */ 633 u64* conflict_locs; 634 635 /* Metrics. */ 636 u64 hard_point_visits; /* points scanned during hard-reg conflict probe */ 637 u64 stack_point_visits; /* points scanned during stack-slot probe */ 638 u64 hard_word_ors; /* word-OR operations into conflict_locs */ 639 u64 stack_word_ors; 640 u64 hard_mark_points; /* points marked when assigning a hard reg */ 641 u64 stack_mark_points; 642 } OptAllocator; 643 644 /* Bitmap helpers over loc_words-wide rows of used_locs. */ 645 static u64* used_locs_row(OptAllocator* a, u32 p) { 646 return &a->used_locs[(u64)p * a->loc_words]; 647 } 648 649 static int loc_bit_in_conflict(const u64* conflict_locs, u32 bit) { 650 return (conflict_locs[bit / 64u] & (1ull << (bit % 64u))) != 0; 651 } 652 653 static u32 alloc_loc_words_for_bits(u32 bits) { return (bits + 63u) / 64u; } 654 655 static void alloc_grow_loc_words(Func* f, OptAllocator* a, u32 need_words) { 656 if (need_words <= a->loc_words) return; 657 u32 new_words = a->loc_words ? a->loc_words : 1u; 658 while (new_words < need_words) new_words *= 2u; 659 u64* nb = arena_zarray(f->arena, u64, (u64)a->point_count * new_words); 660 if (a->used_locs && a->loc_words) { 661 for (u32 p = 0; p < a->point_count; ++p) 662 memcpy(&nb[(u64)p * new_words], &a->used_locs[(u64)p * a->loc_words], 663 sizeof(u64) * a->loc_words); 664 } 665 a->used_locs = nb; 666 a->loc_words = new_words; 667 u64* nc = arena_zarray(f->arena, u64, new_words); 668 a->conflict_locs = nc; 669 } 670 671 static u32 alloc_alloc_stack_slot(Func* f, OptAllocator* a, FrameSlot fs) { 672 if (a->stack_slot_count == a->stack_slot_cap) { 673 u32 ncap = a->stack_slot_cap ? a->stack_slot_cap * 2u : 16u; 674 FrameSlot* ns = arena_array(f->arena, FrameSlot, ncap); 675 if (a->stack_slots) 676 memcpy(ns, a->stack_slots, 677 sizeof(a->stack_slots[0]) * a->stack_slot_count); 678 a->stack_slots = ns; 679 a->stack_slot_cap = ncap; 680 } 681 u32 idx = a->stack_slot_count++; 682 a->stack_slots[idx] = fs; 683 u32 needed_bits = a->hard_loc_bits + a->stack_slot_count; 684 u32 needed_words = alloc_loc_words_for_bits(needed_bits); 685 alloc_grow_loc_words(f, a, needed_words); 686 return idx; 687 } 688 689 static u32 hard_reg_alloc_score(Func* f, const OptAllocator* a, 690 const OptPRegInfo* vi, Reg hr) { 691 const CGPhysRegInfo* pi = phys_info_for(f, vi->cls, hr); 692 u32 score = pi ? pi->spill_cost : 0; 693 if (vi->live_across_call_freq) { 694 if (is_caller_saved(f, vi->cls, hr)) 695 score += 1000u + vi->live_across_call_freq; 696 else 697 score += 20u; 698 } else if (!is_caller_saved(f, vi->cls, hr)) { 699 u32 bit = hard_loc_bit(vi->cls, hr); 700 int already_open = 701 a->hard_open && bit < a->hard_loc_bits && a->hard_open[bit]; 702 if (!already_open) score += pi ? pi->copy_cost : 50u; 703 } 704 /* Soft hint: tie-break toward a preferred hard reg by adding +1 to every 705 * non-hinted choice. Used by apply_abi_aliasing_hints to put IR_CALL 706 * results / IR_RET values into the ABI return register so emit_call / 707 * emit_ret can elide the materialization move. Stays well below the 708 * live-across-call (+1000) penalty and the callee-save (+20) gap so it 709 * never overrides real costs. */ 710 if (vi->preferred_hard_reg >= 0 && (Reg)vi->preferred_hard_reg != hr) 711 score += 1u; 712 return score; 713 } 714 715 static int alloc_candidate_higher(const OptAllocCandidate* a, 716 const OptAllocCandidate* b) { 717 if (a->tied != b->tied) return a->tied > b->tied; 718 if (a->spill_cost != b->spill_cost) return a->spill_cost > b->spill_cost; 719 if (a->live_length != b->live_length) return a->live_length < b->live_length; 720 return a->v < b->v; 721 } 722 723 static int alloc_candidate_cmp(const void* va, const void* vb) { 724 const OptAllocCandidate* a = (const OptAllocCandidate*)va; 725 const OptAllocCandidate* b = (const OptAllocCandidate*)vb; 726 if (alloc_candidate_higher(a, b)) return -1; 727 if (alloc_candidate_higher(b, a)) return 1; 728 return 0; 729 } 730 731 static void alloc_sort_candidates(OptAllocCandidate* cands, u32 n) { 732 if (n > 1) qsort(cands, n, sizeof(cands[0]), alloc_candidate_cmp); 733 } 734 735 static PReg alloc_coalesce_root(Func* f, PReg v) { 736 if (!f->opt_coalesce_parent || v == PREG_NONE || v >= opt_reg_count(f)) 737 return v; 738 PReg p = (PReg)f->opt_coalesce_parent[v]; 739 while (p != f->opt_coalesce_parent[p]) p = (PReg)f->opt_coalesce_parent[p]; 740 while (v != p) { 741 PReg n = (PReg)f->opt_coalesce_parent[v]; 742 f->opt_coalesce_parent[v] = p; 743 v = n; 744 } 745 return p; 746 } 747 748 static int alloc_group_member(Func* f, PReg root, PReg v) { 749 return alloc_coalesce_root(f, v) == root; 750 } 751 752 static void alloc_group_info(Func* f, const OptLiveRangeSet* ranges, PReg root, 753 OptAllocGroupInfo* out) { 754 memset(out, 0, sizeof *out); 755 out->root = root; 756 out->first = (u32)~0u; 757 out->tied_hard_reg = -1; 758 out->cls = f->preg_info[root].cls; 759 for (PReg v = 1; v < opt_reg_count(f); ++v) { 760 if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue; 761 if (!alloc_group_member(f, root, v)) continue; 762 OptPRegInfo* vi = &f->preg_info[v]; 763 out->spill_cost += vi->frequency ? vi->frequency : vi->spill_cost; 764 out->live_length += vi->live_length; 765 u32 first = vi->first_pos ? vi->first_pos - 1u : 0; 766 if (first < out->first) out->first = first; 767 if (vi->last_pos > out->last) out->last = vi->last_pos; 768 out->forbidden_hard_regs |= vi->forbidden_hard_regs; 769 if (vi->allowed_hard_regs) { 770 if (out->allowed_hard_regs) { 771 u32 both = out->allowed_hard_regs & vi->allowed_hard_regs; 772 if (!both) { 773 SrcLoc loc = {0, 0, 0}; 774 compiler_panic(f->c, loc, 775 "opt asm: conflicting allowed register sets"); 776 } 777 out->allowed_hard_regs = both; 778 } else { 779 out->allowed_hard_regs = vi->allowed_hard_regs; 780 } 781 } 782 if (vi->tied_hard_reg >= 0) out->tied_hard_reg = vi->tied_hard_reg; 783 } 784 if (out->first == (u32)~0u) out->first = 0; 785 } 786 787 static void opt_init_preg_info_from_ranges(Func* f, 788 const OptLiveRangeSet* ranges) { 789 OptPRegInfo* old = f->preg_info; 790 OptPRegInfo* info = arena_zarray(f->arena, OptPRegInfo, 791 opt_reg_count(f) ? opt_reg_count(f) : 1u); 792 for (PReg v = 0; v < opt_reg_count(f); ++v) { 793 i32 tied = old ? old[v].tied_hard_reg : -1; 794 u32 forbidden = old ? old[v].forbidden_hard_regs : 0; 795 u32 allowed = old ? old[v].allowed_hard_regs : 0; 796 u32 clobbered = old ? old[v].clobbered_hard_regs : 0; 797 u32 old_frequency = old ? old[v].frequency : 0; 798 i8 pref = old ? old[v].preferred_hard_reg : (i8)-1; 799 OptPRegInfo* vi = &info[v]; 800 vi->tied_hard_reg = tied; 801 vi->preferred_hard_reg = pref; 802 vi->hard_reg = REG_NONE; 803 vi->spill_slot = FRAME_SLOT_NONE; 804 vi->alloc_kind = OPT_ALLOC_NONE; 805 vi->cls = opt_reg_cls(f, v); 806 vi->forbidden_hard_regs = forbidden; 807 vi->allowed_hard_regs = allowed; 808 vi->clobbered_hard_regs = clobbered; 809 if (!ranges || v == PREG_NONE || v == 0 || 810 ranges->first_range_by_preg[v] == OPT_RANGE_NONE) { 811 continue; 812 } 813 u32 first = (u32)~0u; 814 u32 last = 0; 815 for (u32 r = ranges->first_range_by_preg[v]; r != OPT_RANGE_NONE; 816 r = ranges->ranges[r].next) { 817 const OptLiveRange* lr = &ranges->ranges[r]; 818 if (lr->start < first) first = lr->start; 819 if (lr->end > last) last = lr->end; 820 } 821 vi->first_pos = first == (u32)~0u ? 0 : first + 1u; 822 vi->last_pos = last; 823 vi->live_length = ranges->live_length_by_preg[v]; 824 vi->use_freq = ranges->use_freq_by_preg[v]; 825 vi->def_freq = ranges->def_freq_by_preg[v]; 826 vi->live_block_freq = ranges->live_block_freq_by_preg[v]; 827 vi->live_across_call_freq = ranges->live_across_call_freq_by_preg[v]; 828 vi->spill_cost = ranges->spill_cost_by_preg[v]; 829 vi->frequency = vi->spill_cost; 830 if (old_frequency > vi->frequency) vi->frequency = old_frequency; 831 } 832 f->preg_info = info; 833 } 834 835 static void bits_clear(u64* bits, u32 words) { 836 for (u32 i = 0; i < words; ++i) bits[i] = 0; 837 } 838 839 static void live_update_before(u64* live, const u64* use, const u64* def, 840 u32 words) { 841 for (u32 w = 0; w < words; ++w) live[w] = (live[w] & ~def[w]) | use[w]; 842 } 843 844 static void live_copy_block_out(Func* f, const OptLiveInfo* live_info, u32 b, 845 u64* live, u32 words) { 846 (void)f; 847 bits_clear(live, words); 848 if (live_info) { 849 const OptBitset* out = &live_info->blocks[b].live_out; 850 for (u32 w = 0; w < words && w < out->nwords; ++w) live[w] = out->words[w]; 851 } 852 } 853 854 static u32 live_copy_block_out_active(const OptLiveInfo* live_info, u32 b, 855 u64* live, u32 words, 856 u32 old_active_words) { 857 for (u32 w = 0; w < old_active_words; ++w) live[w] = 0; 858 if (!live_info) return 0; 859 const OptBitset* out = &live_info->blocks[b].live_out; 860 u32 active = words < out->active_words ? words : out->active_words; 861 for (u32 w = 0; w < active; ++w) live[w] = out->words[w]; 862 return active; 863 } 864 865 static void opt_apply_asm_constraints_from_live(Func* f, 866 const OptLiveInfo* live_info) { 867 int has_asm = 0; 868 for (u32 b = 0; b < f->nblocks && !has_asm; ++b) { 869 Block* bl = &f->blocks[b]; 870 for (u32 i = 0; i < bl->ninsts; ++i) { 871 if ((IROp)bl->insts[i].op == IR_ASM_BLOCK) { 872 has_asm = 1; 873 break; 874 } 875 } 876 } 877 /* The live walk drives both inline-asm operand constraints and 878 * per-instruction fixed-register clobbers (Func.inst_clobbers); run it if 879 * either is present. */ 880 if (!has_asm && !f->inst_clobbers) return; 881 882 u32 words = live_info ? live_info->words : f->opt_live_words; 883 if (!words) words = bit_words(opt_reg_count(f)); 884 f->opt_live_words = (u16)words; 885 886 u64* live = arena_zarray(f->arena, u64, words ? words : 1u); 887 u64* use = arena_zarray(f->arena, u64, words ? words : 1u); 888 u64* def = arena_zarray(f->arena, u64, words ? words : 1u); 889 for (u32 b = 0; b < f->nblocks; ++b) { 890 Block* bl = &f->blocks[b]; 891 live_copy_block_out(f, live_info, b, live, words); 892 for (u32 ri = bl->ninsts; ri > 0; --ri) { 893 u32 i = ri - 1u; 894 Inst* in = &bl->insts[i]; 895 bits_clear(use, words); 896 bits_clear(def, words); 897 BitsCtx bc = {use, def}; 898 opt_walk_inst_operands(f, in, collect_bits, &bc); 899 if ((IROp)in->op == IR_ASM_BLOCK) 900 apply_asm_register_constraints(f, in, use, def, live); 901 apply_machine_reg_clobbers(f, in, def, live); 902 live_update_before(live, use, def, words); 903 } 904 } 905 } 906 907 static int spill_slot_compatible(Func* f, FrameSlot fs, PReg v) { 908 if (fs == FRAME_SLOT_NONE || fs > f->nframe_slots) return 0; 909 IRFrameSlot* s = &f->frame_slots[fs - 1u]; 910 u32 size = type_size_fallback(f, opt_reg_type(f, v)); 911 u32 align = size >= 8 ? 8 : size; 912 if (s->kind != FS_SPILL) return 0; 913 if (s->size < size) return 0; 914 if (s->align < align) return 0; 915 return 1; 916 } 917 918 /* Compute conflict_locs = union of used_locs[j] for j in every live range 919 * point of every PReg in `root`'s coalesce group. */ 920 static void alloc_compute_group_conflicts(Func* f, OptAllocator* a, 921 const OptLiveRangeSet* ranges, 922 PReg root) { 923 for (u32 w = 0; w < a->loc_words; ++w) a->conflict_locs[w] = 0; 924 for (PReg v = 1; v < opt_reg_count(f); ++v) { 925 if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue; 926 if (!alloc_group_member(f, root, v)) continue; 927 for (u32 ri = ranges->first_range_by_preg[v]; ri != OPT_RANGE_NONE; 928 ri = ranges->ranges[ri].next) { 929 const OptLiveRange* lr = &ranges->ranges[ri]; 930 u32 end = lr->end < a->point_count ? lr->end : a->point_count; 931 for (u32 j = lr->start; j < end; ++j) { 932 ++a->hard_point_visits; 933 const u64* row = used_locs_row(a, j); 934 for (u32 w = 0; w < a->loc_words; ++w) { 935 a->conflict_locs[w] |= row[w]; 936 ++a->hard_word_ors; 937 } 938 } 939 } 940 } 941 } 942 943 /* Mark `loc_bit` as occupied at every point covered by `root`'s group's 944 * live ranges. */ 945 static void alloc_mark_group_loc(Func* f, OptAllocator* a, 946 const OptLiveRangeSet* ranges, PReg root, 947 u32 loc_bit) { 948 u32 w = loc_bit / 64u; 949 u64 mask = 1ull << (loc_bit % 64u); 950 for (PReg v = 1; v < opt_reg_count(f); ++v) { 951 if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue; 952 if (!alloc_group_member(f, root, v)) continue; 953 for (u32 ri = ranges->first_range_by_preg[v]; ri != OPT_RANGE_NONE; 954 ri = ranges->ranges[ri].next) { 955 const OptLiveRange* lr = &ranges->ranges[ri]; 956 u32 end = lr->end < a->point_count ? lr->end : a->point_count; 957 for (u32 j = lr->start; j < end; ++j) { 958 ++a->hard_mark_points; 959 used_locs_row(a, j)[w] |= mask; 960 } 961 } 962 } 963 } 964 965 static void alloc_assign_group_hard(Func* f, OptAllocator* a, 966 const OptLiveRangeSet* ranges, PReg root, 967 Reg r) { 968 u8 cls = f->preg_info[root].cls; 969 u32 bit = hard_loc_bit(cls, r); 970 for (PReg v = 1; v < opt_reg_count(f); ++v) { 971 if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue; 972 if (!alloc_group_member(f, root, v)) continue; 973 OptPRegInfo* vi = &f->preg_info[v]; 974 vi->alloc_kind = OPT_ALLOC_HARD; 975 vi->hard_reg = r; 976 a->locs[v].kind = OPT_LOC_HARD; 977 a->locs[v].cls = vi->cls; 978 a->locs[v].hard_reg = r; 979 a->locs[v].spill_slot = FRAME_SLOT_NONE; 980 } 981 alloc_mark_group_loc(f, a, ranges, root, bit); 982 if (bit < a->hard_loc_bits) a->hard_open[bit] = 1; 983 } 984 985 static void alloc_assign_group_stack(Func* f, OptAllocator* a, 986 const OptLiveRangeSet* ranges, PReg root) { 987 /* Try to reuse an existing stack slot whose bit is clear in conflict_locs 988 * and whose frame slot is compatible. The conflict_locs scratch must 989 * already be populated for `root` by the caller. */ 990 u32 stack_idx = (u32)~0u; 991 for (u32 k = 0; k < a->stack_slot_count; ++k) { 992 u32 bit = a->hard_loc_bits + k; 993 if (loc_bit_in_conflict(a->conflict_locs, bit)) continue; 994 if (!spill_slot_compatible(f, a->stack_slots[k], root)) continue; 995 stack_idx = k; 996 break; 997 } 998 if (stack_idx == (u32)~0u) { 999 FrameSlot fs = spill_slot_for(f, root); 1000 stack_idx = alloc_alloc_stack_slot(f, a, fs); 1001 /* alloc_alloc_stack_slot may have widened a->loc_words: refresh 1002 * conflict_locs (callers don't reuse it after this). */ 1003 } 1004 FrameSlot slot = a->stack_slots[stack_idx]; 1005 for (PReg v = 1; v < opt_reg_count(f); ++v) { 1006 if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue; 1007 if (!alloc_group_member(f, root, v)) continue; 1008 OptPRegInfo* vi = &f->preg_info[v]; 1009 vi->spill_slot = slot; 1010 vi->alloc_kind = OPT_ALLOC_SPILL; 1011 a->locs[v].kind = OPT_LOC_STACK; 1012 a->locs[v].cls = vi->cls; 1013 a->locs[v].hard_reg = REG_NONE; 1014 a->locs[v].spill_slot = slot; 1015 } 1016 u32 bit = a->hard_loc_bits + stack_idx; 1017 u32 w = bit / 64u; 1018 u64 mask = 1ull << (bit % 64u); 1019 for (PReg v = 1; v < opt_reg_count(f); ++v) { 1020 if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue; 1021 if (!alloc_group_member(f, root, v)) continue; 1022 for (u32 ri = ranges->first_range_by_preg[v]; ri != OPT_RANGE_NONE; 1023 ri = ranges->ranges[ri].next) { 1024 const OptLiveRange* lr = &ranges->ranges[ri]; 1025 u32 end = lr->end < a->point_count ? lr->end : a->point_count; 1026 for (u32 j = lr->start; j < end; ++j) { 1027 ++a->stack_mark_points; 1028 used_locs_row(a, j)[w] |= mask; 1029 } 1030 } 1031 } 1032 } 1033 1034 static int alloc_group_conflicts_bit(const OptAllocator* a, u32 bit) { 1035 if (bit / 64u >= a->loc_words) return 1; 1036 return loc_bit_in_conflict(a->conflict_locs, bit); 1037 } 1038 1039 static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges, 1040 OptAllocator* a, int allow_live_range_split) { 1041 (void)allow_live_range_split; /* live-range splitting deferred per 1042 doc/plan/OPTIMIZER.md; the parameter is 1043 passed through for ABI compatibility. */ 1044 memset(a, 0, sizeof *a); 1045 a->point_count = ranges->point_count ? ranges->point_count : 1u; 1046 a->hard_loc_bits = OPT_REG_CLASSES * 32u; 1047 a->loc_words = alloc_loc_words_for_bits(a->hard_loc_bits); 1048 a->used_locs = 1049 arena_zarray(f->arena, u64, (u64)a->point_count * a->loc_words); 1050 a->conflict_locs = arena_zarray(f->arena, u64, a->loc_words); 1051 a->locs = 1052 arena_zarray(f->arena, OptLoc, opt_reg_count(f) ? opt_reg_count(f) : 1u); 1053 a->hard_open = arena_zarray(f->arena, u8, a->hard_loc_bits); 1054 a->stack_slots = NULL; 1055 a->stack_slot_count = 0; 1056 a->stack_slot_cap = 0; 1057 1058 /* Build candidate list: every coalesce-root PReg that has live ranges. */ 1059 u32 ncands = 0; 1060 for (PReg v = 1; v < opt_reg_count(f); ++v) { 1061 if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue; 1062 if (alloc_coalesce_root(f, v) != v) continue; 1063 ++ncands; 1064 } 1065 OptAllocCandidate* cands = 1066 arena_array(f->arena, OptAllocCandidate, ncands ? ncands : 1u); 1067 u32 n = 0; 1068 for (PReg v = 1; v < opt_reg_count(f); ++v) { 1069 if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue; 1070 PReg root = alloc_coalesce_root(f, v); 1071 if (root != v) continue; 1072 OptAllocGroupInfo gi; 1073 alloc_group_info(f, ranges, root, &gi); 1074 cands[n].v = v; 1075 cands[n].spill_cost = gi.spill_cost; 1076 cands[n].live_length = gi.live_length; 1077 cands[n].tied = gi.tied_hard_reg >= 0; 1078 memset(cands[n].pad, 0, sizeof cands[n].pad); 1079 ++n; 1080 } 1081 alloc_sort_candidates(cands, n); 1082 1083 for (u32 i = 0; i < n; ++i) { 1084 PReg v = cands[i].v; 1085 OptAllocGroupInfo gi; 1086 alloc_group_info(f, ranges, v, &gi); 1087 OptPRegInfo* vi = &f->preg_info[v]; 1088 u8 cls = gi.cls; 1089 alloc_compute_group_conflicts(f, a, ranges, v); 1090 1091 if (gi.tied_hard_reg >= 0) { 1092 Reg fixed = (Reg)gi.tied_hard_reg; 1093 /* Machinize has already validated inline-asm hard-register pins against 1094 * the target's operand-register policy. Some legal pins are ABI registers 1095 * outside the standard allocable set (aa64 x0, rv64 a7), so the allocator 1096 * accepts validated physical registers here and relies on the 1097 * conflict/clobber checks below for placement correctness. */ 1098 if (!hard_available(f, cls, fixed) && !phys_info_for(f, cls, fixed)) { 1099 SrcLoc loc = {0, 0, 0}; 1100 compiler_panic( 1101 f->c, loc, 1102 "opt regalloc: fixed hard reg %u unavailable in class %u", 1103 (unsigned)fixed, (unsigned)cls); 1104 } 1105 if (fixed < 32 && (gi.forbidden_hard_regs & (1u << fixed))) { 1106 SrcLoc loc = {0, 0, 0}; 1107 compiler_panic(f->c, loc, 1108 "opt regalloc: fixed hard reg %u is clobbered", 1109 (unsigned)fixed); 1110 } 1111 if (fixed >= 32 || 1112 (gi.allowed_hard_regs && 1113 (gi.allowed_hard_regs & (1u << fixed)) == 0)) { 1114 SrcLoc loc = {0, 0, 0}; 1115 compiler_panic(f->c, loc, 1116 "opt regalloc: fixed hard reg %u violates asm " 1117 "constraint", 1118 (unsigned)fixed); 1119 } 1120 u32 bit = hard_loc_bit(cls, fixed); 1121 if (fixed >= 32 || alloc_group_conflicts_bit(a, bit)) { 1122 SrcLoc loc = {0, 0, 0}; 1123 compiler_panic(f->c, loc, "opt regalloc: conflicting fixed hard reg %u", 1124 (unsigned)fixed); 1125 } 1126 alloc_assign_group_hard(f, a, ranges, v, fixed); 1127 continue; 1128 } 1129 1130 int found = 0; 1131 Reg best = REG_NONE; 1132 u32 best_score = 0xffffffffu; 1133 for (u32 r = 0; r < f->opt_hard_reg_count[cls]; ++r) { 1134 Reg hr = f->opt_hard_regs[cls][r]; 1135 if (hr >= 32) continue; 1136 if (gi.allowed_hard_regs && (gi.allowed_hard_regs & (1u << hr)) == 0) 1137 continue; 1138 if (gi.forbidden_hard_regs & (1u << hr)) continue; 1139 u32 bit = hard_loc_bit(cls, hr); 1140 if (alloc_group_conflicts_bit(a, bit)) continue; 1141 u32 score = hard_reg_alloc_score(f, a, vi, hr); 1142 if (!found || score < best_score) { 1143 found = 1; 1144 best = hr; 1145 best_score = score; 1146 } 1147 } 1148 if (gi.allowed_hard_regs) { 1149 for (Reg hr = 0; hr < 32; ++hr) { 1150 const CGPhysRegInfo* pi; 1151 if ((gi.allowed_hard_regs & (1u << hr)) == 0) continue; 1152 if (hard_available(f, cls, hr)) continue; 1153 if (gi.forbidden_hard_regs & (1u << hr)) continue; 1154 pi = phys_info_for(f, cls, hr); 1155 if (!pi || (pi->flags & CG_REG_RESERVED)) continue; 1156 u32 bit = hard_loc_bit(cls, hr); 1157 if (alloc_group_conflicts_bit(a, bit)) continue; 1158 u32 score = hard_reg_alloc_score(f, a, vi, hr); 1159 if (!found || score < best_score) { 1160 found = 1; 1161 best = hr; 1162 best_score = score; 1163 } 1164 } 1165 } 1166 /* Also consider the preferred hard reg if it's outside the standard 1167 * allocable set (e.g. x0 on aa64: reserved as the ABI ret reg, not in 1168 * aa_int_allocable). Used by apply_abi_aliasing_hints to let an IR_CALL 1169 * result PReg or IR_RET value PReg land directly in x0, eliding the 1170 * materialization move emit_call/emit_ret would otherwise emit. Only 1171 * short-lived PRegs benefit: a value live across a call cannot survive in 1172 * a caller-saved hint reg, and this is the *only* path that can reach an 1173 * out-of-allocable-set reg, so guard it explicitly. The +1000 caller-save 1174 * penalty in hard_reg_alloc_score only deflects the hint when a cheaper 1175 * reg is found; under high register pressure (found == 0) the fallback 1176 * below would otherwise take the hint reg regardless of score, parking a 1177 * cross-call value in x0 where it collides with the next call's result. */ 1178 if (vi->preferred_hard_reg >= 0 && 1179 !(vi->live_across_call_freq && 1180 is_caller_saved(f, cls, (Reg)vi->preferred_hard_reg))) { 1181 Reg hint = (Reg)vi->preferred_hard_reg; 1182 int already_tried = 0; 1183 if (hint < 32 && 1184 (!gi.allowed_hard_regs || (gi.allowed_hard_regs & (1u << hint)))) { 1185 for (u32 r = 0; r < f->opt_hard_reg_count[cls]; ++r) { 1186 if (f->opt_hard_regs[cls][r] == hint) { 1187 already_tried = 1; 1188 break; 1189 } 1190 } 1191 if (!already_tried && !(gi.forbidden_hard_regs & (1u << hint))) { 1192 u32 bit = hard_loc_bit(cls, hint); 1193 int hint_safe = !alloc_group_conflicts_bit(a, bit); 1194 /* The bitmap conflict can be falsely positive when an 1195 * already-assigned PReg ends exactly where v begins — the 1196 * swap-friendly pattern like `sub x0, x21, x0`, where the previous 1197 * call's result occupies x0 and the sub both reads it and writes 1198 * the new value. Fall back to a precise per-PReg interference check 1199 * that allows the unit-length overlap (same rule used by 1200 * opt_coalesce_ranges for moves). */ 1201 if (!hint_safe) { 1202 int real_conflict = 0; 1203 for (PReg u = 1; u < opt_reg_count(f); ++u) { 1204 if (u == v) continue; 1205 const OptPRegInfo* ui = &f->preg_info[u]; 1206 if (ui->alloc_kind != OPT_ALLOC_HARD) continue; 1207 if (ui->hard_reg != hint) continue; 1208 if (opt_ranges_overlap_kind(ranges, u, v) >= 2) { 1209 real_conflict = 1; 1210 break; 1211 } 1212 } 1213 if (!real_conflict) hint_safe = 1; 1214 } 1215 if (hint_safe) { 1216 u32 score = hard_reg_alloc_score(f, a, vi, hint); 1217 if (!found || score < best_score) { 1218 found = 1; 1219 best = hint; 1220 best_score = score; 1221 } 1222 } 1223 } 1224 } 1225 } 1226 if (found) { 1227 alloc_assign_group_hard(f, a, ranges, v, best); 1228 } else if (gi.allowed_hard_regs) { 1229 SrcLoc loc = {0, 0, 0}; 1230 compiler_panic(f->c, loc, 1231 "opt regalloc: no hard register satisfies asm constraint"); 1232 } else { 1233 alloc_assign_group_stack(f, a, ranges, v); 1234 } 1235 } 1236 1237 /* Report storage metrics in u64 words (used_locs is the only bitmap). */ 1238 u32 total_words = a->point_count * a->loc_words; 1239 u32 hard_words = alloc_loc_words_for_bits(a->hard_loc_bits) * a->point_count; 1240 if (hard_words > total_words) hard_words = total_words; 1241 f->opt_alloc_hard_loc_words = hard_words; 1242 f->opt_alloc_stack_loc_words = total_words - hard_words; 1243 f->opt_alloc_stack_slots = a->stack_slot_count; 1244 f->opt_used_loc_words = total_words; 1245 f->opt_alloc_hard_point_visits = a->hard_point_visits; 1246 f->opt_alloc_stack_point_visits = a->stack_point_visits; 1247 f->opt_alloc_hard_word_ors = a->hard_word_ors; 1248 f->opt_alloc_stack_word_ors = a->stack_word_ors; 1249 f->opt_alloc_hard_mark_points = a->hard_mark_points; 1250 f->opt_alloc_stack_mark_points = a->stack_mark_points; 1251 f->preg_locs = a->locs; 1252 } 1253 1254 typedef struct RewriteList { 1255 Inst* data; 1256 u32 n; 1257 u32 cap; 1258 } RewriteList; 1259 1260 typedef enum EdgeMatKind { 1261 EDGE_MAT_STORE, 1262 EDGE_MAT_RELOAD, 1263 } EdgeMatKind; 1264 1265 typedef struct EdgeMat { 1266 u32 pred; 1267 u32 succ; 1268 PReg v; 1269 Reg hard_reg; 1270 u8 kind; 1271 u8 pad[3]; 1272 } EdgeMat; 1273 1274 typedef struct EdgeMatPlan { 1275 EdgeMat* mats; 1276 u32 n; 1277 u32 cap; 1278 } EdgeMatPlan; 1279 1280 typedef struct RewriteOut { 1281 Inst* data; 1282 u32 cap; 1283 u32 start; 1284 } RewriteOut; 1285 1286 static Inst* list_push(Func* f, RewriteList* l, IROp op) { 1287 if (l->n == l->cap) { 1288 u32 ncap = l->cap ? l->cap * 2u : 16u; 1289 Inst* nb = arena_zarray(f->arena, Inst, ncap); 1290 if (l->data) memcpy(nb, l->data, sizeof(Inst) * l->n); 1291 l->data = nb; 1292 l->cap = ncap; 1293 } 1294 Inst* in = &l->data[l->n++]; 1295 memset(in, 0, sizeof *in); 1296 in->op = (u16)op; 1297 ir_assign_inst_id(f, in); 1298 ++f->opt_rewrite_inserted_insts; 1299 return in; 1300 } 1301 1302 static void list_reset(RewriteList* l) { 1303 if (l) l->n = 0; 1304 } 1305 1306 static void out_init(Func* f, RewriteOut* out, u32 cap) { 1307 out->cap = cap ? cap : 16u; 1308 out->data = arena_zarray(f->arena, Inst, out->cap); 1309 out->start = out->cap; 1310 } 1311 1312 static Inst* out_push_front(Func* f, RewriteOut* out, IROp op) { 1313 if (out->start == 0) { 1314 u32 n = out->cap; 1315 u32 ncap = out->cap ? out->cap * 2u : 16u; 1316 Inst* nb = arena_zarray(f->arena, Inst, ncap); 1317 if (out->data && n) 1318 memcpy(nb + (ncap - n), out->data + out->start, sizeof(Inst) * n); 1319 out->data = nb; 1320 out->cap = ncap; 1321 out->start = ncap - n; 1322 } 1323 Inst* in = &out->data[--out->start]; 1324 memset(in, 0, sizeof *in); 1325 in->op = (u16)op; 1326 ir_assign_inst_id(f, in); 1327 return in; 1328 } 1329 1330 static void out_prepend_list_reverse(Func* f, RewriteOut* out, 1331 const RewriteList* list) { 1332 for (u32 i = list->n; i > 0; --i) { 1333 Inst* dst = out_push_front(f, out, (IROp)list->data[i - 1u].op); 1334 *dst = list->data[i - 1u]; 1335 } 1336 } 1337 1338 static void out_prepend_inst(Func* f, RewriteOut* out, const Inst* in) { 1339 Inst* dst = out_push_front(f, out, (IROp)in->op); 1340 *dst = *in; 1341 } 1342 1343 static MemAccess spill_mem(Func* f, PReg v) { 1344 MemAccess m; 1345 memset(&m, 0, sizeof m); 1346 m.type = opt_reg_type(f, v); 1347 m.size = type_size_fallback(f, opt_reg_type(f, v)); 1348 m.align = m.size >= 8 ? 8 : m.size; 1349 m.alias.kind = ALIAS_LOCAL; 1350 m.alias.v.local_id = (i32)spill_slot_for(f, v); 1351 return m; 1352 } 1353 1354 static Operand spill_addr(Func* f, PReg v) { 1355 Operand o; 1356 memset(&o, 0, sizeof o); 1357 o.kind = OPK_LOCAL; 1358 o.cls = RC_INT; 1359 o.type = opt_reg_type(f, v); 1360 o.v.frame_slot = spill_slot_for(f, v); 1361 return o; 1362 } 1363 1364 static Operand hard_operand(Func* f, PReg v) { 1365 Operand o; 1366 memset(&o, 0, sizeof o); 1367 o.kind = OPK_REG; 1368 o.cls = opt_preg_loc_cls(f, v); 1369 o.type = opt_reg_type(f, v); 1370 o.v.reg = opt_preg_hard_reg(f, v); 1371 return o; 1372 } 1373 1374 static Operand hard_operand_reg(Func* f, PReg v, Reg hard_reg) { 1375 Operand o; 1376 memset(&o, 0, sizeof o); 1377 o.kind = OPK_REG; 1378 o.cls = opt_preg_loc_cls(f, v); 1379 o.type = opt_reg_type(f, v); 1380 o.v.reg = hard_reg; 1381 return o; 1382 } 1383 1384 static void append_store_preg(Func* f, RewriteList* out, PReg v) { 1385 Inst* st = list_push(f, out, IR_STORE); 1386 st->opnds = arena_array(f->arena, Operand, 2); 1387 st->opnds[0] = spill_addr(f, v); 1388 st->opnds[1] = hard_operand(f, v); 1389 st->nopnds = 2; 1390 st->extra.mem = spill_mem(f, v); 1391 } 1392 1393 static void append_load_preg(Func* f, RewriteList* out, PReg v) { 1394 Inst* ld = list_push(f, out, IR_LOAD); 1395 ld->opnds = arena_array(f->arena, Operand, 2); 1396 ld->opnds[0] = hard_operand(f, v); 1397 ld->opnds[1] = spill_addr(f, v); 1398 ld->nopnds = 2; 1399 ld->extra.mem = spill_mem(f, v); 1400 } 1401 1402 static void append_store_preg_hard(Func* f, RewriteList* out, PReg v, 1403 Reg hard_reg) { 1404 Inst* st = list_push(f, out, IR_STORE); 1405 st->opnds = arena_array(f->arena, Operand, 2); 1406 st->opnds[0] = spill_addr(f, v); 1407 st->opnds[1] = hard_operand_reg(f, v, hard_reg); 1408 st->nopnds = 2; 1409 st->extra.mem = spill_mem(f, v); 1410 } 1411 1412 static void append_load_preg_hard(Func* f, RewriteList* out, PReg v, 1413 Reg hard_reg) { 1414 Inst* ld = list_push(f, out, IR_LOAD); 1415 ld->opnds = arena_array(f->arena, Operand, 2); 1416 ld->opnds[0] = hard_operand_reg(f, v, hard_reg); 1417 ld->opnds[1] = spill_addr(f, v); 1418 ld->nopnds = 2; 1419 ld->extra.mem = spill_mem(f, v); 1420 } 1421 1422 static Reg scratch_for(Func* f, u8 cls, u32* next) { 1423 u32 n = f->opt_scratch_reg_count[cls]; 1424 if (!n) return REG_NONE; 1425 Reg r = f->opt_scratch_regs[cls][*next % n]; 1426 ++*next; 1427 return r; 1428 } 1429 1430 typedef struct RewriteCtx { 1431 RewriteList* before; 1432 RewriteList* after; 1433 u32 next_scratch[OPT_REG_CLASSES]; 1434 u32 raw_point; 1435 } RewriteCtx; 1436 1437 static const OptAllocSegment* split_segment_at(Func* f, PReg v, u32 raw_point) { 1438 if (!f->opt_first_segment_by_preg || v >= opt_reg_count(f)) return NULL; 1439 for (u32 si = f->opt_first_segment_by_preg[v]; si != OPT_RANGE_NONE; 1440 si = f->opt_alloc_segments[si].next) { 1441 const OptAllocSegment* s = &f->opt_alloc_segments[si]; 1442 if (s->start <= raw_point && raw_point < s->end) return s; 1443 } 1444 return NULL; 1445 } 1446 1447 static void rewrite_one_operand(Func* f, Inst* owner, Operand* op, int is_def, 1448 void* arg) { 1449 RewriteCtx* c = (RewriteCtx*)arg; 1450 if (op->kind != OPK_REG) return; 1451 PReg v = (PReg)op->v.reg; 1452 if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return; 1453 u8 alloc_kind = opt_preg_alloc_kind(f, v); 1454 if (alloc_kind == OPT_ALLOC_HARD) { 1455 op->v.reg = opt_preg_hard_reg(f, v); 1456 return; 1457 } 1458 if (alloc_kind == OPT_ALLOC_SPLIT) { 1459 const OptAllocSegment* seg = split_segment_at(f, v, c->raw_point); 1460 if (seg && seg->loc_kind == OPT_LOC_HARD) { 1461 op->v.reg = seg->hard_reg; 1462 return; 1463 } 1464 } else if (alloc_kind != OPT_ALLOC_SPILL) { 1465 return; 1466 } 1467 u8 cls = opt_preg_loc_cls(f, v); 1468 Reg scratch = scratch_for(f, cls, &c->next_scratch[cls]); 1469 if (scratch == (Reg)REG_NONE) { 1470 SrcLoc loc = owner ? owner->loc : (SrcLoc){0, 0, 0}; 1471 compiler_panic(f->c, loc, 1472 "opt rewrite: no scratch register for spilled class %u", 1473 (unsigned)cls); 1474 } 1475 op->v.reg = scratch; 1476 if (!is_def) { 1477 Inst* ld = list_push(f, c->before, IR_LOAD); 1478 ld->opnds = arena_array(f->arena, Operand, 2); 1479 ld->opnds[0] = *op; 1480 ld->opnds[1] = spill_addr(f, v); 1481 ld->nopnds = 2; 1482 ld->extra.mem = spill_mem(f, v); 1483 } else { 1484 Inst* st = list_push(f, c->after, IR_STORE); 1485 st->opnds = arena_array(f->arena, Operand, 2); 1486 st->opnds[0] = spill_addr(f, v); 1487 st->opnds[1] = *op; 1488 st->nopnds = 2; 1489 st->extra.mem = spill_mem(f, v); 1490 } 1491 } 1492 1493 static void rewrite_call_arg_operand(Func* f, Operand* op) { 1494 if (!op || op->kind != OPK_REG) return; 1495 PReg v = (PReg)op->v.reg; 1496 if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return; 1497 u8 alloc_kind = opt_preg_alloc_kind(f, v); 1498 if (alloc_kind == OPT_ALLOC_HARD) { 1499 op->v.reg = opt_preg_hard_reg(f, v); 1500 } else if (alloc_kind == OPT_ALLOC_SPILL || alloc_kind == OPT_ALLOC_SPLIT) { 1501 *op = spill_addr(f, v); 1502 } 1503 } 1504 1505 static void rewrite_store_value_operand(Func* f, Inst* owner, Operand* op, 1506 RewriteCtx* ctx) { 1507 PReg v; 1508 u8 alloc_kind; 1509 const OptAllocSegment* seg; 1510 if (!op || op->kind != OPK_REG) return; 1511 v = (PReg)op->v.reg; 1512 if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return; 1513 alloc_kind = opt_preg_alloc_kind(f, v); 1514 if (alloc_kind == OPT_ALLOC_HARD) { 1515 op->v.reg = opt_preg_hard_reg(f, v); 1516 return; 1517 } 1518 if (alloc_kind == OPT_ALLOC_SPLIT) { 1519 seg = split_segment_at(f, v, ctx->raw_point); 1520 if (seg && seg->loc_kind == OPT_LOC_HARD) { 1521 op->v.reg = seg->hard_reg; 1522 return; 1523 } 1524 *op = spill_addr(f, v); 1525 return; 1526 } 1527 if (alloc_kind == OPT_ALLOC_SPILL) { 1528 *op = spill_addr(f, v); 1529 return; 1530 } 1531 rewrite_one_operand(f, owner, op, 0, ctx); 1532 } 1533 1534 static void rewrite_call_arg_indirect_base(Func* f, Inst* owner, Operand* op, 1535 RewriteCtx* ctx) { 1536 if (!op || op->kind != OPK_INDIRECT) return; 1537 Operand base = *op; 1538 base.kind = OPK_REG; 1539 base.cls = RC_INT; 1540 base.v.reg = op->v.ind.base; 1541 rewrite_one_operand(f, owner, &base, 0, ctx); 1542 op->v.ind.base = base.v.reg; 1543 } 1544 1545 static void rewrite_call_arg_value(Func* f, Inst* owner, CGABIValue* v, 1546 RewriteCtx* ctx) { 1547 if (!v) return; 1548 rewrite_call_arg_indirect_base(f, owner, &v->storage, ctx); 1549 rewrite_call_arg_operand(f, &v->storage); 1550 for (u32 i = 0; i < v->nparts; ++i) { 1551 Operand* op = (Operand*)&v->parts[i].op; 1552 rewrite_call_arg_indirect_base(f, owner, op, ctx); 1553 rewrite_call_arg_operand(f, op); 1554 } 1555 } 1556 1557 typedef struct RewriteCallSaveCtx { 1558 Func* f; 1559 RewriteList* out; 1560 const InstRefs* refs; 1561 const Inst* call; 1562 int emit_restore; 1563 } RewriteCallSaveCtx; 1564 1565 static void rewrite_call_save_one(PReg v, void* arg) { 1566 RewriteCallSaveCtx* c = (RewriteCallSaveCtx*)arg; 1567 Func* f = c->f; 1568 if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return; 1569 if (c->refs && refs_has_def(c->refs, v)) return; 1570 if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_HARD) return; 1571 u8 cls = opt_preg_loc_cls(f, v); 1572 Reg hr = opt_preg_hard_reg(f, v); 1573 if (cls >= OPT_REG_CLASSES || hr >= 32u) return; 1574 if ((opt_call_clobber_mask_for(f, c->call, cls) & (1u << hr)) == 0) return; 1575 if (c->emit_restore) 1576 append_load_preg(f, c->out, v); 1577 else 1578 append_store_preg(f, c->out, v); 1579 } 1580 1581 static void append_live_call_saves(Func* f, RewriteList* out, const Inst* call, 1582 const u64* live_after, u32 live_active_words, 1583 const InstRefs* refs, 1584 const PReg* call_save_pregs, 1585 u32 ncall_save_pregs, int emit_restore) { 1586 RewriteCallSaveCtx ctx = {f, out, refs, call, emit_restore}; 1587 f->opt_rewrite_live_words_touched += ncall_save_pregs; 1588 for (u32 i = 0; i < ncall_save_pregs; ++i) { 1589 PReg v = call_save_pregs[i]; 1590 u32 w = v / 64u; 1591 if (w >= live_active_words) continue; 1592 if (!bit_has(live_after, v)) continue; 1593 rewrite_call_save_one(v, &ctx); 1594 } 1595 } 1596 1597 static PReg* rewrite_collect_call_save_pregs(Func* f, u32* count_out) { 1598 u32 n = 0; 1599 for (PReg v = 1; v < opt_reg_count(f); ++v) { 1600 OptPRegInfo* vi = &f->preg_info[v]; 1601 if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_HARD) continue; 1602 if (!vi->live_across_call_freq) continue; 1603 ++n; 1604 } 1605 PReg* pregs = arena_array(f->arena, PReg, n ? n : 1u); 1606 u32 w = 0; 1607 for (PReg v = 1; v < opt_reg_count(f); ++v) { 1608 OptPRegInfo* vi = &f->preg_info[v]; 1609 if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_HARD) continue; 1610 if (!vi->live_across_call_freq) continue; 1611 pregs[w++] = v; 1612 } 1613 *count_out = n; 1614 return pregs; 1615 } 1616 1617 static void append_split_reloads_at(Func* f, RewriteList* out, u32 block, 1618 u32 raw_point) { 1619 if (!f->opt_first_segment_by_preg) return; 1620 for (PReg v = 1; v < opt_reg_count(f); ++v) { 1621 if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_SPLIT) continue; 1622 for (u32 si = f->opt_first_segment_by_preg[v]; si != OPT_RANGE_NONE; 1623 si = f->opt_alloc_segments[si].next) { 1624 const OptAllocSegment* s = &f->opt_alloc_segments[si]; 1625 if (s->block == block && s->start == raw_point && s->reload_at_start && 1626 s->loc_kind == OPT_LOC_HARD) { 1627 append_load_preg_hard(f, out, v, s->hard_reg); 1628 } 1629 } 1630 } 1631 } 1632 1633 static void append_split_stores_at(Func* f, RewriteList* out, u32 block, 1634 u32 raw_point) { 1635 if (!f->opt_first_segment_by_preg) return; 1636 for (PReg v = 1; v < opt_reg_count(f); ++v) { 1637 if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_SPLIT) continue; 1638 for (u32 si = f->opt_first_segment_by_preg[v]; si != OPT_RANGE_NONE; 1639 si = f->opt_alloc_segments[si].next) { 1640 const OptAllocSegment* s = &f->opt_alloc_segments[si]; 1641 if (s->block == block && s->end == raw_point && s->store_at_end && 1642 s->loc_kind == OPT_LOC_HARD) { 1643 append_store_preg_hard(f, out, v, s->hard_reg); 1644 } 1645 } 1646 } 1647 } 1648 1649 static void edge_mat_push(Func* f, EdgeMatPlan* plan, u32 pred, u32 succ, 1650 PReg v, Reg hard_reg, EdgeMatKind kind) { 1651 if (!plan || pred >= f->nblocks || succ >= f->nblocks) return; 1652 if (hard_reg == (Reg)REG_NONE) return; 1653 if (plan->n == plan->cap) { 1654 u32 ncap = plan->cap ? plan->cap * 2u : 16u; 1655 EdgeMat* nm = arena_array(f->arena, EdgeMat, ncap); 1656 if (plan->mats) memcpy(nm, plan->mats, sizeof(plan->mats[0]) * plan->n); 1657 plan->mats = nm; 1658 plan->cap = ncap; 1659 } 1660 EdgeMat* m = &plan->mats[plan->n++]; 1661 memset(m, 0, sizeof *m); 1662 m->pred = pred; 1663 m->succ = succ; 1664 m->v = v; 1665 m->hard_reg = hard_reg; 1666 m->kind = (u8)kind; 1667 } 1668 1669 static void prepare_split_edge_materialization(Func* f, EdgeMatPlan* plan) { 1670 if (!f || !plan || !f->opt_first_segment_by_preg) return; 1671 u32* block_base = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u); 1672 u32 raw = 0; 1673 for (u32 b = 0; b < f->nblocks; ++b) { 1674 block_base[b] = raw; 1675 raw += f->blocks[b].ninsts ? f->blocks[b].ninsts : 1u; 1676 } 1677 1678 for (PReg v = 1; v < opt_reg_count(f); ++v) { 1679 if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_SPLIT) continue; 1680 for (u32 si = f->opt_first_segment_by_preg[v]; si != OPT_RANGE_NONE; 1681 si = f->opt_alloc_segments[si].next) { 1682 OptAllocSegment* s = &f->opt_alloc_segments[si]; 1683 if (s->loc_kind != OPT_LOC_HARD || s->block >= f->nblocks) continue; 1684 Block* bl = &f->blocks[s->block]; 1685 u32 block_start = block_base[s->block]; 1686 u32 block_end = block_start + (bl->ninsts ? bl->ninsts : 1u); 1687 if (s->reload_at_start && s->start == block_start && bl->npreds) { 1688 for (u32 p = 0; p < bl->npreds; ++p) 1689 edge_mat_push(f, plan, bl->preds[p], s->block, v, s->hard_reg, 1690 EDGE_MAT_RELOAD); 1691 s->reload_at_start = 0; 1692 } 1693 if (s->store_at_end && s->end == block_end && bl->nsucc) { 1694 for (u32 e = 0; e < bl->nsucc; ++e) 1695 edge_mat_push(f, plan, s->block, bl->succ[e], v, s->hard_reg, 1696 EDGE_MAT_STORE); 1697 s->store_at_end = 0; 1698 } 1699 } 1700 } 1701 } 1702 1703 static int lower_is_terminator(const Inst* in) { 1704 if (!in) return 0; 1705 switch ((IROp)in->op) { 1706 case IR_BR: 1707 case IR_CONDBR: 1708 case IR_CMP_BRANCH: 1709 case IR_SWITCH: 1710 case IR_INDIRECT_BRANCH: 1711 case IR_RET: 1712 case IR_UNREACHABLE: 1713 case IR_BREAK_TO: 1714 case IR_CONTINUE_TO: 1715 return 1; 1716 case IR_INTRINSIC: { 1717 IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; 1718 return aux && (aux->kind == INTRIN_LONGJMP || aux->kind == INTRIN_TRAP); 1719 } 1720 default: 1721 return 0; 1722 } 1723 } 1724 1725 static void block_insert_list(Func* f, u32 block, const RewriteList* list) { 1726 if (!f || block >= f->nblocks || !list || !list->n) return; 1727 Block* bl = &f->blocks[block]; 1728 u32 term = bl->ninsts && lower_is_terminator(&bl->insts[bl->ninsts - 1u]); 1729 u32 insert_at = bl->ninsts - term; 1730 Inst* insts = arena_zarray(f->arena, Inst, bl->ninsts + list->n); 1731 if (insert_at) memcpy(insts, bl->insts, sizeof(Inst) * insert_at); 1732 memcpy(insts + insert_at, list->data, sizeof(Inst) * list->n); 1733 if (bl->ninsts > insert_at) { 1734 memcpy(insts + insert_at + list->n, bl->insts + insert_at, 1735 sizeof(Inst) * (bl->ninsts - insert_at)); 1736 } 1737 bl->insts = insts; 1738 bl->ninsts += list->n; 1739 bl->cap = bl->ninsts; 1740 } 1741 1742 typedef struct EdgePlace { 1743 u32 pred; 1744 u32 succ; 1745 u32 block; 1746 } EdgePlace; 1747 1748 static u32 edge_place_for(Func* f, EdgePlace** places, u32* nplaces, u32* cap, 1749 u32 pred, u32 succ) { 1750 for (u32 i = 0; i < *nplaces; ++i) 1751 if ((*places)[i].pred == pred && (*places)[i].succ == succ) 1752 return (*places)[i].block; 1753 if (*nplaces == *cap) { 1754 u32 ncap = *cap ? *cap * 2u : 16u; 1755 EdgePlace* np = arena_array(f->arena, EdgePlace, ncap); 1756 if (*places) memcpy(np, *places, sizeof((*places)[0]) * *nplaces); 1757 *places = np; 1758 *cap = ncap; 1759 } 1760 u32 block = opt_split_edge(f, pred, succ); 1761 EdgePlace* p = &(*places)[(*nplaces)++]; 1762 p->pred = pred; 1763 p->succ = succ; 1764 p->block = block; 1765 return block; 1766 } 1767 1768 static void apply_split_edge_materialization(Func* f, const EdgeMatPlan* plan) { 1769 if (!f || !plan || !plan->n) return; 1770 EdgePlace* places = NULL; 1771 u32 nplaces = 0; 1772 u32 place_cap = 0; 1773 for (u32 i = 0; i < plan->n; ++i) { 1774 const EdgeMat* m = &plan->mats[i]; 1775 if (m->pred < f->nblocks && m->succ < f->nblocks) 1776 (void)edge_place_for(f, &places, &nplaces, &place_cap, m->pred, m->succ); 1777 } 1778 for (u32 pass = 0; pass < 2; ++pass) { 1779 EdgeMatKind kind = pass == 0 ? EDGE_MAT_STORE : EDGE_MAT_RELOAD; 1780 for (u32 i = 0; i < plan->n; ++i) { 1781 const EdgeMat* m = &plan->mats[i]; 1782 if ((EdgeMatKind)m->kind != kind) continue; 1783 u32 block = 1784 edge_place_for(f, &places, &nplaces, &place_cap, m->pred, m->succ); 1785 RewriteList list; 1786 memset(&list, 0, sizeof list); 1787 if (kind == EDGE_MAT_STORE) 1788 append_store_preg_hard(f, &list, m->v, m->hard_reg); 1789 else 1790 append_load_preg_hard(f, &list, m->v, m->hard_reg); 1791 block_insert_list(f, block, &list); 1792 } 1793 } 1794 opt_build_cfg(f); 1795 } 1796 1797 static void rewrite_func(Func* f, const OptLiveInfo* live_info) { 1798 u32 words = live_info ? live_info->words : f->opt_live_words; 1799 if (!words) words = bit_words(opt_reg_count(f)); 1800 f->opt_live_words = (u16)words; 1801 f->opt_rewrite_inserted_insts = 0; 1802 f->opt_rewrite_live_words_touched = 0; 1803 1804 u64* live = arena_zarray(f->arena, u64, words ? words : 1u); 1805 u32 ncall_save_pregs = 0; 1806 PReg* call_save_pregs = rewrite_collect_call_save_pregs(f, &ncall_save_pregs); 1807 InstRefs refs; 1808 memset(&refs, 0, sizeof refs); 1809 u32 live_active_words = 0; 1810 u32* block_base = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u); 1811 u32 raw = 0; 1812 for (u32 b = 0; b < f->nblocks; ++b) { 1813 block_base[b] = raw; 1814 raw += f->blocks[b].ninsts ? f->blocks[b].ninsts : 1u; 1815 } 1816 for (u32 b = 0; b < f->nblocks; ++b) { 1817 Block* bl = &f->blocks[b]; 1818 RewriteOut out; 1819 out_init(f, &out, bl->ninsts + 16u); 1820 RewriteList before, after, call_saves, call_restores; 1821 memset(&before, 0, sizeof before); 1822 memset(&after, 0, sizeof after); 1823 memset(&call_saves, 0, sizeof call_saves); 1824 memset(&call_restores, 0, sizeof call_restores); 1825 live_active_words = live_copy_block_out_active(live_info, b, live, words, 1826 live_active_words); 1827 f->opt_rewrite_live_words_touched += live_active_words; 1828 1829 for (u32 ri = bl->ninsts; ri > 0; --ri) { 1830 u32 i = ri - 1u; 1831 Inst in = bl->insts[i]; 1832 if ((IROp)in.op == IR_PARAM_DECL) { 1833 out_prepend_inst(f, &out, &in); 1834 continue; 1835 } 1836 refs_reset(&refs); 1837 opt_walk_inst_operands(f, &in, refs_collect, &refs); 1838 list_reset(&before); 1839 list_reset(&after); 1840 list_reset(&call_saves); 1841 list_reset(&call_restores); 1842 RewriteCtx ctx; 1843 memset(&ctx, 0, sizeof ctx); 1844 ctx.before = &before; 1845 ctx.after = &after; 1846 ctx.raw_point = block_base[b] + i; 1847 if ((IROp)in.op == IR_CALL) { 1848 IRCallAux* aux = (IRCallAux*)in.extra.aux; 1849 if (aux) { 1850 if (aux->use_plan_replay) { 1851 rewrite_one_operand(f, &in, &aux->plan.callee, 0, &ctx); 1852 for (u32 k = 0; k < aux->plan.nargs; ++k) { 1853 rewrite_call_arg_indirect_base(f, &in, &aux->plan.args[k].src, 1854 &ctx); 1855 rewrite_call_arg_operand(f, &aux->plan.args[k].src); 1856 } 1857 for (u32 k = 0; k < aux->plan.nrets; ++k) 1858 rewrite_one_operand(f, &in, &aux->plan.rets[k].dst, 1, &ctx); 1859 } else { 1860 rewrite_one_operand(f, &in, &aux->desc.callee, 0, &ctx); 1861 for (u32 k = 0; k < aux->desc.nargs; ++k) 1862 rewrite_call_arg_value(f, &in, (CGABIValue*)&aux->desc.args[k], 1863 &ctx); 1864 opt_walk_abivalue(f, &in, &aux->desc.ret, 1, rewrite_one_operand, 1865 &ctx); 1866 } 1867 } 1868 } else if ((IROp)in.op == IR_STORE && in.nopnds >= 2) { 1869 opt_walk_operand(f, &in, &in.opnds[0], 0, rewrite_one_operand, &ctx); 1870 rewrite_store_value_operand(f, &in, &in.opnds[1], &ctx); 1871 } else { 1872 opt_walk_inst_operands(f, &in, rewrite_one_operand, &ctx); 1873 } 1874 if ((IROp)in.op == IR_CALL) { 1875 append_live_call_saves(f, &call_saves, &in, live, live_active_words, 1876 &refs, call_save_pregs, ncall_save_pregs, 0); 1877 append_live_call_saves(f, &call_restores, &in, live, live_active_words, 1878 &refs, call_save_pregs, ncall_save_pregs, 1); 1879 } 1880 append_split_reloads_at(f, &before, b, block_base[b] + i); 1881 append_split_stores_at(f, &after, b, block_base[b] + i + 1u); 1882 1883 out_prepend_list_reverse(f, &out, &after); 1884 out_prepend_list_reverse(f, &out, &call_restores); 1885 out_prepend_inst(f, &out, &in); 1886 out_prepend_list_reverse(f, &out, &call_saves); 1887 out_prepend_list_reverse(f, &out, &before); 1888 live_active_words = 1889 live_update_refs_before_active(live, live_active_words, words, &refs); 1890 f->opt_rewrite_live_words_touched += refs.nuses + refs.ndefs; 1891 } 1892 bl->insts = out.data + out.start; 1893 bl->ninsts = out.cap - out.start; 1894 bl->cap = bl->ninsts; 1895 } 1896 f->opt_rewritten = 1; 1897 } 1898 1899 void opt_lower_to_mir(Func* f, const OptLiveInfo* live_info) { 1900 if (!f) return; 1901 Func phys = *f; 1902 phys.blocks = f->blocks; 1903 phys.opt_rewritten = 0; 1904 phys.mir = NULL; 1905 1906 rewrite_func(&phys, live_info); 1907 1908 MFunc* m = arena_zarray(f->arena, MFunc, 1); 1909 m->blocks = phys.blocks; 1910 m->nblocks = phys.nblocks; 1911 m->entry = phys.entry; 1912 m->emit_order_n = phys.emit_order_n; 1913 m->emit_order_cap = phys.emit_order_n; 1914 m->emit_order = 1915 arena_array(f->arena, u32, phys.emit_order_n ? phys.emit_order_n : 1u); 1916 if (phys.emit_order_n) 1917 memcpy(m->emit_order, phys.emit_order, sizeof(u32) * phys.emit_order_n); 1918 1919 f->mir = m; 1920 f->blocks = phys.blocks; 1921 f->nblocks = phys.nblocks; 1922 f->blocks_cap = phys.blocks_cap; 1923 f->frame_slots = phys.frame_slots; 1924 f->nframe_slots = phys.nframe_slots; 1925 f->frame_slots_cap = phys.frame_slots_cap; 1926 f->next_inst_id = phys.next_inst_id; 1927 f->opt_rewrite_inserted_insts = phys.opt_rewrite_inserted_insts; 1928 f->opt_rewrite_live_words_touched = phys.opt_rewrite_live_words_touched; 1929 } 1930 1931 static void rewrite_dump_sb(Writer* w, const StrBuf* sb) { 1932 kit_writer_write(w, strbuf_cstr(sb), strbuf_len(sb)); 1933 } 1934 1935 void opt_rewrite_dump(Func* f, Writer* w) { 1936 if (!f || !w) return; 1937 char buf[96]; 1938 StrBuf sb; 1939 strbuf_init(&sb, buf, sizeof buf); 1940 strbuf_puts(&sb, "rewrite blocks="); 1941 strbuf_put_u64(&sb, (u64)(unsigned)f->nblocks); 1942 strbuf_puts(&sb, " pregs="); 1943 strbuf_put_u64(&sb, (u64)(unsigned)opt_reg_count(f)); 1944 strbuf_puts(&sb, " rewritten="); 1945 strbuf_put_u64(&sb, (u64)(unsigned)f->opt_rewritten); 1946 strbuf_putc(&sb, '\n'); 1947 rewrite_dump_sb(w, &sb); 1948 for (u32 b = 0; b < f->nblocks; ++b) { 1949 Block* bl = &f->blocks[b]; 1950 strbuf_reset(&sb); 1951 strbuf_puts(&sb, "block "); 1952 strbuf_put_u64(&sb, (u64)(unsigned)b); 1953 strbuf_puts(&sb, " insts="); 1954 strbuf_put_u64(&sb, (u64)(unsigned)bl->ninsts); 1955 strbuf_putc(&sb, '\n'); 1956 rewrite_dump_sb(w, &sb); 1957 for (u32 i = 0; i < bl->ninsts; ++i) { 1958 Inst* in = &bl->insts[i]; 1959 strbuf_reset(&sb); 1960 strbuf_puts(&sb, " "); 1961 strbuf_put_u64(&sb, (u64)(unsigned)i); 1962 strbuf_puts(&sb, " op="); 1963 strbuf_put_u64(&sb, (u64)(unsigned)in->op); 1964 strbuf_puts(&sb, " operands="); 1965 strbuf_put_u64(&sb, (u64)(unsigned)in->nopnds); 1966 strbuf_putc(&sb, '\n'); 1967 rewrite_dump_sb(w, &sb); 1968 } 1969 } 1970 } 1971 1972 static int all_defs_dead(Func* f, Inst* in, u64* live) { 1973 if (in->def != 0 && in->def < opt_reg_count(f) && bit_has(live, in->def)) 1974 return 0; 1975 for (u32 i = 0; i < in->ndefs; ++i) { 1976 PReg r = (PReg)in->defs[i]; 1977 if (r != 0 && r < opt_reg_count(f) && bit_has(live, r)) return 0; 1978 } 1979 return 1; 1980 } 1981 1982 void opt_dead_def_elim_with_live(Func* f, const OptLiveInfo* live_info) { 1983 u32 words = live_info ? live_info->words : f->opt_live_words; 1984 if (!words) words = bit_words(opt_reg_count(f)); 1985 f->opt_dde_live_words_touched = 0; 1986 InstRefs refs; 1987 memset(&refs, 0, sizeof refs); 1988 for (u32 b = 0; b < f->nblocks; ++b) { 1989 Block* bl = &f->blocks[b]; 1990 u64* live = arena_zarray(f->arena, u64, words ? words : 1u); 1991 f->opt_dde_live_words_touched += words; 1992 live_copy_block_out(f, live_info, b, live, words); 1993 1994 Inst* new_insts = arena_array(f->arena, Inst, bl->ninsts); 1995 u32 w = 0; 1996 for (u32 ri = bl->ninsts; ri > 0; --ri) { 1997 u32 i = ri - 1u; 1998 Inst* in = &bl->insts[i]; 1999 if (!opt_inst_has_side_effect(f, in) && all_defs_dead(f, in, live)) { 2000 continue; 2001 } 2002 new_insts[w++] = *in; 2003 2004 refs_reset(&refs); 2005 opt_walk_inst_operands(f, in, refs_collect, &refs); 2006 live_update_refs_before(live, &refs); 2007 f->opt_dde_live_words_touched += refs.nuses + refs.ndefs; 2008 } 2009 2010 for (u32 i = 0; i < w / 2; ++i) { 2011 Inst tmp = new_insts[i]; 2012 new_insts[i] = new_insts[w - 1 - i]; 2013 new_insts[w - 1 - i] = tmp; 2014 } 2015 2016 bl->insts = new_insts; 2017 bl->ninsts = w; 2018 bl->cap = w; 2019 } 2020 } 2021 2022 void opt_dead_def_elim(Func* f) { 2023 OptLiveInfo live; 2024 opt_live_blocks(f, &live); 2025 opt_dead_def_elim_with_live(f, &live); 2026 } 2027 2028 /* Collect the def and use PRegs of one instruction (operands + aux fan-out). */ 2029 typedef struct AllocVerifyRefs { 2030 PReg defs[256]; 2031 u32 ndefs; 2032 PReg uses[256]; 2033 u32 nuses; 2034 } AllocVerifyRefs; 2035 2036 static void alloc_verify_collect(Func* f, Inst* in, Operand* op, int is_def, 2037 void* ctx) { 2038 (void)in; 2039 AllocVerifyRefs* r = (AllocVerifyRefs*)ctx; 2040 PReg p; 2041 if (!op || op->kind != OPK_REG) return; 2042 p = (PReg)op->v.reg; 2043 if (p == 0 || p >= opt_reg_count(f)) return; 2044 if (is_def) { 2045 if (r->ndefs < 256u) r->defs[r->ndefs++] = p; 2046 } else { 2047 if (r->nuses < 256u) r->uses[r->nuses++] = p; 2048 } 2049 } 2050 2051 /* Post-allocation interference verifier for the O1 (no-coalesce) path. Since 2052 * O1 never coalesces, two *distinct* PRegs that are live simultaneously must 2053 * never share a hard register. Re-derive per-instruction liveness from the 2054 * block live-out sets and panic if any definition collides with a different 2055 * live value in the same hard reg — turning a silent miscompile into a hard 2056 * error. */ 2057 static void opt_verify_alloc(Func* f, const OptLiveInfo* live) { 2058 u32 nregs = opt_reg_count(f); 2059 u8* cur; 2060 if (nregs <= 1u || !live) return; 2061 /* No "left in incoming reg" pre-check: the hint path's 2062 * opt_ranges_overlap_kind precision check already permits the unit-overlap 2063 * between a param PReg and its own incoming reg (= "no entry move"), and 2064 * the standard allocator's bitmap rejects every other overlap. The 2065 * per-instruction interference scan below is the residual safety net. */ 2066 cur = arena_array(f->arena, u8, nregs); 2067 for (u32 b = 0; b < f->nblocks; ++b) { 2068 Block* bl = &f->blocks[b]; 2069 const OptBlockLive* lb = &live->blocks[b]; 2070 memset(cur, 0, nregs); 2071 for (PReg p = 1; p < nregs; ++p) 2072 if (opt_bitset_has(&lb->live_out, p)) cur[p] = 1u; 2073 for (u32 ri = bl->ninsts; ri > 0; --ri) { 2074 Inst* in = &bl->insts[ri - 1u]; 2075 AllocVerifyRefs refs; 2076 refs.ndefs = 0; 2077 refs.nuses = 0; 2078 opt_walk_inst_operands(f, in, alloc_verify_collect, &refs); 2079 for (u32 k = 0; k < refs.ndefs; ++k) { 2080 PReg d = refs.defs[k]; 2081 u8 d_kind = opt_preg_alloc_kind(f, d); 2082 if (d_kind != OPT_ALLOC_HARD && d_kind != OPT_ALLOC_SPILL) continue; 2083 for (PReg p = 1; p < nregs; ++p) { 2084 u8 p_kind; 2085 if (!cur[p] || p == d) continue; 2086 p_kind = opt_preg_alloc_kind(f, p); 2087 if (p_kind == OPT_ALLOC_HARD && d_kind == OPT_ALLOC_HARD && 2088 opt_preg_loc_cls(f, p) == opt_preg_loc_cls(f, d) && 2089 opt_preg_hard_reg(f, p) == opt_preg_hard_reg(f, d)) { 2090 SrcLoc loc = {0, 0, 0}; 2091 compiler_panic(f->c, loc, 2092 "opt regalloc: O1 interference — pregs %u and %u " 2093 "share cls%u reg%u, both live at block %u inst %u " 2094 "(op %u)", 2095 (unsigned)d, (unsigned)p, 2096 (unsigned)opt_preg_loc_cls(f, d), 2097 (unsigned)opt_preg_hard_reg(f, d), b, ri - 1u, 2098 (unsigned)in->op); 2099 } 2100 if (p_kind == OPT_ALLOC_SPILL && d_kind == OPT_ALLOC_SPILL && 2101 opt_preg_spill_slot(f, p) == opt_preg_spill_slot(f, d)) { 2102 SrcLoc loc = {0, 0, 0}; 2103 compiler_panic(f->c, loc, 2104 "opt regalloc: O1 interference — pregs %u and %u " 2105 "share spill slot %u, both live at block %u inst %u " 2106 "(op %u)", 2107 (unsigned)d, (unsigned)p, 2108 (unsigned)opt_preg_spill_slot(f, d), b, ri - 1u, 2109 (unsigned)in->op); 2110 } 2111 } 2112 } 2113 for (u32 k = 0; k < refs.ndefs; ++k) cur[refs.defs[k]] = 0u; 2114 for (u32 k = 0; k < refs.nuses; ++k) cur[refs.uses[k]] = 1u; 2115 } 2116 } 2117 } 2118 2119 static void opt_regalloc_place(Func* f, int allow_live_range_split, 2120 OptLiveInfo* live_out) { 2121 metrics_scope_begin(f->c, "opt.live_ranges.regalloc"); 2122 OptLiveInfo live; 2123 opt_live_blocks(f, &live); 2124 OptLiveRangeSet ranges; 2125 opt_live_ranges_build(f, &live, &ranges); 2126 opt_init_preg_info_from_ranges(f, &ranges); 2127 opt_apply_asm_constraints_from_live(f, &live); 2128 apply_abi_aliasing_hints(f); 2129 /* The ABI hint passes clear forbid bits to steer values toward ABI registers 2130 * (ret reg, arg regs). Restore the hard machine-clobber forbids afterward so 2131 * no allocation path (normal, preferred-reg, or two-address group) can place 2132 * a value in a register that an instruction live across it destroys. */ 2133 if (f->preg_info) 2134 for (PReg v = 1; v < opt_reg_count(f); ++v) 2135 f->preg_info[v].forbidden_hard_regs |= 2136 f->preg_info[v].clobbered_hard_regs; 2137 /* MIR coalesces only at -O2 (mir-gen.c:9431); match that here. At O1 the 2138 * point-bitmap allocator emits copies through the natural conflict-free 2139 * path. IRF_NO_COALESCE protects SSA edge copies inserted at O2. */ 2140 if (allow_live_range_split) opt_coalesce_ranges(f, &ranges); 2141 metrics_count(f->c, "opt.live_words", f->opt_live_words); 2142 metrics_count(f->c, "opt.ranges", ranges.nranges); 2143 metrics_count(f->c, "opt.range_points", ranges.point_count); 2144 metrics_count(f->c, "opt.range_raw_points", ranges.raw_point_count); 2145 metrics_count(f->c, "opt.range_max_per_preg", ranges.max_ranges_per_preg); 2146 metrics_count(f->c, "opt.range_max_length", ranges.max_live_length); 2147 metrics_count(f->c, "opt.range_whole_block_spans", ranges.whole_block_spans); 2148 metrics_count(f->c, "opt.live.bitset_words_touched", 2149 live.bitset_words_touched); 2150 metrics_count(f->c, "opt.live.dataflow_iterations", live.dataflow_iterations); 2151 metrics_count(f->c, "opt.live.dataflow_block_visits", 2152 live.dataflow_block_visits); 2153 metrics_count(f->c, "opt.range.point_visits", ranges.range_point_visits); 2154 metrics_count(f->c, "opt.range.preg_scans", ranges.preg_scans); 2155 metrics_count(f->c, "opt.range.live_words_touched", 2156 ranges.live_words_touched); 2157 metrics_count(f->c, "opt.conflict_bytes", 0); 2158 metrics_scope_end(f->c, "opt.live_ranges.regalloc"); 2159 2160 OptAllocator alloc; 2161 opt_assign_ranges(f, &ranges, &alloc, allow_live_range_split); 2162 if (!allow_live_range_split) opt_verify_alloc(f, &live); 2163 if (live_out) *live_out = live; 2164 } 2165 2166 void opt_regalloc_locations(Func* f, int allow_live_range_split, 2167 OptLiveInfo* live_out) { 2168 opt_regalloc_place(f, allow_live_range_split, live_out); 2169 } 2170 2171 void opt_regalloc(Func* f, int allow_live_range_split) { 2172 OptLiveInfo live; 2173 opt_regalloc_place(f, allow_live_range_split, &live); 2174 EdgeMatPlan edge_mats; 2175 memset(&edge_mats, 0, sizeof edge_mats); 2176 if (allow_live_range_split) prepare_split_edge_materialization(f, &edge_mats); 2177 rewrite_func(f, &live); 2178 apply_split_edge_materialization(f, &edge_mats); 2179 }