opt.c (38045B)
1 #include <kit/config.h> 2 #include <string.h> 3 4 #include "abi/abi.h" 5 #include "cg/ir.h" 6 #include "cg/ir_recorder.h" 7 #include "cg/native_asm.h" 8 #include "cg/native_direct_target.h" 9 #include "cg/type.h" 10 #include "core/arena.h" 11 #include "core/core.h" 12 #include "core/diag.h" 13 #include "core/hashmap.h" 14 #include "core/metrics.h" 15 #include "core/pool.h" 16 #include "core/slice.h" 17 #include "core/strbuf.h" 18 #include "debug/debug.h" 19 #include "obj/symresolve.h" 20 #include "opt/opt_internal.h" 21 22 #undef Operand 23 #undef CGCallDesc 24 #undef CGFuncDesc 25 #undef CGParamDesc 26 #undef CGScopeDesc 27 28 /* Fixpoint bound for the whole-program inliner. opt_inline internally clamps to 29 * 4; an inlined straightline body introduces no new call sites, so a small 30 * bound converges. */ 31 #define OPT_WHOLE_PROGRAM_INLINE_ITERS 4 32 33 typedef struct OptImpl { 34 Compiler* c; 35 CgTarget* target; 36 NativeTarget* native; 37 int level; 38 /* Whole-program (LTO) mode: defer all per-function emission to finalize so 39 * the module-wide sweep can GC dead symbols and run cross-function inlining 40 * over the full reachable set. Enabled whenever the optimizer runs (-O1 and 41 * above; see opt_cgtarget_new). The ARM64 path already defers 42 * unconditionally; this generalizes that to every arch and adds the inliner. 43 */ 44 int whole_program; 45 CgFinishPolicy finish_policy; 46 Writer* dump_writer; 47 /* Registry of functions recorded so far, for tiny-inline callee lookup. 48 * `lowered_cache` is parallel to `cg_by_sym`: a lazily re-lowered 49 * pre-machinize Func for the callee, reused across all call sites. All on 50 * c->tu (persist for the whole translation unit). */ 51 CgIrFunc** cg_by_sym; 52 Func** lowered_cache; 53 u32 ncg; 54 u32 cg_cap; 55 } OptImpl; 56 57 HASHMAP_DEFINE(OptFuncIndex, ObjSymId, u32, hash_u32); 58 59 /* A symbol whose definition can be replaced at link time must not have its body 60 * inlined — the inlined copy would defeat the override. Weak definitions are 61 * interposable in every output kind, so they are never safe to inline. (The 62 * broader default-visibility interposition under -shared is governed by the 63 * preserved set; see doc/plan/LTO.md §5/§9.) */ 64 static int opt_cg_func_interposable(OptImpl* o, const CgIrFunc* cg) { 65 const ObjSym* s; 66 if (!cg || cg->desc.sym == OBJ_SYM_NONE) return 0; 67 if (cg->desc.sym_bind == SB_WEAK) return 1; 68 s = obj_symbol_get(o->target->obj, cg->desc.sym); 69 return s && s->bind == SB_WEAK; 70 } 71 72 /* Lower a recorded function to the pre-machinize Func used by the inliners, and 73 * mark interposable definitions INLINE_NEVER so neither the streaming 74 * tiny-inliner nor the whole-program inliner fuses their bodies into callers. 75 * Marking the callee's policy is honored by effective_inline_policy in both. */ 76 static Func* opt_lower_for_inline(OptImpl* o, const CgIrFunc* cg) { 77 Func* f = opt_func_from_cg_ir(o->c, cg); 78 if (f && opt_cg_func_interposable(o, cg)) 79 f->desc.inline_policy = KIT_CG_INLINE_NEVER; 80 return f; 81 } 82 83 /* Lazily re-lower (and cache) the pre-machinize Func for a recorded callee 84 * symbol. Returns NULL for forward-defined callees not yet recorded. */ 85 static Func* opt_tiny_callee_lookup(void* ctx, ObjSymId sym) { 86 OptImpl* o = (OptImpl*)ctx; 87 for (u32 i = 0; i < o->ncg; ++i) { 88 if (o->cg_by_sym[i]->desc.sym != sym) continue; 89 if (!o->lowered_cache[i]) 90 o->lowered_cache[i] = opt_lower_for_inline(o, o->cg_by_sym[i]); 91 return o->lowered_cache[i]; 92 } 93 return NULL; 94 } 95 96 static void opt_registry_add(OptImpl* o, CgIrFunc* f) { 97 if (o->ncg == o->cg_cap) { 98 u32 ncap = o->cg_cap ? o->cg_cap * 2u : 16u; 99 CgIrFunc** ncg = arena_array(o->c->tu, CgIrFunc*, ncap); 100 Func** nlc = arena_zarray(o->c->tu, Func*, ncap); 101 if (o->ncg) { 102 memcpy(ncg, o->cg_by_sym, sizeof(ncg[0]) * o->ncg); 103 memcpy(nlc, o->lowered_cache, sizeof(nlc[0]) * o->ncg); 104 } 105 o->cg_by_sym = ncg; 106 o->lowered_cache = nlc; 107 o->cg_cap = ncap; 108 } 109 o->cg_by_sym[o->ncg++] = f; 110 } 111 112 static void opt_dbg_dump(OptImpl* o, Func* f, const char* tag) { 113 const char* s = kit_debug_getenv("KIT_DUMP"); 114 KitWriter* w = NULL; 115 size_t len = 0; 116 const uint8_t* bytes; 117 if (!s) return; 118 if (strcmp(s, "1") != 0 && strcmp(s, tag) != 0) return; 119 kit_writer_mem(o->c->ctx->heap, &w); 120 opt_ir_dump(f, w); 121 bytes = kit_writer_mem_bytes(w, &len); 122 compiler_panic(o->c, f->desc.loc, "DUMP %s:\n%.*s", tag, (int)len, 123 (const char*)bytes); 124 } 125 126 /* CFG-prep prefix shared by the streaming and whole-program pipelines: lower's 127 * raw blocks -> built CFG -> jump cleanup -> rebuilt CFG -> local simplify. In 128 * whole-program mode this runs on every reachable function before opt_inline 129 * sees the FuncSet, so the inliner observes the same block shape the streaming 130 * path does. */ 131 static void opt_o1_native_prepare(OptImpl* o, Func* f) { 132 if (!o->native) 133 compiler_panic(o->c, f ? f->desc.loc : (SrcLoc){0, 0, 0}, 134 "O1 optimizer requires a native target"); 135 opt_dbg_dump(o, f, "entry"); 136 137 metrics_count(o->c, "opt.funcs", 1); 138 metrics_count(o->c, "opt.blocks", f->nblocks); 139 metrics_count(o->c, "opt.pregs", f->npregs); 140 141 metrics_scope_begin(o->c, "opt.cfg.build_1"); 142 opt_build_cfg(f); 143 metrics_scope_end(o->c, "opt.cfg.build_1"); 144 metrics_scope_begin(o->c, "opt.cfg.jump_cleanup_cfg"); 145 opt_jump_cleanup(f, OPT_JUMP_CLEANUP_CFG); 146 metrics_scope_end(o->c, "opt.cfg.jump_cleanup_cfg"); 147 metrics_scope_begin(o->c, "opt.cfg.build_2"); 148 opt_build_cfg(f); 149 metrics_scope_end(o->c, "opt.cfg.build_2"); 150 metrics_scope_begin(o->c, "opt.cfg.simplify_local"); 151 opt_simplify_local(f); 152 metrics_scope_end(o->c, "opt.cfg.simplify_local"); 153 } 154 155 /* The machinize-through-emit suffix. `cfg_dirty` is set by the whole-program 156 * path: opt_inline mutated the caller's blocks in place and left CFG analysis 157 * stale, so the CFG must be rebuilt before tiny-inline/verify/machinize. The 158 * streaming path passes 0 (prepare just built it). */ 159 static void opt_o1_native_finish(OptImpl* o, Func* f, int cfg_dirty) { 160 OptLiveInfo live; 161 OptLiveInfo regalloc_live; 162 163 if (cfg_dirty) { 164 /* opt_inline maintains succ + emit_order only; rebuild preds/CFG and merge 165 * the BR-glue chains back into straight-line blocks (build_cfg -> 166 * jump_cleanup -> build_cfg) before any pass that needs the analysis. */ 167 opt_build_cfg(f); 168 opt_jump_cleanup(f, OPT_JUMP_CLEANUP_CFG); 169 opt_build_cfg(f); 170 } 171 172 metrics_scope_begin(o->c, "opt.o1.tiny_inline"); 173 int inlined = opt_try_tiny_inline(f, opt_tiny_callee_lookup, o); 174 metrics_scope_end(o->c, "opt.o1.tiny_inline"); 175 if (inlined) { 176 /* inline_call_site invalidated CFG and left preds stale (it maintains 177 * succ + emit_order only); rebuilding is required before verify/machinize. 178 * Then merge the BR-glue chain (pre -> body -> cont) back into 179 * straight-line blocks so regalloc sees no artificial boundaries, mirroring 180 * the prologue's build_cfg -> jump_cleanup -> build_cfg idiom. */ 181 opt_build_cfg(f); 182 opt_jump_cleanup(f, OPT_JUMP_CLEANUP_CFG); 183 opt_build_cfg(f); 184 } 185 186 metrics_scope_begin(o->c, "opt.cfg.verify"); 187 opt_verify(f, "lowering-cfg"); 188 metrics_scope_end(o->c, "opt.cfg.verify"); 189 190 metrics_scope_begin(o->c, "opt.machinize"); 191 opt_machinize_native(f, o->native); 192 metrics_scope_end(o->c, "opt.machinize"); 193 metrics_scope_begin(o->c, "opt.machinize.verify"); 194 opt_verify(f, "lowering-machinize"); 195 metrics_scope_end(o->c, "opt.machinize.verify"); 196 197 metrics_scope_begin(o->c, "opt.o1.addr_xform_pregs"); 198 opt_addr_xform_pregs(f); 199 metrics_scope_end(o->c, "opt.o1.addr_xform_pregs"); 200 metrics_scope_begin(o->c, "opt.o1.addr_xform.verify"); 201 opt_verify(f, "o1-addr-xform"); 202 metrics_scope_end(o->c, "opt.o1.addr_xform.verify"); 203 metrics_scope_begin(o->c, "opt.o1.promote_scalar_locals"); 204 opt_promote_scalar_locals(f); 205 metrics_scope_end(o->c, "opt.o1.promote_scalar_locals"); 206 metrics_scope_begin(o->c, "opt.o1.promote_scalar.verify"); 207 opt_verify(f, "o1-promote-scalar"); 208 metrics_scope_end(o->c, "opt.o1.promote_scalar.verify"); 209 metrics_scope_begin(o->c, "opt.o1.addr_of_global_cse"); 210 opt_addr_of_global_cse(f); 211 metrics_scope_end(o->c, "opt.o1.addr_of_global_cse"); 212 metrics_scope_begin(o->c, "opt.o1.addr_of_global.verify"); 213 opt_verify(f, "o1-addr-global-cse"); 214 metrics_scope_end(o->c, "opt.o1.addr_of_global.verify"); 215 216 metrics_scope_begin(o->c, "opt.build_loop_tree"); 217 opt_build_loop_tree(f); 218 metrics_scope_end(o->c, "opt.build_loop_tree"); 219 metrics_scope_begin(o->c, "opt.o1.lower_loop_imm"); 220 opt_lower_loop_imm_operands(f, o->native); 221 metrics_scope_end(o->c, "opt.o1.lower_loop_imm"); 222 opt_verify(f, "o1-lower-loop-imm"); 223 metrics_scope_begin(o->c, "opt.o1.hoist_loop_consts"); 224 opt_hoist_loop_consts(f); 225 metrics_scope_end(o->c, "opt.o1.hoist_loop_consts"); 226 opt_verify(f, "o1-hoist-loop-consts"); 227 228 metrics_scope_begin(o->c, "opt.live_blocks.pre_dde"); 229 memset(&live, 0, sizeof live); 230 opt_live_blocks(f, &live); 231 metrics_count(o->c, "opt.live_words", f->opt_live_words); 232 metrics_scope_end(o->c, "opt.live_blocks.pre_dde"); 233 metrics_scope_begin(o->c, "opt.dead_def_elim"); 234 opt_dead_def_elim_with_live(f, &live); 235 metrics_scope_end(o->c, "opt.dead_def_elim"); 236 237 metrics_scope_begin(o->c, "opt.regalloc"); 238 memset(®alloc_live, 0, sizeof regalloc_live); 239 opt_regalloc_locations(f, 0, ®alloc_live); 240 metrics_scope_end(o->c, "opt.regalloc"); 241 metrics_scope_begin(o->c, "opt.regalloc.verify"); 242 opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE); 243 opt_verify(f, "post-regalloc"); 244 metrics_scope_end(o->c, "opt.regalloc.verify"); 245 246 metrics_scope_begin(o->c, "opt.lower_mir"); 247 opt_lower_to_mir(f, ®alloc_live); 248 metrics_scope_end(o->c, "opt.lower_mir"); 249 metrics_scope_begin(o->c, "opt.lower_mir.verify"); 250 opt_mir_verify(f, "lower-mir"); 251 metrics_scope_end(o->c, "opt.lower_mir.verify"); 252 metrics_scope_begin(o->c, "opt.combine"); 253 opt_mir_combine(f); 254 metrics_scope_end(o->c, "opt.combine"); 255 metrics_scope_begin(o->c, "opt.combine.verify"); 256 opt_mir_verify(f, "post-mir-combine"); 257 metrics_scope_end(o->c, "opt.combine.verify"); 258 metrics_scope_begin(o->c, "opt.dce"); 259 opt_mir_dce(f); 260 metrics_scope_end(o->c, "opt.dce"); 261 metrics_scope_begin(o->c, "opt.dce.verify"); 262 opt_mir_verify(f, "post-mir-dce"); 263 metrics_scope_end(o->c, "opt.dce.verify"); 264 metrics_scope_begin(o->c, "opt.post_ra.jump_cleanup_cfg"); 265 opt_mir_jump_cleanup(f, OPT_JUMP_CLEANUP_CFG); 266 metrics_scope_end(o->c, "opt.post_ra.jump_cleanup_cfg"); 267 metrics_scope_begin(o->c, "opt.post_ra.build_cfg"); 268 opt_mir_build_cfg(f); 269 metrics_scope_end(o->c, "opt.post_ra.build_cfg"); 270 metrics_scope_begin(o->c, "opt.post_ra.verify"); 271 opt_mir_verify(f, "post-mir-jump-cfg"); 272 metrics_scope_end(o->c, "opt.post_ra.verify"); 273 metrics_scope_begin(o->c, "opt.post_ra.jump_cleanup_layout"); 274 opt_mir_jump_cleanup(f, OPT_JUMP_CLEANUP_LAYOUT); 275 metrics_scope_end(o->c, "opt.post_ra.jump_cleanup_layout"); 276 277 opt_dbg_dump(o, f, "pre-emit"); 278 metrics_scope_begin(o->c, "opt.emit"); 279 if (o->native->mc && o->native->mc->debug) 280 debug_func_select(o->native->mc->debug, f->desc.sym); 281 opt_emit_native(o->c, f, o->native); 282 if (o->native->mc && o->native->mc->debug) 283 debug_func_end(o->native->mc->debug); 284 metrics_scope_end(o->c, "opt.emit"); 285 } 286 287 /* Streaming pipeline for one function: prepare + finish, back to back. Used by 288 * the eager per-function path (x64/rv64 below -O2) and by the ARM64/-O2 sweep 289 * for functions with no cross-function inlining to do. */ 290 static void opt_run_o1_native(OptImpl* o, Func* f) { 291 metrics_scope_begin(o->c, "opt.o1.total"); 292 opt_o1_native_prepare(o, f); 293 opt_o1_native_finish(o, f, /*cfg_dirty=*/0); 294 metrics_scope_end(o->c, "opt.o1.total"); 295 } 296 297 /* Sibling of opt_run_o1_native for the threaded interpreter. Runs the maximal 298 * target-independent subset and stops before opt_machinize_native: no regalloc, 299 * no MIR, no native emission. The result is a Func with virtual PRegs that the 300 * interpreter loader consumes directly. opt_verify is intentionally omitted — 301 * it is a debug aid and some checks assume the machinize-completed shape. */ 302 Func* opt_run_o1_interp(Compiler* c, const CgIrFunc* cg) { 303 Func* f; 304 OptLiveInfo live; 305 metrics_scope_begin(c, "opt.interp.total"); 306 f = opt_func_from_cg_ir(c, cg); 307 opt_build_cfg(f); 308 opt_jump_cleanup(f, OPT_JUMP_CLEANUP_CFG); 309 opt_build_cfg(f); 310 opt_simplify_local(f); 311 /* Target-independent local/escape optimizations. These run after machinize 312 * in the native pipeline but depend only on the PReg/frame-slot view, not on 313 * physical-register pools, so they are safe here and shrink the work the 314 * interpreter does (promote scalar locals to PRegs, fold addr-of-local, 315 * CSE addr-of-global). */ 316 opt_addr_xform_pregs(f); 317 opt_promote_scalar_locals(f); 318 opt_addr_of_global_cse(f); 319 opt_build_loop_tree(f); 320 memset(&live, 0, sizeof live); 321 opt_live_blocks(f, &live); 322 opt_dead_def_elim_with_live(f, &live); 323 metrics_scope_end(c, "opt.interp.total"); 324 { 325 if (kit_debug_getenv("KIT_DUMP_INTERP")) { 326 KitWriter* w = NULL; 327 size_t len = 0; 328 const uint8_t* bytes; 329 kit_writer_mem(c->ctx->heap, &w); 330 opt_ir_dump(f, w); 331 bytes = kit_writer_mem_bytes(w, &len); 332 diag_emit(c->ctx->diag, KIT_DIAG_NOTE, (SrcLoc){0, 0, 0}, 333 "INTERP IR:\n%.*s", (int)len, (const char*)bytes); 334 } 335 } 336 return f; 337 } 338 339 #if KIT_INTERP_ENABLED 340 /* Defined in src/interp/lower.c. Lowers a post-opt_run_o1_interp Func into the 341 * program's bytecode and registers it by symbol (ObjSymId, for internal calls) 342 * and by unmangled C name (for entry lookup). Declared here (rather than 343 * including the interp header into opt) to keep the dependency one-way. */ 344 void interp_capture_func(void* program, Func* f, ObjSymId sym, const char* name, 345 u32 name_len, const ObjBuilder* obj); 346 347 static void opt_maybe_capture_interp(OptImpl* o, const CgIrFunc* cg) { 348 ObjSymId sym; 349 Slice name; 350 if (!o->c->interp_sink) return; 351 sym = cg->desc.sym; 352 name = 353 (sym != OBJ_SYM_NONE) 354 ? pool_slice(o->c->global, obj_symbol_get(o->target->obj, sym)->name) 355 : SLICE_NULL; 356 interp_capture_func(o->c->interp_sink, opt_run_o1_interp(o->c, cg), sym, 357 name.s, (u32)name.len, o->target->obj); 358 } 359 #else 360 static void opt_maybe_capture_interp(OptImpl* o, const CgIrFunc* cg) { 361 (void)o; 362 (void)cg; 363 } 364 #endif 365 366 static void opt_dbg_dump_cg(OptImpl* o, const CgIrFunc* f) { 367 KitWriter* w = NULL; 368 size_t len = 0; 369 const uint8_t* bytes; 370 if (!kit_debug_getenv("KIT_DUMPCG")) return; 371 kit_writer_mem(o->c->ctx->heap, &w); 372 cg_ir_func_dump(f, w); 373 bytes = kit_writer_mem_bytes(w, &len); 374 compiler_panic(o->c, f->desc.loc, "CGIR:\n%.*s", (int)len, 375 (const char*)bytes); 376 } 377 378 static void opt_on_func(void* user, CgIrFunc* cg_func) { 379 OptImpl* o = (OptImpl*)user; 380 Func* f; 381 /* Register before lowering so later callers can resolve this (complete) 382 * function as a tiny-inline callee. */ 383 opt_registry_add(o, cg_func); 384 opt_dbg_dump_cg(o, cg_func); 385 /* The dump writer renders the semantic CG IR tape — the IR as recorded, 386 * before lowering to the optimizer's CFG form. */ 387 if (o->dump_writer) cg_ir_func_dump(cg_func, o->dump_writer); 388 /* Defer emission to the finalize sweep whenever whole-program mode is on — 389 * the same path for every arch. The sweep does GC + cross-function inlining 390 * over the full reachable set. The eager emit below is the fallback for a 391 * (currently unreachable) non-whole-program configuration. */ 392 if (o->whole_program) return; 393 metrics_scope_begin(o->c, "opt.o1.cg_ir_lower"); 394 f = opt_func_from_cg_ir(o->c, cg_func); 395 metrics_scope_end(o->c, "opt.o1.cg_ir_lower"); 396 opt_run_o1_native(o, f); 397 opt_maybe_capture_interp(o, cg_func); 398 } 399 400 static int opt_module_has_asm(const CgIrModule* module) { 401 if (!module) return 0; 402 if (module->nfile_scope_asms) return 1; 403 for (u32 i = 0; i < module->nfuncs; ++i) { 404 const CgIrFunc* f = module->funcs[i]; 405 if (!f || f->removed) continue; 406 for (u32 k = 0; k < f->ninsts; ++k) 407 if (f->insts[k].op == CG_IR_ASM_BLOCK) return 1; 408 } 409 return 0; 410 } 411 412 static int opt_sym_in_preserved_section(OptImpl* o, ObjSymId sym, 413 const ObjSym* s) { 414 const Section* sec; 415 const ObjAtom* atom; 416 ObjAtomId aid; 417 if (!o || !s || s->section_id == OBJ_SEC_NONE) return 0; 418 sec = obj_section_get(o->target->obj, s->section_id); 419 if (sec && ((sec->flags & SF_RETAIN) || sec->sem == SSEM_INIT_ARRAY || 420 sec->sem == SSEM_FINI_ARRAY || sec->sem == SSEM_PREINIT_ARRAY)) 421 return 1; 422 aid = obj_atom_find_symbol(o->target->obj, sym); 423 atom = obj_atom_get(o->target->obj, aid); 424 return atom && (atom->flags & OBJ_ATOM_RETAIN); 425 } 426 427 static void opt_build_preserved_set(OptImpl* o, ObjSymSet* preserved) { 428 for (u32 i = 0; i < o->finish_policy.npreserved_symbols; ++i) { 429 ObjSymId sym = o->finish_policy.preserved_symbols[i]; 430 if (sym != OBJ_SYM_NONE) (void)ObjSymSet_set(preserved, sym, 1); 431 } 432 } 433 434 static int opt_sym_must_stay_external(OptImpl* o, int module_has_asm, 435 const ObjSymSet* preserved, ObjSymId sym, 436 const ObjSym* s) { 437 if (!s || s->removed) return 1; 438 if (s->bind == SB_LOCAL) return 1; 439 if (o->finish_policy.output_kind != KIT_CG_OUTPUT_EXECUTABLE) return 1; 440 if (o->finish_policy.interposition_policy == 441 KIT_CG_INTERPOSITION_DEFAULT_VISIBILITY) 442 return 1; 443 if (ObjSymSet_get(preserved, sym)) return 1; 444 if (s->bind == SB_WEAK) return 1; 445 if (s->kind == SK_IFUNC) return 1; 446 if (s->flags & KIT_CG_SYM_USED) return 1; 447 if (module_has_asm) return 1; 448 if (opt_sym_in_preserved_section(o, sym, s)) return 1; 449 return 0; 450 } 451 452 static int opt_sym_internalizable(const ObjSym* s) { 453 if (!s || s->removed || s->bind == SB_LOCAL) return 0; 454 if (!symresolve_sym_is_def(s)) return 0; 455 switch ((SymKind)s->kind) { 456 case SK_FUNC: 457 case SK_OBJ: 458 case SK_TLS: 459 return 1; 460 default: 461 return 0; 462 } 463 } 464 465 static void opt_internalize_non_preserved(OptImpl* o, const CgIrModule* module, 466 const ObjSymSet* preserved) { 467 ObjSymIter* it; 468 ObjSymEntry ent; 469 int module_has_asm; 470 if (!o || !o->target || !o->target->obj) return; 471 if (o->finish_policy.output_kind != KIT_CG_OUTPUT_EXECUTABLE) return; 472 module_has_asm = opt_module_has_asm(module); 473 it = obj_symiter_new(o->target->obj); 474 while (it && obj_symiter_next(it, &ent)) { 475 const ObjSym* s = ent.sym; 476 if (!opt_sym_internalizable(s)) continue; 477 if (opt_sym_must_stay_external(o, module_has_asm, preserved, ent.id, s)) 478 continue; 479 obj_symbol_set_bind(o->target->obj, ent.id, SB_LOCAL); 480 obj_symbol_set_vis(o->target->obj, ent.id, SV_HIDDEN); 481 } 482 if (it) obj_symiter_free(it); 483 } 484 485 static int opt_func_is_root(OptImpl* o, const CgIrFunc* f) { 486 const ObjSym* s; 487 if (!f || f->removed || f->desc.sym == OBJ_SYM_NONE) return 0; 488 s = obj_symbol_get(o->target->obj, f->desc.sym); 489 if (!s || s->removed) return 0; 490 if (s->bind != SB_LOCAL) return 1; 491 if (s->flags & KIT_CG_SYM_USED) return 1; 492 return 0; 493 } 494 495 static SymAttrs opt_func_sym_attrs(OptImpl* o, const CgIrFunc* f) { 496 SymAttrs a; 497 const ObjSym* s; 498 memset(&a, 0, sizeof a); 499 if (!f) return a; 500 s = obj_symbol_get(o->target->obj, f->desc.sym); 501 a.bind = f->desc.sym_bind ? f->desc.sym_bind : (s ? s->bind : SB_GLOBAL); 502 a.kind = f->desc.sym_kind ? f->desc.sym_kind : SK_FUNC; 503 a.size = s ? s->size : 0; 504 a.common_align = 0; 505 a.in_comdat = 0; 506 return a; 507 } 508 509 static void opt_resolve_duplicate_funcs(OptImpl* o, const CgIrModule* module, 510 OptFuncIndex* index) { 511 for (u32 i = 0; i < module->nfuncs; ++i) { 512 CgIrFunc* incoming = module->funcs[i]; 513 u32* existing_idx; 514 if (!incoming || incoming->removed || incoming->desc.sym == OBJ_SYM_NONE) 515 continue; 516 existing_idx = OptFuncIndex_get(index, incoming->desc.sym); 517 if (!existing_idx) { 518 (void)OptFuncIndex_set(index, incoming->desc.sym, i); 519 continue; 520 } 521 { 522 CgIrFunc* existing = module->funcs[*existing_idx]; 523 SymMergeResult mr = symresolve_merge(opt_func_sym_attrs(o, existing), 524 opt_func_sym_attrs(o, incoming)); 525 switch (mr.kind) { 526 case SYM_MERGE_REPLACE: 527 if (existing) existing->removed = 1; 528 (void)OptFuncIndex_set(index, incoming->desc.sym, i); 529 if (incoming->desc.sym_bind) 530 obj_symbol_set_bind(o->target->obj, incoming->desc.sym, 531 (SymBind)incoming->desc.sym_bind); 532 break; 533 case SYM_MERGE_KEEP_EXISTING: 534 case SYM_MERGE_COMDAT_DISCARD: 535 incoming->removed = 1; 536 if (existing && existing->desc.sym_bind) 537 obj_symbol_set_bind(o->target->obj, incoming->desc.sym, 538 (SymBind)existing->desc.sym_bind); 539 break; 540 case SYM_MERGE_COMMON: 541 incoming->removed = 1; 542 break; 543 case SYM_MERGE_ODR_ERROR: 544 compiler_panic(o->c, incoming->desc.loc, 545 "duplicate definition of symbol"); 546 } 547 } 548 } 549 } 550 551 static void opt_mark_func(u8* reachable, u8* queued, u32* queue, u32* qtail, 552 u32 idx) { 553 if (reachable[idx]) return; 554 reachable[idx] = 1; 555 if (!queued[idx]) { 556 queued[idx] = 1; 557 queue[(*qtail)++] = idx; 558 } 559 } 560 561 static void opt_mark_sym(OptFuncIndex* index, u8* reachable, u8* queued, 562 u32* queue, u32* qtail, ObjSymId sym) { 563 u32* slot; 564 if (sym == OBJ_SYM_NONE) return; 565 slot = OptFuncIndex_get(index, sym); 566 if (slot) opt_mark_func(reachable, queued, queue, qtail, *slot); 567 } 568 569 static void opt_mark_symset(OptFuncIndex* index, u8* reachable, u8* queued, 570 u32* queue, u32* qtail, const ObjSymSet* refs) { 571 if (!refs || !refs->cap) return; 572 for (u32 i = 0; i < refs->cap; ++i) { 573 ObjSymId sym = refs->slots[i].k; 574 if (sym != OBJ_SYM_NONE) 575 opt_mark_sym(index, reachable, queued, queue, qtail, sym); 576 } 577 } 578 579 static int opt_sym_is_relocatable_data(const ObjSym* s) { 580 if (!s || s->removed || s->section_id == OBJ_SEC_NONE) return 0; 581 return s->kind == SK_OBJ || s->kind == SK_TLS || s->kind == SK_COMMON; 582 } 583 584 static int opt_reloc_inside_sym(const Reloc* r, const ObjSym* s) { 585 u64 begin, end; 586 if (!r || r->removed || !opt_sym_is_relocatable_data(s)) return 0; 587 if (r->section_id != s->section_id) return 0; 588 begin = s->value; 589 end = begin + s->size; 590 if (s->size == 0) end = begin + 1u; 591 return (u64)r->offset >= begin && (u64)r->offset < end; 592 } 593 594 static void opt_enqueue_data_sym(OptImpl* o, ObjSymSet* seen, ObjSymId* queue, 595 u32* qtail, ObjSymId sym) { 596 const ObjSym* s; 597 if (sym == OBJ_SYM_NONE || ObjSymSet_get(seen, sym)) return; 598 s = obj_symbol_get(o->target->obj, sym); 599 if (!opt_sym_is_relocatable_data(s)) return; 600 (void)ObjSymSet_set(seen, sym, 1); 601 queue[(*qtail)++] = sym; 602 } 603 604 static void opt_enqueue_data_symset(OptImpl* o, ObjSymSet* seen, 605 ObjSymId* queue, u32* qtail, 606 const ObjSymSet* refs) { 607 if (!refs || !refs->cap) return; 608 for (u32 i = 0; i < refs->cap; ++i) { 609 ObjSymId sym = refs->slots[i].k; 610 if (sym != OBJ_SYM_NONE) opt_enqueue_data_sym(o, seen, queue, qtail, sym); 611 } 612 } 613 614 static void opt_mark_data_reloc_graph(OptImpl* o, OptFuncIndex* index, 615 u8* reachable, u8* queued, 616 u32* func_queue, u32* func_qtail, 617 ObjSymSet* data_seen, 618 ObjSymId* data_queue, u32* data_qhead, 619 u32* data_qtail) { 620 u32 nrel = obj_reloc_total(o->target->obj); 621 while (*data_qhead < *data_qtail) { 622 ObjSymId data_sym = data_queue[(*data_qhead)++]; 623 const ObjSym* data = obj_symbol_get(o->target->obj, data_sym); 624 if (!opt_sym_is_relocatable_data(data)) continue; 625 for (u32 i = 0; i < nrel; ++i) { 626 const Reloc* r = obj_reloc_at(o->target->obj, i); 627 if (!opt_reloc_inside_sym(r, data)) continue; 628 opt_mark_sym(index, reachable, queued, func_queue, func_qtail, r->sym); 629 opt_enqueue_data_sym(o, data_seen, data_queue, data_qtail, r->sym); 630 } 631 } 632 } 633 634 static ObjSymId opt_data_reloc_exported_root_sym(OptImpl* o, const Reloc* r) { 635 ObjSymIter* it; 636 ObjSymEntry ent; 637 const Section* sec; 638 if (!r || r->removed || r->section_id == OBJ_SEC_NONE) return OBJ_SYM_NONE; 639 sec = obj_section_get(o->target->obj, r->section_id); 640 if (!sec || sec->removed || sec->kind == SEC_TEXT) return OBJ_SYM_NONE; 641 it = obj_symiter_new(o->target->obj); 642 if (!it) return OBJ_SYM_NONE; 643 while (obj_symiter_next(it, &ent)) { 644 const ObjSym* s = ent.sym; 645 if (!opt_reloc_inside_sym(r, s)) continue; 646 if (sec->flags & SF_RETAIN) { 647 obj_symiter_free(it); 648 return ent.id; 649 } 650 if (s->bind == SB_LOCAL && !(s->flags & KIT_CG_SYM_USED)) continue; 651 obj_symiter_free(it); 652 return ent.id; 653 } 654 obj_symiter_free(it); 655 return OBJ_SYM_NONE; 656 } 657 658 static void opt_root_exported_data_relocs(OptImpl* o, ObjSymSet* data_seen, 659 ObjSymId* data_queue, 660 u32* data_qtail) { 661 u32 nrel = obj_reloc_total(o->target->obj); 662 for (u32 i = 0; i < nrel; ++i) { 663 const Reloc* r = obj_reloc_at(o->target->obj, i); 664 ObjSymId sym = opt_data_reloc_exported_root_sym(o, r); 665 if (sym != OBJ_SYM_NONE) 666 opt_enqueue_data_sym(o, data_seen, data_queue, data_qtail, sym); 667 } 668 } 669 670 static void opt_root_aliases(OptImpl* o, const CgIrModule* module, 671 OptFuncIndex* index, u8* reachable, u8* queued, 672 u32* queue, u32* qtail) { 673 for (u32 i = 0; module && i < module->naliases; ++i) { 674 const CgIrAlias* a = &module->aliases[i]; 675 const ObjSym* s = obj_symbol_get(o->target->obj, a->alias_sym); 676 if (!s || s->removed) continue; 677 if (s->bind != SB_LOCAL || (s->flags & KIT_CG_SYM_USED)) 678 opt_mark_sym(index, reachable, queued, queue, qtail, a->target_sym); 679 } 680 } 681 682 static void opt_refresh_or_prune_aliases(OptImpl* o, const CgIrModule* module, 683 OptFuncIndex* index, 684 const u8* reachable) { 685 for (u32 i = 0; module && i < module->naliases; ++i) { 686 const CgIrAlias* a = &module->aliases[i]; 687 const ObjSym* ts; 688 const ObjSym* as; 689 u32* target_idx = OptFuncIndex_get(index, a->target_sym); 690 if (!target_idx || !reachable[*target_idx]) { 691 as = obj_symbol_get(o->target->obj, a->alias_sym); 692 if (as && as->bind == SB_LOCAL) 693 obj_symbol_remove(o->target->obj, a->alias_sym); 694 continue; 695 } 696 ts = obj_symbol_get(o->target->obj, a->target_sym); 697 if (ts && !ts->removed && ts->section_id != OBJ_SEC_NONE) 698 obj_symbol_define(o->target->obj, a->alias_sym, ts->section_id, ts->value, 699 ts->size); 700 } 701 } 702 703 static void opt_prune_debug(OptImpl* o) { 704 if (o->native && o->native->mc && o->native->mc->debug) 705 debug_prune_removed_funcs(o->native->mc->debug); 706 } 707 708 /* Whole-module finalize: seed roots, walk the call/use + data-reloc graph, 709 * remove unreachable local symbols, then lower + optimize + emit only the live 710 * set. Arch-independent — the ARM64 path has always finalized this way; -O2 now 711 * routes every arch through here (see opt_on_finalize). When `do_inline` is set 712 * the live functions are lowered into a FuncSet and run through the 713 * whole-program inliner before the per-function machinize/emit suffix. */ 714 static void opt_whole_module_finalize(OptImpl* o, const CgIrModule* module, 715 int do_inline) { 716 OptFuncIndex index; 717 ObjSymSet preserved; 718 ObjSymSet data_seen; 719 u8* reachable; 720 u8* queued; 721 u32* queue; 722 ObjSymId* data_queue; 723 u32 qhead = 0; 724 u32 qtail = 0; 725 u32 data_qhead = 0; 726 u32 data_qtail = 0; 727 u32 nsym = 1; 728 if (!module || !module->nfuncs) return; 729 OptFuncIndex_init_cap(&index, o->c->ctx->heap, 0); 730 ObjSymSet_init_cap(&preserved, o->c->ctx->heap, 0); 731 ObjSymSet_init_cap(&data_seen, o->c->ctx->heap, 0); 732 reachable = arena_zarray(o->c->tu, u8, module->nfuncs); 733 queued = arena_zarray(o->c->tu, u8, module->nfuncs); 734 queue = arena_array(o->c->tu, u32, module->nfuncs); 735 { 736 ObjSymIter* it = obj_symiter_new(o->target->obj); 737 ObjSymEntry ent; 738 while (it && obj_symiter_next(it, &ent)) ++nsym; 739 if (it) obj_symiter_free(it); 740 } 741 data_queue = arena_array(o->c->tu, ObjSymId, nsym); 742 opt_resolve_duplicate_funcs(o, module, &index); 743 opt_build_preserved_set(o, &preserved); 744 opt_internalize_non_preserved(o, module, &preserved); 745 for (u32 i = 0; i < module->nfuncs; ++i) { 746 if (module->funcs[i] && module->funcs[i]->removed) continue; 747 if (module->nfile_scope_asms || opt_func_is_root(o, module->funcs[i])) 748 opt_mark_func(reachable, queued, queue, &qtail, i); 749 } 750 opt_root_aliases(o, module, &index, reachable, queued, queue, &qtail); 751 opt_root_exported_data_relocs(o, &data_seen, data_queue, &data_qtail); 752 opt_mark_data_reloc_graph(o, &index, reachable, queued, queue, &qtail, 753 &data_seen, data_queue, &data_qhead, &data_qtail); 754 while (qhead < qtail) { 755 CgIrFunc* f = module->funcs[queue[qhead++]]; 756 opt_mark_symset(&index, reachable, queued, queue, &qtail, &f->call_refs); 757 opt_mark_symset(&index, reachable, queued, queue, &qtail, &f->global_refs); 758 opt_enqueue_data_symset(o, &data_seen, data_queue, &data_qtail, 759 &f->global_refs); 760 opt_mark_data_reloc_graph(o, &index, reachable, queued, queue, &qtail, 761 &data_seen, data_queue, &data_qhead, &data_qtail); 762 } 763 for (u32 i = 0; i < module->nfuncs; ++i) { 764 CgIrFunc* cg_func = module->funcs[i]; 765 if (cg_func && cg_func->removed) continue; 766 if (reachable[i]) continue; 767 if (cg_func && cg_func->desc.sym != OBJ_SYM_NONE) { 768 const ObjSym* s = obj_symbol_get(o->target->obj, cg_func->desc.sym); 769 if (s && s->bind == SB_LOCAL) 770 obj_symbol_remove(o->target->obj, cg_func->desc.sym); 771 } 772 } 773 opt_prune_debug(o); 774 if (!do_inline) { 775 /* Streaming emit: lower and run the full per-function pipeline in place. 776 * Preserves the historical ARM64 -O1 behavior exactly. */ 777 for (u32 i = 0; i < module->nfuncs; ++i) { 778 Func* f; 779 if (!reachable[i]) continue; 780 metrics_scope_begin(o->c, "opt.o1.cg_ir_lower"); 781 f = opt_func_from_cg_ir(o->c, module->funcs[i]); 782 metrics_scope_end(o->c, "opt.o1.cg_ir_lower"); 783 opt_run_o1_native(o, f); 784 opt_maybe_capture_interp(o, module->funcs[i]); 785 } 786 } else { 787 /* Whole-program inline: lower + CFG-prep every live function into one 788 * FuncSet so the inliner can resolve direct callees by symbol across the 789 * module, inline under the growth-gated cost model, then run the 790 * machinize/emit suffix on each (cfg_dirty=1 because opt_inline left the 791 * caller CFGs stale). Functions and their source CgIrFuncs are tracked in 792 * parallel so the interp-capture re-lowers the right body. */ 793 FuncSet fs; 794 CgIrFunc** cg_srcs; 795 u32 nlive = 0; 796 for (u32 i = 0; i < module->nfuncs; ++i) 797 if (reachable[i] && module->funcs[i] && !module->funcs[i]->removed) 798 ++nlive; 799 memset(&fs, 0, sizeof fs); 800 fs.c = o->c; 801 fs.arena = o->c->tu; 802 fs.funcs = arena_array(o->c->tu, Func*, nlive ? nlive : 1u); 803 fs.cap = nlive; 804 cg_srcs = arena_array(o->c->tu, CgIrFunc*, nlive ? nlive : 1u); 805 for (u32 i = 0; i < module->nfuncs; ++i) { 806 Func* f; 807 if (!reachable[i] || !module->funcs[i] || module->funcs[i]->removed) 808 continue; 809 metrics_scope_begin(o->c, "opt.o1.cg_ir_lower"); 810 f = opt_lower_for_inline(o, module->funcs[i]); 811 metrics_scope_end(o->c, "opt.o1.cg_ir_lower"); 812 opt_o1_native_prepare(o, f); 813 cg_srcs[fs.nfuncs] = module->funcs[i]; 814 fs.funcs[fs.nfuncs++] = f; 815 } 816 metrics_scope_begin(o->c, "opt.inline.total"); 817 opt_inline(&fs, OPT_WHOLE_PROGRAM_INLINE_ITERS); 818 metrics_scope_end(o->c, "opt.inline.total"); 819 for (u32 k = 0; k < fs.nfuncs; ++k) { 820 metrics_scope_begin(o->c, "opt.o1.total"); 821 opt_o1_native_finish(o, fs.funcs[k], /*cfg_dirty=*/1); 822 metrics_scope_end(o->c, "opt.o1.total"); 823 opt_maybe_capture_interp(o, cg_srcs[k]); 824 } 825 } 826 opt_refresh_or_prune_aliases(o, module, &index, reachable); 827 ObjSymSet_fini(&data_seen); 828 ObjSymSet_fini(&preserved); 829 OptFuncIndex_fini(&index); 830 } 831 832 static void opt_on_finalize(void* user, const CgIrModule* module) { 833 OptImpl* o = (OptImpl*)user; 834 /* File-scope asm blocks are captured during recording (no live target then) 835 * and replayed here, before finalize. Emission order relative to functions 836 * does not matter: each block selects its own sections (.data/.text/...). */ 837 if (o->native && o->native->file_scope_asm && module) { 838 for (u32 i = 0; i < module->nfile_scope_asms; ++i) 839 o->native->file_scope_asm(o->native, module->file_scope_asms[i].src, 840 module->file_scope_asms[i].len); 841 } 842 /* Whole-program mode finalizes through the module sweep — one path for every 843 * arch: GC + cross-function inlining over the full reachable set. If it were 844 * off, every arch would have emitted eagerly in opt_on_func and the sweep 845 * would find nothing, so it is skipped. */ 846 if (o->whole_program) 847 opt_whole_module_finalize(o, module, /*do_inline=*/o->whole_program); 848 if (o->native && o->native->finalize) o->native->finalize(o->native); 849 } 850 851 static void opt_on_destroy(void* user) { 852 OptImpl* o = (OptImpl*)user; 853 if (o->native && o->native->destroy) o->native->destroy(o->native); 854 (void)o; 855 } 856 857 static int opt_on_local_static_data_begin(void* user, 858 const CGLocalStaticDataDesc* desc) { 859 (void)desc; 860 OptImpl* o = (OptImpl*)user; 861 return o && o->target && o->target->local_static_data_begin && 862 o->target->local_static_data_write && 863 o->target->local_static_data_label_addr && 864 o->target->local_static_data_end; 865 } 866 867 static const char* opt_on_tail_call_unrealizable_reason( 868 void* user, const struct CGFuncDesc* caller, const CGCallDesc* call) { 869 OptImpl* o = (OptImpl*)user; 870 int callee_va = 0, caller_va = 0; 871 u32 callee_nparams = 0; 872 u32 outgoing, incoming; 873 if (!o || !o->native || !o->native->signature_stack_bytes) return NULL; 874 /* A realized sibling call reuses the area this function received for its own 875 * incoming stack arguments to hold the callee's outgoing stack arguments, so 876 * the callee's must fit. If they don't, the tail isn't realizable and CG 877 * falls back to an ordinary call + return. */ 878 outgoing = o->native->signature_stack_bytes(o->native, call->fn_type, 879 &callee_va, &callee_nparams); 880 incoming = o->native->signature_stack_bytes(o->native, caller->fn_type, 881 &caller_va, NULL); 882 /* A variadic caller's incoming layout includes the register save / overflow 883 * area, which isn't a simple reusable parameter slab — don't realize. And if 884 * the call actually passes variadic (beyond-fixed) arguments, their stack 885 * footprint isn't captured by the fixed-parameter sizing above, so be 886 * conservative and fall back. A variadic callee invoked with only its fixed 887 * arguments (e.g. `musttail sink(x)`) is fine and uses the byte comparison. 888 */ 889 if (caller_va) return "variadic caller cannot host a sibling call"; 890 if (callee_va && call->nargs > callee_nparams) 891 return "variadic tail call arguments are not realizable as a sibling call"; 892 if (outgoing > incoming) 893 return "tail call stack arguments exceed the caller's parameter area"; 894 return NULL; 895 } 896 897 static int opt_on_asm_is_reg_constraint(void* user, const char* constraint) { 898 OptImpl* o = (OptImpl*)user; 899 return native_asm_constraint_is_reg(o ? o->native : NULL, constraint); 900 } 901 902 CgTarget* opt_cgtarget_new(Compiler* c, CgTarget* target, int level) { 903 if (!target) 904 compiler_panic(c, (SrcLoc){0, 0, 0}, "opt_cgtarget_new: target is NULL"); 905 if (level < 1) 906 compiler_panic(c, (SrcLoc){0, 0, 0}, 907 "opt_cgtarget_new: level %d out of range", level); 908 OptImpl* o = arena_znew(c->tu, OptImpl); 909 o->c = c; 910 o->target = target; 911 o->native = native_direct_target_native(target); 912 o->level = level; 913 /* Whenever the optimizer is engaged (-O1 and above) we run whole-program 914 * optimization: deferred emission plus the module-wide reachability sweep and 915 * cross-function inliner. The optimizer recorder only exists at level >= 1 916 * (see kit_cg_begin), so this is effectively "on whenever optimizing". 917 * -O0 uses the single-pass direct target and never reaches this code. */ 918 o->whole_program = (level >= 1) ? 1 : 0; 919 920 CgIrRecorderConfig cfg; 921 memset(&cfg, 0, sizeof cfg); 922 cfg.func_recorded = opt_on_func; 923 cfg.finalize = opt_on_finalize; 924 cfg.destroy = opt_on_destroy; 925 cfg.local_static_data_begin = opt_on_local_static_data_begin; 926 cfg.tail_call_unrealizable_reason = opt_on_tail_call_unrealizable_reason; 927 cfg.asm_is_reg_constraint = opt_on_asm_is_reg_constraint; 928 cfg.user = o; 929 return cg_ir_recorder_new(c, target->obj, &cfg); 930 } 931 932 void opt_set_dump_writer(CgTarget* t, Writer* w) { 933 CgIrRecorder* rec = cg_ir_recorder_from_target(t); 934 OptImpl* o = rec ? (OptImpl*)cg_ir_recorder_user(rec) : NULL; 935 if (o) o->dump_writer = w; 936 } 937 938 void opt_set_finish_policy(CgTarget* t, const CgFinishPolicy* policy) { 939 CgIrRecorder* rec = cg_ir_recorder_from_target(t); 940 OptImpl* o = rec ? (OptImpl*)cg_ir_recorder_user(rec) : NULL; 941 if (!o) return; 942 memset(&o->finish_policy, 0, sizeof(o->finish_policy)); 943 if (policy) o->finish_policy = *policy; 944 }