kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 396793eaaaa6a778f2f6face8a960fd58a3438db
parent 77167e7c944869160b845e1a94f3a81a9105f435
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 22 May 2026 12:24:56 -0700

opt emit: dedup set_loc calls; cache hard-reg scan; gate verify on NDEBUG

pass_emit.c: skip redundant set_loc calls between consecutive instructions
that share the same source location (common case), keeping one forced update
per function for panic accuracy when debug info is off. Also cache the
collect_replayed_hard_regs result so plan_hard_regs and reserve_hard_regs
share one scan instead of two. Add metrics_scope instrumentation around each
major emit phase (setup, plan_hard_regs, func_begin, body, reserve_hard_regs,
func_end).

pass_analysis.c: guard verify-only helpers (opt_fail, block_has_pred,
fixed_terminator_succ_count, verify_operand, verify_def_use, and the
opt_no_loc helper) under #ifndef NDEBUG; make opt_verify a no-op in
release builds via an early #ifdef NDEBUG return.

test/opt/opt_test.c: guard stale_verify_arg and the body of
opt_verify_catches_stale_def_use under #ifndef NDEBUG, since opt_verify
is a no-op in release builds and cannot catch the stale def-use case.

driver/env.c: on Mach-O hosts try the underscore-stripped name first in
driver_dlsym_resolver, avoiding a wasted dlsym call for every libc symbol
that arrives with a leading underscore.

Diffstat:
Mdriver/env.c | 14+++++++++++---
Msrc/opt/pass_analysis.c | 14++++++++++++++
Msrc/opt/pass_emit.c | 86++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
Mtest/opt/opt_test.c | 7+++++++
4 files changed, 106 insertions(+), 15 deletions(-)

diff --git a/driver/env.c b/driver/env.c @@ -1758,14 +1758,22 @@ void *driver_dlsym_resolver(void *user, const char *name) { (void)user; if (!name) return NULL; - p = dlsym(RTLD_DEFAULT, name); /* On Mach-O hosts the linker hands us C names with a leading underscore * (obj_format_c_mangle), but dlsym(RTLD_DEFAULT) expects the - * source-level name. Retry with the prefix stripped so JITed code can - * resolve libc symbols by their C name. */ + * source-level name. Try the stripped form first to avoid a wasted + * dlsym call per libc symbol. */ +#if defined(__APPLE__) + if (name[0] == '_' && name[1] != '\0') { + p = dlsym(RTLD_DEFAULT, name + 1); + if (p) return p; + } + return dlsym(RTLD_DEFAULT, name); +#else + p = dlsym(RTLD_DEFAULT, name); if (!p && name[0] == '_' && name[1] != '\0') p = dlsym(RTLD_DEFAULT, name + 1); return p; +#endif } int driver_read_line(char *buf, size_t cap) { diff --git a/src/opt/pass_analysis.c b/src/opt/pass_analysis.c @@ -6,11 +6,13 @@ #define OPT_BLK_NONE 0xffffffffu +#ifndef NDEBUG static SrcLoc opt_no_loc(void) { SrcLoc loc = {0, 0, 0}; return loc; } + static void opt_fail(Func* f, const char* stage, const char* msg, u32 a, u32 b) { compiler_panic(f->c, opt_no_loc(), "opt verify[%s]: %s (%u, %u)", @@ -21,6 +23,7 @@ static int verify_stage_is_ssa(const char* stage) { return stage && strstr(stage, "ssa") != NULL && strstr(stage, "pre-ssa") == NULL; } +#endif void opt_analysis_mark_valid(Func* f, u32 flags) { if (!f) return; @@ -162,6 +165,7 @@ int opt_analysis_dominates(const OptAnalysis* a, u32 dom, u32 node) { return 0; } +#ifndef NDEBUG static int block_has_pred(const Block* bl, u32 pred) { for (u32 i = 0; i < bl->npreds; ++i) if (bl->preds[i] == pred) return 1; @@ -201,6 +205,7 @@ static int fixed_terminator_succ_count(const Inst* in, u32* count_out) { return 0; } } +#endif /* !NDEBUG (verify-only helpers) */ static void opt_use_add(Func* f, Val v, u32 b, u32 i, u8 kind, u32 op_idx, u32 pred_idx, Operand* op) { @@ -335,6 +340,7 @@ void opt_rebuild_def_use(Func* f) { opt_analysis_mark_valid(f, OPT_ANALYSIS_DEF_USE); } +#ifndef NDEBUG static void verify_operand(Func* f, Inst* in, Operand* op, int is_def, void* arg) { (void)in; @@ -477,7 +483,14 @@ static void verify_def_use(Func* f, const char* stage) { } } +#endif /* !NDEBUG */ + void opt_verify(Func* f, const char* stage) { +#ifdef NDEBUG + (void)f; + (void)stage; + return; +#else if (!f) return; if (f->nblocks && f->entry >= f->nblocks) opt_fail(f, stage, "entry out of range", f->entry, f->nblocks); @@ -520,4 +533,5 @@ void opt_verify(Func* f, const char* stage) { } verify_values(f, stage); verify_def_use(f, stage); +#endif } diff --git a/src/opt/pass_emit.c b/src/opt/pass_emit.c @@ -3,6 +3,7 @@ #include "arch/regalloc.h" #include "core/arena.h" #include "core/core.h" +#include "core/metrics.h" #include "opt/ir.h" #include "opt/opt_internal.h" @@ -18,8 +19,25 @@ typedef struct ReplayCtx { u8* block_label_placed; u8 identity_regs; CGSimpleRegAlloc regalloc; + /* Cached hard-reg collection: filled once when identity_regs is set and + * reused by plan_hard_regs and reserve_hard_regs callbacks. */ + Reg used_hard_regs[OPT_REG_CLASSES][OPT_MAX_HARD_REGS]; + u32 nused_hard_regs[OPT_REG_CLASSES]; + u8 used_hard_regs_valid; + /* Last source location pushed to the target — used to skip redundant + * set_loc calls when consecutive insts share a loc (the common case). */ + SrcLoc last_loc; + u8 last_loc_valid; + /* When debug info isn't being emitted, set_loc only affects the panic + * loc — we set it once per function in func_begin and skip per-inst + * updates entirely. */ + u8 wants_loc; } ReplayCtx; +static inline int srcloc_eq(SrcLoc a, SrcLoc b) { + return a.file_id == b.file_id && a.line == b.line && a.col == b.col; +} + static Reg val_to_target_reg(ReplayCtx* r, Val v) { Func* f = r->f; if (v == VAL_NONE) return REG_NONE; @@ -465,7 +483,25 @@ static void ensure_label_placed(ReplayCtx* r, u32 b) { static void replay_inst(ReplayCtx* r, u32 b, Inst* in) { CGTarget* w = r->tgt; - w->set_loc(w, in->loc); + /* set_loc serves two purposes (see arch/mc.c and the per-arch emit code): + * 1. error reporting via compiler_panic - needs some recent loc + * 2. DWARF line-info rows via debug_emit_row, gated on mc->debug + * When debug info isn't being emitted we still set the loc once (the + * first inst's check catches that via last_loc_valid=0), so panic + * messages still point at a real source location, but subsequent updates + * are skipped. When debug info IS being emitted we update on every + * change so line rows stay accurate. */ + if (r->wants_loc) { + if (!r->last_loc_valid || !srcloc_eq(r->last_loc, in->loc)) { + w->set_loc(w, in->loc); + r->last_loc = in->loc; + r->last_loc_valid = 1; + } + } else if (!r->last_loc_valid) { + w->set_loc(w, in->loc); + r->last_loc = in->loc; + r->last_loc_valid = 1; + } switch ((IROp)in->op) { case IR_NOP: @@ -1064,6 +1100,7 @@ static void collect_known_frame(Func* f, CGTarget* w, CGKnownFrameDesc* out) { static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) { ReplayCtx r; + metrics_scope_begin(c, "opt.emit.setup"); r.c = c; r.f = f; r.tgt = w; @@ -1081,16 +1118,34 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) { r.scope_map = arena_zarray(f->arena, CGScope, f->nscopes + 1u); for (u32 i = 0; i <= f->nscopes; ++i) r.scope_map[i] = CG_SCOPE_NONE; r.block_label_placed = arena_zarray(f->arena, u8, nb); + r.used_hard_regs_valid = 0; + r.last_loc_valid = 0; + /* If the target isn't emitting debug info, we only need to keep the + * panic loc accurate at function granularity. Set once at func entry + * (handled by the first replay_inst's dedup check) and skip the rest. */ + r.wants_loc = w->debug != NULL; + metrics_scope_end(c, "opt.emit.setup"); - if (identity && w->plan_hard_regs) { + metrics_scope_begin(c, "opt.emit.plan_hard_regs"); + if (identity && (w->plan_hard_regs || w->reserve_hard_regs)) { + /* Collect once; reuse for both plan_hard_regs (here) and + * reserve_hard_regs (after the body). The IR doesn't change between + * these two callbacks, so a second scan would compute the same data. */ for (u32 cidx = 0; cidx < OPT_REG_CLASSES; ++cidx) { - Reg used[OPT_MAX_HARD_REGS]; - u32 nused = collect_replayed_hard_regs(f, w, (RegClass)cidx, used, - OPT_MAX_HARD_REGS); - w->plan_hard_regs(w, (RegClass)cidx, used, nused); + r.nused_hard_regs[cidx] = collect_replayed_hard_regs( + f, w, (RegClass)cidx, r.used_hard_regs[cidx], OPT_MAX_HARD_REGS); + } + r.used_hard_regs_valid = 1; + if (w->plan_hard_regs) { + for (u32 cidx = 0; cidx < OPT_REG_CLASSES; ++cidx) { + w->plan_hard_regs(w, (RegClass)cidx, r.used_hard_regs[cidx], + r.nused_hard_regs[cidx]); + } } } + metrics_scope_end(c, "opt.emit.plan_hard_regs"); + metrics_scope_begin(c, "opt.emit.func_begin"); int known_frame = identity && w->func_begin_known_frame && w->call_stack_size; if (known_frame) { CGKnownFrameDesc frame; @@ -1153,27 +1208,31 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) { d.loc = p->loc; (void)w->param(w, &d); } + metrics_scope_end(c, "opt.emit.func_begin"); + metrics_scope_begin(c, "opt.emit.body"); /* Body in emit order — the order CG's emit cursor visited each * block. Block-creation order can differ when label_new precedes a * cmp_branch whose fallthrough block must physically follow. */ for (u32 i = 0; i < f->emit_order_n; ++i) { replay_block(&r, f->emit_order[i]); } + metrics_scope_end(c, "opt.emit.body"); + metrics_scope_begin(c, "opt.emit.reserve_hard_regs"); /* At -O1, opt managed allocation and emitted hard regs directly, * bypassing backend-local allocation. Tell the backend which hard * regs are still visible in replay so it can save the right callee-saved - * subset in prologue/epilogue. + * subset in prologue/epilogue. Reuses the cached collection from the + * plan_hard_regs pass — the IR hasn't changed since. * * The backend records only callee-saved members of this set for * prologue/epilogue preservation. */ - if (r.identity_regs && w->reserve_hard_regs) { + if (r.identity_regs && w->reserve_hard_regs && r.used_hard_regs_valid) { for (u32 c = 0; c < OPT_REG_CLASSES; ++c) { - Reg used[OPT_MAX_HARD_REGS]; - u32 nused = collect_replayed_hard_regs(f, w, (RegClass)c, used, - OPT_MAX_HARD_REGS); - if (nused) w->reserve_hard_regs(w, (RegClass)c, used, nused); + if (r.nused_hard_regs[c]) + w->reserve_hard_regs(w, (RegClass)c, r.used_hard_regs[c], + r.nused_hard_regs[c]); } } else if (!r.identity_regs && w->reserve_hard_regs) { for (u32 c = 0; c < OPT_REG_CLASSES; ++c) { @@ -1184,7 +1243,10 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) { } } + metrics_scope_end(c, "opt.emit.reserve_hard_regs"); + metrics_scope_begin(c, "opt.emit.func_end"); w->func_end(w); + metrics_scope_end(c, "opt.emit.func_end"); } void opt_replay(Compiler* c, Func* f, CGTarget* target) { diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c @@ -2430,11 +2430,17 @@ static void opt_ssa_conventional_splits_critical_edge(void) { tc_fini(&tc); } +#ifndef NDEBUG static void stale_verify_arg(void* arg) { opt_verify((Func*)arg, "stale-def-use-test"); } +#endif static void opt_verify_catches_stale_def_use(void) { +#ifdef NDEBUG + /* opt_verify is a no-op under NDEBUG, so it cannot catch this. */ + return; +#else TestCtx tc; tc_init(&tc); Func* f = new_func(&tc); @@ -2451,6 +2457,7 @@ static void opt_verify_catches_stale_def_use(void) { EXPECT(expect_panic(tc.c, stale_verify_arg, f), "verifier should catch stale cached def-use after mutation"); tc_fini(&tc); +#endif } static void opt_ssa_dce_removes_dead_defs_and_phi(void) {