kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit ab675f5697a19ecc6d59233c1acfb096b0656588
parent 5194cb9d9957c73acaf9b0aec66baee5efa53e7c
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed, 20 May 2026 20:43:03 -0700

Add jump-table lowering for CG switches

Diffstat:
Msrc/arch/arch.h | 3++-
Msrc/cg/control.c | 233+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Msrc/cg/data.c | 80+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/cg/internal.h | 2++
Msrc/cg/session.c | 1+
Msrc/opt/pass_emit.c | 6++++++
6 files changed, 308 insertions(+), 17 deletions(-)

diff --git a/src/arch/arch.h b/src/arch/arch.h @@ -629,7 +629,8 @@ typedef struct CGSwitchDesc { const CGSwitchCase* cases; u32 ncases; u8 hint; /* CfreeCgSwitchHint */ - u8 pad[3]; + u8 opt_level; /* 0/1/2; reads policy in cg_lower_switch_default */ + u8 pad[2]; } CGSwitchDesc; typedef struct CGLocalStaticDataDesc { diff --git a/src/cg/control.c b/src/cg/control.c @@ -1,4 +1,5 @@ #include "cg/internal.h" +#include "core/metrics.h" CfreeCgLabel cfree_cg_label_new(CfreeCg* g) { if (!g) return CFREE_CG_LABEL_NONE; @@ -74,14 +75,12 @@ void cfree_cg_branch_false(CfreeCg* g, CfreeCgLabel label) { void cg_lower_switch_default(CGTarget* t, const CGSwitchDesc* d) { /* Cmp-and-branch chain: one cmp_branch per case, then jump to - * default (or fall through if LABEL_NONE). Fast to emit and the - * default policy at -O0. Density-driven jump-table conversion runs - * later as an opt-level rewrite over IR_SWITCH; the structured - * shape survives in IR until then. - * - * d->hint is currently advisory only here — cg does not rewrite - * into a jump table at lowering time, so JUMP_TABLE and - * BRANCH_CHAIN both produce the chain. */ + * default (or fall through if LABEL_NONE). The fallback shape; the + * frontend-facing cfree_cg_switch picks chain vs. jump-table up + * front (see cg_plan_switch) and routes here only for the chain + * case. Backend overrides (the C target's switch_) and opt's IR + * replay both reach this from outside the cg API, so the lowering + * stays target-only and uses just cmp_branch + jump. */ for (u32 i = 0; i < d->ncases; ++i) { Operand imm = api_op_imm((i64)d->cases[i].value, d->selector_type); t->cmp_branch(t, CMP_EQ, d->selector, imm, d->cases[i].label); @@ -91,23 +90,207 @@ void cg_lower_switch_default(CGTarget* t, const CGSwitchDesc* d) { } } +/* Density / sizing thresholds for the O1 jump-table heuristic. At O0 + * we only honor an explicit CFREE_CG_SWITCH_JUMP_TABLE hint — the policy + * is single-pass and tied to frontend intent rather than analysis. O1 + * runs a linear scan over the case list and picks a single global table + * when the cases are dense enough; clustering / multi-table layouts are + * out of scope here. */ +#define CG_SWITCH_TABLE_MIN_CASES_O1 4u +#define CG_SWITCH_TABLE_MAX_SPAN_O1 4096u +#define CG_SWITCH_TABLE_DENSITY_RECIP_O1 4u /* ncases * recip >= span */ + +typedef enum CGSwitchClass { + CG_SWITCH_CLS_CHAIN, + CG_SWITCH_CLS_TABLE, +} CGSwitchClass; + +typedef struct CGSwitchPlan { + CGSwitchClass cls; + i64 vmin; + u64 span; +} CGSwitchPlan; + +/* Single pass over the cases array: derive (vmin, vmax, span) in the + * selector type's signed interpretation. Frontends store case values as + * the u64 bit pattern of an i64 sign-extended from the selector's + * width, so api_sign_extend_width recovers the source ordering for + * both signed and unsigned selectors that fit in i64. Returns 0 if the + * selector type is unusable for our policy (>64 bits) or if span + * overflows u64. */ +static int cg_switch_extents(Compiler* c, const CGSwitchDesc* d, i64* out_vmin, + u64* out_span) { + u32 width; + i64 vmin; + i64 vmax; + u32 i; + width = cfree_cg_type_int_width((CfreeCompiler*)c, d->selector_type); + if (!width || width > 64u) return 0; + vmin = INT64_MAX; + vmax = INT64_MIN; + for (i = 0; i < d->ncases; ++i) { + i64 vi = (width == 64u) ? (i64)d->cases[i].value + : api_sign_extend_width(d->cases[i].value, width); + if (vi < vmin) vmin = vi; + if (vi > vmax) vmax = vi; + } + if (vmax < vmin) return 0; + { + u64 delta = (u64)vmax - (u64)vmin; + if (delta == UINT64_MAX) return 0; + *out_span = delta + 1u; + } + *out_vmin = vmin; + return 1; +} + +static CGSwitchPlan cg_plan_switch(CfreeCg* g, const CGSwitchDesc* d) { + CGSwitchPlan plan; + plan.cls = CG_SWITCH_CLS_CHAIN; + plan.vmin = 0; + plan.span = 0; + if (d->ncases == 0) return plan; + if (d->default_label == LABEL_NONE) return plan; + if (d->hint == CFREE_CG_SWITCH_BRANCH_CHAIN) return plan; + if (!cg_switch_extents(g->c, d, &plan.vmin, &plan.span)) return plan; + if (d->hint == CFREE_CG_SWITCH_JUMP_TABLE) { + /* Frontend explicitly opted in. Honor unless the span is wildly + * out of bounds — a forced hint shouldn't blow up code size on a + * misshapen switch. */ + if (plan.span > CG_SWITCH_TABLE_MAX_SPAN_O1) return plan; + plan.cls = CG_SWITCH_CLS_TABLE; + return plan; + } + /* TARGET_DEFAULT: O0 keeps the chain; O1+ runs the density check. */ + if (d->opt_level == 0) return plan; + if (d->ncases < CG_SWITCH_TABLE_MIN_CASES_O1) return plan; + if (plan.span > CG_SWITCH_TABLE_MAX_SPAN_O1) return plan; + if (plan.span > (u64)d->ncases * CG_SWITCH_TABLE_DENSITY_RECIP_O1) return plan; + plan.cls = CG_SWITCH_CLS_TABLE; + return plan; +} + +/* Emit a dense jump-table dispatch using cg-API ops. The selector value + * is still on the value stack on entry. Routing through the cg API + * means the same primitives work for direct CG (lowered to machine + * ops) and for the opt wrapper (recorded as IR_BINOP / IR_CMP_BRANCH / + * IR_LOAD / IR_INDIRECT_BRANCH, which pass_emit already lowers + * natively without ever materializing IR_SWITCH). */ +static void cg_emit_switch_table(CfreeCg* g, const CGSwitchDesc* d, + const CGSwitchPlan* plan) { + Compiler* c; + Heap* h; + CfreeCgTypeId sel_ty; + u32 sel_w; + CfreeCgTypeId i64_ty; + CfreeCgTypeId void_ptr_ty; + CfreeCgTypeId arr_ty; + Label* labels; + CfreeCgLabel* targets; + ObjSymId table_sym; + CfreeCgDecl decl; + CfreeCgMemAccess acc; + u64 i; + u32 width; + c = g->c; + h = (Heap*)c->ctx->heap; + sel_ty = d->selector_type; + sel_w = cfree_cg_type_int_width((CfreeCompiler*)c, sel_ty); + i64_ty = builtin_id(CFREE_CG_BUILTIN_I64); + void_ptr_ty = cg_type_ptr_to(c, builtin_id(CFREE_CG_BUILTIN_VOID)); + + /* 1. idx = sel - vmin (selector_type wraparound is what we want; + * the unsigned bounds check below catches both negative-underflow + * and positive-overflow cases). */ + cfree_cg_push_int(g, (uint64_t)plan->vmin, sel_ty); + cfree_cg_int_binop(g, CFREE_CG_INT_SUB, 0); /* [idx] */ + + /* 2. Bounds check: branch to default when idx u> span-1. */ + cfree_cg_dup(g); /* [idx, idx] */ + cfree_cg_push_int(g, plan->span - 1u, sel_ty); + cfree_cg_int_cmp(g, CFREE_CG_INT_GT_U); /* [idx, cond] */ + cfree_cg_branch_true(g, (CfreeCgLabel)d->default_label); /* [idx] */ + + /* 3. Widen idx to i64 so the subsequent index multiply runs at + * pointer width regardless of selector signedness. The bounds + * check above already established 0 <= idx < span, so zext is + * value-preserving. */ + if (sel_w < 64u) { + cfree_cg_zext(g, i64_ty); + } + + /* 4. Build the dense label[] slot array and emit the rodata table. */ + labels = (Label*)h->alloc(h, (size_t)plan->span * sizeof *labels, + _Alignof(Label)); + if (!labels) compiler_panic(c, g->cur_loc, "cfree_cg_switch: oom"); + for (i = 0; i < plan->span; ++i) labels[i] = d->default_label; + width = sel_w; + for (i = 0; i < d->ncases; ++i) { + i64 vi = (width == 64u) ? (i64)d->cases[i].value + : api_sign_extend_width(d->cases[i].value, width); + u64 slot = (u64)(vi - plan->vmin); + labels[slot] = d->cases[i].label; + } + table_sym = api_emit_label_table(g, labels, (u32)plan->span); + h->free(h, labels, (size_t)plan->span * sizeof *labels); + if (table_sym == OBJ_SYM_NONE) { + /* api_emit_label_table panics on real failure; this only fires if + * a future caller asks for a 0-slot table (which cg_plan_switch + * already rules out). */ + compiler_panic(c, g->cur_loc, "cfree_cg_switch: table emission failed"); + return; + } + arr_ty = cfree_cg_type_array((CfreeCompiler*)c, void_ptr_ty, plan->span); + memset(&decl, 0, sizeof decl); + decl.kind = CFREE_CG_DECL_OBJECT; + decl.sym.bind = CFREE_SB_LOCAL; + decl.sym.visibility = CFREE_CG_VIS_DEFAULT; + decl.as.object.flags = CFREE_CG_OBJ_READONLY; + api_remember_sym(g, table_sym, arr_ty, decl); + + /* 5. Compute &table[idx] and load the label address. */ + cfree_cg_push_symbol_lvalue(g, (CfreeCgSym)table_sym, 0); /* [idx, table_lv] */ + cfree_cg_swap(g); /* [table_lv, idx] */ + cfree_cg_index(g, 0); /* [&table[idx]] */ + memset(&acc, 0, sizeof acc); + acc.type = void_ptr_ty; + acc.align = (uint32_t)c->target.ptr_align; + cfree_cg_load(g, acc); /* [label_addr] */ + + /* 6. Indirect branch with the full closed target set (every case + + * default), so backends doing branch-target hardening (BTI/IBT/CFG) + * can stamp landing pads on every reachable label. */ + targets = (CfreeCgLabel*)h->alloc(h, (d->ncases + 1u) * sizeof *targets, + _Alignof(CfreeCgLabel)); + if (!targets) compiler_panic(c, g->cur_loc, "cfree_cg_switch: oom"); + for (i = 0; i < d->ncases; ++i) { + targets[i] = (CfreeCgLabel)d->cases[i].label; + } + targets[d->ncases] = (CfreeCgLabel)d->default_label; + cfree_cg_computed_goto(g, targets, d->ncases + 1u); + h->free(h, targets, (d->ncases + 1u) * sizeof *targets); +} + void cfree_cg_switch(CfreeCg* g, CfreeCgSwitch sw) { ApiSValue selector; CGSwitchDesc desc; Heap* h; CGSwitchCase* cases = NULL; + CGSwitchPlan plan; + int native_switch_override; if (!g) return; if (g->sp == 0) return; api_local_const_control_boundary(g); - selector = api_pop(g); memset(&desc, 0, sizeof desc); desc.selector_type = resolve_type(g->c, sw.selector_type); - if (!desc.selector_type) desc.selector_type = api_sv_type(&selector); - desc.selector = - api_force_reg_unless_imm(g, &selector, desc.selector_type); + if (!desc.selector_type) { + ApiSValue tmp = g->stack[g->sp - 1u]; + desc.selector_type = api_sv_type(&tmp); + } desc.default_label = (Label)sw.default_label; desc.ncases = sw.ncases; desc.hint = (u8)sw.hint; + desc.opt_level = (u8)g->opt_level; if (sw.ncases) { h = g->c->ctx->heap; cases = (CGSwitchCase*)h->alloc(h, sw.ncases * sizeof(CGSwitchCase), @@ -119,16 +302,34 @@ void cfree_cg_switch(CfreeCg* g, CfreeCgSwitch sw) { } desc.cases = cases; } - if (g->target->switch_) { - g->target->switch_(g->target, &desc); + + /* The C-source target overrides switch_ to emit a native `switch` + * statement and forces opt_level=0; route everything to it + * unchanged. Native + opt targets both flow through the planner. */ + native_switch_override = (g->target->switch_ && g->opt_level == 0); + plan = native_switch_override ? (CGSwitchPlan){CG_SWITCH_CLS_CHAIN, 0, 0} + : cg_plan_switch(g, &desc); + + if (plan.cls == CG_SWITCH_CLS_TABLE) { + /* Selector stays on the value stack; cg_emit_switch_table consumes + * it via cg-API ops so the path also records cleanly under opt. */ + metrics_count(g->c, "cg.switch.table", 1); + cg_emit_switch_table(g, &desc, &plan); } else { - cg_lower_switch_default(g->target, &desc); + metrics_count(g->c, "cg.switch.chain", 1); + selector = api_pop(g); + desc.selector = api_force_reg_unless_imm(g, &selector, desc.selector_type); + if (g->target->switch_) { + g->target->switch_(g->target, &desc); + } else { + cg_lower_switch_default(g->target, &desc); + } + api_release(g, &selector); } if (cases) { h = g->c->ctx->heap; h->free(h, cases, sw.ncases * sizeof(CGSwitchCase)); } - api_release(g, &selector); } void cfree_cg_push_label_addr(CfreeCg* g, CfreeCgLabel label, diff --git a/src/cg/data.c b/src/cg/data.c @@ -589,3 +589,83 @@ void cfree_cg_data_end(CfreeCg* g) { g->data_base = 0; g->data_size = 0; } + +/* Emit a function-local jump-table of `n` label addresses into .rodata + * and return the anonymous ObjSymId backing it. Each slot is one pointer + * wide. Used by api_cg_switch_lower_table to materialize the dispatch + * table inline during cg recording — under direct CG the reloc resolves + * immediately when the label is later placed; under opt, the + * label-data reloc is queued on the MCEmitter and resolved at replay + * time once the wrapped backend's func_begin has set cur_func_sym. The + * helper does not register the symbol with CfreeCg's sym table; + * callers wire its CfreeCg type via api_remember_sym so subsequent + * cfree_cg_push_symbol_lvalue / cfree_cg_index can address it as an + * array of pointers. */ +ObjSymId api_emit_label_table(CfreeCg* g, const Label* labels, u32 n) { + Compiler* c; + ObjBuilder* ob; + MCEmitter* mc; + CGTarget* T; + u32 ptrsz; + u32 ptral; + Sym sec_name; + ObjSecId sec; + u32 base; + RelocKind rk; + u8 pad[8]; + char name_buf[40]; + Sym anon; + ObjSymId sym; + u32 i; + if (!g || !labels || !n) return OBJ_SYM_NONE; + c = g->c; + ob = g->obj; + mc = g->mc; + T = g->target; + if (!mc) { + compiler_panic(c, g->cur_loc, + "api_emit_label_table: requires MCEmitter " + "(unsupported with --emit=c)"); + return OBJ_SYM_NONE; + } + ptrsz = (u32)c->target.ptr_size; + ptral = (u32)c->target.ptr_align; + if (ptrsz != 4u && ptrsz != 8u) { + compiler_panic(c, g->cur_loc, + "api_emit_label_table: unsupported ptr_size %u", + (unsigned)ptrsz); + return OBJ_SYM_NONE; + } + rk = api_data_reloc_kind(/*pcrel=*/0, ptrsz); + if (rk == R_NONE) { + compiler_panic(c, g->cur_loc, + "api_emit_label_table: unsupported reloc width %u", + (unsigned)ptrsz); + return OBJ_SYM_NONE; + } + sec_name = pool_intern_cstr(c->global, ".rodata"); + sec = obj_section(ob, sec_name, SEC_RODATA, SF_ALLOC, ptral); + base = obj_align_to(ob, sec, ptral); + /* Write `n` placeholder slots first; emit_label_data_reloc later + * patches each one inline (via obj_patch) at label-place time. */ + memset(pad, 0, sizeof pad); + for (i = 0; i < n; ++i) { + obj_write(ob, sec, pad, ptrsz); + } + for (i = 0; i < n; ++i) { + MCLabel ml = T->cg_label_to_mc_label + ? T->cg_label_to_mc_label(T, labels[i]) + : (MCLabel)labels[i]; + if (ml == MC_LABEL_NONE) { + compiler_panic(c, g->cur_loc, + "api_emit_label_table: label has no MCLabel"); + return OBJ_SYM_NONE; + } + mc->emit_label_data_reloc(mc, sec, base + i * ptrsz, ml, rk, ptrsz, + /*extra_addend=*/0); + } + snprintf(name_buf, sizeof name_buf, ".Lcfree_jt.%u", g->rodata_counter++); + anon = pool_intern_cstr(c->global, name_buf); + sym = obj_symbol(ob, anon, SB_LOCAL, SK_OBJ, sec, base, (u64)n * ptrsz); + return sym; +} diff --git a/src/cg/internal.h b/src/cg/internal.h @@ -155,6 +155,7 @@ struct CfreeCg { u32 scope_generation; u32 rodata_counter; + int opt_level; ObjSecId data_sec; ObjSymId data_sym; @@ -318,6 +319,7 @@ void cfree_cg_data_pcrel(CfreeCg* g, CfreeCgSym target, int64_t addend, void cfree_cg_data_symdiff(CfreeCg* g, CfreeCgSym lhs, CfreeCgSym rhs, int64_t addend, uint32_t width); void cfree_cg_data_end(CfreeCg* g); +ObjSymId api_emit_label_table(CfreeCg* g, const Label* labels, u32 n); DebugTypeId api_debug_type(CfreeCg* g, CfreeCgTypeId id); int api_source_flags_addr_taken(u32 flags); int api_local_requires_memory(CfreeCg* g, CfreeCgTypeId ty, diff --git a/src/cg/session.c b/src/cg/session.c @@ -74,6 +74,7 @@ CfreeStatus cfree_cg_new(CfreeCompiler* c, CfreeObjBuilder* out, g->target = target; g->mc = mc; g->debug = debug; + g->opt_level = opt_level; *cg_out = g; return CFREE_OK; } diff --git a/src/opt/pass_emit.c b/src/opt/pass_emit.c @@ -617,6 +617,12 @@ static void replay_inst(ReplayCtx* r, u32 b, Inst* in) { d.default_label = ensure_label(r, aux->default_block); d.ncases = aux->ncases; d.hint = aux->hint; + /* opt only invokes pass_emit at level >= 1; cfree_cg_switch + * already routed dense/forced-table switches through + * cg_emit_switch_table, so anything that survives in IR_SWITCH + * is chain by construction. Set the field anyway to keep the + * desc fully populated. */ + d.opt_level = 1u; if (aux->ncases) { cases = arena_array(r->f->arena, CGSwitchCase, aux->ncases); for (u32 i = 0; i < aux->ncases; ++i) {