kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit addc00d93e9b67a981889840a231276aa207e5a8
parent 2a9bbeef597e44132e18c68c5f44eaaf679e643b
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 28 May 2026 13:01:14 -0700

cg/ndt: O(cached) flush list, cached dispatch/class info, thin NativeOps

NDT throughput/cleanliness rework (Phases 1-3, 5):

- Phase 1: replace the O(nlocals) nd_flush_all scan with an intrusive
  doubly-linked list of currently-cached locals (cache_next/cache_prev
  indices on NativeDirectLocal; cache_head/cache_tail/ncached on the
  target). Tail insertion preserves ascending flush order, so codegen
  stays byte-identical. Flush is now O(cached) on every control-flow /
  barrier op.
- Phase 2: resolve reg_info and per-class NativeAllocClassInfo once at
  construction and cache them, replacing the per-acquire reg_info
  re-resolve + linear ri->classes scan with an O(1) lookup.
- Phase 3: thin NativeOps to a semantic-only adapter -- drop the pure
  pass-through hooks (reg_info, func_begin/func_end, alloc_frame_slot,
  class_for_type, addr_legal); NDT calls d->native->X directly. ops is
  now mandatory on the NDT path.
- Phase 5: add fixed on-struct transient buffers (argbuf/retbuf/lblbuf)
  for per-op location/label arrays, with TU-arena fallback on overflow,
  removing permanent per-op arena growth on the common call.

Output byte-identical across the -O0 corpus.

Diffstat:
Msrc/cg/native_direct_target.c | 149++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
Msrc/cg/native_direct_target.h | 46+++++++++++++++++++++++++++++++++++++---------
2 files changed, 136 insertions(+), 59 deletions(-)

diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c @@ -34,6 +34,15 @@ static void* nd_arena(NativeDirectTarget* d, size_t size, size_t align) { return p; } +/* Pick a transient NativeLoc array for one op: the on-struct buffer when N + * fits, else a one-shot arena allocation. N == 0 yields NULL. */ +static NativeLoc* nd_loc_buf(NativeDirectTarget* d, NativeLoc* buf, u32 cap, + u32 n) { + if (!n) return NULL; + if (n <= cap) return buf; + return nd_arena(d, sizeof(NativeLoc) * n, _Alignof(NativeLoc)); +} + static void nd_grow_locals(NativeDirectTarget* d, u32 want) { NativeDirectLocal* next; u32 cap; @@ -78,7 +87,6 @@ static NativeDirectLocal* nd_local(NativeDirectTarget* d, CGLocal local) { static NativeAllocClass nd_class_for_type(NativeDirectTarget* d, CfreeCgTypeId type) { - if (d->ops && d->ops->class_for_type) return d->ops->class_for_type(d, type); if (d->native && d->native->class_for_type) return d->native->class_for_type(d->native, type); return NATIVE_REG_INT; @@ -86,13 +94,10 @@ static NativeAllocClass nd_class_for_type(NativeDirectTarget* d, static const NativeAllocClassInfo* nd_class_info(NativeDirectTarget* d, NativeAllocClass cls) { - const NativeRegInfo* ri = - d->ops && d->ops->reg_info ? d->ops->reg_info(d) : NULL; - if (!ri && d->native) ri = d->native->regs; - if (!ri) nd_panic(d, "target has no register info"); - for (u32 i = 0; i < ri->nclasses; ++i) - if ((NativeAllocClass)ri->classes[i].cls == cls) return &ri->classes[i]; - nd_panic(d, "target has no requested register class"); + const NativeAllocClassInfo* ci = + (u32)cls < 3u ? d->class_info[cls] : NULL; + if (!ci) nd_panic(d, "target has no requested register class"); + return ci; } static NativeLoc nd_reg_loc(Reg reg, NativeAllocClass cls, CfreeCgTypeId type) { @@ -150,9 +155,7 @@ static void nd_scratch_release(NativeDirectTarget* d, NativeAllocClass cls, static NativeFrameSlot nd_alloc_frame_slot(NativeDirectTarget* d, const NativeFrameSlotDesc* desc) { NativeFrameSlot slot = NATIVE_FRAME_SLOT_NONE; - if (d->ops && d->ops->alloc_frame_slot) - slot = d->ops->alloc_frame_slot(d, desc); - else if (d->native && d->native->frame_slot) + if (d->native && d->native->frame_slot) slot = d->native->frame_slot(d->native, desc); else nd_panic(d, "target does not allocate frame slots"); @@ -271,19 +274,21 @@ static NativeLoc nd_loc_operand(NativeDirectTarget* d, Operand op) { case OPK_GLOBAL: return nd_loc_global(op.v.global.sym, op.v.global.addend, op.type); case OPK_INDIRECT: { + NativeDirectLocal* bl = nd_local(d, op.v.ind.base); NativeLoc out; memset(&out, 0, sizeof out); out.kind = NATIVE_LOC_ADDR; out.type = op.type; out.v.addr.base_kind = NATIVE_ADDR_BASE_FRAME_VALUE; - out.v.addr.base.frame = nd_local(d, op.v.ind.base)->home; - out.v.addr.cls = nd_local(d, op.v.ind.base)->cls; - out.v.addr.base_type = nd_local(d, op.v.ind.base)->type; + out.v.addr.base.frame = bl->home; + out.v.addr.cls = bl->cls; + out.v.addr.base_type = bl->type; if (op.v.ind.index != CG_LOCAL_NONE) { + NativeDirectLocal* il = nd_local(d, op.v.ind.index); out.v.addr.index_kind = NATIVE_ADDR_INDEX_FRAME_VALUE; - out.v.addr.index.frame = nd_local(d, op.v.ind.index)->home; - out.v.addr.index_cls = nd_local(d, op.v.ind.index)->cls; - out.v.addr.index_type = nd_local(d, op.v.ind.index)->type; + out.v.addr.index.frame = il->home; + out.v.addr.index_cls = il->cls; + out.v.addr.index_type = il->type; } out.v.addr.log2_scale = op.v.ind.log2_scale; out.v.addr.offset = op.v.ind.ofs; @@ -298,17 +303,20 @@ static NativeAddr nd_addr_storage(NativeDirectTarget* d, Operand op) { NativeAddr out; memset(&out, 0, sizeof out); switch ((OpKind)op.kind) { - case OPK_LOCAL: + case OPK_LOCAL: { /* The local's home is addressed directly (a memory access reads/writes the * frame slot itself, e.g. by-value aggregate field extraction). This is * not pointer aliasing, but it does read the home, so a cached value must * be made current: spill if dirty and drop the entry. */ + NativeDirectLocal* l; nd_flush_local(d, op.v.local); + l = nd_local(d, op.v.local); out.base_kind = NATIVE_ADDR_BASE_FRAME; - out.base.frame = nd_local(d, op.v.local)->home; - out.cls = nd_local(d, op.v.local)->cls; - out.base_type = nd_local(d, op.v.local)->type; + out.base.frame = l->home; + out.cls = l->cls; + out.base_type = l->type; return out; + } case OPK_GLOBAL: out.base_kind = NATIVE_ADDR_BASE_GLOBAL; out.base.global.sym = op.v.global.sym; @@ -541,6 +549,37 @@ static Reg nd_cache_alloc(NativeDirectTarget* d, NativeAllocClass cls) { return REG_NONE; } +/* Append LOCAL to the tail of the cached-locals list (O(1)). Called only on the + * REG_NONE -> reg transition in nd_dst_reg. */ +static void nd_cache_link(NativeDirectTarget* d, CGLocal local) { + i32 idx = (i32)(local - 1u); + i32 prev = d->cache_tail; + d->locals[idx].cache_next = -1; + d->locals[idx].cache_prev = prev; + if (prev >= 0) + d->locals[prev].cache_next = idx; + else + d->cache_head = idx; + d->cache_tail = idx; + d->ncached++; +} + +/* Remove LOCAL (which must currently be cached) from the cached-locals list. */ +static void nd_cache_unlink(NativeDirectTarget* d, CGLocal local) { + i32 idx = (i32)(local - 1u); + i32 next = d->locals[idx].cache_next; + i32 prev = d->locals[idx].cache_prev; + if (next >= 0) + d->locals[next].cache_prev = prev; + else + d->cache_tail = prev; + if (prev >= 0) + d->locals[prev].cache_next = next; + else + d->cache_head = next; + d->ncached--; +} + /* Write a cached local back to its home (if dirty) and drop the entry. Safe to * call on an uncached local. */ static void nd_flush_local(NativeDirectTarget* d, CGLocal local) { @@ -549,6 +588,7 @@ static void nd_flush_local(NativeDirectTarget* d, CGLocal local) { if (l->dirty) nd_store_reg_to_frame(d, l->home, l->type, nd_reg_loc(l->reg, (NativeAllocClass)l->cls, l->type)); + nd_cache_unlink(d, local); d->reg_owner[l->cls][l->reg] = CG_LOCAL_NONE; l->reg = REG_NONE; l->dirty = 0; @@ -559,15 +599,16 @@ static void nd_flush_local(NativeDirectTarget* d, CGLocal local) { static void nd_invalidate_local(NativeDirectTarget* d, CGLocal local) { NativeDirectLocal* l = nd_local(d, local); if (l->reg == REG_NONE) return; + nd_cache_unlink(d, local); d->reg_owner[l->cls][l->reg] = CG_LOCAL_NONE; l->reg = REG_NONE; l->dirty = 0; } -/* Spill the whole cache to memory and empty it. */ +/* Spill the whole cache to memory and empty it. The list is sorted ascending, + * so this spills in the same order as the former O(nlocals) index scan. */ static void nd_flush_all(NativeDirectTarget* d) { - for (u32 i = 0; i < d->nlocals; ++i) - if (d->locals[i].reg != REG_NONE) nd_flush_local(d, i + 1u); + while (d->cache_head >= 0) nd_flush_local(d, (CGLocal)(d->cache_head + 1)); } static NativeAddr nd_addr_materialize(NativeDirectTarget* d, NativeAddr in, @@ -608,9 +649,8 @@ static NativeAddr nd_addr_materialize(NativeDirectTarget* d, NativeAddr in, temps->index = r; temps->index_cls = cls; } - if ((d->ops && d->ops->addr_legal && !d->ops->addr_legal(d, &out, mem)) || - ((!d->ops || !d->ops->addr_legal) && d->native && d->native->addr_legal && - !d->native->addr_legal(d->native, &out, mem))) { + if (d->native && d->native->addr_legal && + !d->native->addr_legal(d->native, &out, mem)) { NativeAllocClass cls = NATIVE_REG_INT; Reg r = nd_scratch_acquire(d, cls); NativeLoc dst = nd_reg_loc( @@ -628,9 +668,8 @@ static NativeAddr nd_addr_materialize(NativeDirectTarget* d, NativeAddr in, out.base.reg = r; out.cls = (u8)cls; out.base_type = dst.type; - if ((d->ops && d->ops->addr_legal && !d->ops->addr_legal(d, &out, mem)) || - ((!d->ops || !d->ops->addr_legal) && d->native && - d->native->addr_legal && !d->native->addr_legal(d->native, &out, mem))) + if (d->native && d->native->addr_legal && + !d->native->addr_legal(d->native, &out, mem)) nd_panic(d, "native address is not legal"); } return out; @@ -805,6 +844,7 @@ static NativeLoc nd_dst_reg(NativeDirectTarget* d, Operand dst) { if (r != REG_NONE) { d->reg_owner[l->cls][r] = dst.v.local; l->reg = r; + nd_cache_link(d, dst.v.local); } } if (r != REG_NONE) { @@ -842,9 +882,11 @@ static void nd_store_operand_from_reg(NativeDirectTarget* d, Operand dst, * supersedes it. Drop without storing; storing back would clobber the new * home value. Runs after SRC is produced, so a dst that was its own address * base has already been consumed. */ - if (nd_local(d, dst.v.local)->reg != REG_NONE) - nd_invalidate_local(d, dst.v.local); - nd_store_reg_to_frame(d, nd_local(d, dst.v.local)->home, dst.type, src); + { + NativeDirectLocal* l = nd_local(d, dst.v.local); + if (l->reg != REG_NONE) nd_invalidate_local(d, dst.v.local); + nd_store_reg_to_frame(d, l->home, dst.type, src); + } } static void nd_func_begin(CgTarget* t, const CGFuncDesc* fd) { @@ -855,9 +897,11 @@ static void nd_func_begin(CgTarget* t, const CGFuncDesc* fd) { d->nscopes = 0; d->max_outgoing = 0; d->use_tick = 0; + d->cache_head = -1; + d->cache_tail = -1; + d->ncached = 0; memset(d->scratch_used, 0, sizeof d->scratch_used); memset(d->reg_owner, 0, sizeof d->reg_owner); - if (d->ops && d->ops->func_begin) d->ops->func_begin(d, fd); if (d->native && d->native->func_begin) d->native->func_begin(d->native, fd); } @@ -869,7 +913,6 @@ static void nd_func_end(CgTarget* t) { if (d->native && d->native->note_frame_state) d->native->note_frame_state(d->native, &frame); if (d->native && d->native->patch_apply) d->native->patch_apply(d->native); - if (d->ops && d->ops->func_end) d->ops->func_end(d); if (d->native && d->native->func_end) d->native->func_end(d->native); d->func = NULL; } @@ -977,9 +1020,11 @@ static void nd_indirect_branch(CgTarget* t, Operand addr, addr_reg = nd_materialize_operand(d, addr); ND_REQUIRE_NATIVE(d, indirect_branch, "target does not emit indirect branches"); - native_targets = ntargets ? nd_arena(d, sizeof(*native_targets) * ntargets, - _Alignof(MCLabel)) - : NULL; + native_targets = ntargets == 0 ? NULL + : ntargets <= ND_LBL_BUF + ? d->lblbuf + : nd_arena(d, sizeof(*native_targets) * ntargets, + _Alignof(MCLabel)); for (u32 i = 0; i < ntargets; ++i) native_targets[i] = nd_mc_label(d, valid_targets[i]); d->native->indirect_branch(d->native, addr_reg, native_targets, ntargets); @@ -1431,12 +1476,8 @@ static void nd_call(CgTarget* t, const CGCallDesc* desc) { nd_barrier(d, NATIVE_DIRECT_BARRIER_CALL | NATIVE_DIRECT_BARRIER_MEMORY); memset(&plan, 0, sizeof plan); memset(&nd, 0, sizeof nd); - args = desc->nargs - ? nd_arena(d, sizeof(*args) * desc->nargs, _Alignof(NativeLoc)) - : NULL; - results = desc->nresults ? nd_arena(d, sizeof(*results) * desc->nresults, - _Alignof(NativeLoc)) - : NULL; + args = nd_loc_buf(d, d->argbuf, ND_ARG_BUF, desc->nargs); + results = nd_loc_buf(d, d->retbuf, ND_RET_BUF, desc->nresults); for (u32 i = 0; i < desc->nargs; ++i) args[i] = nd_loc_frame(d, desc->args[i], 0); for (u32 i = 0; i < desc->nresults; ++i) @@ -1497,8 +1538,7 @@ static void nd_ret(CgTarget* t, const CGLocal* values, u32 nvalues) { d->ops->emit_ret(d, values, nvalues); return; } - locs = nvalues ? nd_arena(d, sizeof(*locs) * nvalues, _Alignof(NativeLoc)) - : NULL; + locs = nd_loc_buf(d, d->argbuf, ND_ARG_BUF, nvalues); for (u32 i = 0; i < nvalues; ++i) locs[i] = nd_loc_frame(d, values[i], 0); ND_REQUIRE_NATIVE(d, plan_ret, "target does not plan returns"); d->native->plan_ret(d->native, d->func, locs, nvalues, &rets, &nrets); @@ -1636,10 +1676,8 @@ static void nd_fence(CgTarget* t, MemOrder order) { static void nd_intrinsic(CgTarget* t, IntrinKind kind, Operand* dsts, u32 ndst, const Operand* args, u32 narg) { NativeDirectTarget* d = nd_of(t); - NativeLoc* ndsts = - ndst ? nd_arena(d, sizeof(*ndsts) * ndst, _Alignof(NativeLoc)) : NULL; - NativeLoc* nargs = - narg ? nd_arena(d, sizeof(*nargs) * narg, _Alignof(NativeLoc)) : NULL; + NativeLoc* ndsts = nd_loc_buf(d, d->retbuf, ND_RET_BUF, ndst); + NativeLoc* nargs = nd_loc_buf(d, d->argbuf, ND_ARG_BUF, narg); nd_flush_all(d); ND_REQUIRE_NATIVE(d, intrinsic, "target does not emit compiler intrinsics"); for (u32 i = 0; i < ndst; ++i) ndsts[i] = nd_dst_scratch(d, dsts[i]); @@ -1709,6 +1747,17 @@ CgTarget* native_direct_target_new(Compiler* c, ObjBuilder* obj, d->ops = cfg->ops; d->user = cfg->user; + /* Resolve register/class info once; it is constant for the program. */ + d->reg_info = cfg->native ? cfg->native->regs : NULL; + for (u32 i = 0; i < 3u; ++i) d->class_info[i] = NULL; + if (d->reg_info) { + const NativeRegInfo* ri = d->reg_info; + for (u32 i = 0; i < ri->nclasses; ++i) { + u32 cls = ri->classes[i].cls; + if (cls < 3u) d->class_info[cls] = &ri->classes[i]; + } + } + d->base.func_begin = nd_func_begin; d->base.func_end = nd_func_end; d->base.alias = nd_alias; diff --git a/src/cg/native_direct_target.h b/src/cg/native_direct_target.h @@ -43,6 +43,12 @@ typedef struct NativeDirectLocal { u8 address_taken; u8 memory_required; u32 last_use; /* d->use_tick at the most recent cache touch (LRU victim key) */ + /* Intrusive doubly-linked list of currently-cached locals, in insertion + * (caching) order so nd_cache_link is O(1) tail insertion. Values are 0-based + * indices into NativeDirectTarget.locals (or -1); only valid while this local + * is cached (l->reg != REG_NONE). */ + i32 cache_next; + i32 cache_prev; } NativeDirectLocal; typedef enum NativeDirectAddrLegality { @@ -52,20 +58,16 @@ typedef enum NativeDirectAddrLegality { } NativeDirectAddrLegality; typedef struct NativeOps NativeOps; +/* Semantic-only adapter the arch supplies on the NDT path. `ops` is mandatory: + * aa64_backend_make and aa64_semantic_target_new always pass it. Pure + * pass-throughs to NativeTarget (reg_info, func_begin/func_end, frame-slot + * allocation, class_for_type, addr_legal) are not mirrored here -- NDT calls + * d->native->X directly for those. */ struct NativeOps { - const NativeRegInfo* (*reg_info)(NativeDirectTarget*); - - void (*func_begin)(NativeDirectTarget*, const CGFuncDesc*); - void (*func_end)(NativeDirectTarget*); - - NativeFrameSlot (*alloc_frame_slot)(NativeDirectTarget*, - const NativeFrameSlotDesc*); void (*bind_param)(NativeDirectTarget*, const CGParamDesc*, CGLocal, NativeDirectLocal*); - NativeAllocClass (*class_for_type)(NativeDirectTarget*, CfreeCgTypeId); int (*operand_legal)(NativeDirectTarget*, const Operand*, NativeAllocClass); - int (*addr_legal)(NativeDirectTarget*, const NativeAddr*, MemAccess); NativeDirectAddrLegality (*semantic_addr_legal)(NativeDirectTarget*, Operand addr, MemAccess); @@ -91,6 +93,13 @@ struct NativeOps { void (*barrier)(NativeDirectTarget*, u32 flags); }; +/* Fixed transient buffers for per-op location/label arrays (call args/results, + * return values, intrinsic operands, indirect-branch targets). The common case + * fits these and allocates nothing; overflow falls back to the TU arena. */ +#define ND_ARG_BUF 16u +#define ND_RET_BUF 8u +#define ND_LBL_BUF 16u + typedef struct NativeDirectTargetConfig { NativeTarget* native; const NativeOps* ops; @@ -105,6 +114,12 @@ struct NativeDirectTarget { const NativeOps* ops; void* user; + /* Register info and per-class info resolved once at construction (constant for + * the program), so scratch acquire / cache alloc / evict do an O(1) lookup + * instead of re-resolving reg_info and linearly scanning ri->classes. */ + const NativeRegInfo* reg_info; + const NativeAllocClassInfo* class_info[3]; + const CGFuncDesc* func; SrcLoc loc; @@ -128,8 +143,21 @@ struct NativeDirectTarget { * NativeDirectLocal. See doc/CGTARGET.md "local register cache". */ CGLocal reg_owner[3][32]; u32 use_tick; /* monotonic counter stamped onto NativeDirectLocal.last_use */ + /* Head/tail of the intrusive cached-locals list (in caching order), -1 when + * empty; ncached is its length. Lets nd_flush_all run in O(cached) instead of + * scanning all nlocals on every control-flow / barrier op; cache_tail makes + * nd_cache_link an O(1) tail insertion. */ + i32 cache_head; + i32 cache_tail; + u32 ncached; u32 max_outgoing; + /* Transient per-op buffers; see ND_*_BUF. Inputs use argbuf, outputs retbuf; + * never live across more than one op, so reuse between ops is safe. */ + NativeLoc argbuf[ND_ARG_BUF]; + NativeLoc retbuf[ND_RET_BUF]; + MCLabel lblbuf[ND_LBL_BUF]; + ObjSecId local_static_sec; ObjSymId local_static_sym; u32 local_static_base;