commit addc00d93e9b67a981889840a231276aa207e5a8
parent 2a9bbeef597e44132e18c68c5f44eaaf679e643b
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 28 May 2026 13:01:14 -0700
cg/ndt: O(cached) flush list, cached dispatch/class info, thin NativeOps
NDT throughput/cleanliness rework (Phases 1-3, 5):
- Phase 1: replace the O(nlocals) nd_flush_all scan with an intrusive
doubly-linked list of currently-cached locals (cache_next/cache_prev
indices on NativeDirectLocal; cache_head/cache_tail/ncached on the
target). Tail insertion preserves ascending flush order, so codegen
stays byte-identical. Flush is now O(cached) on every control-flow /
barrier op.
- Phase 2: resolve reg_info and per-class NativeAllocClassInfo once at
construction and cache them, replacing the per-acquire reg_info
re-resolve + linear ri->classes scan with an O(1) lookup.
- Phase 3: thin NativeOps to a semantic-only adapter -- drop the pure
pass-through hooks (reg_info, func_begin/func_end, alloc_frame_slot,
class_for_type, addr_legal); NDT calls d->native->X directly. ops is
now mandatory on the NDT path.
- Phase 5: add fixed on-struct transient buffers (argbuf/retbuf/lblbuf)
for per-op location/label arrays, with TU-arena fallback on overflow,
removing permanent per-op arena growth on the common call.
Output byte-identical across the -O0 corpus.
Diffstat:
2 files changed, 136 insertions(+), 59 deletions(-)
diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c
@@ -34,6 +34,15 @@ static void* nd_arena(NativeDirectTarget* d, size_t size, size_t align) {
return p;
}
+/* Pick a transient NativeLoc array for one op: the on-struct buffer when N
+ * fits, else a one-shot arena allocation. N == 0 yields NULL. */
+static NativeLoc* nd_loc_buf(NativeDirectTarget* d, NativeLoc* buf, u32 cap,
+ u32 n) {
+ if (!n) return NULL;
+ if (n <= cap) return buf;
+ return nd_arena(d, sizeof(NativeLoc) * n, _Alignof(NativeLoc));
+}
+
static void nd_grow_locals(NativeDirectTarget* d, u32 want) {
NativeDirectLocal* next;
u32 cap;
@@ -78,7 +87,6 @@ static NativeDirectLocal* nd_local(NativeDirectTarget* d, CGLocal local) {
static NativeAllocClass nd_class_for_type(NativeDirectTarget* d,
CfreeCgTypeId type) {
- if (d->ops && d->ops->class_for_type) return d->ops->class_for_type(d, type);
if (d->native && d->native->class_for_type)
return d->native->class_for_type(d->native, type);
return NATIVE_REG_INT;
@@ -86,13 +94,10 @@ static NativeAllocClass nd_class_for_type(NativeDirectTarget* d,
static const NativeAllocClassInfo* nd_class_info(NativeDirectTarget* d,
NativeAllocClass cls) {
- const NativeRegInfo* ri =
- d->ops && d->ops->reg_info ? d->ops->reg_info(d) : NULL;
- if (!ri && d->native) ri = d->native->regs;
- if (!ri) nd_panic(d, "target has no register info");
- for (u32 i = 0; i < ri->nclasses; ++i)
- if ((NativeAllocClass)ri->classes[i].cls == cls) return &ri->classes[i];
- nd_panic(d, "target has no requested register class");
+ const NativeAllocClassInfo* ci =
+ (u32)cls < 3u ? d->class_info[cls] : NULL;
+ if (!ci) nd_panic(d, "target has no requested register class");
+ return ci;
}
static NativeLoc nd_reg_loc(Reg reg, NativeAllocClass cls, CfreeCgTypeId type) {
@@ -150,9 +155,7 @@ static void nd_scratch_release(NativeDirectTarget* d, NativeAllocClass cls,
static NativeFrameSlot nd_alloc_frame_slot(NativeDirectTarget* d,
const NativeFrameSlotDesc* desc) {
NativeFrameSlot slot = NATIVE_FRAME_SLOT_NONE;
- if (d->ops && d->ops->alloc_frame_slot)
- slot = d->ops->alloc_frame_slot(d, desc);
- else if (d->native && d->native->frame_slot)
+ if (d->native && d->native->frame_slot)
slot = d->native->frame_slot(d->native, desc);
else
nd_panic(d, "target does not allocate frame slots");
@@ -271,19 +274,21 @@ static NativeLoc nd_loc_operand(NativeDirectTarget* d, Operand op) {
case OPK_GLOBAL:
return nd_loc_global(op.v.global.sym, op.v.global.addend, op.type);
case OPK_INDIRECT: {
+ NativeDirectLocal* bl = nd_local(d, op.v.ind.base);
NativeLoc out;
memset(&out, 0, sizeof out);
out.kind = NATIVE_LOC_ADDR;
out.type = op.type;
out.v.addr.base_kind = NATIVE_ADDR_BASE_FRAME_VALUE;
- out.v.addr.base.frame = nd_local(d, op.v.ind.base)->home;
- out.v.addr.cls = nd_local(d, op.v.ind.base)->cls;
- out.v.addr.base_type = nd_local(d, op.v.ind.base)->type;
+ out.v.addr.base.frame = bl->home;
+ out.v.addr.cls = bl->cls;
+ out.v.addr.base_type = bl->type;
if (op.v.ind.index != CG_LOCAL_NONE) {
+ NativeDirectLocal* il = nd_local(d, op.v.ind.index);
out.v.addr.index_kind = NATIVE_ADDR_INDEX_FRAME_VALUE;
- out.v.addr.index.frame = nd_local(d, op.v.ind.index)->home;
- out.v.addr.index_cls = nd_local(d, op.v.ind.index)->cls;
- out.v.addr.index_type = nd_local(d, op.v.ind.index)->type;
+ out.v.addr.index.frame = il->home;
+ out.v.addr.index_cls = il->cls;
+ out.v.addr.index_type = il->type;
}
out.v.addr.log2_scale = op.v.ind.log2_scale;
out.v.addr.offset = op.v.ind.ofs;
@@ -298,17 +303,20 @@ static NativeAddr nd_addr_storage(NativeDirectTarget* d, Operand op) {
NativeAddr out;
memset(&out, 0, sizeof out);
switch ((OpKind)op.kind) {
- case OPK_LOCAL:
+ case OPK_LOCAL: {
/* The local's home is addressed directly (a memory access reads/writes the
* frame slot itself, e.g. by-value aggregate field extraction). This is
* not pointer aliasing, but it does read the home, so a cached value must
* be made current: spill if dirty and drop the entry. */
+ NativeDirectLocal* l;
nd_flush_local(d, op.v.local);
+ l = nd_local(d, op.v.local);
out.base_kind = NATIVE_ADDR_BASE_FRAME;
- out.base.frame = nd_local(d, op.v.local)->home;
- out.cls = nd_local(d, op.v.local)->cls;
- out.base_type = nd_local(d, op.v.local)->type;
+ out.base.frame = l->home;
+ out.cls = l->cls;
+ out.base_type = l->type;
return out;
+ }
case OPK_GLOBAL:
out.base_kind = NATIVE_ADDR_BASE_GLOBAL;
out.base.global.sym = op.v.global.sym;
@@ -541,6 +549,37 @@ static Reg nd_cache_alloc(NativeDirectTarget* d, NativeAllocClass cls) {
return REG_NONE;
}
+/* Append LOCAL to the tail of the cached-locals list (O(1)). Called only on the
+ * REG_NONE -> reg transition in nd_dst_reg. */
+static void nd_cache_link(NativeDirectTarget* d, CGLocal local) {
+ i32 idx = (i32)(local - 1u);
+ i32 prev = d->cache_tail;
+ d->locals[idx].cache_next = -1;
+ d->locals[idx].cache_prev = prev;
+ if (prev >= 0)
+ d->locals[prev].cache_next = idx;
+ else
+ d->cache_head = idx;
+ d->cache_tail = idx;
+ d->ncached++;
+}
+
+/* Remove LOCAL (which must currently be cached) from the cached-locals list. */
+static void nd_cache_unlink(NativeDirectTarget* d, CGLocal local) {
+ i32 idx = (i32)(local - 1u);
+ i32 next = d->locals[idx].cache_next;
+ i32 prev = d->locals[idx].cache_prev;
+ if (next >= 0)
+ d->locals[next].cache_prev = prev;
+ else
+ d->cache_tail = prev;
+ if (prev >= 0)
+ d->locals[prev].cache_next = next;
+ else
+ d->cache_head = next;
+ d->ncached--;
+}
+
/* Write a cached local back to its home (if dirty) and drop the entry. Safe to
* call on an uncached local. */
static void nd_flush_local(NativeDirectTarget* d, CGLocal local) {
@@ -549,6 +588,7 @@ static void nd_flush_local(NativeDirectTarget* d, CGLocal local) {
if (l->dirty)
nd_store_reg_to_frame(d, l->home, l->type,
nd_reg_loc(l->reg, (NativeAllocClass)l->cls, l->type));
+ nd_cache_unlink(d, local);
d->reg_owner[l->cls][l->reg] = CG_LOCAL_NONE;
l->reg = REG_NONE;
l->dirty = 0;
@@ -559,15 +599,16 @@ static void nd_flush_local(NativeDirectTarget* d, CGLocal local) {
static void nd_invalidate_local(NativeDirectTarget* d, CGLocal local) {
NativeDirectLocal* l = nd_local(d, local);
if (l->reg == REG_NONE) return;
+ nd_cache_unlink(d, local);
d->reg_owner[l->cls][l->reg] = CG_LOCAL_NONE;
l->reg = REG_NONE;
l->dirty = 0;
}
-/* Spill the whole cache to memory and empty it. */
+/* Spill the whole cache to memory and empty it. The list is sorted ascending,
+ * so this spills in the same order as the former O(nlocals) index scan. */
static void nd_flush_all(NativeDirectTarget* d) {
- for (u32 i = 0; i < d->nlocals; ++i)
- if (d->locals[i].reg != REG_NONE) nd_flush_local(d, i + 1u);
+ while (d->cache_head >= 0) nd_flush_local(d, (CGLocal)(d->cache_head + 1));
}
static NativeAddr nd_addr_materialize(NativeDirectTarget* d, NativeAddr in,
@@ -608,9 +649,8 @@ static NativeAddr nd_addr_materialize(NativeDirectTarget* d, NativeAddr in,
temps->index = r;
temps->index_cls = cls;
}
- if ((d->ops && d->ops->addr_legal && !d->ops->addr_legal(d, &out, mem)) ||
- ((!d->ops || !d->ops->addr_legal) && d->native && d->native->addr_legal &&
- !d->native->addr_legal(d->native, &out, mem))) {
+ if (d->native && d->native->addr_legal &&
+ !d->native->addr_legal(d->native, &out, mem)) {
NativeAllocClass cls = NATIVE_REG_INT;
Reg r = nd_scratch_acquire(d, cls);
NativeLoc dst = nd_reg_loc(
@@ -628,9 +668,8 @@ static NativeAddr nd_addr_materialize(NativeDirectTarget* d, NativeAddr in,
out.base.reg = r;
out.cls = (u8)cls;
out.base_type = dst.type;
- if ((d->ops && d->ops->addr_legal && !d->ops->addr_legal(d, &out, mem)) ||
- ((!d->ops || !d->ops->addr_legal) && d->native &&
- d->native->addr_legal && !d->native->addr_legal(d->native, &out, mem)))
+ if (d->native && d->native->addr_legal &&
+ !d->native->addr_legal(d->native, &out, mem))
nd_panic(d, "native address is not legal");
}
return out;
@@ -805,6 +844,7 @@ static NativeLoc nd_dst_reg(NativeDirectTarget* d, Operand dst) {
if (r != REG_NONE) {
d->reg_owner[l->cls][r] = dst.v.local;
l->reg = r;
+ nd_cache_link(d, dst.v.local);
}
}
if (r != REG_NONE) {
@@ -842,9 +882,11 @@ static void nd_store_operand_from_reg(NativeDirectTarget* d, Operand dst,
* supersedes it. Drop without storing; storing back would clobber the new
* home value. Runs after SRC is produced, so a dst that was its own address
* base has already been consumed. */
- if (nd_local(d, dst.v.local)->reg != REG_NONE)
- nd_invalidate_local(d, dst.v.local);
- nd_store_reg_to_frame(d, nd_local(d, dst.v.local)->home, dst.type, src);
+ {
+ NativeDirectLocal* l = nd_local(d, dst.v.local);
+ if (l->reg != REG_NONE) nd_invalidate_local(d, dst.v.local);
+ nd_store_reg_to_frame(d, l->home, dst.type, src);
+ }
}
static void nd_func_begin(CgTarget* t, const CGFuncDesc* fd) {
@@ -855,9 +897,11 @@ static void nd_func_begin(CgTarget* t, const CGFuncDesc* fd) {
d->nscopes = 0;
d->max_outgoing = 0;
d->use_tick = 0;
+ d->cache_head = -1;
+ d->cache_tail = -1;
+ d->ncached = 0;
memset(d->scratch_used, 0, sizeof d->scratch_used);
memset(d->reg_owner, 0, sizeof d->reg_owner);
- if (d->ops && d->ops->func_begin) d->ops->func_begin(d, fd);
if (d->native && d->native->func_begin) d->native->func_begin(d->native, fd);
}
@@ -869,7 +913,6 @@ static void nd_func_end(CgTarget* t) {
if (d->native && d->native->note_frame_state)
d->native->note_frame_state(d->native, &frame);
if (d->native && d->native->patch_apply) d->native->patch_apply(d->native);
- if (d->ops && d->ops->func_end) d->ops->func_end(d);
if (d->native && d->native->func_end) d->native->func_end(d->native);
d->func = NULL;
}
@@ -977,9 +1020,11 @@ static void nd_indirect_branch(CgTarget* t, Operand addr,
addr_reg = nd_materialize_operand(d, addr);
ND_REQUIRE_NATIVE(d, indirect_branch,
"target does not emit indirect branches");
- native_targets = ntargets ? nd_arena(d, sizeof(*native_targets) * ntargets,
- _Alignof(MCLabel))
- : NULL;
+ native_targets = ntargets == 0 ? NULL
+ : ntargets <= ND_LBL_BUF
+ ? d->lblbuf
+ : nd_arena(d, sizeof(*native_targets) * ntargets,
+ _Alignof(MCLabel));
for (u32 i = 0; i < ntargets; ++i)
native_targets[i] = nd_mc_label(d, valid_targets[i]);
d->native->indirect_branch(d->native, addr_reg, native_targets, ntargets);
@@ -1431,12 +1476,8 @@ static void nd_call(CgTarget* t, const CGCallDesc* desc) {
nd_barrier(d, NATIVE_DIRECT_BARRIER_CALL | NATIVE_DIRECT_BARRIER_MEMORY);
memset(&plan, 0, sizeof plan);
memset(&nd, 0, sizeof nd);
- args = desc->nargs
- ? nd_arena(d, sizeof(*args) * desc->nargs, _Alignof(NativeLoc))
- : NULL;
- results = desc->nresults ? nd_arena(d, sizeof(*results) * desc->nresults,
- _Alignof(NativeLoc))
- : NULL;
+ args = nd_loc_buf(d, d->argbuf, ND_ARG_BUF, desc->nargs);
+ results = nd_loc_buf(d, d->retbuf, ND_RET_BUF, desc->nresults);
for (u32 i = 0; i < desc->nargs; ++i)
args[i] = nd_loc_frame(d, desc->args[i], 0);
for (u32 i = 0; i < desc->nresults; ++i)
@@ -1497,8 +1538,7 @@ static void nd_ret(CgTarget* t, const CGLocal* values, u32 nvalues) {
d->ops->emit_ret(d, values, nvalues);
return;
}
- locs = nvalues ? nd_arena(d, sizeof(*locs) * nvalues, _Alignof(NativeLoc))
- : NULL;
+ locs = nd_loc_buf(d, d->argbuf, ND_ARG_BUF, nvalues);
for (u32 i = 0; i < nvalues; ++i) locs[i] = nd_loc_frame(d, values[i], 0);
ND_REQUIRE_NATIVE(d, plan_ret, "target does not plan returns");
d->native->plan_ret(d->native, d->func, locs, nvalues, &rets, &nrets);
@@ -1636,10 +1676,8 @@ static void nd_fence(CgTarget* t, MemOrder order) {
static void nd_intrinsic(CgTarget* t, IntrinKind kind, Operand* dsts, u32 ndst,
const Operand* args, u32 narg) {
NativeDirectTarget* d = nd_of(t);
- NativeLoc* ndsts =
- ndst ? nd_arena(d, sizeof(*ndsts) * ndst, _Alignof(NativeLoc)) : NULL;
- NativeLoc* nargs =
- narg ? nd_arena(d, sizeof(*nargs) * narg, _Alignof(NativeLoc)) : NULL;
+ NativeLoc* ndsts = nd_loc_buf(d, d->retbuf, ND_RET_BUF, ndst);
+ NativeLoc* nargs = nd_loc_buf(d, d->argbuf, ND_ARG_BUF, narg);
nd_flush_all(d);
ND_REQUIRE_NATIVE(d, intrinsic, "target does not emit compiler intrinsics");
for (u32 i = 0; i < ndst; ++i) ndsts[i] = nd_dst_scratch(d, dsts[i]);
@@ -1709,6 +1747,17 @@ CgTarget* native_direct_target_new(Compiler* c, ObjBuilder* obj,
d->ops = cfg->ops;
d->user = cfg->user;
+ /* Resolve register/class info once; it is constant for the program. */
+ d->reg_info = cfg->native ? cfg->native->regs : NULL;
+ for (u32 i = 0; i < 3u; ++i) d->class_info[i] = NULL;
+ if (d->reg_info) {
+ const NativeRegInfo* ri = d->reg_info;
+ for (u32 i = 0; i < ri->nclasses; ++i) {
+ u32 cls = ri->classes[i].cls;
+ if (cls < 3u) d->class_info[cls] = &ri->classes[i];
+ }
+ }
+
d->base.func_begin = nd_func_begin;
d->base.func_end = nd_func_end;
d->base.alias = nd_alias;
diff --git a/src/cg/native_direct_target.h b/src/cg/native_direct_target.h
@@ -43,6 +43,12 @@ typedef struct NativeDirectLocal {
u8 address_taken;
u8 memory_required;
u32 last_use; /* d->use_tick at the most recent cache touch (LRU victim key) */
+ /* Intrusive doubly-linked list of currently-cached locals, in insertion
+ * (caching) order so nd_cache_link is O(1) tail insertion. Values are 0-based
+ * indices into NativeDirectTarget.locals (or -1); only valid while this local
+ * is cached (l->reg != REG_NONE). */
+ i32 cache_next;
+ i32 cache_prev;
} NativeDirectLocal;
typedef enum NativeDirectAddrLegality {
@@ -52,20 +58,16 @@ typedef enum NativeDirectAddrLegality {
} NativeDirectAddrLegality;
typedef struct NativeOps NativeOps;
+/* Semantic-only adapter the arch supplies on the NDT path. `ops` is mandatory:
+ * aa64_backend_make and aa64_semantic_target_new always pass it. Pure
+ * pass-throughs to NativeTarget (reg_info, func_begin/func_end, frame-slot
+ * allocation, class_for_type, addr_legal) are not mirrored here -- NDT calls
+ * d->native->X directly for those. */
struct NativeOps {
- const NativeRegInfo* (*reg_info)(NativeDirectTarget*);
-
- void (*func_begin)(NativeDirectTarget*, const CGFuncDesc*);
- void (*func_end)(NativeDirectTarget*);
-
- NativeFrameSlot (*alloc_frame_slot)(NativeDirectTarget*,
- const NativeFrameSlotDesc*);
void (*bind_param)(NativeDirectTarget*, const CGParamDesc*, CGLocal,
NativeDirectLocal*);
- NativeAllocClass (*class_for_type)(NativeDirectTarget*, CfreeCgTypeId);
int (*operand_legal)(NativeDirectTarget*, const Operand*, NativeAllocClass);
- int (*addr_legal)(NativeDirectTarget*, const NativeAddr*, MemAccess);
NativeDirectAddrLegality (*semantic_addr_legal)(NativeDirectTarget*,
Operand addr, MemAccess);
@@ -91,6 +93,13 @@ struct NativeOps {
void (*barrier)(NativeDirectTarget*, u32 flags);
};
+/* Fixed transient buffers for per-op location/label arrays (call args/results,
+ * return values, intrinsic operands, indirect-branch targets). The common case
+ * fits these and allocates nothing; overflow falls back to the TU arena. */
+#define ND_ARG_BUF 16u
+#define ND_RET_BUF 8u
+#define ND_LBL_BUF 16u
+
typedef struct NativeDirectTargetConfig {
NativeTarget* native;
const NativeOps* ops;
@@ -105,6 +114,12 @@ struct NativeDirectTarget {
const NativeOps* ops;
void* user;
+ /* Register info and per-class info resolved once at construction (constant for
+ * the program), so scratch acquire / cache alloc / evict do an O(1) lookup
+ * instead of re-resolving reg_info and linearly scanning ri->classes. */
+ const NativeRegInfo* reg_info;
+ const NativeAllocClassInfo* class_info[3];
+
const CGFuncDesc* func;
SrcLoc loc;
@@ -128,8 +143,21 @@ struct NativeDirectTarget {
* NativeDirectLocal. See doc/CGTARGET.md "local register cache". */
CGLocal reg_owner[3][32];
u32 use_tick; /* monotonic counter stamped onto NativeDirectLocal.last_use */
+ /* Head/tail of the intrusive cached-locals list (in caching order), -1 when
+ * empty; ncached is its length. Lets nd_flush_all run in O(cached) instead of
+ * scanning all nlocals on every control-flow / barrier op; cache_tail makes
+ * nd_cache_link an O(1) tail insertion. */
+ i32 cache_head;
+ i32 cache_tail;
+ u32 ncached;
u32 max_outgoing;
+ /* Transient per-op buffers; see ND_*_BUF. Inputs use argbuf, outputs retbuf;
+ * never live across more than one op, so reuse between ops is safe. */
+ NativeLoc argbuf[ND_ARG_BUF];
+ NativeLoc retbuf[ND_RET_BUF];
+ MCLabel lblbuf[ND_LBL_BUF];
+
ObjSecId local_static_sec;
ObjSymId local_static_sym;
u32 local_static_base;