commit 283f35cc35b3d8f28773a7820d1513081ea28862
parent ef28221b5e51646d11fd54686a23349b93007d02
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Wed, 27 May 2026 11:29:50 -0700
opt: implement Design B of NativeDirectTarget local register cache
Keep the register cache live across non-clobbering memory ops instead of
tearing it down on every load/store (Design A). Builds incrementally on A:
- Cache-aware addressing: an INDIRECT/pointer base or index local that is
cached is dereferenced straight from its live register (BASE_REG/INDEX_REG,
pinned); nd_addr_materialize records the reg so temps release just unpins it.
- Drop nd_flush_all from non-clobbering memory ops (load/store/copy/copy_bytes/
set_bytes/bitfield_*) on the escape argument; volatile keeps flush+barrier.
- Centralize load-result dst invalidation in nd_store_operand_from_reg so a
home write supersedes any stale cache entry (covers load/addr_of/local_addr).
- Targeted address-of flush: nd_local_addr/nd_addr_of flush only the addressed
local and mark it address_taken, not the whole cache.
- Approximate-LRU victim policy (use_tick/last_use + nd_pick_cache_victim) in
both nd_cache_alloc and nd_scratch_acquire; B raises register pressure.
Refinement beyond the original escape argument: a load/store can address a
non-escaped local's home directly via an OPK_LOCAL operand (by-value aggregate
field extraction), not only through a pointer. nd_addr_storage/nd_addr_pointer
now flush that one local when building a BASE_FRAME address for it.
Add B unit tests (cache-aware addressing, cache-survives-store, volatile
flush, call flush) and fix a stale Design-A EV_LOAD assertion.
Diffstat:
4 files changed, 378 insertions(+), 55 deletions(-)
diff --git a/doc/NATIVE_DIRECT_CACHE.md b/doc/NATIVE_DIRECT_CACHE.md
@@ -4,8 +4,8 @@
backends (currently aarch64). The baseline lowers every semantic op to
load-operands / compute / store-result against frame homes, so a local round
trips through memory on every use. This document describes the local register
-cache that avoids those round trips. **Design A** is implemented; **Design B**
-is the planned follow-on.
+cache that avoids those round trips. **Design A** and **Design B** are both
+implemented; B builds incrementally on A.
See also `doc/CGTARGET.md` ("local register cache") for the original sketch.
@@ -68,7 +68,7 @@ load/store/call, so memory-dense code degrades to the baseline. It is never
worse than baseline: each cached local is stored at most once per boundary,
versus once per definition in the baseline.
-## Design B — escape-aware cache with cache-aware addressing (planned)
+## Design B — escape-aware cache with cache-aware addressing (implemented)
Design B keeps the same block-local invariant but stops tearing the cache down
on memory operations, by reasoning about escape and by making address
@@ -76,6 +76,18 @@ construction consult the cache. It is strictly incremental over A: A with (a)
the memory ops' `nd_flush_all` removed on escape grounds, (b) addressing taught
to use live cache registers, and (c) a real spill victim selector.
+> **Refinement found during implementation — direct frame addressing.** The
+> escape argument below assumes a memory op reaches a local only *through a
+> pointer*. But a `load`/`store` can also address a non-escaped local's frame
+> home *directly* via an `OPK_LOCAL` storage operand (`nd_addr_storage` →
+> `NATIVE_ADDR_BASE_FRAME`) — the frontend does this for by-value aggregate
+> field extraction on a scalar-sized struct temp. That is not pointer aliasing,
+> but it does read/write the home, so a cached value would be stale. The fix is
+> targeted, not a `nd_flush_all`: when `nd_addr_storage`/`nd_addr_pointer` build
+> a `BASE_FRAME` address for an `OPK_LOCAL`, they `nd_flush_local` that one local
+> first (spill if dirty, drop the entry). Pointer-based accesses (`BASE_REG` /
+> `BASE_FRAME_VALUE`) are unaffected, so the common case stays fully cached.
+
### 1. Escape-based aliasing replaces flush-on-memory-op
Address-taken / `memory_required` locals are never cached. Therefore a pointer
@@ -136,12 +148,23 @@ CFG/liveness.
B must therefore either (i) confine read-entry creation to compute ops, or
(ii) invalidate (clean entries need no store) after any clobbering op. Option
(i) — only compute ops may create cache entries — is simplest and preserves
- the "cache only grows across compute runs" property.
-- **Real eviction/spill.** Pressure is higher because values survive across
- more ops, so "don't cache when full" is insufficient; `nd_cache_alloc` and
- `nd_scratch_acquire` must pick a non-pinned, non-destination victim and
- `nd_flush_local` it. A victim policy (e.g. round-robin or oldest) is needed;
- pinned source/dst/address regs are never victims.
+ the "cache only grows across compute runs" property. **Implemented: option
+ (i).** A `load`/`bitfield_load`/`addr_of` result is written to the dst local's
+ frame home, bypassing the cache, and must drop any stale entry for that local
+ (the home write supersedes it). This is centralized in
+ `nd_store_operand_from_reg` — the single choke point that writes a
+ freshly-computed scratch value to a local's home — which `nd_invalidate_local`s
+ the dst entry before the store. Because it runs *after* the value reg is
+ produced, a dst that was its own address base has already been consumed.
+- **Real eviction/spill (implemented as approximate-LRU).** Pressure is higher
+ because values survive across more ops, so "don't cache when full" is
+ insufficient. Each cache touch (def in `nd_dst_reg`/`nd_dst_writeback`, read
+ hit in `nd_materialize_operand`, addressing use in `nd_cache_reg_for`) stamps
+ `NativeDirectLocal.last_use` from a monotonic `NativeDirectTarget.use_tick`.
+ `nd_pick_cache_victim` returns the least-recently-used non-pinned owned
+ register; `nd_cache_alloc` and `nd_scratch_acquire` route their eviction
+ through it and `nd_flush_local` the victim. Pinned source/dst/address regs are
+ never victims.
- **Pin discipline through addressing.** Base/index cache regs must be pinned
for the duration of a memory op and unpinned (never invalidated) afterward,
alongside the existing source/dst/temp pins.
diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c
@@ -106,6 +106,9 @@ static NativeLoc nd_reg_loc(Reg reg, NativeAllocClass cls, CfreeCgTypeId type) {
}
static void nd_flush_local(NativeDirectTarget* d, CGLocal local);
+static Reg nd_cache_reg_for(NativeDirectTarget* d, CGLocal local,
+ CfreeCgTypeId access_type);
+static Reg nd_pick_cache_victim(NativeDirectTarget* d, NativeAllocClass cls);
static Reg nd_scratch_acquire(NativeDirectTarget* d, NativeAllocClass cls) {
const NativeAllocClassInfo* ci = nd_class_info(d, cls);
@@ -126,13 +129,11 @@ static Reg nd_scratch_acquire(NativeDirectTarget* d, NativeAllocClass cls) {
regs = ci->allocable;
nregs = ci->nallocable;
}
- /* Under pressure, evict a non-pinned cached local (spilling it to its home)
- * and reuse its register as a scratch temporary. */
- for (u32 i = 0; i < ci->nallocable; ++i) {
- Reg r = ci->allocable[i];
- if (r >= 32u) continue;
- if ((d->scratch_used[cls] & (1u << r)) == 0 &&
- d->reg_owner[cls][r] != CG_LOCAL_NONE) {
+ /* Under pressure, evict the LRU non-pinned cached local (spilling it to its
+ * home) and reuse its register as a scratch temporary. */
+ {
+ Reg r = nd_pick_cache_victim(d, cls);
+ if (r != REG_NONE) {
nd_flush_local(d, d->reg_owner[cls][r]);
d->scratch_used[cls] |= 1u << r;
return r;
@@ -298,6 +299,11 @@ static NativeAddr nd_addr_storage(NativeDirectTarget* d, Operand op) {
memset(&out, 0, sizeof out);
switch ((OpKind)op.kind) {
case OPK_LOCAL:
+ /* The local's home is addressed directly (a memory access reads/writes the
+ * frame slot itself, e.g. by-value aggregate field extraction). This is
+ * not pointer aliasing, but it does read the home, so a cached value must
+ * be made current: spill if dirty and drop the entry. */
+ nd_flush_local(d, op.v.local);
out.base_kind = NATIVE_ADDR_BASE_FRAME;
out.base.frame = nd_local(d, op.v.local)->home;
out.cls = nd_local(d, op.v.local)->cls;
@@ -309,20 +315,37 @@ static NativeAddr nd_addr_storage(NativeDirectTarget* d, Operand op) {
out.base.global.addend = op.v.global.addend;
out.base_type = op.type;
return out;
- case OPK_INDIRECT:
- out.base_kind = NATIVE_ADDR_BASE_FRAME_VALUE;
- out.base.frame = nd_local(d, op.v.ind.base)->home;
- out.cls = nd_local(d, op.v.ind.base)->cls;
- out.base_type = nd_local(d, op.v.ind.base)->type;
+ case OPK_INDIRECT: {
+ NativeDirectLocal* bl = nd_local(d, op.v.ind.base);
+ Reg br = nd_cache_reg_for(d, op.v.ind.base, bl->type);
+ out.cls = bl->cls;
+ out.base_type = bl->type;
+ if (br != REG_NONE) {
+ out.base_kind = NATIVE_ADDR_BASE_REG;
+ out.base.reg = br;
+ d->scratch_used[bl->cls] |= 1u << br; /* pin; unpinned at temps release */
+ } else {
+ out.base_kind = NATIVE_ADDR_BASE_FRAME_VALUE;
+ out.base.frame = bl->home;
+ }
if (op.v.ind.index != CG_LOCAL_NONE) {
- out.index_kind = NATIVE_ADDR_INDEX_FRAME_VALUE;
- out.index.frame = nd_local(d, op.v.ind.index)->home;
- out.index_cls = nd_local(d, op.v.ind.index)->cls;
- out.index_type = nd_local(d, op.v.ind.index)->type;
+ NativeDirectLocal* il = nd_local(d, op.v.ind.index);
+ Reg ir = nd_cache_reg_for(d, op.v.ind.index, il->type);
+ out.index_cls = il->cls;
+ out.index_type = il->type;
+ if (ir != REG_NONE) {
+ out.index_kind = NATIVE_ADDR_INDEX_REG;
+ out.index.reg = ir;
+ d->scratch_used[il->cls] |= 1u << ir;
+ } else {
+ out.index_kind = NATIVE_ADDR_INDEX_FRAME_VALUE;
+ out.index.frame = il->home;
+ }
}
out.log2_scale = op.v.ind.log2_scale;
out.offset = op.v.ind.ofs;
return out;
+ }
default:
nd_panic(d, "operand is not addressable storage");
}
@@ -332,19 +355,31 @@ static NativeAddr nd_addr_pointer(NativeDirectTarget* d, Operand op) {
NativeAddr out;
memset(&out, 0, sizeof out);
switch ((OpKind)op.kind) {
- case OPK_LOCAL:
+ case OPK_LOCAL: {
+ NativeDirectLocal* l = nd_local(d, op.v.local);
+ out.cls = l->cls;
+ out.base_type = l->type;
if (cg_type_is_ptr(d->base.c, op.type)) {
- out.base_kind = NATIVE_ADDR_BASE_FRAME_VALUE;
- out.base.frame = nd_local(d, op.v.local)->home;
- out.cls = nd_local(d, op.v.local)->cls;
- out.base_type = nd_local(d, op.v.local)->type;
+ /* Pointer value lives in the local: use its live register if cached
+ * (a dirty cached pointer is a valid base), else load from the home. */
+ Reg r = nd_cache_reg_for(d, op.v.local, l->type);
+ if (r != REG_NONE) {
+ out.base_kind = NATIVE_ADDR_BASE_REG;
+ out.base.reg = r;
+ d->scratch_used[l->cls] |= 1u << r;
+ } else {
+ out.base_kind = NATIVE_ADDR_BASE_FRAME_VALUE;
+ out.base.frame = l->home;
+ }
} else {
+ /* The local's home is addressed directly; make it current first (see
+ * nd_addr_storage OPK_LOCAL). */
+ nd_flush_local(d, op.v.local);
out.base_kind = NATIVE_ADDR_BASE_FRAME;
- out.base.frame = nd_local(d, op.v.local)->home;
- out.cls = nd_local(d, op.v.local)->cls;
- out.base_type = nd_local(d, op.v.local)->type;
+ out.base.frame = l->home;
}
return out;
+ }
case OPK_GLOBAL:
out.base_kind = NATIVE_ADDR_BASE_GLOBAL;
out.base.global.sym = op.v.global.sym;
@@ -440,11 +475,57 @@ static int nd_local_cacheable(NativeDirectTarget* d, const NativeDirectLocal* l)
l->size <= (u32)d->base.c->target.ptr_size;
}
+/* If LOCAL is currently cached and the access reads it at its cached (storage)
+ * width, return its live register; else REG_NONE. Used by the address builders
+ * to point an address at a base/index local's live register instead of reading
+ * a possibly-stale frame home. Base/index reads are always of the local's own
+ * type, so the width check is trivially met for that use; the value-read width
+ * hazard is handled separately in nd_materialize_operand. */
+/* Stamp a cache touch (def/read/addressing use) for LRU victim selection. */
+static void nd_touch_local(NativeDirectTarget* d, NativeDirectLocal* l) {
+ l->last_use = ++d->use_tick;
+}
+
+static Reg nd_cache_reg_for(NativeDirectTarget* d, CGLocal local,
+ CfreeCgTypeId access_type) {
+ NativeDirectLocal* l = nd_local(d, local);
+ if (l->reg == REG_NONE) return REG_NONE;
+ if (!nd_local_cacheable(d, l)) return REG_NONE;
+ if (access_type && access_type != l->type) return REG_NONE;
+ nd_touch_local(d, l);
+ return l->reg;
+}
+
+/* Pick the least-recently-used non-pinned cached local in CLS as a spill victim
+ * (its register can then be reused). REG_NONE if every owned reg is pinned.
+ * Pressure is real in Design B (the cache survives across memory ops), so an
+ * arbitrary victim would thrash a hot local; LRU keeps the live set resident. */
+static Reg nd_pick_cache_victim(NativeDirectTarget* d, NativeAllocClass cls) {
+ const NativeAllocClassInfo* ci = nd_class_info(d, cls);
+ Reg best = REG_NONE;
+ u32 best_use = 0;
+ for (u32 i = 0; i < ci->nallocable; ++i) {
+ Reg r = ci->allocable[i];
+ CGLocal owner;
+ if (r >= 32u) continue;
+ owner = d->reg_owner[cls][r];
+ if (owner == CG_LOCAL_NONE) continue;
+ if (d->scratch_used[cls] & (1u << r)) continue; /* pinned: never a victim */
+ if (best == REG_NONE || nd_local(d, owner)->last_use < best_use) {
+ best = r;
+ best_use = nd_local(d, owner)->last_use;
+ }
+ }
+ return best;
+}
+
/* Pick a caller-saved allocable register to cache a local in: a free one, else
- * evict a non-pinned cached local. REG_NONE means use the frame-only path. */
+ * evict the LRU non-pinned cached local. REG_NONE means use the frame-only
+ * path. */
static Reg nd_cache_alloc(NativeDirectTarget* d, NativeAllocClass cls) {
const NativeAllocClassInfo* ci = nd_class_info(d, cls);
u32 caller = ci->caller_saved_mask;
+ Reg victim;
for (u32 i = 0; i < ci->nallocable; ++i) {
Reg r = ci->allocable[i];
if (r >= 32u) continue;
@@ -452,14 +533,10 @@ static Reg nd_cache_alloc(NativeDirectTarget* d, NativeAllocClass cls) {
(d->scratch_used[cls] & (1u << r)) == 0)
return r;
}
- for (u32 i = 0; i < ci->nallocable; ++i) {
- Reg r = ci->allocable[i];
- if (r >= 32u) continue;
- if ((caller & (1u << r)) && d->reg_owner[cls][r] != CG_LOCAL_NONE &&
- (d->scratch_used[cls] & (1u << r)) == 0) {
- nd_flush_local(d, d->reg_owner[cls][r]);
- return r;
- }
+ victim = nd_pick_cache_victim(d, cls);
+ if (victim != REG_NONE && (caller & (1u << victim))) {
+ nd_flush_local(d, d->reg_owner[cls][victim]);
+ return victim;
}
return REG_NONE;
}
@@ -499,6 +576,18 @@ static NativeAddr nd_addr_materialize(NativeDirectTarget* d, NativeAddr in,
memset(temps, 0, sizeof *temps);
temps->base = REG_NONE;
temps->index = REG_NONE;
+ /* A base/index that arrives already in a register is a pinned live cache reg
+ * (the addr builders are the only producers of REG-kind storage addresses).
+ * Record it so the temps release unpins it afterward — without storing or
+ * invalidating, leaving the cache entry intact. */
+ if (out.base_kind == NATIVE_ADDR_BASE_REG) {
+ temps->base = out.base.reg;
+ temps->base_cls = (NativeAllocClass)out.cls;
+ }
+ if (out.index_kind == NATIVE_ADDR_INDEX_REG) {
+ temps->index = out.index.reg;
+ temps->index_cls = (NativeAllocClass)out.index_cls;
+ }
if (out.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) {
NativeAllocClass cls = (NativeAllocClass)out.cls;
Reg r = nd_scratch_acquire(d, cls);
@@ -685,6 +774,7 @@ static NativeLoc nd_materialize_operand(NativeDirectTarget* d, Operand op) {
if (l->reg != REG_NONE && op.type == l->type && nd_local_cacheable(d, l)) {
/* Cache hit: pin and reuse the live register, no reload. */
d->scratch_used[l->cls] |= 1u << l->reg;
+ nd_touch_local(d, l);
return nd_reg_loc(l->reg, (NativeAllocClass)l->cls, op.type);
}
/* A live entry under a different access width must reach memory before we
@@ -719,6 +809,7 @@ static NativeLoc nd_dst_reg(NativeDirectTarget* d, Operand dst) {
}
if (r != REG_NONE) {
d->scratch_used[l->cls] |= 1u << r; /* pin for the instruction */
+ nd_touch_local(d, l);
return nd_reg_loc(r, (NativeAllocClass)l->cls, dst.type);
}
}
@@ -745,6 +836,14 @@ static void nd_dst_writeback(NativeDirectTarget* d, Operand dst, NativeLoc dr) {
static void nd_store_operand_from_reg(NativeDirectTarget* d, Operand dst,
NativeLoc src) {
if (dst.kind != OPK_LOCAL) nd_panic(d, "destination is not a semantic local");
+ /* This writes SRC to the local's frame home, bypassing the value cache (the
+ * result was produced in a scratch reg, e.g. a load / address-of). Any live
+ * cache entry for the local is now stale and must be dropped — the home write
+ * supersedes it. Drop without storing; storing back would clobber the new
+ * home value. Runs after SRC is produced, so a dst that was its own address
+ * base has already been consumed. */
+ if (nd_local(d, dst.v.local)->reg != REG_NONE)
+ nd_invalidate_local(d, dst.v.local);
nd_store_reg_to_frame(d, nd_local(d, dst.v.local)->home, dst.type, src);
}
@@ -755,6 +854,7 @@ static void nd_func_begin(CgTarget* t, const CGFuncDesc* fd) {
d->nlabels = 0;
d->nscopes = 0;
d->max_outgoing = 0;
+ d->use_tick = 0;
memset(d->scratch_used, 0, sizeof d->scratch_used);
memset(d->reg_owner, 0, sizeof d->reg_owner);
if (d->ops && d->ops->func_begin) d->ops->func_begin(d, fd);
@@ -792,7 +892,10 @@ static void nd_local_addr(CgTarget* t, Operand dst, const CGLocalDesc* desc,
NativeDirectLocal* l = nd_local(d, local);
Operand lv;
(void)desc;
- nd_flush_all(d);
+ /* Targeted flush: only this local escapes. Spill+drop its entry so the home
+ * is authoritative for the address computation, then mark it uncacheable. The
+ * rest of the cache is unaffected (other cached locals stay non-escaped). */
+ nd_flush_local(d, local);
l->address_taken = 1;
l->flags |= CG_LOCAL_ADDR_TAKEN;
memset(&lv, 0, sizeof lv);
@@ -1068,7 +1171,8 @@ static void nd_copy(CgTarget* t, Operand dst, Operand src) {
if (size > (u64)t->c->target.ptr_size) {
NdAddrTemps dt, st;
AggregateAccess access;
- nd_flush_all(d);
+ /* Aggregate copy: addresses are built cache-aware (a directly-addressed
+ * cached local is flushed in nd_addr_storage), so no whole-cache flush. */
memset(&access, 0, sizeof access);
access.type = dst.type;
access.size = (u32)size;
@@ -1099,10 +1203,14 @@ static void nd_load(CgTarget* t, Operand dst, Operand addr, MemAccess mem) {
NdAddrTemps temps;
u64 size =
mem.size ? mem.size : (mem.type ? cg_type_size(t->c, mem.type) : 0);
- nd_flush_all(d);
- if (mem.flags & MF_VOLATILE)
+ /* No value-cache flush: only escaped (address-taken / memory-required) locals
+ * can be aliased through a pointer, and those are never cached. A volatile
+ * access may observe memory and needs the cache made authoritative first. */
+ if (mem.flags & MF_VOLATILE) {
+ nd_flush_all(d);
nd_barrier(d,
NATIVE_DIRECT_BARRIER_MEMORY | NATIVE_DIRECT_BARRIER_VOLATILE);
+ }
NativeAddr naddr =
nd_addr_materialize(d, nd_addr_storage(d, addr), &temps, mem);
if (size > (u64)t->c->target.ptr_size) {
@@ -1133,10 +1241,14 @@ static void nd_store(CgTarget* t, Operand addr, Operand src, MemAccess mem) {
NdAddrTemps temps;
u64 size =
mem.size ? mem.size : (mem.type ? cg_type_size(t->c, mem.type) : 0);
- nd_flush_all(d);
- if (mem.flags & MF_VOLATILE)
+ /* No value-cache flush (see nd_load): a store through a pointer cannot alias a
+ * cached non-escaped local. The store target is foreign memory, so there is no
+ * dst local entry to invalidate; SRC is read via nd_materialize_operand. */
+ if (mem.flags & MF_VOLATILE) {
+ nd_flush_all(d);
nd_barrier(d,
NATIVE_DIRECT_BARRIER_MEMORY | NATIVE_DIRECT_BARRIER_VOLATILE);
+ }
NativeAddr naddr =
nd_addr_materialize(d, nd_addr_storage(d, addr), &temps, mem);
if (size > (u64)t->c->target.ptr_size) {
@@ -1167,7 +1279,17 @@ static void nd_addr_of(CgTarget* t, Operand dst, Operand lv) {
MemAccess mem = nd_scalar_mem(dst.type, d->base.c->target.ptr_size,
d->base.c->target.ptr_align);
NativeAddr naddr;
- nd_flush_all(d);
+ /* Targeted: only an OPK_LOCAL lvalue escapes here — flush+mark just that local
+ * (its home becomes the authoritative address source). An INDIRECT lvalue's
+ * address is computed from base/index, which nd_addr_storage now reads from
+ * the cache directly; a GLOBAL needs nothing. The dst home write is handled by
+ * nd_store_operand_from_reg's invalidation. */
+ if (lv.kind == OPK_LOCAL) {
+ NativeDirectLocal* l = nd_local(d, lv.v.local);
+ nd_flush_local(d, lv.v.local);
+ l->address_taken = 1;
+ l->flags |= CG_LOCAL_ADDR_TAKEN;
+ }
naddr = nd_addr_materialize(d, nd_addr_storage(d, lv), &temps, mem);
NativeLoc reg = nd_dst_scratch(d, dst);
ND_REQUIRE_NATIVE(d, load_addr, "target does not materialize addresses");
@@ -1194,7 +1316,7 @@ static void nd_copy_bytes(CgTarget* t, Operand dst_addr, Operand src_addr,
NativeDirectTarget* d = nd_of(t);
NdAddrTemps dt, st;
NativeAddr dst;
- nd_flush_all(d);
+ /* Pointer-target memory; addresses are cache-aware. No whole-cache flush. */
dst = nd_addr_materialize(d, nd_addr_pointer(d, dst_addr), &dt, access.mem);
NativeAddr src =
nd_addr_materialize(d, nd_addr_pointer(d, src_addr), &st, access.mem);
@@ -1210,7 +1332,7 @@ static void nd_set_bytes(CgTarget* t, Operand dst_addr, Operand byte_value,
NdAddrTemps temps;
NativeAddr dst;
NativeLoc byte;
- nd_flush_all(d);
+ /* Pointer-target memory; addresses are cache-aware. No whole-cache flush. */
dst = nd_addr_materialize(d, nd_addr_pointer(d, dst_addr), &temps, access.mem);
byte = nd_materialize_operand(d, byte_value);
ND_REQUIRE_NATIVE(d, set_bytes, "target does not set bytes");
@@ -1225,7 +1347,8 @@ static void nd_bitfield_load(CgTarget* t, Operand dst, Operand record_addr,
NdAddrTemps temps;
NativeAddr addr;
NativeLoc reg;
- nd_flush_all(d);
+ /* Record (pointer-target) memory; addresses are cache-aware. The dst home
+ * write is handled by nd_store_operand_from_reg's invalidation. */
addr = nd_addr_materialize(d, nd_addr_storage(d, record_addr), &temps,
access.storage);
reg = nd_dst_scratch(d, dst);
@@ -1242,7 +1365,8 @@ static void nd_bitfield_store(CgTarget* t, Operand record_addr, Operand src,
NdAddrTemps temps;
NativeAddr addr;
NativeLoc val;
- nd_flush_all(d);
+ /* Record (pointer-target) memory; addresses are cache-aware, SRC reads the
+ * cache. No whole-cache flush. */
addr = nd_addr_materialize(d, nd_addr_storage(d, record_addr), &temps,
access.storage);
val = nd_materialize_operand(d, src);
diff --git a/src/cg/native_direct_target.h b/src/cg/native_direct_target.h
@@ -42,6 +42,7 @@ typedef struct NativeDirectLocal {
u8 dirty;
u8 address_taken;
u8 memory_required;
+ u32 last_use; /* d->use_tick at the most recent cache touch (LRU victim key) */
} NativeDirectLocal;
typedef enum NativeDirectAddrLegality {
@@ -126,6 +127,7 @@ struct NativeDirectTarget {
* instruction" mask. Per-local cache state (reg/cls/dirty) lives on
* NativeDirectLocal. See doc/CGTARGET.md "local register cache". */
CGLocal reg_owner[3][32];
+ u32 use_tick; /* monotonic counter stamped onto NativeDirectLocal.last_use */
u32 max_outgoing;
ObjSecId local_static_sec;
diff --git a/test/cg/native_direct_target_test.c b/test/cg/native_direct_target_test.c
@@ -365,6 +365,43 @@ static CGLocal local_new(CgTarget* t, CfreeCgTypeId type) {
return t->local(t, &d);
}
+static CGLocal local_new_ptr(CgTarget* t, CfreeCgTypeId type) {
+ CGLocalDesc d;
+ memset(&d, 0, sizeof d);
+ d.type = type;
+ d.size = 8;
+ d.align = 8;
+ return t->local(t, &d);
+}
+
+static Operand op_indirect(CGLocal base, i32 ofs, CfreeCgTypeId type) {
+ Operand o;
+ memset(&o, 0, sizeof o);
+ o.kind = OPK_INDIRECT;
+ o.type = type;
+ o.v.ind.base = base;
+ o.v.ind.index = CG_LOCAL_NONE;
+ o.v.ind.ofs = ofs;
+ return o;
+}
+
+static MemAccess mem_scalar(CfreeCgTypeId type, u16 flags) {
+ MemAccess m;
+ memset(&m, 0, sizeof m);
+ m.type = type;
+ m.size = 4;
+ m.align = 4;
+ m.flags = flags;
+ return m;
+}
+
+/* Index of the first event of KIND at or after `from`, or -1. */
+static int event_index(const MockNative* m, MockEventKind kind, u32 from) {
+ for (u32 i = from; i < m->nevents; ++i)
+ if (m->events[i].kind == kind) return (int)i;
+ return -1;
+}
+
static CGFuncDesc fn_desc(TestCtx* tc) {
CGFuncDesc fd;
CfreeCgFuncSig sig;
@@ -423,8 +460,12 @@ static void test_frame_locals_scratch_storeback_and_branches(void) {
EXPECT(count_event(&native, EV_LOAD_IMM) == 3,
"two explicit immediates plus cmp imm materialization expected");
- EXPECT(count_event(&native, EV_LOAD) >= 3,
- "frame locals should be reloaded before arithmetic/return");
+ /* With the local register cache, a/b/sum live in registers across the
+ * straight-line compute run: the binop reloads neither operand. They are
+ * spilled at the branch flush (EV_STORE), and sum is reloaded only after that
+ * flush — once for the cmp_branch and once for the return. */
+ EXPECT(count_event(&native, EV_LOAD) == 2,
+ "sum is reloaded from its home for the cmp_branch and the return");
EXPECT(count_event(&native, EV_STORE) >= 3,
"results should store back to frame homes");
EXPECT(count_event(&native, EV_BINOP) == 1, "expected one native binop");
@@ -482,9 +523,142 @@ static void test_call_barrier_storeback_and_max_outgoing(void) {
tc_fini(&tc);
}
+/* Design B: a cached pointer base is dereferenced straight from its register —
+ * no spill, no reload of the base from its home. The discriminator is the
+ * EV_LOAD count: an uncached base would emit a separate BASE_FRAME load to read
+ * the pointer, plus the dereference. A cached base emits only the dereference. */
+static void test_b_cached_pointer_base_not_reloaded(void) {
+ TestCtx tc;
+ MockNative native;
+ CgTarget* t;
+ CGFuncDesc fd;
+ CGLocal p, d;
+ tc_init(&tc);
+ t = make_target(&tc, &native);
+ fd = fn_desc(&tc);
+ t->func_begin(t, &fd);
+ p = local_new_ptr(t, tc.ptr);
+ d = local_new(t, tc.i32);
+ t->load_imm(t, op_local(p, tc.ptr), 0x1000); /* p computed -> cached, dirty */
+ t->load(t, op_local(d, tc.i32), op_indirect(p, 0, tc.i32),
+ mem_scalar(tc.i32, 0));
+
+ EXPECT(count_event(&native, EV_LOAD) == 1,
+ "only the dereference loads; the cached base p is not reloaded");
+ {
+ int li = event_index(&native, EV_LOAD, 0);
+ EXPECT(li >= 0 && native.events[li].a == NATIVE_ADDR_BASE_REG,
+ "dereference addresses the live cache register for p");
+ }
+ EXPECT(count_event(&native, EV_STORE) == 1,
+ "p is not spilled; only the load result is written to d's home");
+ t->func_end(t);
+ tc_fini(&tc);
+}
+
+/* Design B: the cache survives across a store through a pointer. Neither the
+ * base p nor the stored value a is spilled/reloaded, and a later use of a hits
+ * the cache. Under Design A the store's flush_all would force both to memory. */
+static void test_b_cache_survives_store(void) {
+ TestCtx tc;
+ MockNative native;
+ CgTarget* t;
+ CGFuncDesc fd;
+ CGLocal p, a, b;
+ tc_init(&tc);
+ t = make_target(&tc, &native);
+ fd = fn_desc(&tc);
+ t->func_begin(t, &fd);
+ p = local_new_ptr(t, tc.ptr);
+ a = local_new(t, tc.i32);
+ b = local_new(t, tc.i32);
+ t->load_imm(t, op_local(p, tc.ptr), 0x2000); /* p cached */
+ t->load_imm(t, op_local(a, tc.i32), 5); /* a cached, dirty */
+ t->store(t, op_indirect(p, 0, tc.i32), op_local(a, tc.i32),
+ mem_scalar(tc.i32, 0));
+ t->binop(t, BO_IADD, op_local(b, tc.i32), op_local(a, tc.i32),
+ op_local(a, tc.i32)); /* a is a cache hit */
+
+ EXPECT(count_event(&native, EV_LOAD) == 0,
+ "no home reloads: p and a are read from registers across the store");
+ {
+ int si = event_index(&native, EV_STORE, 0);
+ EXPECT(si >= 0 && native.events[si].a == NATIVE_ADDR_BASE_REG,
+ "the store dereferences p from its live register");
+ }
+ EXPECT(count_event(&native, EV_STORE) == 1,
+ "only the user store is emitted; nothing is spilled");
+ t->func_end(t);
+ tc_fini(&tc);
+}
+
+/* A volatile access must still flush the cache (it may observe memory) and emit
+ * the volatile barrier — the escape argument does not apply. */
+static void test_b_volatile_load_flushes(void) {
+ TestCtx tc;
+ MockNative native;
+ CgTarget* t;
+ CGFuncDesc fd;
+ CGLocal p, a, d;
+ tc_init(&tc);
+ t = make_target(&tc, &native);
+ fd = fn_desc(&tc);
+ t->func_begin(t, &fd);
+ p = local_new_ptr(t, tc.ptr);
+ a = local_new(t, tc.i32);
+ d = local_new(t, tc.i32);
+ t->load_imm(t, op_local(p, tc.ptr), 0x3000);
+ t->load_imm(t, op_local(a, tc.i32), 9); /* a cached, dirty */
+ t->load(t, op_local(d, tc.i32), op_indirect(p, 0, tc.i32),
+ mem_scalar(tc.i32, MF_VOLATILE));
+
+ EXPECT((native.barrier_flags & NATIVE_DIRECT_BARRIER_VOLATILE) != 0,
+ "volatile access emits a volatile barrier");
+ EXPECT(count_event(&native, EV_STORE) >= 2,
+ "volatile flush spills the dirty cached locals (p and a)");
+ t->func_end(t);
+ tc_fini(&tc);
+}
+
+/* A call still flushes the whole cache: a dirty cached local is spilled before
+ * the call is emitted (caller-saved registers die across the call). */
+static void test_b_call_still_flushes(void) {
+ TestCtx tc;
+ MockNative native;
+ CgTarget* t;
+ CGFuncDesc fd;
+ CGLocal a, fnptr;
+ CGCallDesc call;
+ tc_init(&tc);
+ t = make_target(&tc, &native);
+ fd = fn_desc(&tc);
+ t->func_begin(t, &fd);
+ a = local_new(t, tc.i32);
+ fnptr = local_new_ptr(t, tc.ptr);
+ t->load_imm(t, op_local(a, tc.i32), 5); /* a cached, dirty */
+ t->load_imm(t, op_local(fnptr, tc.ptr), 0x1000);
+ memset(&call, 0, sizeof call);
+ call.fn_type = fd.fn_type;
+ call.callee = op_local(fnptr, tc.ptr);
+ t->call(t, &call);
+
+ {
+ int ci = event_index(&native, EV_EMIT_CALL, 0);
+ int si = event_index(&native, EV_STORE, 0);
+ EXPECT(ci >= 0 && si >= 0 && si < ci,
+ "dirty cached locals are spilled before the call is emitted");
+ }
+ t->func_end(t);
+ tc_fini(&tc);
+}
+
int main(void) {
test_frame_locals_scratch_storeback_and_branches();
test_call_barrier_storeback_and_max_outgoing();
+ test_b_cached_pointer_base_not_reloaded();
+ test_b_cache_survives_store();
+ test_b_volatile_load_flushes();
+ test_b_call_still_flushes();
if (g_fails) {
fprintf(stderr, "%d/%d checks failed\n", g_fails, g_checks);
return 1;