commit 2ebefab9c911e0a289223a435d8debe7c9240449
parent 607986aef63ad1b10404361bf82862b70cb8ba65
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Wed, 27 May 2026 10:01:00 -0700
opt: add write-back local register cache to NativeDirectTarget (Design A)
Cache scalar, non-address-taken locals in caller-saved allocable registers
on the shared -O0 direct path. Entries are created only by pure-compute
destinations (always dirty); reads hit a live entry or load to a scratch
temporary without creating one. nd_flush_all spills and empties the cache at
the top of every non-compute op, so the cache survives only across
straight-line runs of compute ops -- no aliasing, address-base-staleness, or
control-flow-merge reasoning is needed.
Caller-saved-only caching means no callee-save prologue work and the
conservative flush-before-call covers ABI clobbering. nd_scratch_acquire now
skips cached registers and evicts (spills) one under pressure.
Eliminates the load/compute/store memory round trip for temp/expression
chains on the aarch64 direct path. doc/NATIVE_DIRECT_CACHE.md also sketches
the planned Design B (escape-aware cache with cache-aware addressing).
Toy suite green at -O0: R 254/0, L 257/0, C 152/0/2.
Diffstat:
3 files changed, 398 insertions(+), 40 deletions(-)
diff --git a/doc/NATIVE_DIRECT_CACHE.md b/doc/NATIVE_DIRECT_CACHE.md
@@ -0,0 +1,166 @@
+# NativeDirectTarget local register cache
+
+`NativeDirectTarget` is the single-pass `-O0` lowering shared by the native
+backends (currently aarch64). The baseline lowers every semantic op to
+load-operands / compute / store-result against frame homes, so a local round
+trips through memory on every use. This document describes the local register
+cache that avoids those round trips. **Design A** is implemented; **Design B**
+is the planned follow-on.
+
+See also `doc/CGTARGET.md` ("local register cache") for the original sketch.
+
+## Invariants shared by both designs
+
+- **What is cached.** Only scalar locals (`size <= ptr_size`) that are neither
+ address-taken nor `memory_required`. Aggregates and escaped locals stay
+ frame-only. A cache access is keyed on the local's storage type; a foreign
+ width bypasses (and flushes) the entry.
+- **Where it is cached.** Only **caller-saved allocable** registers
+ (`caller_saved_mask & allocable`). Consequences:
+ - No prologue/epilogue work — the direct path never reports clobbered
+ callee-saved registers (`reserve_callee_saves` is unused on this path), so
+ using callee-saved regs would be unsound. Caller-saved sidesteps it.
+ - The conservative flush before every call/barrier (which spills the whole
+ cache) fully covers ABI clobbering across calls.
+- **Basic-block scope.** Without CFG/liveness, the cache cannot survive a
+ control-flow edge or a join. Both designs spill+empty the cache at every
+ branch, label placement, and `ret`. `func_begin` starts with an empty cache.
+- **State.** `reg_owner[cls][reg]` names the local cached in a physical
+ register (or `CG_LOCAL_NONE`); `scratch_used[cls]` doubles as the
+ "pinned for the current instruction" mask; per-local `reg`/`cls`/`dirty` live
+ on `NativeDirectLocal`.
+
+## Design A — write-back, compute-only cache (implemented)
+
+The simplest correct form. Cache entries are created **only** by pure-compute
+destinations (`binop`, `unop`, `cmp`, `convert`, `load_imm`, `load_const`,
+scalar `copy`) and are therefore always dirty. A read returns a live entry
+(hit) or loads into a scratch temporary **without** creating an entry. The
+cache thus only survives across a straight-line run of pure-compute ops.
+
+`nd_flush_all` (spill all dirty entries to their homes, then empty) runs at the
+**top of every non-compute op**: control-flow/labels (block boundaries), calls
+and barriers (clobbering / observable memory), and every memory or address op
+(so frame homes are authoritative before an address base is read from a frame).
+
+This uniform flush is what makes Design A obviously correct: at the start of any
+op that could observe memory, alias a local, read an address base from a frame,
+or merge control flow, the cache is already empty and memory is authoritative.
+No escape analysis, alias reasoning, or address-base tracking is required.
+
+Key helpers (`src/cg/native_direct_target.c`):
+
+```
+nd_local_cacheable(d, l) scalar, !address_taken, !memory_required, fits
+nd_cache_alloc(d, cls) free caller-saved allocable reg, else evict one
+nd_flush_local(d, local) store if dirty; drop entry
+nd_invalidate_local(d, local) drop entry without storing (store supersedes)
+nd_flush_all(d) flush every live entry
+nd_dst_reg / nd_dst_writeback compute-op result reg; mark dirty without storing
+```
+
+`nd_scratch_acquire` skips owned registers and, under pressure, evicts a
+non-pinned cached local (spilling it) to reuse its register as a temporary.
+
+**Wins:** expression/temp chains (`t1 = a+b; t2 = t1*c; t3 = t2+d`) never touch
+memory between definition and use. **Limits:** the cache collapses on every
+load/store/call, so memory-dense code degrades to the baseline. It is never
+worse than baseline: each cached local is stored at most once per boundary,
+versus once per definition in the baseline.
+
+## Design B — escape-aware cache with cache-aware addressing (planned)
+
+Design B keeps the same block-local invariant but stops tearing the cache down
+on memory operations, by reasoning about escape and by making address
+construction consult the cache. It is strictly incremental over A: A with (a)
+the memory ops' `nd_flush_all` removed on escape grounds, (b) addressing taught
+to use live cache registers, and (c) a real spill victim selector.
+
+### 1. Escape-based aliasing replaces flush-on-memory-op
+
+Address-taken / `memory_required` locals are never cached. Therefore a pointer
+`load`/`store`/`copy_bytes`/`set_bytes`/`bitfield_*`/non-clobbering atomic can
+only alias an **escaped** local — which is never in a register. So these ops
+need **no value-cache flush at all** for aliasing correctness; they keep the
+cache live across the access.
+
+This is the central change: A flushes before a memory op to make the frame home
+current; B instead never lets a non-escaped local's home be aliased, so there is
+nothing to make current.
+
+### 2. Cache-aware addressing
+
+Today `nd_addr_storage` / `nd_addr_pointer` emit `NATIVE_ADDR_BASE_FRAME_VALUE`
+and `nd_addr_materialize` *loads* the pointer/index from the frame. When the
+base or index local is currently cached, B instead points the address at the
+live register:
+
+```
+building an INDIRECT/pointer address whose base/index local L is cached:
+ base_kind = NATIVE_ADDR_BASE_REG, base.reg = L->reg (pin for the op)
+ else:
+ NATIVE_ADDR_BASE_FRAME_VALUE (load from home, as today)
+...after the memory op: unpin the base/index regs; do NOT invalidate and do NOT
+ store even if dirty — the value was read from the live register, so the stale
+ home is irrelevant.
+```
+
+This simultaneously removes the reload and fixes the staleness hazard that A
+sidesteps by flushing. A dirty pointer local can serve as an address base
+directly from its register.
+
+### 3. Calls / barriers still flush + invalidate
+
+Caller-saved registers die across a call, and a memory-clobber barrier
+(volatile, atomic with clobber, inline asm with `memory`) may observe
+everything, so `nd_flush_all` still runs before `call`/`atomic`/volatile/`asm`.
+A future refinement could use a per-call save-set to shrink this; flush is the
+simple correct version.
+
+### 4. Branches / labels / ret still flush
+
+The block-local invariant is unchanged from A and is unavoidable without
+CFG/liveness.
+
+### 5. Address-taking is targeted, not global
+
+`addr_of` / `local_addr` on a cached local does `nd_flush_local(L)` and marks it
+`address_taken` (uncacheable thereafter) — a targeted flush, not `nd_flush_all`.
+
+### Extra bookkeeping B requires over A
+
+- **Read entries.** B benefits from caching read-only locals (loop variables,
+ reused operands), not just compute results. Read-created entries are clean
+ (not dirty). This reintroduces the hazard A avoids: an entry created during a
+ non-compute op that emits clobbering code (intrinsic/call) must not survive.
+ B must therefore either (i) confine read-entry creation to compute ops, or
+ (ii) invalidate (clean entries need no store) after any clobbering op. Option
+ (i) — only compute ops may create cache entries — is simplest and preserves
+ the "cache only grows across compute runs" property.
+- **Real eviction/spill.** Pressure is higher because values survive across
+ more ops, so "don't cache when full" is insufficient; `nd_cache_alloc` and
+ `nd_scratch_acquire` must pick a non-pinned, non-destination victim and
+ `nd_flush_local` it. A victim policy (e.g. round-robin or oldest) is needed;
+ pinned source/dst/address regs are never victims.
+- **Pin discipline through addressing.** Base/index cache regs must be pinned
+ for the duration of a memory op and unpinned (never invalidated) afterward,
+ alongside the existing source/dst/temp pins.
+- **INDIRECT compute operands.** Keep A's `nd_flush_operand_addr_locals` guard
+ for the rare compute op that receives an `OPK_INDIRECT` operand directly,
+ unless addressing in such operands is also made cache-aware.
+
+### Correctness rests on
+
+- The escape argument: non-escaped locals are never aliasable through a pointer,
+ so memory ops need not flush them.
+- The addressing intercept being exact: every frame-value base/index read of a
+ cached local must instead use the live register.
+- Clean read entries never surviving a clobbering op.
+- The block-local invariant (flush at every edge/label/ret) being complete.
+
+### Why B is worth it
+
+A wins only on arithmetic-dense straight-line code. B keeps the cache alive
+across loads, stores, and pointer-base reuse, so it wins on realistic mixed code
+— `p->a + p->b`, pointer-walking loops, struct-field math — while remaining
+strictly single-pass with no lookahead and no liveness.
diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c
@@ -105,15 +105,20 @@ static NativeLoc nd_reg_loc(Reg reg, NativeAllocClass cls, CfreeCgTypeId type) {
return out;
}
+static void nd_flush_local(NativeDirectTarget* d, CGLocal local);
+
static Reg nd_scratch_acquire(NativeDirectTarget* d, NativeAllocClass cls) {
const NativeAllocClassInfo* ci = nd_class_info(d, cls);
const Reg* regs = ci->scratch;
u32 nregs = ci->nscratch;
+ /* Prefer a register that is neither pinned (scratch_used) nor caching a live
+ * local (reg_owner). */
for (u32 pass = 0; pass < 2u; ++pass) {
for (u32 i = 0; i < nregs; ++i) {
Reg r = regs[i];
if (r >= 32u) continue;
- if ((d->scratch_used[cls] & (1u << r)) == 0) {
+ if ((d->scratch_used[cls] & (1u << r)) == 0 &&
+ d->reg_owner[cls][r] == CG_LOCAL_NONE) {
d->scratch_used[cls] |= 1u << r;
return r;
}
@@ -121,6 +126,18 @@ static Reg nd_scratch_acquire(NativeDirectTarget* d, NativeAllocClass cls) {
regs = ci->allocable;
nregs = ci->nallocable;
}
+ /* Under pressure, evict a non-pinned cached local (spilling it to its home)
+ * and reuse its register as a scratch temporary. */
+ for (u32 i = 0; i < ci->nallocable; ++i) {
+ Reg r = ci->allocable[i];
+ if (r >= 32u) continue;
+ if ((d->scratch_used[cls] & (1u << r)) == 0 &&
+ d->reg_owner[cls][r] != CG_LOCAL_NONE) {
+ nd_flush_local(d, d->reg_owner[cls][r]);
+ d->scratch_used[cls] |= 1u << r;
+ return r;
+ }
+ }
nd_panic(d, "out of scratch registers");
}
@@ -405,6 +422,76 @@ static void nd_store_reg_to_frame(NativeDirectTarget* d, NativeFrameSlot frame,
static void nd_copy_to_reg(NativeDirectTarget* d, NativeLoc dst, NativeLoc src);
static void nd_release_materialized(NativeDirectTarget* d, NativeLoc loc);
+static void nd_store_operand_from_reg(NativeDirectTarget* d, Operand dst,
+ NativeLoc src);
+
+/* --- Local register cache (write-back, basic-block-scoped) ---------------- *
+ * Only scalar, non-address-taken locals are cached, and only in caller-saved
+ * allocable registers. Entries are created solely by pure-compute destinations
+ * (nd_dst_reg/nd_dst_writeback) and are always dirty; reads hit a live entry or
+ * fall back to a frame load without creating one. nd_flush_all spills and
+ * empties the cache at the top of every non-pure-compute op, so the cache only
+ * survives across straight-line runs of compute ops. Caching only caller-saved
+ * registers means that conservative flush fully covers ABI clobbering across
+ * calls, and no callee-save prologue/epilogue work is required. */
+
+static int nd_local_cacheable(NativeDirectTarget* d, const NativeDirectLocal* l) {
+ return !l->address_taken && !l->memory_required && l->size != 0 &&
+ l->size <= (u32)d->base.c->target.ptr_size;
+}
+
+/* Pick a caller-saved allocable register to cache a local in: a free one, else
+ * evict a non-pinned cached local. REG_NONE means use the frame-only path. */
+static Reg nd_cache_alloc(NativeDirectTarget* d, NativeAllocClass cls) {
+ const NativeAllocClassInfo* ci = nd_class_info(d, cls);
+ u32 caller = ci->caller_saved_mask;
+ for (u32 i = 0; i < ci->nallocable; ++i) {
+ Reg r = ci->allocable[i];
+ if (r >= 32u) continue;
+ if ((caller & (1u << r)) && d->reg_owner[cls][r] == CG_LOCAL_NONE &&
+ (d->scratch_used[cls] & (1u << r)) == 0)
+ return r;
+ }
+ for (u32 i = 0; i < ci->nallocable; ++i) {
+ Reg r = ci->allocable[i];
+ if (r >= 32u) continue;
+ if ((caller & (1u << r)) && d->reg_owner[cls][r] != CG_LOCAL_NONE &&
+ (d->scratch_used[cls] & (1u << r)) == 0) {
+ nd_flush_local(d, d->reg_owner[cls][r]);
+ return r;
+ }
+ }
+ return REG_NONE;
+}
+
+/* Write a cached local back to its home (if dirty) and drop the entry. Safe to
+ * call on an uncached local. */
+static void nd_flush_local(NativeDirectTarget* d, CGLocal local) {
+ NativeDirectLocal* l = nd_local(d, local);
+ if (l->reg == REG_NONE) return;
+ if (l->dirty)
+ nd_store_reg_to_frame(d, l->home, l->type,
+ nd_reg_loc(l->reg, (NativeAllocClass)l->cls, l->type));
+ d->reg_owner[l->cls][l->reg] = CG_LOCAL_NONE;
+ l->reg = REG_NONE;
+ l->dirty = 0;
+}
+
+/* Drop a cache entry without writing it back, for when a store supersedes the
+ * cached value. */
+static void nd_invalidate_local(NativeDirectTarget* d, CGLocal local) {
+ NativeDirectLocal* l = nd_local(d, local);
+ if (l->reg == REG_NONE) return;
+ d->reg_owner[l->cls][l->reg] = CG_LOCAL_NONE;
+ l->reg = REG_NONE;
+ l->dirty = 0;
+}
+
+/* Spill the whole cache to memory and empty it. */
+static void nd_flush_all(NativeDirectTarget* d) {
+ for (u32 i = 0; i < d->nlocals; ++i)
+ if (d->locals[i].reg != REG_NONE) nd_flush_local(d, i + 1u);
+}
static NativeAddr nd_addr_materialize(NativeDirectTarget* d, NativeAddr in,
NdAddrTemps* temps, MemAccess mem) {
@@ -582,8 +669,29 @@ static void nd_release_materialized(NativeDirectTarget* d, NativeLoc loc) {
nd_scratch_release(d, (NativeAllocClass)loc.cls, loc.v.reg);
}
+/* Spill cached locals that back an INDIRECT operand's address before it is read
+ * from their frame homes. Compute ops normally receive only LOCAL/IMM/GLOBAL
+ * operands; this keeps the rare INDIRECT case correct without flushing all. */
+static void nd_flush_operand_addr_locals(NativeDirectTarget* d, Operand op) {
+ if (op.kind != OPK_INDIRECT) return;
+ nd_flush_local(d, op.v.ind.base);
+ if (op.v.ind.index != CG_LOCAL_NONE) nd_flush_local(d, op.v.ind.index);
+}
+
static NativeLoc nd_materialize_operand(NativeDirectTarget* d, Operand op) {
NativeAllocClass cls = nd_class_for_type(d, op.type);
+ if (op.kind == OPK_LOCAL) {
+ NativeDirectLocal* l = nd_local(d, op.v.local);
+ if (l->reg != REG_NONE && op.type == l->type && nd_local_cacheable(d, l)) {
+ /* Cache hit: pin and reuse the live register, no reload. */
+ d->scratch_used[l->cls] |= 1u << l->reg;
+ return nd_reg_loc(l->reg, (NativeAllocClass)l->cls, op.type);
+ }
+ /* A live entry under a different access width must reach memory before we
+ * bypass the cache for this access. */
+ if (l->reg != REG_NONE) nd_flush_local(d, op.v.local);
+ }
+ nd_flush_operand_addr_locals(d, op);
return nd_materialize_loc(d, nd_loc_operand(d, op), cls, op.type);
}
@@ -593,6 +701,47 @@ static NativeLoc nd_dst_scratch(NativeDirectTarget* d, Operand dst) {
return nd_reg_loc(r, cls, dst.type);
}
+/* Register a pure-compute op writes its result into. For a cacheable local that
+ * is the local's cache register (reused or freshly allocated), pinned for the
+ * instruction; nd_dst_writeback then marks it dirty without storing. Otherwise
+ * a scratch temporary that nd_dst_writeback spills to the frame home. */
+static NativeLoc nd_dst_reg(NativeDirectTarget* d, Operand dst) {
+ if (dst.kind == OPK_LOCAL) {
+ NativeDirectLocal* l = nd_local(d, dst.v.local);
+ if (dst.type == l->type && nd_local_cacheable(d, l)) {
+ Reg r = l->reg;
+ if (r == REG_NONE) {
+ r = nd_cache_alloc(d, (NativeAllocClass)l->cls);
+ if (r != REG_NONE) {
+ d->reg_owner[l->cls][r] = dst.v.local;
+ l->reg = r;
+ }
+ }
+ if (r != REG_NONE) {
+ d->scratch_used[l->cls] |= 1u << r; /* pin for the instruction */
+ return nd_reg_loc(r, (NativeAllocClass)l->cls, dst.type);
+ }
+ }
+ }
+ return nd_dst_scratch(d, dst);
+}
+
+static void nd_dst_writeback(NativeDirectTarget* d, Operand dst, NativeLoc dr) {
+ if (dst.kind == OPK_LOCAL) {
+ NativeDirectLocal* l = nd_local(d, dst.v.local);
+ if (dr.kind == NATIVE_LOC_REG && l->reg == dr.v.reg && dst.type == l->type &&
+ nd_local_cacheable(d, l)) {
+ l->dirty = 1;
+ d->scratch_used[l->cls] &= ~(1u << dr.v.reg); /* unpin, keep cached */
+ return;
+ }
+ /* Bypassing the cache: drop any stale entry, then spill to the home. */
+ if (l->reg != REG_NONE) nd_invalidate_local(d, dst.v.local);
+ }
+ nd_store_operand_from_reg(d, dst, dr);
+ nd_release_materialized(d, dr);
+}
+
static void nd_store_operand_from_reg(NativeDirectTarget* d, Operand dst,
NativeLoc src) {
if (dst.kind != OPK_LOCAL) nd_panic(d, "destination is not a semantic local");
@@ -607,6 +756,7 @@ static void nd_func_begin(CgTarget* t, const CGFuncDesc* fd) {
d->nscopes = 0;
d->max_outgoing = 0;
memset(d->scratch_used, 0, sizeof d->scratch_used);
+ memset(d->reg_owner, 0, sizeof d->reg_owner);
if (d->ops && d->ops->func_begin) d->ops->func_begin(d, fd);
if (d->native && d->native->func_begin) d->native->func_begin(d->native, fd);
}
@@ -642,6 +792,7 @@ static void nd_local_addr(CgTarget* t, Operand dst, const CGLocalDesc* desc,
NativeDirectLocal* l = nd_local(d, local);
Operand lv;
(void)desc;
+ nd_flush_all(d);
l->address_taken = 1;
l->flags |= CG_LOCAL_ADDR_TAKEN;
memset(&lv, 0, sizeof lv);
@@ -684,12 +835,14 @@ static Label nd_label_new(CgTarget* t) { return nd_label_new_raw(nd_of(t)); }
static void nd_label_place(CgTarget* t, Label label) {
NativeDirectTarget* d = nd_of(t);
+ nd_flush_all(d);
ND_REQUIRE_NATIVE(d, label_place, "target does not place labels");
d->native->label_place(d->native, nd_mc_label(d, label));
}
static void nd_jump(CgTarget* t, Label label) {
NativeDirectTarget* d = nd_of(t);
+ nd_flush_all(d);
ND_REQUIRE_NATIVE(d, jump, "target does not emit jumps");
d->native->jump(d->native, nd_mc_label(d, label));
}
@@ -697,8 +850,10 @@ static void nd_jump(CgTarget* t, Label label) {
static void nd_cmp_branch(CgTarget* t, CmpOp op, Operand a, Operand b,
Label label) {
NativeDirectTarget* d = nd_of(t);
- NativeLoc ar = nd_materialize_operand(d, a);
- NativeLoc br = nd_materialize_operand(d, b);
+ NativeLoc ar, br;
+ nd_flush_all(d);
+ ar = nd_materialize_operand(d, a);
+ br = nd_materialize_operand(d, b);
ND_REQUIRE_NATIVE(d, cmp_branch, "target does not emit compare branches");
d->native->cmp_branch(d->native, op, ar, br, nd_mc_label(d, label));
nd_release_materialized(d, br);
@@ -706,6 +861,7 @@ static void nd_cmp_branch(CgTarget* t, CmpOp op, Operand a, Operand b,
}
static void nd_switch(CgTarget* t, const CGSwitchDesc* desc) {
+ nd_flush_all(nd_of(t));
cg_lower_switch_default(t, desc);
}
@@ -713,7 +869,9 @@ static void nd_indirect_branch(CgTarget* t, Operand addr,
const Label* valid_targets, u32 ntargets) {
NativeDirectTarget* d = nd_of(t);
MCLabel* native_targets;
- NativeLoc addr_reg = nd_materialize_operand(d, addr);
+ NativeLoc addr_reg;
+ nd_flush_all(d);
+ addr_reg = nd_materialize_operand(d, addr);
ND_REQUIRE_NATIVE(d, indirect_branch,
"target does not emit indirect branches");
native_targets = ntargets ? nd_arena(d, sizeof(*native_targets) * ntargets,
@@ -727,7 +885,9 @@ static void nd_indirect_branch(CgTarget* t, Operand addr,
static void nd_load_label_addr(CgTarget* t, Operand dst, Label label) {
NativeDirectTarget* d = nd_of(t);
- NativeLoc reg = nd_dst_scratch(d, dst);
+ NativeLoc reg;
+ nd_flush_all(d);
+ reg = nd_dst_scratch(d, dst);
ND_REQUIRE_NATIVE(d, load_label_addr,
"target does not materialize label addresses");
d->native->load_label_addr(d->native, reg, nd_mc_label(d, label));
@@ -888,20 +1048,18 @@ static void nd_continue_to(CgTarget* t, CGScope scope) {
static void nd_load_imm(CgTarget* t, Operand dst, i64 imm) {
NativeDirectTarget* d = nd_of(t);
- NativeLoc reg = nd_dst_scratch(d, dst);
+ NativeLoc reg = nd_dst_reg(d, dst);
ND_REQUIRE_NATIVE(d, load_imm, "target does not emit immediates");
d->native->load_imm(d->native, reg, imm);
- nd_store_operand_from_reg(d, dst, reg);
- nd_release_materialized(d, reg);
+ nd_dst_writeback(d, dst, reg);
}
static void nd_load_const(CgTarget* t, Operand dst, ConstBytes cbytes) {
NativeDirectTarget* d = nd_of(t);
- NativeLoc reg = nd_dst_scratch(d, dst);
+ NativeLoc reg = nd_dst_reg(d, dst);
ND_REQUIRE_NATIVE(d, load_const, "target does not emit byte constants");
d->native->load_const(d->native, reg, cbytes);
- nd_store_operand_from_reg(d, dst, reg);
- nd_release_materialized(d, reg);
+ nd_dst_writeback(d, dst, reg);
}
static void nd_copy(CgTarget* t, Operand dst, Operand src) {
@@ -910,6 +1068,7 @@ static void nd_copy(CgTarget* t, Operand dst, Operand src) {
if (size > (u64)t->c->target.ptr_size) {
NdAddrTemps dt, st;
AggregateAccess access;
+ nd_flush_all(d);
memset(&access, 0, sizeof access);
access.type = dst.type;
access.size = (u32)size;
@@ -929,7 +1088,9 @@ static void nd_copy(CgTarget* t, Operand dst, Operand src) {
return;
}
NativeLoc val = nd_materialize_operand(d, src);
- nd_store_operand_from_reg(d, dst, val);
+ NativeLoc dr = nd_dst_reg(d, dst);
+ nd_copy_to_reg(d, dr, val);
+ nd_dst_writeback(d, dst, dr);
nd_release_materialized(d, val);
}
@@ -938,6 +1099,7 @@ static void nd_load(CgTarget* t, Operand dst, Operand addr, MemAccess mem) {
NdAddrTemps temps;
u64 size =
mem.size ? mem.size : (mem.type ? cg_type_size(t->c, mem.type) : 0);
+ nd_flush_all(d);
if (mem.flags & MF_VOLATILE)
nd_barrier(d,
NATIVE_DIRECT_BARRIER_MEMORY | NATIVE_DIRECT_BARRIER_VOLATILE);
@@ -971,6 +1133,7 @@ static void nd_store(CgTarget* t, Operand addr, Operand src, MemAccess mem) {
NdAddrTemps temps;
u64 size =
mem.size ? mem.size : (mem.type ? cg_type_size(t->c, mem.type) : 0);
+ nd_flush_all(d);
if (mem.flags & MF_VOLATILE)
nd_barrier(d,
NATIVE_DIRECT_BARRIER_MEMORY | NATIVE_DIRECT_BARRIER_VOLATILE);
@@ -1003,8 +1166,9 @@ static void nd_addr_of(CgTarget* t, Operand dst, Operand lv) {
NdAddrTemps temps;
MemAccess mem = nd_scalar_mem(dst.type, d->base.c->target.ptr_size,
d->base.c->target.ptr_align);
- NativeAddr naddr =
- nd_addr_materialize(d, nd_addr_storage(d, lv), &temps, mem);
+ NativeAddr naddr;
+ nd_flush_all(d);
+ naddr = nd_addr_materialize(d, nd_addr_storage(d, lv), &temps, mem);
NativeLoc reg = nd_dst_scratch(d, dst);
ND_REQUIRE_NATIVE(d, load_addr, "target does not materialize addresses");
d->native->load_addr(d->native, reg, naddr);
@@ -1015,7 +1179,9 @@ static void nd_addr_of(CgTarget* t, Operand dst, Operand lv) {
static void nd_tls_addr_of(CgTarget* t, Operand dst, ObjSymId sym, i64 addend) {
NativeDirectTarget* d = nd_of(t);
- NativeLoc reg = nd_dst_scratch(d, dst);
+ NativeLoc reg;
+ nd_flush_all(d);
+ reg = nd_dst_scratch(d, dst);
ND_REQUIRE_NATIVE(d, tls_addr_of,
"target does not materialize TLS addresses");
d->native->tls_addr_of(d->native, reg, sym, addend);
@@ -1027,8 +1193,9 @@ static void nd_copy_bytes(CgTarget* t, Operand dst_addr, Operand src_addr,
AggregateAccess access) {
NativeDirectTarget* d = nd_of(t);
NdAddrTemps dt, st;
- NativeAddr dst =
- nd_addr_materialize(d, nd_addr_pointer(d, dst_addr), &dt, access.mem);
+ NativeAddr dst;
+ nd_flush_all(d);
+ dst = nd_addr_materialize(d, nd_addr_pointer(d, dst_addr), &dt, access.mem);
NativeAddr src =
nd_addr_materialize(d, nd_addr_pointer(d, src_addr), &st, access.mem);
ND_REQUIRE_NATIVE(d, copy_bytes, "target does not copy bytes");
@@ -1041,9 +1208,11 @@ static void nd_set_bytes(CgTarget* t, Operand dst_addr, Operand byte_value,
AggregateAccess access) {
NativeDirectTarget* d = nd_of(t);
NdAddrTemps temps;
- NativeAddr dst =
- nd_addr_materialize(d, nd_addr_pointer(d, dst_addr), &temps, access.mem);
- NativeLoc byte = nd_materialize_operand(d, byte_value);
+ NativeAddr dst;
+ NativeLoc byte;
+ nd_flush_all(d);
+ dst = nd_addr_materialize(d, nd_addr_pointer(d, dst_addr), &temps, access.mem);
+ byte = nd_materialize_operand(d, byte_value);
ND_REQUIRE_NATIVE(d, set_bytes, "target does not set bytes");
d->native->set_bytes(d->native, dst, byte, access);
nd_release_materialized(d, byte);
@@ -1054,9 +1223,12 @@ static void nd_bitfield_load(CgTarget* t, Operand dst, Operand record_addr,
BitFieldAccess access) {
NativeDirectTarget* d = nd_of(t);
NdAddrTemps temps;
- NativeAddr addr = nd_addr_materialize(d, nd_addr_storage(d, record_addr),
- &temps, access.storage);
- NativeLoc reg = nd_dst_scratch(d, dst);
+ NativeAddr addr;
+ NativeLoc reg;
+ nd_flush_all(d);
+ addr = nd_addr_materialize(d, nd_addr_storage(d, record_addr), &temps,
+ access.storage);
+ reg = nd_dst_scratch(d, dst);
ND_REQUIRE_NATIVE(d, bitfield_load, "target does not load bitfields");
d->native->bitfield_load(d->native, reg, addr, access);
nd_store_operand_from_reg(d, dst, reg);
@@ -1068,9 +1240,12 @@ static void nd_bitfield_store(CgTarget* t, Operand record_addr, Operand src,
BitFieldAccess access) {
NativeDirectTarget* d = nd_of(t);
NdAddrTemps temps;
- NativeAddr addr = nd_addr_materialize(d, nd_addr_storage(d, record_addr),
- &temps, access.storage);
- NativeLoc val = nd_materialize_operand(d, src);
+ NativeAddr addr;
+ NativeLoc val;
+ nd_flush_all(d);
+ addr = nd_addr_materialize(d, nd_addr_storage(d, record_addr), &temps,
+ access.storage);
+ val = nd_materialize_operand(d, src);
ND_REQUIRE_NATIVE(d, bitfield_store, "target does not store bitfields");
d->native->bitfield_store(d->native, addr, val, access);
nd_release_materialized(d, val);
@@ -1081,11 +1256,10 @@ static void nd_binop(CgTarget* t, BinOp op, Operand dst, Operand a, Operand b) {
NativeDirectTarget* d = nd_of(t);
NativeLoc ar = nd_materialize_operand(d, a);
NativeLoc br = nd_materialize_operand(d, b);
- NativeLoc dr = nd_dst_scratch(d, dst);
+ NativeLoc dr = nd_dst_reg(d, dst);
ND_REQUIRE_NATIVE(d, binop, "target does not emit binary ops");
d->native->binop(d->native, op, dr, ar, br);
- nd_store_operand_from_reg(d, dst, dr);
- nd_release_materialized(d, dr);
+ nd_dst_writeback(d, dst, dr);
nd_release_materialized(d, br);
nd_release_materialized(d, ar);
}
@@ -1093,11 +1267,10 @@ static void nd_binop(CgTarget* t, BinOp op, Operand dst, Operand a, Operand b) {
static void nd_unop(CgTarget* t, UnOp op, Operand dst, Operand a) {
NativeDirectTarget* d = nd_of(t);
NativeLoc ar = nd_materialize_operand(d, a);
- NativeLoc dr = nd_dst_scratch(d, dst);
+ NativeLoc dr = nd_dst_reg(d, dst);
ND_REQUIRE_NATIVE(d, unop, "target does not emit unary ops");
d->native->unop(d->native, op, dr, ar);
- nd_store_operand_from_reg(d, dst, dr);
- nd_release_materialized(d, dr);
+ nd_dst_writeback(d, dst, dr);
nd_release_materialized(d, ar);
}
@@ -1105,11 +1278,10 @@ static void nd_cmp(CgTarget* t, CmpOp op, Operand dst, Operand a, Operand b) {
NativeDirectTarget* d = nd_of(t);
NativeLoc ar = nd_materialize_operand(d, a);
NativeLoc br = nd_materialize_operand(d, b);
- NativeLoc dr = nd_dst_scratch(d, dst);
+ NativeLoc dr = nd_dst_reg(d, dst);
ND_REQUIRE_NATIVE(d, cmp, "target does not emit compares");
d->native->cmp(d->native, op, dr, ar, br);
- nd_store_operand_from_reg(d, dst, dr);
- nd_release_materialized(d, dr);
+ nd_dst_writeback(d, dst, dr);
nd_release_materialized(d, br);
nd_release_materialized(d, ar);
}
@@ -1117,11 +1289,10 @@ static void nd_cmp(CgTarget* t, CmpOp op, Operand dst, Operand a, Operand b) {
static void nd_convert(CgTarget* t, ConvKind op, Operand dst, Operand src) {
NativeDirectTarget* d = nd_of(t);
NativeLoc sr = nd_materialize_operand(d, src);
- NativeLoc dr = nd_dst_scratch(d, dst);
+ NativeLoc dr = nd_dst_reg(d, dst);
ND_REQUIRE_NATIVE(d, convert, "target does not emit converts");
d->native->convert(d->native, op, dr, sr);
- nd_store_operand_from_reg(d, dst, dr);
- nd_release_materialized(d, dr);
+ nd_dst_writeback(d, dst, dr);
nd_release_materialized(d, sr);
}
@@ -1132,6 +1303,7 @@ static void nd_call(CgTarget* t, const CGCallDesc* desc) {
NativeLoc* args;
NativeLoc* results;
int release_callee = 0;
+ nd_flush_all(d);
nd_barrier(d, NATIVE_DIRECT_BARRIER_CALL | NATIVE_DIRECT_BARRIER_MEMORY);
memset(&plan, 0, sizeof plan);
memset(&nd, 0, sizeof nd);
@@ -1196,6 +1368,7 @@ static void nd_ret(CgTarget* t, const CGLocal* values, u32 nvalues) {
NativeLoc* locs = NULL;
NativeCallPlanRet* rets = NULL;
u32 nrets = 0;
+ nd_flush_all(d);
if (d->ops && d->ops->emit_ret) {
d->ops->emit_ret(d, values, nvalues);
return;
@@ -1213,8 +1386,10 @@ static void nd_ret(CgTarget* t, const CGLocal* values, u32 nvalues) {
static void nd_alloca(CgTarget* t, Operand dst, Operand size, u32 align) {
NativeDirectTarget* d = nd_of(t);
- NativeLoc sr = nd_materialize_operand(d, size);
- NativeLoc dr = nd_dst_scratch(d, dst);
+ NativeLoc sr, dr;
+ nd_flush_all(d);
+ sr = nd_materialize_operand(d, size);
+ dr = nd_dst_scratch(d, dst);
ND_REQUIRE_NATIVE(d, alloca_, "target does not emit alloca");
d->native->alloca_(d->native, dr, sr, align);
nd_store_operand_from_reg(d, dst, dr);
@@ -1224,6 +1399,7 @@ static void nd_alloca(CgTarget* t, Operand dst, Operand size, u32 align) {
static void nd_va_start(CgTarget* t, Operand ap_addr) {
NativeDirectTarget* d = nd_of(t);
+ nd_flush_all(d);
if (!d->ops || !d->ops->va_start_)
nd_panic(d, "target does not emit va_start");
d->ops->va_start_(d, ap_addr);
@@ -1232,18 +1408,21 @@ static void nd_va_start(CgTarget* t, Operand ap_addr) {
static void nd_va_arg(CgTarget* t, Operand dst, Operand ap_addr,
CfreeCgTypeId type) {
NativeDirectTarget* d = nd_of(t);
+ nd_flush_all(d);
if (!d->ops || !d->ops->va_arg_) nd_panic(d, "target does not emit va_arg");
d->ops->va_arg_(d, dst, ap_addr, type);
}
static void nd_va_end(CgTarget* t, Operand ap_addr) {
NativeDirectTarget* d = nd_of(t);
+ nd_flush_all(d);
if (!d->ops || !d->ops->va_end_) nd_panic(d, "target does not emit va_end");
d->ops->va_end_(d, ap_addr);
}
static void nd_va_copy(CgTarget* t, Operand dst_ap_addr, Operand src_ap_addr) {
NativeDirectTarget* d = nd_of(t);
+ nd_flush_all(d);
if (!d->ops || !d->ops->va_copy_) nd_panic(d, "target does not emit va_copy");
d->ops->va_copy_(d, dst_ap_addr, src_ap_addr);
}
@@ -1252,6 +1431,7 @@ static void nd_atomic_load(CgTarget* t, Operand dst, Operand addr,
MemAccess mem, MemOrder order) {
NativeDirectTarget* d = nd_of(t);
NdAddrTemps temps;
+ nd_flush_all(d);
nd_barrier(d, NATIVE_DIRECT_BARRIER_MEMORY | NATIVE_DIRECT_BARRIER_ATOMIC);
NativeAddr naddr =
nd_addr_materialize(d, nd_addr_pointer(d, addr), &temps, mem);
@@ -1267,6 +1447,7 @@ static void nd_atomic_store(CgTarget* t, Operand addr, Operand src,
MemAccess mem, MemOrder order) {
NativeDirectTarget* d = nd_of(t);
NdAddrTemps temps;
+ nd_flush_all(d);
nd_barrier(d, NATIVE_DIRECT_BARRIER_MEMORY | NATIVE_DIRECT_BARRIER_ATOMIC);
NativeAddr naddr =
nd_addr_materialize(d, nd_addr_pointer(d, addr), &temps, mem);
@@ -1281,6 +1462,7 @@ static void nd_atomic_rmw(CgTarget* t, AtomicOp op, Operand dst, Operand addr,
Operand val, MemAccess mem, MemOrder order) {
NativeDirectTarget* d = nd_of(t);
NdAddrTemps temps;
+ nd_flush_all(d);
nd_barrier(d, NATIVE_DIRECT_BARRIER_MEMORY | NATIVE_DIRECT_BARRIER_ATOMIC);
NativeAddr naddr =
nd_addr_materialize(d, nd_addr_pointer(d, addr), &temps, mem);
@@ -1299,6 +1481,7 @@ static void nd_atomic_cas(CgTarget* t, Operand prior, Operand ok, Operand addr,
MemOrder success, MemOrder failure) {
NativeDirectTarget* d = nd_of(t);
NdAddrTemps temps;
+ nd_flush_all(d);
nd_barrier(d, NATIVE_DIRECT_BARRIER_MEMORY | NATIVE_DIRECT_BARRIER_ATOMIC);
NativeAddr naddr =
nd_addr_materialize(d, nd_addr_pointer(d, addr), &temps, mem);
@@ -1321,6 +1504,7 @@ static void nd_atomic_cas(CgTarget* t, Operand prior, Operand ok, Operand addr,
static void nd_fence(CgTarget* t, MemOrder order) {
NativeDirectTarget* d = nd_of(t);
+ nd_flush_all(d);
ND_REQUIRE_NATIVE(d, fence, "target does not emit fences");
d->native->fence(d->native, order);
}
@@ -1332,6 +1516,7 @@ static void nd_intrinsic(CgTarget* t, IntrinKind kind, Operand* dsts, u32 ndst,
ndst ? nd_arena(d, sizeof(*ndsts) * ndst, _Alignof(NativeLoc)) : NULL;
NativeLoc* nargs =
narg ? nd_arena(d, sizeof(*nargs) * narg, _Alignof(NativeLoc)) : NULL;
+ nd_flush_all(d);
ND_REQUIRE_NATIVE(d, intrinsic, "target does not emit compiler intrinsics");
for (u32 i = 0; i < ndst; ++i) ndsts[i] = nd_dst_scratch(d, dsts[i]);
for (u32 i = 0; i < narg; ++i) {
@@ -1352,6 +1537,7 @@ static void nd_asm_block(CgTarget* t, const char* tmpl,
const Operand* in_ops, const Sym* clobbers,
u32 nclob) {
NativeDirectTarget* d = nd_of(t);
+ nd_flush_all(d);
nd_barrier(d,
NATIVE_DIRECT_BARRIER_INLINE_ASM | NATIVE_DIRECT_BARRIER_MEMORY);
if (d->ops && d->ops->asm_block) {
diff --git a/src/cg/native_direct_target.h b/src/cg/native_direct_target.h
@@ -120,6 +120,12 @@ struct NativeDirectTarget {
u32 scopes_cap;
u32 scratch_used[3];
+ /* Local register cache (write-back, basic-block-scoped). reg_owner[cls][reg]
+ * names the semantic local currently cached in that physical register, or
+ * CG_LOCAL_NONE. scratch_used doubles as the per-class "pinned for the current
+ * instruction" mask. Per-local cache state (reg/cls/dirty) lives on
+ * NativeDirectLocal. See doc/CGTARGET.md "local register cache". */
+ CGLocal reg_owner[3][32];
u32 max_outgoing;
ObjSecId local_static_sec;