opt: record-replay wrapper + tape printer + constfold (phases 0-2) - kit

commit cc3abed6052eb7fdcc58fc6ccba94bbd8fc8a5ff
parent 9747c24c5a4aa056f0f2c3d89498be8edb971c47
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sun, 10 May 2026 05:23:34 -0700

opt: record-replay wrapper + tape printer + constfold (phases 0-2)

Implements doc/OPT.md phases 0-2. opt_cgtarget_new now returns a
CGTarget that records each function as a tape of CGTarget calls,
hands out wrapper-local virtual ids for alloc_reg/frame_slot/
label_new/scope_begin, and replays the tape onto the wrapped target
at func_end (with vid -> target-id translation). A simple integer
constfold peephole rewrites LOAD_IMM+LOAD_IMM+BINOP{IADD,ISUB,IMUL}
to a single LOAD_IMM, chaining transitively. opt_set_dump_writer
exposes a textual tape dump for debugging, wired through cg-runner
--opt-level N --dump-tape NAME. The cg corpus now runs at every
level in CFREE_OPT_LEVELS (default "0 1") so D/R/E/J equivalence is
checked end-to-end; W stays at level 0.

Diffstat:
M src/api/stubs.c  | 10 +---------
A src/opt/opt.c  | 1875 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M src/opt/opt.h  | 6 ++++++
M test/cg/harness/cg_runner.c  | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M test/cg/run.sh  | 312 +++++++++++++++++++++++++++++++++++++++++++-------------------------------------

5 files changed, 2164 insertions(+), 158 deletions(-)
diff --git a/src/api/stubs.c b/src/api/stubs.c
@@ -54,15 +54,7 @@ void parse_asm(Compiler* c, Lexer* l, MCEmitter* m) {
  * cgtarget_new / cgtarget_finalize / cgtarget_free live in src/arch/<target>.c
  * (dispatched through src/arch/arch.c). */
 
-/* ============================================================
- * Optimizer
- * ============================================================ */
-
-CGTarget* opt_cgtarget_new(Compiler* c, CGTarget* t, int level) {
-  (void)t;
-  (void)level;
-  unimplemented(c, "opt");
-}
+/* Optimizer (opt_cgtarget_new) lives in src/opt/opt.c. */
 
 /* Debug info producer lives in src/debug/. */
 
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -0,0 +1,1875 @@
+/* opt — CGTarget wrapper that records each function as a tape of
+ * CGTarget calls, then replays them onto the wrapped target on
+ * func_end. See doc/OPT.md for the phased plan.
+ *
+ * Phase 1 (current): record every emit-side call into a per-function
+ *   tape; alloc_reg / frame_slot / label_new / scope_begin hand out
+ *   wrapper-local virtual ids. On func_end the tape is replayed
+ *   linearly: each entry produces exactly one wrapped target call,
+ *   with virtual ids translated to target-side ids on the fly. This
+ *   preserves doc/DESIGN.md §8's "function-at-a-time" streaming
+ *   guarantee at -O1.
+ *
+ * Phase 2 (current): a small, safe peephole pass runs over the tape
+ *   between recording and replay. See try_peephole_constfold.
+ *
+ * Phase 3+ (deferred): build CFG and SSA from the tape, run
+ *   intra-procedural passes, lower through machinize → regalloc →
+ *   emit. Until that lands, level 2 is functionally identical to
+ *   level 1 (per-function record + replay).
+ *
+ * Methods the wrapper rejects under unbounded virtuals:
+ *   - clobbers / spill_reg / reload_reg are CG -O0 register-pressure
+ *     mechanics. CG never invokes them on real backends in v1, and
+ *     they're meaningless for opt's vreg space — calling them is a
+ *     wiring bug, so we panic loudly.
+ *   - free_reg is documented as a hint and is silently ignored. */
+
+#include "opt/opt.h"
+
+#include <string.h>
+
+#include "core/arena.h"
+#include "core/core.h"
+
+/* ---- tape op tags ---- */
+
+typedef enum {
+  TOP_FUNC_BEGIN,
+  TOP_FUNC_END,
+  TOP_ALLOC_REG,
+  TOP_FRAME_SLOT,
+  TOP_PARAM,
+  TOP_LABEL_NEW,
+  TOP_LABEL_PLACE,
+  TOP_JUMP,
+  TOP_CMP_BRANCH,
+  TOP_SCOPE_BEGIN,
+  TOP_SCOPE_ELSE,
+  TOP_SCOPE_END,
+  TOP_BREAK_TO,
+  TOP_CONTINUE_TO,
+  TOP_LOAD_IMM,
+  TOP_LOAD_CONST,
+  TOP_COPY,
+  TOP_LOAD,
+  TOP_STORE,
+  TOP_ADDR_OF,
+  TOP_TLS_ADDR_OF,
+  TOP_COPY_BYTES,
+  TOP_SET_BYTES,
+  TOP_BITFIELD_LOAD,
+  TOP_BITFIELD_STORE,
+  TOP_BINOP,
+  TOP_UNOP,
+  TOP_CMP,
+  TOP_CONVERT,
+  TOP_CALL,
+  TOP_RET,
+  TOP_ALLOCA,
+  TOP_VA_START,
+  TOP_VA_ARG,
+  TOP_VA_END,
+  TOP_VA_COPY,
+  TOP_SETJMP,
+  TOP_LONGJMP,
+  TOP_ATOMIC_LOAD,
+  TOP_ATOMIC_STORE,
+  TOP_ATOMIC_RMW,
+  TOP_ATOMIC_CAS,
+  TOP_FENCE,
+  TOP_INTRINSIC,
+  TOP_SET_LOC,
+} TapeOpKind;
+
+/* TapeEntry: one recorded CGTarget call. The tagged union is wide; we
+ * pay arena bytes for clarity. */
+typedef struct TapeEntry {
+  u8 op;       /* TapeOpKind */
+  u8 dead;     /* set by peepholes; replay skips dead entries */
+  u16 padding;
+  SrcLoc loc;
+  union {
+    /* WOP_FUNC_BEGIN: deep-copied descriptor. The caller's CGFuncDesc
+     * may be stack-allocated, so we copy by value into our arena.
+     * params[] is also copied; field shapes inside (Type*, ABIArgInfo*,
+     * incoming pointer) are TU-lifetime and shared. */
+    struct {
+      CGFuncDesc desc;
+      CGParamDesc* params; /* arena copy of fd.params */
+    } func_begin;
+
+    /* WOP_ALLOC_REG: returns a vreg, indexed into reg_map at replay. */
+    struct {
+      RegClass cls;
+      const Type* ty;
+      Reg vreg;
+    } alloc_reg;
+
+    /* WOP_FRAME_SLOT */
+    struct {
+      FrameSlotDesc desc;
+      FrameSlot vslot;
+    } frame_slot;
+
+    /* WOP_PARAM */
+    struct {
+      CGParamDesc desc;
+    } param;
+
+    /* WOP_LABEL_NEW */
+    struct {
+      Label vlabel;
+    } label_new;
+
+    /* WOP_LABEL_PLACE / WOP_JUMP */
+    struct {
+      Label vlabel;
+    } label_op;
+
+    /* WOP_CMP_BRANCH */
+    struct {
+      CmpOp op;
+      Operand a, b;
+      Label vlabel;
+    } cmp_branch;
+
+    /* WOP_SCOPE_BEGIN */
+    struct {
+      CGScopeDesc desc;
+      CGScope vscope;
+    } scope_begin;
+
+    /* WOP_SCOPE_ELSE / WOP_SCOPE_END / WOP_BREAK_TO / WOP_CONTINUE_TO */
+    struct {
+      CGScope vscope;
+    } scope_op;
+
+    /* WOP_LOAD_IMM */
+    struct {
+      Operand dst;
+      i64 imm;
+    } load_imm;
+
+    /* WOP_LOAD_CONST */
+    struct {
+      Operand dst;
+      ConstBytes cb;
+    } load_const;
+
+    /* WOP_COPY / WOP_ADDR_OF / WOP_VA_COPY */
+    struct {
+      Operand dst;
+      Operand src;
+    } copy;
+
+    /* WOP_LOAD */
+    struct {
+      Operand dst;
+      Operand addr;
+      MemAccess mem;
+    } load;
+
+    /* WOP_STORE */
+    struct {
+      Operand addr;
+      Operand src;
+      MemAccess mem;
+    } store;
+
+    /* WOP_TLS_ADDR_OF */
+    struct {
+      Operand dst;
+      ObjSymId sym;
+      i64 addend;
+    } tls_addr_of;
+
+    /* WOP_COPY_BYTES / WOP_SET_BYTES */
+    struct {
+      Operand a;
+      Operand b;
+      AggregateAccess agg;
+    } agg;
+
+    /* WOP_BITFIELD_LOAD */
+    struct {
+      Operand dst;
+      Operand record;
+      BitFieldAccess bf;
+    } bitfield_load;
+
+    /* WOP_BITFIELD_STORE */
+    struct {
+      Operand record;
+      Operand src;
+      BitFieldAccess bf;
+    } bitfield_store;
+
+    /* WOP_BINOP */
+    struct {
+      BinOp op;
+      Operand dst, a, b;
+    } binop;
+
+    /* WOP_UNOP */
+    struct {
+      UnOp op;
+      Operand dst, a;
+    } unop;
+
+    /* WOP_CMP */
+    struct {
+      CmpOp op;
+      Operand dst, a, b;
+    } cmp;
+
+    /* WOP_CONVERT */
+    struct {
+      ConvKind kind;
+      Operand dst, src;
+    } convert;
+
+    /* WOP_CALL: deep-copied descriptor and inner arrays. */
+    struct {
+      CGCallDesc desc;
+      CGABIValue* args;       /* len = desc.nargs */
+      CGABIPart* ret_parts;   /* len = desc.ret.nparts; NULL if 0 */
+      CGABIPart** arg_parts;  /* per-arg parts arrays; entry is NULL if 0 */
+    } call;
+
+    /* WOP_RET: present == 1 means there is a CGABIValue; otherwise a
+     * void return. parts is deep-copied. */
+    struct {
+      u8 present;
+      CGABIValue val;
+      CGABIPart* parts; /* len = val.nparts */
+    } ret;
+
+    /* WOP_ALLOCA */
+    struct {
+      Operand dst;
+      Operand size;
+      u32 align;
+    } alloca_;
+
+    /* WOP_VA_START / WOP_VA_END */
+    struct {
+      Operand ap;
+    } va_se;
+
+    /* WOP_VA_ARG */
+    struct {
+      Operand dst;
+      Operand ap;
+      const Type* ty;
+    } va_arg_;
+
+    /* WOP_SETJMP */
+    struct {
+      Operand dst;
+      Operand buf;
+    } setjmp_;
+
+    /* WOP_LONGJMP */
+    struct {
+      Operand buf;
+      Operand val;
+    } longjmp_;
+
+    /* WOP_ATOMIC_LOAD */
+    struct {
+      Operand dst;
+      Operand addr;
+      MemAccess mem;
+      MemOrder mo;
+    } atomic_load;
+
+    /* WOP_ATOMIC_STORE */
+    struct {
+      Operand addr;
+      Operand src;
+      MemAccess mem;
+      MemOrder mo;
+    } atomic_store;
+
+    /* WOP_ATOMIC_RMW */
+    struct {
+      AtomicOp op;
+      Operand dst;
+      Operand addr;
+      Operand val;
+      MemAccess mem;
+      MemOrder mo;
+    } atomic_rmw;
+
+    /* WOP_ATOMIC_CAS */
+    struct {
+      Operand prior;
+      Operand ok;
+      Operand addr;
+      Operand expected;
+      Operand desired;
+      MemAccess mem;
+      MemOrder success;
+      MemOrder failure;
+    } atomic_cas;
+
+    /* WOP_FENCE */
+    struct {
+      MemOrder mo;
+    } fence;
+
+    /* WOP_INTRINSIC */
+    struct {
+      IntrinKind kind;
+      Operand* dsts; /* deep-copied */
+      u32 ndst;
+      Operand* args; /* deep-copied */
+      u32 narg;
+    } intrinsic;
+
+    /* WOP_SET_LOC */
+    struct {
+      SrcLoc loc;
+    } set_loc;
+  } u;
+} TapeEntry;
+
+/* ---- wrapper state ---- */
+
+typedef struct OptImpl {
+  CGTarget base;
+  CGTarget* target; /* wrapped */
+  int level;
+  Compiler* c;
+
+  /* Tape: per-function, reset on func_begin. Allocated from c->tu so
+   * the buffer survives panic via compiler_defer cleanups. */
+  TapeEntry* tape;
+  u32 ntape, tape_cap;
+
+  /* Wrapper-local virtual id counters. 1-based; 0 reserved as NONE.
+   * Reset on each func_begin. */
+  Reg next_vreg;
+  FrameSlot next_vslot;
+  Label next_vlabel;
+  CGScope next_vscope;
+
+  /* Replay-time translation tables. Index by virtual id; entry 0 is
+   * the NONE sentinel and never referenced. Allocated lazily on first
+   * replay so peak size matches the largest function. */
+  Reg* reg_map;
+  u32 reg_map_cap;
+  FrameSlot* slot_map;
+  u32 slot_map_cap;
+  Label* label_map;
+  u32 label_map_cap;
+  CGScope* scope_map;
+  u32 scope_map_cap;
+
+  SrcLoc pending_loc; /* most recent set_loc; stamped onto each entry */
+
+  /* If non-NULL, dump the tape to this writer on each func_end (before
+   * replay). Used by cg-runner --dump-tape and ad-hoc debugging. */
+  Writer* dump_writer;
+} OptImpl;
+
+static OptImpl* impl_of(CGTarget* t) { return (OptImpl*)t; }
+
+static _Noreturn void panic_unsupported(OptImpl* o, const char* what) {
+  SrcLoc loc = {0, 0, 0};
+  compiler_panic(o->c, loc, "opt_cgtarget: %s called under unbounded virtuals",
+                 what);
+}
+
+/* ---- tape append ---- */
+
+static TapeEntry* tape_append(OptImpl* o, TapeOpKind op) {
+  TapeEntry* e;
+  if (o->ntape == o->tape_cap) {
+    u32 ncap = o->tape_cap ? o->tape_cap * 2u : 64u;
+    TapeEntry* nb = arena_array(o->c->tu, TapeEntry, ncap);
+    if (o->tape) memcpy(nb, o->tape, sizeof(TapeEntry) * o->ntape);
+    o->tape = nb;
+    o->tape_cap = ncap;
+  }
+  e = &o->tape[o->ntape++];
+  memset(e, 0, sizeof *e);
+  e->op = (u8)op;
+  e->loc = o->pending_loc;
+  return e;
+}
+
+/* ---- deep-copy helpers ---- */
+
+static CGParamDesc* copy_params(Compiler* c, const CGParamDesc* src, u32 n) {
+  CGParamDesc* dst;
+  if (!n) return NULL;
+  dst = arena_array(c->tu, CGParamDesc, n);
+  memcpy(dst, src, sizeof(CGParamDesc) * n);
+  return dst;
+}
+
+static CGABIPart* copy_parts(Compiler* c, const CGABIPart* src, u32 n) {
+  CGABIPart* dst;
+  if (!n) return NULL;
+  dst = arena_array(c->tu, CGABIPart, n);
+  memcpy(dst, src, sizeof(CGABIPart) * n);
+  return dst;
+}
+
+static Operand* copy_operands(Compiler* c, const Operand* src, u32 n) {
+  Operand* dst;
+  if (!n) return NULL;
+  dst = arena_array(c->tu, Operand, n);
+  memcpy(dst, src, sizeof(Operand) * n);
+  return dst;
+}
+
+/* ---- map helpers (replay-time) ----
+ * The maps are direct-indexed by the 1-based virtual id; entry 0 is
+ * the NONE sentinel. */
+
+static void map_reg_grow(OptImpl* o, u32 needed) {
+  u32 ncap;
+  Reg* nb;
+  if (needed <= o->reg_map_cap) return;
+  ncap = o->reg_map_cap ? o->reg_map_cap : 16u;
+  while (ncap < needed) ncap *= 2u;
+  nb = arena_array(o->c->tu, Reg, ncap);
+  if (o->reg_map) memcpy(nb, o->reg_map, sizeof(Reg) * o->reg_map_cap);
+  /* New slots default to REG_NONE (0xffffffff). */
+  for (u32 i = o->reg_map_cap; i < ncap; ++i) nb[i] = REG_NONE;
+  o->reg_map = nb;
+  o->reg_map_cap = ncap;
+}
+
+static void map_slot_grow(OptImpl* o, u32 needed) {
+  u32 ncap;
+  FrameSlot* nb;
+  if (needed <= o->slot_map_cap) return;
+  ncap = o->slot_map_cap ? o->slot_map_cap : 16u;
+  while (ncap < needed) ncap *= 2u;
+  nb = arena_array(o->c->tu, FrameSlot, ncap);
+  if (o->slot_map) memcpy(nb, o->slot_map, sizeof(FrameSlot) * o->slot_map_cap);
+  for (u32 i = o->slot_map_cap; i < ncap; ++i) nb[i] = FRAME_SLOT_NONE;
+  o->slot_map = nb;
+  o->slot_map_cap = ncap;
+}
+
+static void map_label_grow(OptImpl* o, u32 needed) {
+  u32 ncap;
+  Label* nb;
+  if (needed <= o->label_map_cap) return;
+  ncap = o->label_map_cap ? o->label_map_cap : 16u;
+  while (ncap < needed) ncap *= 2u;
+  nb = arena_array(o->c->tu, Label, ncap);
+  if (o->label_map) memcpy(nb, o->label_map, sizeof(Label) * o->label_map_cap);
+  for (u32 i = o->label_map_cap; i < ncap; ++i) nb[i] = LABEL_NONE;
+  o->label_map = nb;
+  o->label_map_cap = ncap;
+}
+
+static void map_scope_grow(OptImpl* o, u32 needed) {
+  u32 ncap;
+  CGScope* nb;
+  if (needed <= o->scope_map_cap) return;
+  ncap = o->scope_map_cap ? o->scope_map_cap : 8u;
+  while (ncap < needed) ncap *= 2u;
+  nb = arena_array(o->c->tu, CGScope, ncap);
+  if (o->scope_map) memcpy(nb, o->scope_map, sizeof(CGScope) * o->scope_map_cap);
+  for (u32 i = o->scope_map_cap; i < ncap; ++i) nb[i] = CG_SCOPE_NONE;
+  o->scope_map = nb;
+  o->scope_map_cap = ncap;
+}
+
+/* ---- recording: every emit-side method records a tape entry.
+ *
+ * Allocator methods (alloc_reg, frame_slot, label_new, scope_begin)
+ * additionally hand back a wrapper-local virtual id; the underlying
+ * target is not consulted until replay. */
+
+static void w_func_begin(CGTarget* t, const CGFuncDesc* fd) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e;
+
+  /* Reset per-function state. */
+  o->tape = NULL;
+  o->ntape = 0;
+  o->tape_cap = 0;
+  o->next_vreg = 1;
+  o->next_vslot = 1;
+  o->next_vlabel = 1;
+  o->next_vscope = 1;
+  o->pending_loc = (SrcLoc){0, 0, 0};
+  /* Reset translation maps; capacities are kept for amortization. */
+  for (u32 i = 0; i < o->reg_map_cap; ++i) o->reg_map[i] = REG_NONE;
+  for (u32 i = 0; i < o->slot_map_cap; ++i) o->slot_map[i] = FRAME_SLOT_NONE;
+  for (u32 i = 0; i < o->label_map_cap; ++i) o->label_map[i] = LABEL_NONE;
+  for (u32 i = 0; i < o->scope_map_cap; ++i) o->scope_map[i] = CG_SCOPE_NONE;
+
+  e = tape_append(o, TOP_FUNC_BEGIN);
+  /* Shallow-copy the descriptor by value, then deep-copy the params
+   * array — the harness mutates pds[i].slot AFTER func_begin returns,
+   * so we can't rely on pointer-shallow-copy for that field. The slots
+   * we record here are wrapper vslots (allocated by w_frame_slot in the
+   * subsequent param-setup loop); replay translates them. */
+  e->u.func_begin.desc = *fd;
+  e->u.func_begin.params = copy_params(o->c, fd->params, fd->nparams);
+  e->u.func_begin.desc.params = e->u.func_begin.params;
+}
+
+static void w_func_end(CGTarget* t);
+
+static Reg w_alloc_reg(CGTarget* t, RegClass cls, const Type* ty) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_ALLOC_REG);
+  Reg vreg = o->next_vreg++;
+  e->u.alloc_reg.cls = cls;
+  e->u.alloc_reg.ty = ty;
+  e->u.alloc_reg.vreg = vreg;
+  return vreg;
+}
+
+static void w_free_reg(CGTarget* t, Reg r) {
+  /* Hint; opt_cgtarget ignores. The wrapper's vregs are unbounded —
+   * there is no pool to return to. */
+  (void)t;
+  (void)r;
+}
+
+static FrameSlot w_frame_slot(CGTarget* t, const FrameSlotDesc* d) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_FRAME_SLOT);
+  FrameSlot vslot = o->next_vslot++;
+  e->u.frame_slot.desc = *d;
+  e->u.frame_slot.vslot = vslot;
+  return vslot;
+}
+
+static void w_param(CGTarget* t, const CGParamDesc* d) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_PARAM);
+  e->u.param.desc = *d;
+}
+
+static const Reg* w_clobbers(CGTarget* t, RegClass cls, u32* nregs) {
+  (void)cls;
+  (void)nregs;
+  panic_unsupported(impl_of(t), "clobbers");
+}
+static void w_spill_reg(CGTarget* t, Operand src, FrameSlot s, MemAccess m) {
+  (void)src;
+  (void)s;
+  (void)m;
+  panic_unsupported(impl_of(t), "spill_reg");
+}
+static void w_reload_reg(CGTarget* t, Operand dst, FrameSlot s, MemAccess m) {
+  (void)dst;
+  (void)s;
+  (void)m;
+  panic_unsupported(impl_of(t), "reload_reg");
+}
+
+static Label w_label_new(CGTarget* t) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_LABEL_NEW);
+  Label v = o->next_vlabel++;
+  e->u.label_new.vlabel = v;
+  return v;
+}
+
+static void w_label_place(CGTarget* t, Label l) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_LABEL_PLACE);
+  e->u.label_op.vlabel = l;
+}
+static void w_jump(CGTarget* t, Label l) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_JUMP);
+  e->u.label_op.vlabel = l;
+}
+static void w_cmp_branch(CGTarget* t, CmpOp op, Operand a, Operand b, Label l) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_CMP_BRANCH);
+  e->u.cmp_branch.op = op;
+  e->u.cmp_branch.a = a;
+  e->u.cmp_branch.b = b;
+  e->u.cmp_branch.vlabel = l;
+}
+
+static CGScope w_scope_begin(CGTarget* t, const CGScopeDesc* d) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_SCOPE_BEGIN);
+  CGScope v = o->next_vscope++;
+  e->u.scope_begin.desc = *d;
+  e->u.scope_begin.vscope = v;
+  return v;
+}
+static void w_scope_else(CGTarget* t, CGScope s) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_SCOPE_ELSE);
+  e->u.scope_op.vscope = s;
+}
+static void w_scope_end(CGTarget* t, CGScope s) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_SCOPE_END);
+  e->u.scope_op.vscope = s;
+}
+static void w_break_to(CGTarget* t, CGScope s) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_BREAK_TO);
+  e->u.scope_op.vscope = s;
+}
+static void w_continue_to(CGTarget* t, CGScope s) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_CONTINUE_TO);
+  e->u.scope_op.vscope = s;
+}
+
+static void w_load_imm(CGTarget* t, Operand dst, i64 imm) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_LOAD_IMM);
+  e->u.load_imm.dst = dst;
+  e->u.load_imm.imm = imm;
+}
+static void w_load_const(CGTarget* t, Operand dst, ConstBytes cb) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_LOAD_CONST);
+  e->u.load_const.dst = dst;
+  e->u.load_const.cb = cb;
+}
+static void w_copy(CGTarget* t, Operand dst, Operand src) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_COPY);
+  e->u.copy.dst = dst;
+  e->u.copy.src = src;
+}
+static void w_load(CGTarget* t, Operand dst, Operand addr, MemAccess m) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_LOAD);
+  e->u.load.dst = dst;
+  e->u.load.addr = addr;
+  e->u.load.mem = m;
+}
+static void w_store(CGTarget* t, Operand addr, Operand src, MemAccess m) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_STORE);
+  e->u.store.addr = addr;
+  e->u.store.src = src;
+  e->u.store.mem = m;
+}
+static void w_addr_of(CGTarget* t, Operand dst, Operand lv) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_ADDR_OF);
+  e->u.copy.dst = dst;
+  e->u.copy.src = lv;
+}
+static void w_tls_addr_of(CGTarget* t, Operand dst, ObjSymId sym, i64 addend) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_TLS_ADDR_OF);
+  e->u.tls_addr_of.dst = dst;
+  e->u.tls_addr_of.sym = sym;
+  e->u.tls_addr_of.addend = addend;
+}
+static void w_copy_bytes(CGTarget* t, Operand dst, Operand src,
+                         AggregateAccess agg) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_COPY_BYTES);
+  e->u.agg.a = dst;
+  e->u.agg.b = src;
+  e->u.agg.agg = agg;
+}
+static void w_set_bytes(CGTarget* t, Operand dst, Operand byte,
+                        AggregateAccess agg) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_SET_BYTES);
+  e->u.agg.a = dst;
+  e->u.agg.b = byte;
+  e->u.agg.agg = agg;
+}
+static void w_bitfield_load(CGTarget* t, Operand dst, Operand record,
+                            BitFieldAccess bf) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_BITFIELD_LOAD);
+  e->u.bitfield_load.dst = dst;
+  e->u.bitfield_load.record = record;
+  e->u.bitfield_load.bf = bf;
+}
+static void w_bitfield_store(CGTarget* t, Operand record, Operand src,
+                             BitFieldAccess bf) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_BITFIELD_STORE);
+  e->u.bitfield_store.record = record;
+  e->u.bitfield_store.src = src;
+  e->u.bitfield_store.bf = bf;
+}
+
+static void w_binop(CGTarget* t, BinOp op, Operand dst, Operand a, Operand b) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_BINOP);
+  e->u.binop.op = op;
+  e->u.binop.dst = dst;
+  e->u.binop.a = a;
+  e->u.binop.b = b;
+}
+static void w_unop(CGTarget* t, UnOp op, Operand dst, Operand a) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_UNOP);
+  e->u.unop.op = op;
+  e->u.unop.dst = dst;
+  e->u.unop.a = a;
+}
+static void w_cmp(CGTarget* t, CmpOp op, Operand dst, Operand a, Operand b) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_CMP);
+  e->u.cmp.op = op;
+  e->u.cmp.dst = dst;
+  e->u.cmp.a = a;
+  e->u.cmp.b = b;
+}
+static void w_convert(CGTarget* t, ConvKind k, Operand dst, Operand src) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_CONVERT);
+  e->u.convert.kind = k;
+  e->u.convert.dst = dst;
+  e->u.convert.src = src;
+}
+
+static void w_call(CGTarget* t, const CGCallDesc* d) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_CALL);
+  CGABIValue* args_copy = NULL;
+  CGABIPart** arg_parts_copy = NULL;
+  CGABIPart* ret_parts_copy = NULL;
+  u32 i;
+
+  /* Deep-copy the argv. Caller-owned d may be on the stack, and
+   * args[i].parts may be too. */
+  if (d->nargs) {
+    args_copy = arena_array(o->c->tu, CGABIValue, d->nargs);
+    arg_parts_copy = arena_array(o->c->tu, CGABIPart*, d->nargs);
+    for (i = 0; i < d->nargs; ++i) {
+      args_copy[i] = d->args[i];
+      arg_parts_copy[i] =
+          copy_parts(o->c, d->args[i].parts, d->args[i].nparts);
+      args_copy[i].parts = arg_parts_copy[i];
+    }
+  }
+  ret_parts_copy = copy_parts(o->c, d->ret.parts, d->ret.nparts);
+
+  e->u.call.desc = *d;
+  e->u.call.desc.args = args_copy;
+  e->u.call.desc.ret.parts = ret_parts_copy;
+  e->u.call.args = args_copy;
+  e->u.call.arg_parts = arg_parts_copy;
+  e->u.call.ret_parts = ret_parts_copy;
+}
+
+static void w_ret(CGTarget* t, const CGABIValue* v) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_RET);
+  if (!v) {
+    e->u.ret.present = 0;
+    return;
+  }
+  e->u.ret.present = 1;
+  e->u.ret.val = *v;
+  e->u.ret.parts = copy_parts(o->c, v->parts, v->nparts);
+  e->u.ret.val.parts = e->u.ret.parts;
+}
+
+static void w_alloca_(CGTarget* t, Operand dst, Operand size, u32 align) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_ALLOCA);
+  e->u.alloca_.dst = dst;
+  e->u.alloca_.size = size;
+  e->u.alloca_.align = align;
+}
+
+static void w_va_start_(CGTarget* t, Operand ap) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_VA_START);
+  e->u.va_se.ap = ap;
+}
+static void w_va_arg_(CGTarget* t, Operand dst, Operand ap, const Type* ty) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_VA_ARG);
+  e->u.va_arg_.dst = dst;
+  e->u.va_arg_.ap = ap;
+  e->u.va_arg_.ty = ty;
+}
+static void w_va_end_(CGTarget* t, Operand ap) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_VA_END);
+  e->u.va_se.ap = ap;
+}
+static void w_va_copy_(CGTarget* t, Operand dst, Operand src) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_VA_COPY);
+  e->u.copy.dst = dst;
+  e->u.copy.src = src;
+}
+
+static void w_setjmp_(CGTarget* t, Operand dst, Operand buf) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_SETJMP);
+  e->u.setjmp_.dst = dst;
+  e->u.setjmp_.buf = buf;
+}
+static void w_longjmp_(CGTarget* t, Operand buf, Operand val) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_LONGJMP);
+  e->u.longjmp_.buf = buf;
+  e->u.longjmp_.val = val;
+}
+
+static void w_atomic_load(CGTarget* t, Operand dst, Operand addr, MemAccess m,
+                          MemOrder mo) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_ATOMIC_LOAD);
+  e->u.atomic_load.dst = dst;
+  e->u.atomic_load.addr = addr;
+  e->u.atomic_load.mem = m;
+  e->u.atomic_load.mo = mo;
+}
+static void w_atomic_store(CGTarget* t, Operand addr, Operand src, MemAccess m,
+                           MemOrder mo) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_ATOMIC_STORE);
+  e->u.atomic_store.addr = addr;
+  e->u.atomic_store.src = src;
+  e->u.atomic_store.mem = m;
+  e->u.atomic_store.mo = mo;
+}
+static void w_atomic_rmw(CGTarget* t, AtomicOp op, Operand dst, Operand addr,
+                         Operand val, MemAccess m, MemOrder mo) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_ATOMIC_RMW);
+  e->u.atomic_rmw.op = op;
+  e->u.atomic_rmw.dst = dst;
+  e->u.atomic_rmw.addr = addr;
+  e->u.atomic_rmw.val = val;
+  e->u.atomic_rmw.mem = m;
+  e->u.atomic_rmw.mo = mo;
+}
+static void w_atomic_cas(CGTarget* t, Operand prior, Operand ok, Operand addr,
+                         Operand expected, Operand desired, MemAccess m,
+                         MemOrder s, MemOrder f) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_ATOMIC_CAS);
+  e->u.atomic_cas.prior = prior;
+  e->u.atomic_cas.ok = ok;
+  e->u.atomic_cas.addr = addr;
+  e->u.atomic_cas.expected = expected;
+  e->u.atomic_cas.desired = desired;
+  e->u.atomic_cas.mem = m;
+  e->u.atomic_cas.success = s;
+  e->u.atomic_cas.failure = f;
+}
+static void w_fence(CGTarget* t, MemOrder mo) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_FENCE);
+  e->u.fence.mo = mo;
+}
+
+static void w_intrinsic(CGTarget* t, IntrinKind k, Operand* dsts, u32 nd,
+                        const Operand* args, u32 na) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e = tape_append(o, TOP_INTRINSIC);
+  e->u.intrinsic.kind = k;
+  e->u.intrinsic.ndst = nd;
+  e->u.intrinsic.narg = na;
+  e->u.intrinsic.dsts = copy_operands(o->c, dsts, nd);
+  e->u.intrinsic.args = copy_operands(o->c, args, na);
+}
+
+static void w_asm_block(CGTarget* t, const char* tmpl,
+                        const AsmConstraint* outs, u32 nout, Operand* out_ops,
+                        const AsmConstraint* ins, u32 nin,
+                        const Operand* in_ops, const Sym* clobbers, u32 nclob) {
+  (void)tmpl;
+  (void)outs;
+  (void)nout;
+  (void)out_ops;
+  (void)ins;
+  (void)nin;
+  (void)in_ops;
+  (void)clobbers;
+  (void)nclob;
+  /* Group M (inline asm) is deferred in the corpus; the wrapper does
+   * not yet support it. */
+  panic_unsupported(impl_of(t), "asm_block");
+}
+
+static void w_set_loc(CGTarget* t, SrcLoc loc) {
+  OptImpl* o = impl_of(t);
+  TapeEntry* e;
+  o->pending_loc = loc;
+  e = tape_append(o, TOP_SET_LOC);
+  e->u.set_loc.loc = loc;
+}
+
+/* ---- replay-time translation ---- */
+
+static Reg xlat_reg(OptImpl* o, Reg vreg) {
+  if (vreg == REG_NONE || vreg == 0u) return vreg;
+  if (vreg >= o->reg_map_cap || o->reg_map[vreg] == REG_NONE) {
+    SrcLoc loc = {0, 0, 0};
+    compiler_panic(o->c, loc, "opt replay: unmapped vreg %u", (unsigned)vreg);
+  }
+  return o->reg_map[vreg];
+}
+
+static FrameSlot xlat_slot(OptImpl* o, FrameSlot vs) {
+  if (vs == FRAME_SLOT_NONE) return FRAME_SLOT_NONE;
+  if (vs >= o->slot_map_cap || o->slot_map[vs] == FRAME_SLOT_NONE) {
+    SrcLoc loc = {0, 0, 0};
+    compiler_panic(o->c, loc, "opt replay: unmapped vslot %u", (unsigned)vs);
+  }
+  return o->slot_map[vs];
+}
+
+static Label xlat_label(OptImpl* o, Label vl) {
+  if (vl == LABEL_NONE) return LABEL_NONE;
+  if (vl >= o->label_map_cap || o->label_map[vl] == LABEL_NONE) {
+    SrcLoc loc = {0, 0, 0};
+    compiler_panic(o->c, loc, "opt replay: unmapped vlabel %u", (unsigned)vl);
+  }
+  return o->label_map[vl];
+}
+
+static CGScope xlat_scope(OptImpl* o, CGScope vs) {
+  if (vs == CG_SCOPE_NONE) return CG_SCOPE_NONE;
+  if (vs >= o->scope_map_cap || o->scope_map[vs] == CG_SCOPE_NONE) {
+    SrcLoc loc = {0, 0, 0};
+    compiler_panic(o->c, loc, "opt replay: unmapped vscope %u", (unsigned)vs);
+  }
+  return o->scope_map[vs];
+}
+
+static Operand xlat_op(OptImpl* o, Operand op) {
+  switch ((OpKind)op.kind) {
+    case OPK_IMM:
+    case OPK_GLOBAL:
+      return op;
+    case OPK_REG:
+      op.v.reg = xlat_reg(o, op.v.reg);
+      return op;
+    case OPK_LOCAL:
+      op.v.frame_slot = xlat_slot(o, op.v.frame_slot);
+      return op;
+    case OPK_INDIRECT:
+      op.v.ind.base = xlat_reg(o, op.v.ind.base);
+      return op;
+  }
+  /* unreachable */
+  return op;
+}
+
+static CGABIValue xlat_abivalue(OptImpl* o, const CGABIValue* in,
+                                CGABIPart* parts_out) {
+  CGABIValue out = *in;
+  out.storage = xlat_op(o, in->storage);
+  if (in->nparts && parts_out) {
+    for (u32 i = 0; i < in->nparts; ++i) {
+      parts_out[i] = in->parts[i];
+      parts_out[i].op = xlat_op(o, in->parts[i].op);
+    }
+    out.parts = parts_out;
+  }
+  return out;
+}
+
+/* ---- replay ---- */
+
+static void replay(OptImpl* o) {
+  CGTarget* w = o->target;
+
+  /* Pre-size the maps to the high-water mark for this function. */
+  if (o->next_vreg > 1) map_reg_grow(o, o->next_vreg);
+  if (o->next_vslot > 1) map_slot_grow(o, o->next_vslot);
+  if (o->next_vlabel > 1) map_label_grow(o, o->next_vlabel);
+  if (o->next_vscope > 1) map_scope_grow(o, o->next_vscope);
+
+  for (u32 i = 0; i < o->ntape; ++i) {
+    TapeEntry* e = &o->tape[i];
+    if (e->dead) continue;
+    switch ((TapeOpKind)e->op) {
+      case TOP_FUNC_BEGIN: {
+        /* Build a fresh CGFuncDesc with translated param slots. */
+        CGFuncDesc fd = e->u.func_begin.desc;
+        if (fd.nparams) {
+          CGParamDesc* params = arena_array(o->c->tu, CGParamDesc, fd.nparams);
+          for (u32 k = 0; k < fd.nparams; ++k) {
+            params[k] = e->u.func_begin.params[k];
+            params[k].slot = xlat_slot(o, e->u.func_begin.params[k].slot);
+          }
+          fd.params = params;
+        }
+        w->func_begin(w, &fd);
+        break;
+      }
+      case TOP_FUNC_END:
+        w->func_end(w);
+        break;
+      case TOP_ALLOC_REG: {
+        Reg r =
+            w->alloc_reg(w, e->u.alloc_reg.cls, e->u.alloc_reg.ty);
+        Reg v = e->u.alloc_reg.vreg;
+        if (v >= o->reg_map_cap) map_reg_grow(o, v + 1);
+        o->reg_map[v] = r;
+        break;
+      }
+      case TOP_FRAME_SLOT: {
+        FrameSlot s = w->frame_slot(w, &e->u.frame_slot.desc);
+        FrameSlot v = e->u.frame_slot.vslot;
+        if (v >= o->slot_map_cap) map_slot_grow(o, v + 1);
+        o->slot_map[v] = s;
+        break;
+      }
+      case TOP_PARAM: {
+        CGParamDesc d = e->u.param.desc;
+        d.slot = xlat_slot(o, d.slot);
+        w->param(w, &d);
+        break;
+      }
+      case TOP_LABEL_NEW: {
+        Label l = w->label_new(w);
+        Label v = e->u.label_new.vlabel;
+        if (v >= o->label_map_cap) map_label_grow(o, v + 1);
+        o->label_map[v] = l;
+        break;
+      }
+      case TOP_LABEL_PLACE:
+        w->label_place(w, xlat_label(o, e->u.label_op.vlabel));
+        break;
+      case TOP_JUMP:
+        w->jump(w, xlat_label(o, e->u.label_op.vlabel));
+        break;
+      case TOP_CMP_BRANCH:
+        w->cmp_branch(w, e->u.cmp_branch.op, xlat_op(o, e->u.cmp_branch.a),
+                      xlat_op(o, e->u.cmp_branch.b),
+                      xlat_label(o, e->u.cmp_branch.vlabel));
+        break;
+      case TOP_SCOPE_BEGIN: {
+        CGScopeDesc d = e->u.scope_begin.desc;
+        d.cond = xlat_op(o, d.cond);
+        d.break_label = xlat_label(o, d.break_label);
+        d.continue_label = xlat_label(o, d.continue_label);
+        CGScope s = w->scope_begin(w, &d);
+        CGScope v = e->u.scope_begin.vscope;
+        if (v >= o->scope_map_cap) map_scope_grow(o, v + 1);
+        o->scope_map[v] = s;
+        break;
+      }
+      case TOP_SCOPE_ELSE:
+        w->scope_else(w, xlat_scope(o, e->u.scope_op.vscope));
+        break;
+      case TOP_SCOPE_END:
+        w->scope_end(w, xlat_scope(o, e->u.scope_op.vscope));
+        break;
+      case TOP_BREAK_TO:
+        w->break_to(w, xlat_scope(o, e->u.scope_op.vscope));
+        break;
+      case TOP_CONTINUE_TO:
+        w->continue_to(w, xlat_scope(o, e->u.scope_op.vscope));
+        break;
+      case TOP_LOAD_IMM:
+        w->load_imm(w, xlat_op(o, e->u.load_imm.dst), e->u.load_imm.imm);
+        break;
+      case TOP_LOAD_CONST:
+        w->load_const(w, xlat_op(o, e->u.load_const.dst), e->u.load_const.cb);
+        break;
+      case TOP_COPY:
+        w->copy(w, xlat_op(o, e->u.copy.dst), xlat_op(o, e->u.copy.src));
+        break;
+      case TOP_LOAD:
+        w->load(w, xlat_op(o, e->u.load.dst), xlat_op(o, e->u.load.addr),
+                e->u.load.mem);
+        break;
+      case TOP_STORE:
+        w->store(w, xlat_op(o, e->u.store.addr), xlat_op(o, e->u.store.src),
+                 e->u.store.mem);
+        break;
+      case TOP_ADDR_OF:
+        w->addr_of(w, xlat_op(o, e->u.copy.dst), xlat_op(o, e->u.copy.src));
+        break;
+      case TOP_TLS_ADDR_OF:
+        w->tls_addr_of(w, xlat_op(o, e->u.tls_addr_of.dst),
+                       e->u.tls_addr_of.sym, e->u.tls_addr_of.addend);
+        break;
+      case TOP_COPY_BYTES:
+        w->copy_bytes(w, xlat_op(o, e->u.agg.a), xlat_op(o, e->u.agg.b),
+                      e->u.agg.agg);
+        break;
+      case TOP_SET_BYTES:
+        w->set_bytes(w, xlat_op(o, e->u.agg.a), xlat_op(o, e->u.agg.b),
+                     e->u.agg.agg);
+        break;
+      case TOP_BITFIELD_LOAD:
+        w->bitfield_load(w, xlat_op(o, e->u.bitfield_load.dst),
+                         xlat_op(o, e->u.bitfield_load.record),
+                         e->u.bitfield_load.bf);
+        break;
+      case TOP_BITFIELD_STORE:
+        w->bitfield_store(w, xlat_op(o, e->u.bitfield_store.record),
+                          xlat_op(o, e->u.bitfield_store.src),
+                          e->u.bitfield_store.bf);
+        break;
+      case TOP_BINOP:
+        w->binop(w, e->u.binop.op, xlat_op(o, e->u.binop.dst),
+                 xlat_op(o, e->u.binop.a), xlat_op(o, e->u.binop.b));
+        break;
+      case TOP_UNOP:
+        w->unop(w, e->u.unop.op, xlat_op(o, e->u.unop.dst),
+                xlat_op(o, e->u.unop.a));
+        break;
+      case TOP_CMP:
+        w->cmp(w, e->u.cmp.op, xlat_op(o, e->u.cmp.dst),
+               xlat_op(o, e->u.cmp.a), xlat_op(o, e->u.cmp.b));
+        break;
+      case TOP_CONVERT:
+        w->convert(w, e->u.convert.kind, xlat_op(o, e->u.convert.dst),
+                   xlat_op(o, e->u.convert.src));
+        break;
+      case TOP_CALL: {
+        CGCallDesc cd = e->u.call.desc;
+        cd.callee = xlat_op(o, cd.callee);
+        CGABIValue* args = NULL;
+        if (cd.nargs) {
+          args = arena_array(o->c->tu, CGABIValue, cd.nargs);
+          for (u32 k = 0; k < cd.nargs; ++k) {
+            CGABIPart* parts =
+                e->u.call.args[k].nparts
+                    ? arena_array(o->c->tu, CGABIPart,
+                                  e->u.call.args[k].nparts)
+                    : NULL;
+            args[k] = xlat_abivalue(o, &e->u.call.args[k], parts);
+          }
+          cd.args = args;
+        } else {
+          cd.args = NULL;
+        }
+        CGABIPart* ret_parts =
+            cd.ret.nparts
+                ? arena_array(o->c->tu, CGABIPart, cd.ret.nparts)
+                : NULL;
+        cd.ret = xlat_abivalue(o, &e->u.call.desc.ret, ret_parts);
+        w->call(w, &cd);
+        break;
+      }
+      case TOP_RET: {
+        if (!e->u.ret.present) {
+          w->ret(w, NULL);
+          break;
+        }
+        CGABIPart* parts =
+            e->u.ret.val.nparts
+                ? arena_array(o->c->tu, CGABIPart, e->u.ret.val.nparts)
+                : NULL;
+        CGABIValue v = xlat_abivalue(o, &e->u.ret.val, parts);
+        w->ret(w, &v);
+        break;
+      }
+      case TOP_ALLOCA:
+        w->alloca_(w, xlat_op(o, e->u.alloca_.dst),
+                   xlat_op(o, e->u.alloca_.size), e->u.alloca_.align);
+        break;
+      case TOP_VA_START:
+        w->va_start_(w, xlat_op(o, e->u.va_se.ap));
+        break;
+      case TOP_VA_ARG:
+        w->va_arg_(w, xlat_op(o, e->u.va_arg_.dst),
+                   xlat_op(o, e->u.va_arg_.ap), e->u.va_arg_.ty);
+        break;
+      case TOP_VA_END:
+        w->va_end_(w, xlat_op(o, e->u.va_se.ap));
+        break;
+      case TOP_VA_COPY:
+        w->va_copy_(w, xlat_op(o, e->u.copy.dst), xlat_op(o, e->u.copy.src));
+        break;
+      case TOP_SETJMP:
+        w->setjmp_(w, xlat_op(o, e->u.setjmp_.dst),
+                   xlat_op(o, e->u.setjmp_.buf));
+        break;
+      case TOP_LONGJMP:
+        w->longjmp_(w, xlat_op(o, e->u.longjmp_.buf),
+                    xlat_op(o, e->u.longjmp_.val));
+        break;
+      case TOP_ATOMIC_LOAD:
+        w->atomic_load(w, xlat_op(o, e->u.atomic_load.dst),
+                       xlat_op(o, e->u.atomic_load.addr),
+                       e->u.atomic_load.mem, e->u.atomic_load.mo);
+        break;
+      case TOP_ATOMIC_STORE:
+        w->atomic_store(w, xlat_op(o, e->u.atomic_store.addr),
+                        xlat_op(o, e->u.atomic_store.src),
+                        e->u.atomic_store.mem, e->u.atomic_store.mo);
+        break;
+      case TOP_ATOMIC_RMW:
+        w->atomic_rmw(w, e->u.atomic_rmw.op, xlat_op(o, e->u.atomic_rmw.dst),
+                      xlat_op(o, e->u.atomic_rmw.addr),
+                      xlat_op(o, e->u.atomic_rmw.val), e->u.atomic_rmw.mem,
+                      e->u.atomic_rmw.mo);
+        break;
+      case TOP_ATOMIC_CAS:
+        w->atomic_cas(w, xlat_op(o, e->u.atomic_cas.prior),
+                      xlat_op(o, e->u.atomic_cas.ok),
+                      xlat_op(o, e->u.atomic_cas.addr),
+                      xlat_op(o, e->u.atomic_cas.expected),
+                      xlat_op(o, e->u.atomic_cas.desired),
+                      e->u.atomic_cas.mem, e->u.atomic_cas.success,
+                      e->u.atomic_cas.failure);
+        break;
+      case TOP_FENCE:
+        w->fence(w, e->u.fence.mo);
+        break;
+      case TOP_INTRINSIC: {
+        Operand* dsts = NULL;
+        Operand* args = NULL;
+        if (e->u.intrinsic.ndst) {
+          dsts = arena_array(o->c->tu, Operand, e->u.intrinsic.ndst);
+          for (u32 k = 0; k < e->u.intrinsic.ndst; ++k) {
+            dsts[k] = xlat_op(o, e->u.intrinsic.dsts[k]);
+          }
+        }
+        if (e->u.intrinsic.narg) {
+          args = arena_array(o->c->tu, Operand, e->u.intrinsic.narg);
+          for (u32 k = 0; k < e->u.intrinsic.narg; ++k) {
+            args[k] = xlat_op(o, e->u.intrinsic.args[k]);
+          }
+        }
+        w->intrinsic(w, e->u.intrinsic.kind, dsts, e->u.intrinsic.ndst, args,
+                     e->u.intrinsic.narg);
+        break;
+      }
+      case TOP_SET_LOC:
+        w->set_loc(w, e->u.set_loc.loc);
+        break;
+    }
+  }
+}
+
+/* ---- printer ---- */
+
+static void wstr(Writer* w, const char* s) {
+  size_t n = 0;
+  while (s[n]) ++n;
+  if (n) w->write(w, s, n);
+}
+
+/* Minimal i64 → decimal formatter. Writes into a 32-byte buffer (enough
+ * for INT64_MIN). Returns nothing; the caller hands the buffer to wstr. */
+static void fmt_i64(i64 v, char* out) {
+  char tmp[32];
+  u32 n = 0;
+  u64 u;
+  int neg = 0;
+  if (v < 0) {
+    neg = 1;
+    u = (u64)(-(v + 1)) + 1u; /* avoid UB for INT64_MIN */
+  } else {
+    u = (u64)v;
+  }
+  do {
+    tmp[n++] = (char)('0' + (u % 10u));
+    u /= 10u;
+  } while (u);
+  if (neg) tmp[n++] = '-';
+  /* reverse */
+  for (u32 i = 0; i < n; ++i) out[i] = tmp[n - 1 - i];
+  out[n] = 0;
+}
+
+static void wint(Writer* w, i64 v) {
+  char buf[32];
+  fmt_i64(v, buf);
+  wstr(w, buf);
+}
+
+static const char* binop_name(BinOp op) {
+  switch (op) {
+    case BO_IADD: return "iadd";
+    case BO_ISUB: return "isub";
+    case BO_IMUL: return "imul";
+    case BO_SDIV: return "sdiv";
+    case BO_UDIV: return "udiv";
+    case BO_SREM: return "srem";
+    case BO_UREM: return "urem";
+    case BO_FADD: return "fadd";
+    case BO_FSUB: return "fsub";
+    case BO_FMUL: return "fmul";
+    case BO_FDIV: return "fdiv";
+    case BO_AND:  return "and";
+    case BO_OR:   return "or";
+    case BO_XOR:  return "xor";
+    case BO_SHL:  return "shl";
+    case BO_SHR_S: return "shr_s";
+    case BO_SHR_U: return "shr_u";
+  }
+  return "?binop";
+}
+
+static const char* unop_name(UnOp op) {
+  switch (op) {
+    case UO_NEG: return "neg";
+    case UO_NOT: return "not";
+    case UO_BNOT: return "bnot";
+  }
+  return "?unop";
+}
+
+static const char* cmp_name(CmpOp op) {
+  switch (op) {
+    case CMP_EQ: return "eq";
+    case CMP_NE: return "ne";
+    case CMP_LT_S: return "lt_s";
+    case CMP_LE_S: return "le_s";
+    case CMP_GT_S: return "gt_s";
+    case CMP_GE_S: return "ge_s";
+    case CMP_LT_U: return "lt_u";
+    case CMP_LE_U: return "le_u";
+    case CMP_GT_U: return "gt_u";
+    case CMP_GE_U: return "ge_u";
+    case CMP_LT_F: return "lt_f";
+    case CMP_LE_F: return "le_f";
+    case CMP_GT_F: return "gt_f";
+    case CMP_GE_F: return "ge_f";
+  }
+  return "?cmp";
+}
+
+static void print_operand(Writer* w, const Operand* op) {
+  switch ((OpKind)op->kind) {
+    case OPK_IMM:
+      wstr(w, "imm:");
+      wint(w, op->v.imm);
+      return;
+    case OPK_REG:
+      wstr(w, "v");
+      wint(w, (i64)op->v.reg);
+      return;
+    case OPK_LOCAL:
+      wstr(w, "fs");
+      wint(w, (i64)op->v.frame_slot);
+      return;
+    case OPK_GLOBAL:
+      wstr(w, "sym");
+      wint(w, (i64)op->v.global.sym);
+      if (op->v.global.addend) {
+        wstr(w, "+");
+        wint(w, op->v.global.addend);
+      }
+      return;
+    case OPK_INDIRECT:
+      wstr(w, "[v");
+      wint(w, (i64)op->v.ind.base);
+      if (op->v.ind.ofs) {
+        wstr(w, "+");
+        wint(w, op->v.ind.ofs);
+      }
+      wstr(w, "]");
+      return;
+  }
+  wstr(w, "?op");
+}
+
+static void print_tape(OptImpl* o, Writer* w) {
+  for (u32 i = 0; i < o->ntape; ++i) {
+    TapeEntry* e = &o->tape[i];
+    if (e->dead) {
+      wstr(w, "  ; dead\n");
+      continue;
+    }
+    wstr(w, "  ");
+    switch ((TapeOpKind)e->op) {
+      case TOP_FUNC_BEGIN:
+        wstr(w, "func_begin sym=");
+        wint(w, (i64)e->u.func_begin.desc.sym);
+        wstr(w, " nparams=");
+        wint(w, (i64)e->u.func_begin.desc.nparams);
+        break;
+      case TOP_FUNC_END:
+        wstr(w, "func_end");
+        break;
+      case TOP_ALLOC_REG:
+        wstr(w, "alloc_reg v");
+        wint(w, (i64)e->u.alloc_reg.vreg);
+        wstr(w, " cls=");
+        wint(w, (i64)e->u.alloc_reg.cls);
+        break;
+      case TOP_FRAME_SLOT:
+        wstr(w, "frame_slot fs");
+        wint(w, (i64)e->u.frame_slot.vslot);
+        wstr(w, " size=");
+        wint(w, (i64)e->u.frame_slot.desc.size);
+        wstr(w, " kind=");
+        wint(w, (i64)e->u.frame_slot.desc.kind);
+        break;
+      case TOP_PARAM:
+        wstr(w, "param idx=");
+        wint(w, (i64)e->u.param.desc.index);
+        wstr(w, " fs=");
+        wint(w, (i64)e->u.param.desc.slot);
+        break;
+      case TOP_LABEL_NEW:
+        wstr(w, "label_new L");
+        wint(w, (i64)e->u.label_new.vlabel);
+        break;
+      case TOP_LABEL_PLACE:
+        wstr(w, "label_place L");
+        wint(w, (i64)e->u.label_op.vlabel);
+        break;
+      case TOP_JUMP:
+        wstr(w, "jump L");
+        wint(w, (i64)e->u.label_op.vlabel);
+        break;
+      case TOP_CMP_BRANCH:
+        wstr(w, "cmp_branch ");
+        wstr(w, cmp_name(e->u.cmp_branch.op));
+        wstr(w, " ");
+        print_operand(w, &e->u.cmp_branch.a);
+        wstr(w, ", ");
+        print_operand(w, &e->u.cmp_branch.b);
+        wstr(w, " -> L");
+        wint(w, (i64)e->u.cmp_branch.vlabel);
+        break;
+      case TOP_SCOPE_BEGIN:
+        wstr(w, "scope_begin S");
+        wint(w, (i64)e->u.scope_begin.vscope);
+        wstr(w, " kind=");
+        wint(w, (i64)e->u.scope_begin.desc.kind);
+        break;
+      case TOP_SCOPE_ELSE:
+        wstr(w, "scope_else S");
+        wint(w, (i64)e->u.scope_op.vscope);
+        break;
+      case TOP_SCOPE_END:
+        wstr(w, "scope_end S");
+        wint(w, (i64)e->u.scope_op.vscope);
+        break;
+      case TOP_BREAK_TO:
+        wstr(w, "break_to S");
+        wint(w, (i64)e->u.scope_op.vscope);
+        break;
+      case TOP_CONTINUE_TO:
+        wstr(w, "continue_to S");
+        wint(w, (i64)e->u.scope_op.vscope);
+        break;
+      case TOP_LOAD_IMM:
+        wstr(w, "load_imm ");
+        print_operand(w, &e->u.load_imm.dst);
+        wstr(w, ", ");
+        wint(w, e->u.load_imm.imm);
+        break;
+      case TOP_LOAD_CONST:
+        wstr(w, "load_const ");
+        print_operand(w, &e->u.load_const.dst);
+        wstr(w, ", <bytes:");
+        wint(w, (i64)e->u.load_const.cb.size);
+        wstr(w, ">");
+        break;
+      case TOP_COPY:
+        wstr(w, "copy ");
+        print_operand(w, &e->u.copy.dst);
+        wstr(w, ", ");
+        print_operand(w, &e->u.copy.src);
+        break;
+      case TOP_LOAD:
+        wstr(w, "load ");
+        print_operand(w, &e->u.load.dst);
+        wstr(w, ", ");
+        print_operand(w, &e->u.load.addr);
+        break;
+      case TOP_STORE:
+        wstr(w, "store ");
+        print_operand(w, &e->u.store.addr);
+        wstr(w, ", ");
+        print_operand(w, &e->u.store.src);
+        break;
+      case TOP_ADDR_OF:
+        wstr(w, "addr_of ");
+        print_operand(w, &e->u.copy.dst);
+        wstr(w, ", ");
+        print_operand(w, &e->u.copy.src);
+        break;
+      case TOP_TLS_ADDR_OF:
+        wstr(w, "tls_addr_of ");
+        print_operand(w, &e->u.tls_addr_of.dst);
+        wstr(w, ", sym");
+        wint(w, (i64)e->u.tls_addr_of.sym);
+        break;
+      case TOP_COPY_BYTES:
+        wstr(w, "copy_bytes ");
+        print_operand(w, &e->u.agg.a);
+        wstr(w, ", ");
+        print_operand(w, &e->u.agg.b);
+        wstr(w, " size=");
+        wint(w, (i64)e->u.agg.agg.size);
+        break;
+      case TOP_SET_BYTES:
+        wstr(w, "set_bytes ");
+        print_operand(w, &e->u.agg.a);
+        wstr(w, ", ");
+        print_operand(w, &e->u.agg.b);
+        wstr(w, " size=");
+        wint(w, (i64)e->u.agg.agg.size);
+        break;
+      case TOP_BITFIELD_LOAD:
+        wstr(w, "bitfield_load ");
+        print_operand(w, &e->u.bitfield_load.dst);
+        wstr(w, ", ");
+        print_operand(w, &e->u.bitfield_load.record);
+        break;
+      case TOP_BITFIELD_STORE:
+        wstr(w, "bitfield_store ");
+        print_operand(w, &e->u.bitfield_store.record);
+        wstr(w, ", ");
+        print_operand(w, &e->u.bitfield_store.src);
+        break;
+      case TOP_BINOP:
+        wstr(w, binop_name(e->u.binop.op));
+        wstr(w, " ");
+        print_operand(w, &e->u.binop.dst);
+        wstr(w, ", ");
+        print_operand(w, &e->u.binop.a);
+        wstr(w, ", ");
+        print_operand(w, &e->u.binop.b);
+        break;
+      case TOP_UNOP:
+        wstr(w, unop_name(e->u.unop.op));
+        wstr(w, " ");
+        print_operand(w, &e->u.unop.dst);
+        wstr(w, ", ");
+        print_operand(w, &e->u.unop.a);
+        break;
+      case TOP_CMP:
+        wstr(w, "cmp.");
+        wstr(w, cmp_name(e->u.cmp.op));
+        wstr(w, " ");
+        print_operand(w, &e->u.cmp.dst);
+        wstr(w, ", ");
+        print_operand(w, &e->u.cmp.a);
+        wstr(w, ", ");
+        print_operand(w, &e->u.cmp.b);
+        break;
+      case TOP_CONVERT:
+        wstr(w, "convert ");
+        print_operand(w, &e->u.convert.dst);
+        wstr(w, ", ");
+        print_operand(w, &e->u.convert.src);
+        wstr(w, " kind=");
+        wint(w, (i64)e->u.convert.kind);
+        break;
+      case TOP_CALL:
+        wstr(w, "call ");
+        print_operand(w, &e->u.call.desc.callee);
+        wstr(w, " nargs=");
+        wint(w, (i64)e->u.call.desc.nargs);
+        break;
+      case TOP_RET:
+        wstr(w, "ret");
+        if (e->u.ret.present) {
+          wstr(w, " ");
+          print_operand(w, &e->u.ret.val.storage);
+        }
+        break;
+      case TOP_ALLOCA:
+        wstr(w, "alloca ");
+        print_operand(w, &e->u.alloca_.dst);
+        wstr(w, ", ");
+        print_operand(w, &e->u.alloca_.size);
+        break;
+      case TOP_VA_START:
+        wstr(w, "va_start ");
+        print_operand(w, &e->u.va_se.ap);
+        break;
+      case TOP_VA_ARG:
+        wstr(w, "va_arg ");
+        print_operand(w, &e->u.va_arg_.dst);
+        wstr(w, ", ");
+        print_operand(w, &e->u.va_arg_.ap);
+        break;
+      case TOP_VA_END:
+        wstr(w, "va_end ");
+        print_operand(w, &e->u.va_se.ap);
+        break;
+      case TOP_VA_COPY:
+        wstr(w, "va_copy ");
+        print_operand(w, &e->u.copy.dst);
+        wstr(w, ", ");
+        print_operand(w, &e->u.copy.src);
+        break;
+      case TOP_SETJMP:
+        wstr(w, "setjmp ");
+        print_operand(w, &e->u.setjmp_.dst);
+        wstr(w, ", ");
+        print_operand(w, &e->u.setjmp_.buf);
+        break;
+      case TOP_LONGJMP:
+        wstr(w, "longjmp ");
+        print_operand(w, &e->u.longjmp_.buf);
+        wstr(w, ", ");
+        print_operand(w, &e->u.longjmp_.val);
+        break;
+      case TOP_ATOMIC_LOAD:
+        wstr(w, "atomic_load ");
+        print_operand(w, &e->u.atomic_load.dst);
+        wstr(w, ", ");
+        print_operand(w, &e->u.atomic_load.addr);
+        break;
+      case TOP_ATOMIC_STORE:
+        wstr(w, "atomic_store ");
+        print_operand(w, &e->u.atomic_store.addr);
+        wstr(w, ", ");
+        print_operand(w, &e->u.atomic_store.src);
+        break;
+      case TOP_ATOMIC_RMW:
+        wstr(w, "atomic_rmw op=");
+        wint(w, (i64)e->u.atomic_rmw.op);
+        wstr(w, " ");
+        print_operand(w, &e->u.atomic_rmw.dst);
+        wstr(w, ", ");
+        print_operand(w, &e->u.atomic_rmw.addr);
+        wstr(w, ", ");
+        print_operand(w, &e->u.atomic_rmw.val);
+        break;
+      case TOP_ATOMIC_CAS:
+        wstr(w, "atomic_cas prior=");
+        print_operand(w, &e->u.atomic_cas.prior);
+        wstr(w, " ok=");
+        print_operand(w, &e->u.atomic_cas.ok);
+        wstr(w, " addr=");
+        print_operand(w, &e->u.atomic_cas.addr);
+        break;
+      case TOP_FENCE:
+        wstr(w, "fence mo=");
+        wint(w, (i64)e->u.fence.mo);
+        break;
+      case TOP_INTRINSIC:
+        wstr(w, "intrinsic kind=");
+        wint(w, (i64)e->u.intrinsic.kind);
+        wstr(w, " ndst=");
+        wint(w, (i64)e->u.intrinsic.ndst);
+        wstr(w, " narg=");
+        wint(w, (i64)e->u.intrinsic.narg);
+        break;
+      case TOP_SET_LOC:
+        wstr(w, "set_loc ");
+        wint(w, (i64)e->u.set_loc.loc.line);
+        wstr(w, ":");
+        wint(w, (i64)e->u.set_loc.loc.col);
+        break;
+    }
+    wstr(w, "\n");
+  }
+}
+
+/* ---- Phase 2 peephole: integer constant folding ----
+ *
+ * Pattern: LOAD_IMM(V_a, k_a); LOAD_IMM(V_b, k_b); BINOP(op, V_d, V_a, V_b)
+ *           with op ∈ {IADD, ISUB, IMUL}.
+ * After:   the BINOP is rewritten to LOAD_IMM(V_d, k_a OP k_b).
+ *
+ * Both operands must be OPK_REG referencing wrapper vregs whose only
+ * recorded definition was a LOAD_IMM. The intermediate LOAD_IMMs are
+ * left in place — they may have other uses, and DCE is a Phase 3
+ * concern.
+ *
+ * Folding is done in 64-bit signed arithmetic and truncated by the
+ * target's load_imm based on the destination type. This matches C11
+ * §6.5/3 ("two's-complement wraparound at the abstract machine level
+ * for signed and unsigned integer types alike" per cfree's no-UB
+ * stance — see doc/DESIGN.md §9). */
+
+typedef struct ImmInfo {
+  i64 val;
+  u8 known;
+} ImmInfo;
+
+static void peephole_constfold(OptImpl* o) {
+  ImmInfo* imm;
+  u32 cap;
+
+  if (o->next_vreg <= 1) return;
+  cap = o->next_vreg;
+  imm = arena_zarray(o->c->tu, ImmInfo, cap);
+
+  for (u32 i = 0; i < o->ntape; ++i) {
+    TapeEntry* e = &o->tape[i];
+    if (e->dead) continue;
+    switch ((TapeOpKind)e->op) {
+      case TOP_LOAD_IMM:
+        if (e->u.load_imm.dst.kind == OPK_REG) {
+          Reg r = e->u.load_imm.dst.v.reg;
+          if (r < cap) {
+            imm[r].val = e->u.load_imm.imm;
+            imm[r].known = 1;
+          }
+        }
+        break;
+      case TOP_BINOP: {
+        Operand a = e->u.binop.a;
+        Operand b = e->u.binop.b;
+        BinOp op = e->u.binop.op;
+        if (a.kind != OPK_REG || b.kind != OPK_REG) break;
+        if (a.v.reg >= cap || b.v.reg >= cap) break;
+        if (!imm[a.v.reg].known || !imm[b.v.reg].known) break;
+        if (op != BO_IADD && op != BO_ISUB && op != BO_IMUL) break;
+
+        i64 av = imm[a.v.reg].val;
+        i64 bv = imm[b.v.reg].val;
+        u64 folded;
+        /* Compute in u64 to make wraparound deterministic, then cast
+         * back. cfree's no-UB stance forbids signed-overflow-is-UB
+         * exploitation (doc/DESIGN.md §9), so this is the right shape. */
+        switch (op) {
+          case BO_IADD: folded = (u64)av + (u64)bv; break;
+          case BO_ISUB: folded = (u64)av - (u64)bv; break;
+          case BO_IMUL: folded = (u64)av * (u64)bv; break;
+          default: continue;
+        }
+
+        Operand dst = e->u.binop.dst;
+        memset(&e->u, 0, sizeof e->u);
+        e->op = (u8)TOP_LOAD_IMM;
+        e->u.load_imm.dst = dst;
+        e->u.load_imm.imm = (i64)folded;
+        if (dst.kind == OPK_REG && dst.v.reg < cap) {
+          imm[dst.v.reg].val = (i64)folded;
+          imm[dst.v.reg].known = 1;
+        }
+        break;
+      }
+      default:
+        break;
+    }
+  }
+}
+
+/* ---- func_end: append TOP_FUNC_END, run peepholes, replay ---- */
+
+static void w_func_end(CGTarget* t) {
+  OptImpl* o = impl_of(t);
+  tape_append(o, TOP_FUNC_END);
+  peephole_constfold(o);
+  if (o->dump_writer) print_tape(o, o->dump_writer);
+  replay(o);
+}
+
+/* ---- public API: dump writer ---- */
+
+void opt_set_dump_writer(CGTarget* t, Writer* w) {
+  /* Identify our own targets by the func_begin slot. Anything else is
+   * a non-opt CGTarget and the call is a silent no-op. */
+  if (!t || t->func_begin != w_func_begin) return;
+  impl_of(t)->dump_writer = w;
+}
+
+/* ---- end-of-TU and destruction ---- */
+
+static void w_finalize(CGTarget* t) {
+  CGTarget* wr = impl_of(t)->target;
+  if (wr->finalize) wr->finalize(wr);
+}
+
+static void w_destroy(CGTarget* t) {
+  CGTarget* wr = impl_of(t)->target;
+  if (wr->destroy) wr->destroy(wr);
+}
+
+/* ---- construction ---- */
+
+CGTarget* opt_cgtarget_new(Compiler* c, CGTarget* target, int level) {
+  OptImpl* o;
+  CGTarget* t;
+
+  if (!target) {
+    SrcLoc loc = {0, 0, 0};
+    compiler_panic(c, loc, "opt_cgtarget_new: target is NULL");
+  }
+  if (level < 1 || level > 2) {
+    SrcLoc loc = {0, 0, 0};
+    compiler_panic(c, loc, "opt_cgtarget_new: level %d out of range [1, 2]",
+                   level);
+  }
+
+  o = arena_new(c->tu, OptImpl);
+  memset(o, 0, sizeof *o);
+  o->c = c;
+  o->target = target;
+  o->level = level;
+
+  t = &o->base;
+  t->c = c;
+  t->obj = target->obj;
+  t->mc = target->mc;
+  t->debug = target->debug;
+
+  t->func_begin = w_func_begin;
+  t->func_end = w_func_end;
+
+  t->alloc_reg = w_alloc_reg;
+  t->free_reg = w_free_reg;
+  t->frame_slot = w_frame_slot;
+  t->param = w_param;
+  t->clobbers = w_clobbers;
+  t->spill_reg = w_spill_reg;
+  t->reload_reg = w_reload_reg;
+
+  t->label_new = w_label_new;
+  t->label_place = w_label_place;
+  t->jump = w_jump;
+  t->cmp_branch = w_cmp_branch;
+
+  t->scope_begin = w_scope_begin;
+  t->scope_else = w_scope_else;
+  t->scope_end = w_scope_end;
+  t->break_to = w_break_to;
+  t->continue_to = w_continue_to;
+
+  t->load_imm = w_load_imm;
+  t->load_const = w_load_const;
+  t->copy = w_copy;
+  t->load = w_load;
+  t->store = w_store;
+  t->addr_of = w_addr_of;
+  t->tls_addr_of = w_tls_addr_of;
+  t->copy_bytes = w_copy_bytes;
+  t->set_bytes = w_set_bytes;
+  t->bitfield_load = w_bitfield_load;
+  t->bitfield_store = w_bitfield_store;
+
+  t->binop = w_binop;
+  t->unop = w_unop;
+  t->cmp = w_cmp;
+  t->convert = w_convert;
+
+  t->call = w_call;
+  t->ret = w_ret;
+
+  t->alloca_ = w_alloca_;
+  t->va_start_ = w_va_start_;
+  t->va_arg_ = w_va_arg_;
+  t->va_end_ = w_va_end_;
+  t->va_copy_ = w_va_copy_;
+
+  t->setjmp_ = target->setjmp_ ? w_setjmp_ : NULL;
+  t->longjmp_ = target->longjmp_ ? w_longjmp_ : NULL;
+
+  t->atomic_load = w_atomic_load;
+  t->atomic_store = w_atomic_store;
+  t->atomic_rmw = w_atomic_rmw;
+  t->atomic_cas = w_atomic_cas;
+  t->fence = w_fence;
+
+  t->intrinsic = w_intrinsic;
+  t->asm_block = w_asm_block;
+
+  t->set_loc = w_set_loc;
+  t->finalize = w_finalize;
+  t->destroy = w_destroy;
+
+  return t;
+}
diff --git a/src/opt/opt.h b/src/opt/opt.h
@@ -77,4 +77,10 @@ void opt_dce(Func*);     /* post-RA DCE */
  * needs. Stamps each emitted insn's SrcLoc onto target via CGTarget.set_loc. */
 void opt_emit(Compiler*, Func*, CGTarget* target);
 
+/* When set, the wrapper writes a textual dump of each function's recorded
+ * tape to `w` on func_end, immediately before replay. Pass `w == NULL` to
+ * disable. The format is line-oriented and stable enough for golden-file
+ * diffs but otherwise unspecified. No-op if `t` is not an opt_cgtarget. */
+void opt_set_dump_writer(CGTarget* t, Writer* w);
+
 #endif
diff --git a/test/cg/harness/cg_runner.c b/test/cg/harness/cg_runner.c
@@ -31,8 +31,15 @@
 #include "debug/debug.h"
 #include "link/link.h"
 #include "obj/obj.h"
+#include "opt/opt.h"
 #include "type/type.h"
 
+/* --opt-level N: wrap the constructed CGTarget with opt_cgtarget_new(level)
+ * before each case runs. 0 (default) drives the backend directly; 1 / 2
+ * exercise the opt pipeline. The corpus is the equivalence oracle — every
+ * case's exit code at level 0 must match levels 1 / 2. */
+static int g_opt_level = 0;
+
 /* ---- env ---- */
 
 static void* h_alloc(CfreeHeap* h, size_t n, size_t a) {
@@ -265,6 +272,9 @@ static int build_case(BuildState* st, const CgCase* cc) {
 
   if (cc->kind != CG_CASE_MC_ONLY) {
     st->target = cgtarget_new(c, st->ob, st->mc);
+    if (g_opt_level > 0) {
+      st->target = opt_cgtarget_new(c, st->target, g_opt_level);
+    }
   } else {
     st->target = NULL;
   }
@@ -328,6 +338,96 @@ static int mode_expected(const char* name) {
   return 0;
 }
 
+/* CfreeWriter that wraps stdout; used by --dump-tape. */
+typedef struct StdoutWriter {
+  CfreeWriter base;
+} StdoutWriter;
+
+static void sw_write(CfreeWriter* w, const void* data, size_t n) {
+  (void)w;
+  fwrite(data, 1, n, stdout);
+}
+static void sw_seek(CfreeWriter* w, uint64_t off) {
+  (void)w;
+  (void)off;
+}
+static uint64_t sw_tell(CfreeWriter* w) {
+  (void)w;
+  return 0;
+}
+static int sw_error(CfreeWriter* w) {
+  (void)w;
+  return 0;
+}
+static void sw_close(CfreeWriter* w) { (void)w; }
+
+static StdoutWriter g_stdout_writer = {{sw_write, sw_seek, sw_tell, sw_error,
+                                        sw_close}};
+
+/* --dump-tape NAME — build the case at the current --opt-level (must be
+ * >= 1) and print each function's recorded tape to stdout instead of
+ * just running the equivalence path. Useful for ad-hoc inspection and
+ * golden-file diffs. */
+static int mode_dump_tape(const char* name) {
+  const CgCase* cc = find_case(name);
+  if (!cc) {
+    fprintf(stderr, "cg-runner: unknown case '%s'\n", name);
+    return 2;
+  }
+  if (g_opt_level < 1) {
+    fprintf(stderr, "cg-runner: --dump-tape requires --opt-level >= 1\n");
+    return 2;
+  }
+
+  CfreeTarget target;
+  target_aarch64_linux(&target);
+  CfreeEnv env;
+  memset(&env, 0, sizeof env);
+  env.heap = &g_heap;
+  env.diag = &g_diag;
+  env.execmem = &g_execmem;
+  env.now = -1;
+
+  CfreeCompiler* cc_ = cfree_compiler_new(target, &env);
+  if (!cc_) return 2;
+
+  BuildState st;
+  memset(&st, 0, sizeof st);
+  st.c = (Compiler*)cc_;
+
+  /* Pre-empt build_case so we can install the dump writer before the
+   * case runs through func_begin/func_end. */
+  Compiler* c = st.c;
+  if (setjmp(c->panic)) {
+    compiler_run_cleanups(c);
+    cfree_compiler_free(cc_);
+    return 1;
+  }
+  st.ob = obj_new(c);
+  st.mc = mc_new(c, st.ob);
+  st.target = cgtarget_new(c, st.ob, st.mc);
+  st.target = opt_cgtarget_new(c, st.target, g_opt_level);
+  opt_set_dump_writer(st.target, &g_stdout_writer.base);
+
+  Sym text_name = pool_intern_cstr(c->global, ".text");
+  ObjSecId text_sec =
+      obj_section(st.ob, text_name, SEC_TEXT, SF_ALLOC | SF_EXEC, 4);
+
+  st.ctx.c = c;
+  st.ctx.ob = st.ob;
+  st.ctx.mc = st.mc;
+  st.ctx.target = st.target;
+  st.ctx.text_sec = text_sec;
+  st.ctx.pool = c->global;
+  st.ctx.debug = NULL;
+  st.mc->set_section(st.mc, text_sec);
+  cc->build(&st.ctx);
+  cgtarget_finalize(st.target);
+
+  cfree_compiler_free(cc_);
+  return 0;
+}
+
 /* --dwarf-checks NAME — print the W-path directive blob registered for
  * NAME, or nothing if the case has no DWARF checks. The shell harness
  * pipes this into cg_check_dwarf <obj>. */
@@ -502,11 +602,12 @@ static int mode_jit(const char* name) {
 
 static int usage(void) {
   fprintf(stderr,
-          "usage: cg-runner --list\n"
-          "       cg-runner --expected NAME\n"
-          "       cg-runner --dwarf-checks NAME\n"
-          "       cg-runner --emit NAME OUT.o\n"
-          "       cg-runner --jit  NAME\n");
+          "usage: cg-runner [--opt-level N] --list\n"
+          "       cg-runner [--opt-level N] --expected NAME\n"
+          "       cg-runner [--opt-level N] --dwarf-checks NAME\n"
+          "       cg-runner [--opt-level N] --emit NAME OUT.o\n"
+          "       cg-runner [--opt-level N] --jit  NAME\n"
+          "       cg-runner --opt-level N --dump-tape NAME\n");
   return 2;
 }
 
@@ -515,6 +616,12 @@ int main(int argc, char** argv) {
     long ps = sysconf(_SC_PAGESIZE);
     if (ps > 0) g_execmem.page_size = (size_t)ps;
   }
+  /* Optional leading --opt-level N flag. */
+  if (argc >= 3 && !strcmp(argv[1], "--opt-level")) {
+    g_opt_level = atoi(argv[2]);
+    argc -= 2;
+    argv += 2;
+  }
   if (argc < 2) return usage();
   if (!strcmp(argv[1], "--list"))
     return mode_list();
@@ -526,5 +633,7 @@ int main(int argc, char** argv) {
     return mode_emit(argv[2], argv[3]);
   else if (!strcmp(argv[1], "--jit") && argc == 3)
     return mode_jit(argv[2]);
+  else if (!strcmp(argv[1], "--dump-tape") && argc == 3)
+    return mode_dump_tape(argv[2]);
   return usage();
 }
diff --git a/test/cg/run.sh b/test/cg/run.sh
@@ -51,8 +51,15 @@ ALLOW_SKIP="${CFREE_TEST_ALLOW_SKIP:-0}"
 # Filters (env vars or positional args; args win):
 #   $1 / CFREE_TEST_FILTER — substring match against case name
 #   $2 / CFREE_TEST_PATHS  — subset of "DREJ" (default "DREJ")
+#   CFREE_OPT_LEVELS — space-separated opt levels to exercise. Default "0 1"
+#                      so every case is built twice: directly against the
+#                      backend (level 0) and through the opt_cgtarget
+#                      wrapper (level 1). Path W (DWARF) only runs at
+#                      level 0 — opt-level DWARF equivalence is a later
+#                      phase concern.
 FILTER="${1:-${CFREE_TEST_FILTER:-}}"
 PATHS="${2:-${CFREE_TEST_PATHS:-DREJW}}"
+OPT_LEVELS="${CFREE_OPT_LEVELS:-0 1}"
 case "$PATHS" in *D*) RUN_D=1;; *) RUN_D=0;; esac
 case "$PATHS" in *R*) RUN_R=1;; *) RUN_R=0;; esac
 case "$PATHS" in *E*) RUN_E=1;; *) RUN_E=0;; esac
@@ -221,178 +228,195 @@ if [ $have_clang_cross -eq 1 ]; then
     fi
 fi
 
-printf 'Running cases...\n'
+CASES="$($CG_RUNNER --list)"
+
+# Each level wraps cg-runner with --opt-level N. Level 0 drives the AArch64
+# backend directly; level >0 inserts opt_cgtarget. Cases tagged with /L<N>
+# in the output when level>0 so failures localize to the level.
+for OPT_LEVEL in $OPT_LEVELS; do
+    if [ "$OPT_LEVEL" = "0" ]; then
+        CG_RUN=("$CG_RUNNER")
+        TAG=""
+        WORK_SUB="cg"
+    else
+        CG_RUN=("$CG_RUNNER" "--opt-level" "$OPT_LEVEL")
+        TAG="/L${OPT_LEVEL}"
+        WORK_SUB="cg-L${OPT_LEVEL}"
+    fi
 
-# ---- per-case loop ---------------------------------------------------------
+    printf 'Running cases (opt-level %s)...\n' "$OPT_LEVEL"
 
-CASES="$($CG_RUNNER --list)"
+    # Path E result bookkeeping (per level — flushed at end of this iteration).
+    E_NAMES=()
+    E_WORK=()
+    E_LINK_MS=()
+    E_EXPECTED=()
+
+    for name in $CASES; do
+        [ -n "$FILTER" ] && [[ "$name" != *"$FILTER"* ]] && continue
+        work="$BUILD_DIR/$WORK_SUB/$name"
+        mkdir -p "$work"
 
-# Path E result bookkeeping. We queue exes during the main loop and verify
-# after a single batched podman flush.
-E_NAMES=()
-E_WORK=()
-E_LINK_MS=()
-E_EXPECTED=()
-
-for name in $CASES; do
-    [ -n "$FILTER" ] && [[ "$name" != *"$FILTER"* ]] && continue
-    work="$BUILD_DIR/cg/$name"
-    mkdir -p "$work"
-
-    expected="$($CG_RUNNER --expected "$name" 2>/dev/null)"
-    expected="${expected:-0}"
-    # Exit codes are mod 256 on POSIX; mask the expected the same way so
-    # negative-return cases compare correctly.
-    expected_byte=$(( expected & 0xff ))
-
-    # ---- Path D: in-process JIT (only on aarch64) ------------------------
-    if [ $RUN_D -eq 1 ]; then
-        if [ $is_aarch64 -eq 1 ]; then
-            t0=$(now_ms)
-            "$CG_RUNNER" --jit "$name" >"$work/d.out" 2>"$work/d.err"
-            d_rc=$?
-            dt=$(( $(now_ms) - t0 )); T_D=$(( T_D + dt ))
-            if [ "$d_rc" -eq "$expected_byte" ]; then
-                note_pass "$name/D (${dt}ms)"
+        expected="$("${CG_RUN[@]}" --expected "$name" 2>/dev/null)"
+        expected="${expected:-0}"
+        # Exit codes are mod 256 on POSIX; mask the expected the same way so
+        # negative-return cases compare correctly.
+        expected_byte=$(( expected & 0xff ))
+
+        # ---- Path D: in-process JIT (only on aarch64) ------------------------
+        if [ $RUN_D -eq 1 ]; then
+            if [ $is_aarch64 -eq 1 ]; then
+                t0=$(now_ms)
+                "${CG_RUN[@]}" --jit "$name" >"$work/d.out" 2>"$work/d.err"
+                d_rc=$?
+                dt=$(( $(now_ms) - t0 )); T_D=$(( T_D + dt ))
+                if [ "$d_rc" -eq "$expected_byte" ]; then
+                    note_pass "$name/D${TAG} (${dt}ms)"
+                else
+                    note_fail "$name/D${TAG} (expected $expected_byte got $d_rc, ${dt}ms)"
+                fi
             else
-                note_fail "$name/D (expected $expected_byte got $d_rc, ${dt}ms)"
+                note_skip "$name/D${TAG}" "not on aarch64 host"
             fi
-        else
-            note_skip "$name/D" "not on aarch64 host"
         fi
-    fi
-
-    # ---- emit (needed by R/E/J/W) -----------------------------------------
-    obj="$work/$name.o"
-    if [ $RUN_R -eq 1 ] || [ $RUN_E -eq 1 ] || [ $RUN_J -eq 1 ] \
-            || [ $RUN_W -eq 1 ]; then
-        if ! "$CG_RUNNER" --emit "$name" "$obj" 2>"$work/emit.err"; then
-            note_fail "$name/emit (cg-runner --emit failed; see $work/emit.err)"
-            continue
-        fi
-    fi
 
-    # ---- Path R: ELF roundtrip --------------------------------------------
-    if [ $RUN_R -eq 1 ]; then
-        if [ $have_roundtrip -eq 1 ] && [ $have_readelf -eq 1 ] && [ $have_python3 -eq 1 ]; then
-            t0=$(now_ms)
-            rt="$work/$name.rt.o"
-            r_ok=1; r_msg=""
-            if ! "$ROUNDTRIP_BIN" "$obj" "$rt" 2>"$work/rt.err"; then
-                r_ok=0; r_msg=" (roundtrip failed)"
-            else
-                "$READELF_BIN" -aW "$obj" | python3 "$NORMALIZE" >"$work/golden.norm" 2>/dev/null
-                "$READELF_BIN" -aW "$rt"  | python3 "$NORMALIZE" >"$work/rt.norm"     2>/dev/null
-                diff -u "$work/golden.norm" "$work/rt.norm" >"$work/r.diff" 2>&1 || r_ok=0
+        # ---- emit (needed by R/E/J/W) -----------------------------------------
+        obj="$work/$name.o"
+        if [ $RUN_R -eq 1 ] || [ $RUN_E -eq 1 ] || [ $RUN_J -eq 1 ] \
+                || [ $RUN_W -eq 1 ]; then
+            if ! "${CG_RUN[@]}" --emit "$name" "$obj" 2>"$work/emit.err"; then
+                note_fail "$name/emit${TAG} (cg-runner --emit failed; see $work/emit.err)"
+                continue
             fi
-            dt=$(( $(now_ms) - t0 )); T_R=$(( T_R + dt ))
-            if [ $r_ok -eq 1 ]; then note_pass "$name/R (${dt}ms)"
-            else note_fail "$name/R${r_msg} (${dt}ms)"; fi
-        else
-            note_skip "$name/R" "missing roundtrip/readelf/python3"
         fi
-    fi
 
-    # ---- Path E: link + (batched) qemu/podman ------------------------------
-    # Link now (per case); the run is queued for the post-loop flush.
-    if [ $RUN_E -eq 1 ]; then
-        if [ $have_exe_runner -eq 1 ] && [ $have_clang_cross -eq 1 ] \
-                && [ $have_start_obj -eq 1 ]; then
-            t0=$(now_ms)
-            exe="$work/linked.exe"
-            if ! "$LINK_EXE_RUNNER" -o "$exe" "$obj" "$START_OBJ" \
-                    >"$work/exec_link.out" 2>"$work/exec_link.err"; then
-                dt=$(( $(now_ms) - t0 )); T_E=$(( T_E + dt ))
-                note_fail "$name/E (link failed, ${dt}ms)"
-            elif [ $have_runner -eq 1 ]; then
-                link_dt=$(( $(now_ms) - t0 )); T_E=$(( T_E + link_dt ))
-                E_NAMES+=("$name")
-                E_WORK+=("$work")
-                E_LINK_MS+=("$link_dt")
-                E_EXPECTED+=("$expected_byte")
-                exec_aarch64_queue "$name" "$exe" \
-                    "$work/exec.out" "$work/exec.err" "$work/exec.rc"
+        # ---- Path R: ELF roundtrip --------------------------------------------
+        if [ $RUN_R -eq 1 ]; then
+            if [ $have_roundtrip -eq 1 ] && [ $have_readelf -eq 1 ] && [ $have_python3 -eq 1 ]; then
+                t0=$(now_ms)
+                rt="$work/$name.rt.o"
+                r_ok=1; r_msg=""
+                if ! "$ROUNDTRIP_BIN" "$obj" "$rt" 2>"$work/rt.err"; then
+                    r_ok=0; r_msg=" (roundtrip failed)"
+                else
+                    "$READELF_BIN" -aW "$obj" | python3 "$NORMALIZE" >"$work/golden.norm" 2>/dev/null
+                    "$READELF_BIN" -aW "$rt"  | python3 "$NORMALIZE" >"$work/rt.norm"     2>/dev/null
+                    diff -u "$work/golden.norm" "$work/rt.norm" >"$work/r.diff" 2>&1 || r_ok=0
+                fi
+                dt=$(( $(now_ms) - t0 )); T_R=$(( T_R + dt ))
+                if [ $r_ok -eq 1 ]; then note_pass "$name/R${TAG} (${dt}ms)"
+                else note_fail "$name/R${TAG}${r_msg} (${dt}ms)"; fi
             else
-                note_skip "$name/E" "no qemu/podman"
+                note_skip "$name/R${TAG}" "missing roundtrip/readelf/python3"
             fi
-        else
-            note_skip "$name/E" "no link-exe-runner, aarch64 clang, or start.o"
         fi
-    fi
 
-    # ---- Path J: jit-via-file ---------------------------------------------
-    if [ $RUN_J -eq 1 ]; then
-        if [ $have_jit_runner -eq 1 ]; then
-            t0=$(now_ms)
-            "$JIT_RUNNER" "$obj" >"$work/jit.out" 2>"$work/jit.err"
-            j_rc=$?
-            dt=$(( $(now_ms) - t0 )); T_J=$(( T_J + dt ))
-            if [ "$j_rc" -eq "$expected_byte" ]; then
-                note_pass "$name/J (${dt}ms)"
+        # ---- Path E: link + (batched) qemu/podman ------------------------------
+        # Link now (per case); the run is queued for the post-loop flush.
+        if [ $RUN_E -eq 1 ]; then
+            if [ $have_exe_runner -eq 1 ] && [ $have_clang_cross -eq 1 ] \
+                    && [ $have_start_obj -eq 1 ]; then
+                t0=$(now_ms)
+                exe="$work/linked.exe"
+                if ! "$LINK_EXE_RUNNER" -o "$exe" "$obj" "$START_OBJ" \
+                        >"$work/exec_link.out" 2>"$work/exec_link.err"; then
+                    dt=$(( $(now_ms) - t0 )); T_E=$(( T_E + dt ))
+                    note_fail "$name/E${TAG} (link failed, ${dt}ms)"
+                elif [ $have_runner -eq 1 ]; then
+                    link_dt=$(( $(now_ms) - t0 )); T_E=$(( T_E + link_dt ))
+                    E_NAMES+=("$name")
+                    E_WORK+=("$work")
+                    E_LINK_MS+=("$link_dt")
+                    E_EXPECTED+=("$expected_byte")
+                    # Queue with a level-tagged key so cases at different
+                    # opt levels don't collide in the batched runner.
+                    exec_aarch64_queue "L${OPT_LEVEL}_${name}" "$exe" \
+                        "$work/exec.out" "$work/exec.err" "$work/exec.rc"
+                else
+                    note_skip "$name/E${TAG}" "no qemu/podman"
+                fi
             else
-                note_fail "$name/J (expected $expected_byte got $j_rc, ${dt}ms)"
+                note_skip "$name/E${TAG}" "no link-exe-runner, aarch64 clang, or start.o"
             fi
-        else
-            note_skip "$name/J" "no jit-runner (not aarch64 host)"
         fi
-    fi
 
-    # ---- Path W: DWARF check ----------------------------------------------
-    # Cases that don't register directives produce empty stdout from
-    # --dwarf-checks; we silently skip those (no SKIP entry, since W is
-    # opt-in per case rather than per host).
-    if [ $RUN_W -eq 1 ]; then
-        "$CG_RUNNER" --dwarf-checks "$name" >"$work/w.directives" \
-            2>"$work/w.dc.err"
-        if [ -s "$work/w.directives" ]; then
-            if [ $have_dwarf_check -eq 1 ]; then
+        # ---- Path J: jit-via-file ---------------------------------------------
+        if [ $RUN_J -eq 1 ]; then
+            if [ $have_jit_runner -eq 1 ]; then
                 t0=$(now_ms)
-                "$DWARF_CHECK" "$obj" <"$work/w.directives" \
-                    >"$work/w.out" 2>"$work/w.err"
-                w_rc=$?
-                dt=$(( $(now_ms) - t0 )); T_W=$(( T_W + dt ))
-                if [ "$w_rc" -eq 0 ]; then
-                    note_pass "$name/W (${dt}ms)"
+                "$JIT_RUNNER" "$obj" >"$work/jit.out" 2>"$work/jit.err"
+                j_rc=$?
+                dt=$(( $(now_ms) - t0 )); T_J=$(( T_J + dt ))
+                if [ "$j_rc" -eq "$expected_byte" ]; then
+                    note_pass "$name/J${TAG} (${dt}ms)"
                 else
-                    note_fail "$name/W (see $work/w.out, $work/w.err; ${dt}ms)"
+                    note_fail "$name/J${TAG} (expected $expected_byte got $j_rc, ${dt}ms)"
                 fi
             else
-                note_skip "$name/W" "no cg-check-dwarf"
+                note_skip "$name/J${TAG}" "no jit-runner (not aarch64 host)"
             fi
         fi
-    fi
-done
 
-# ---- batched path-E flush + verification -----------------------------------
-# Run every queued case in a single podman invocation, then iterate the
-# queue to read each exit code and emit PASS/FAIL.
-
-T_E_BATCH=0
-if [ "$(exec_aarch64_queue_size)" -gt 0 ]; then
-    printf 'Running path E (%d cases batched)...\n' "$(exec_aarch64_queue_size)"
-    t0=$(now_ms)
-    exec_aarch64_flush
-    T_E_BATCH=$(( $(now_ms) - t0 )); T_E=$(( T_E + T_E_BATCH ))
-
-    i=0
-    while [ $i -lt ${#E_NAMES[@]} ]; do
-        name="${E_NAMES[$i]}"
-        work="${E_WORK[$i]}"
-        link_dt="${E_LINK_MS[$i]}"
-        expected_byte="${E_EXPECTED[$i]}"
-        if [ ! -f "$work/exec.rc" ]; then
-            note_fail "$name/E (no rc; podman batch did not produce results)"
-        else
-            RUN_RC="$(cat "$work/exec.rc")"
-            if [ "$RUN_RC" -eq "$expected_byte" ]; then
-                note_pass "$name/E (link ${link_dt}ms)"
-            else
-                note_fail "$name/E (expected $expected_byte got $RUN_RC, link ${link_dt}ms)"
+        # ---- Path W: DWARF check ----------------------------------------------
+        # Cases that don't register directives produce empty stdout from
+        # --dwarf-checks; we silently skip those (no SKIP entry, since W is
+        # opt-in per case rather than per host). DWARF / opt-level
+        # equivalence is a Phase 5+ concern, so skip W when level > 0.
+        if [ $RUN_W -eq 1 ] && [ "$OPT_LEVEL" = "0" ]; then
+            "${CG_RUN[@]}" --dwarf-checks "$name" >"$work/w.directives" \
+                2>"$work/w.dc.err"
+            if [ -s "$work/w.directives" ]; then
+                if [ $have_dwarf_check -eq 1 ]; then
+                    t0=$(now_ms)
+                    "$DWARF_CHECK" "$obj" <"$work/w.directives" \
+                        >"$work/w.out" 2>"$work/w.err"
+                    w_rc=$?
+                    dt=$(( $(now_ms) - t0 )); T_W=$(( T_W + dt ))
+                    if [ "$w_rc" -eq 0 ]; then
+                        note_pass "$name/W (${dt}ms)"
+                    else
+                        note_fail "$name/W (see $work/w.out, $work/w.err; ${dt}ms)"
+                    fi
+                else
+                    note_skip "$name/W" "no cg-check-dwarf"
+                fi
             fi
         fi
-        i=$((i+1))
     done
-fi
+
+    # ---- batched path-E flush + verification (per level) -------------------
+    # Run every queued case in a single podman invocation, then iterate the
+    # queue to read each exit code and emit PASS/FAIL.
+    if [ "$(exec_aarch64_queue_size)" -gt 0 ]; then
+        printf 'Running path E%s (%d cases batched)...\n' \
+            "$TAG" "$(exec_aarch64_queue_size)"
+        t0=$(now_ms)
+        exec_aarch64_flush
+        DELTA=$(( $(now_ms) - t0 ))
+        T_E_BATCH=$(( ${T_E_BATCH:-0} + DELTA )); T_E=$(( T_E + DELTA ))
+
+        i=0
+        while [ $i -lt ${#E_NAMES[@]} ]; do
+            name="${E_NAMES[$i]}"
+            work="${E_WORK[$i]}"
+            link_dt="${E_LINK_MS[$i]}"
+            expected_byte="${E_EXPECTED[$i]}"
+            if [ ! -f "$work/exec.rc" ]; then
+                note_fail "$name/E${TAG} (no rc; podman batch did not produce results)"
+            else
+                RUN_RC="$(cat "$work/exec.rc")"
+                if [ "$RUN_RC" -eq "$expected_byte" ]; then
+                    note_pass "$name/E${TAG} (link ${link_dt}ms)"
+                else
+                    note_fail "$name/E${TAG} (expected $expected_byte got $RUN_RC, link ${link_dt}ms)"
+                fi
+            fi
+            i=$((i+1))
+        done
+    fi
+done
+
+T_E_BATCH=${T_E_BATCH:-0}
 
 # ---- summary ---------------------------------------------------------------

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/api/stubs.c	\|	10	+---------
A	src/opt/opt.c	\|	1875	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	src/opt/opt.h	\|	6	++++++
M	test/cg/harness/cg_runner.c	\|	119	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M	test/cg/run.sh	\|	312	+++++++++++++++++++++++++++++++++++++++++++-------------------------------------