kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit cc3abed6052eb7fdcc58fc6ccba94bbd8fc8a5ff
parent 9747c24c5a4aa056f0f2c3d89498be8edb971c47
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sun, 10 May 2026 05:23:34 -0700

opt: record-replay wrapper + tape printer + constfold (phases 0-2)

Implements doc/OPT.md phases 0-2. opt_cgtarget_new now returns a
CGTarget that records each function as a tape of CGTarget calls,
hands out wrapper-local virtual ids for alloc_reg/frame_slot/
label_new/scope_begin, and replays the tape onto the wrapped target
at func_end (with vid -> target-id translation). A simple integer
constfold peephole rewrites LOAD_IMM+LOAD_IMM+BINOP{IADD,ISUB,IMUL}
to a single LOAD_IMM, chaining transitively. opt_set_dump_writer
exposes a textual tape dump for debugging, wired through cg-runner
--opt-level N --dump-tape NAME. The cg corpus now runs at every
level in CFREE_OPT_LEVELS (default "0 1") so D/R/E/J equivalence is
checked end-to-end; W stays at level 0.

Diffstat:
Msrc/api/stubs.c | 10+---------
Asrc/opt/opt.c | 1875+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/opt/opt.h | 6++++++
Mtest/cg/harness/cg_runner.c | 119+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mtest/cg/run.sh | 312+++++++++++++++++++++++++++++++++++++++++++-------------------------------------
5 files changed, 2164 insertions(+), 158 deletions(-)

diff --git a/src/api/stubs.c b/src/api/stubs.c @@ -54,15 +54,7 @@ void parse_asm(Compiler* c, Lexer* l, MCEmitter* m) { * cgtarget_new / cgtarget_finalize / cgtarget_free live in src/arch/<target>.c * (dispatched through src/arch/arch.c). */ -/* ============================================================ - * Optimizer - * ============================================================ */ - -CGTarget* opt_cgtarget_new(Compiler* c, CGTarget* t, int level) { - (void)t; - (void)level; - unimplemented(c, "opt"); -} +/* Optimizer (opt_cgtarget_new) lives in src/opt/opt.c. */ /* Debug info producer lives in src/debug/. */ diff --git a/src/opt/opt.c b/src/opt/opt.c @@ -0,0 +1,1875 @@ +/* opt — CGTarget wrapper that records each function as a tape of + * CGTarget calls, then replays them onto the wrapped target on + * func_end. See doc/OPT.md for the phased plan. + * + * Phase 1 (current): record every emit-side call into a per-function + * tape; alloc_reg / frame_slot / label_new / scope_begin hand out + * wrapper-local virtual ids. On func_end the tape is replayed + * linearly: each entry produces exactly one wrapped target call, + * with virtual ids translated to target-side ids on the fly. This + * preserves doc/DESIGN.md §8's "function-at-a-time" streaming + * guarantee at -O1. + * + * Phase 2 (current): a small, safe peephole pass runs over the tape + * between recording and replay. See try_peephole_constfold. + * + * Phase 3+ (deferred): build CFG and SSA from the tape, run + * intra-procedural passes, lower through machinize → regalloc → + * emit. Until that lands, level 2 is functionally identical to + * level 1 (per-function record + replay). + * + * Methods the wrapper rejects under unbounded virtuals: + * - clobbers / spill_reg / reload_reg are CG -O0 register-pressure + * mechanics. CG never invokes them on real backends in v1, and + * they're meaningless for opt's vreg space — calling them is a + * wiring bug, so we panic loudly. + * - free_reg is documented as a hint and is silently ignored. */ + +#include "opt/opt.h" + +#include <string.h> + +#include "core/arena.h" +#include "core/core.h" + +/* ---- tape op tags ---- */ + +typedef enum { + TOP_FUNC_BEGIN, + TOP_FUNC_END, + TOP_ALLOC_REG, + TOP_FRAME_SLOT, + TOP_PARAM, + TOP_LABEL_NEW, + TOP_LABEL_PLACE, + TOP_JUMP, + TOP_CMP_BRANCH, + TOP_SCOPE_BEGIN, + TOP_SCOPE_ELSE, + TOP_SCOPE_END, + TOP_BREAK_TO, + TOP_CONTINUE_TO, + TOP_LOAD_IMM, + TOP_LOAD_CONST, + TOP_COPY, + TOP_LOAD, + TOP_STORE, + TOP_ADDR_OF, + TOP_TLS_ADDR_OF, + TOP_COPY_BYTES, + TOP_SET_BYTES, + TOP_BITFIELD_LOAD, + TOP_BITFIELD_STORE, + TOP_BINOP, + TOP_UNOP, + TOP_CMP, + TOP_CONVERT, + TOP_CALL, + TOP_RET, + TOP_ALLOCA, + TOP_VA_START, + TOP_VA_ARG, + TOP_VA_END, + TOP_VA_COPY, + TOP_SETJMP, + TOP_LONGJMP, + TOP_ATOMIC_LOAD, + TOP_ATOMIC_STORE, + TOP_ATOMIC_RMW, + TOP_ATOMIC_CAS, + TOP_FENCE, + TOP_INTRINSIC, + TOP_SET_LOC, +} TapeOpKind; + +/* TapeEntry: one recorded CGTarget call. The tagged union is wide; we + * pay arena bytes for clarity. */ +typedef struct TapeEntry { + u8 op; /* TapeOpKind */ + u8 dead; /* set by peepholes; replay skips dead entries */ + u16 padding; + SrcLoc loc; + union { + /* WOP_FUNC_BEGIN: deep-copied descriptor. The caller's CGFuncDesc + * may be stack-allocated, so we copy by value into our arena. + * params[] is also copied; field shapes inside (Type*, ABIArgInfo*, + * incoming pointer) are TU-lifetime and shared. */ + struct { + CGFuncDesc desc; + CGParamDesc* params; /* arena copy of fd.params */ + } func_begin; + + /* WOP_ALLOC_REG: returns a vreg, indexed into reg_map at replay. */ + struct { + RegClass cls; + const Type* ty; + Reg vreg; + } alloc_reg; + + /* WOP_FRAME_SLOT */ + struct { + FrameSlotDesc desc; + FrameSlot vslot; + } frame_slot; + + /* WOP_PARAM */ + struct { + CGParamDesc desc; + } param; + + /* WOP_LABEL_NEW */ + struct { + Label vlabel; + } label_new; + + /* WOP_LABEL_PLACE / WOP_JUMP */ + struct { + Label vlabel; + } label_op; + + /* WOP_CMP_BRANCH */ + struct { + CmpOp op; + Operand a, b; + Label vlabel; + } cmp_branch; + + /* WOP_SCOPE_BEGIN */ + struct { + CGScopeDesc desc; + CGScope vscope; + } scope_begin; + + /* WOP_SCOPE_ELSE / WOP_SCOPE_END / WOP_BREAK_TO / WOP_CONTINUE_TO */ + struct { + CGScope vscope; + } scope_op; + + /* WOP_LOAD_IMM */ + struct { + Operand dst; + i64 imm; + } load_imm; + + /* WOP_LOAD_CONST */ + struct { + Operand dst; + ConstBytes cb; + } load_const; + + /* WOP_COPY / WOP_ADDR_OF / WOP_VA_COPY */ + struct { + Operand dst; + Operand src; + } copy; + + /* WOP_LOAD */ + struct { + Operand dst; + Operand addr; + MemAccess mem; + } load; + + /* WOP_STORE */ + struct { + Operand addr; + Operand src; + MemAccess mem; + } store; + + /* WOP_TLS_ADDR_OF */ + struct { + Operand dst; + ObjSymId sym; + i64 addend; + } tls_addr_of; + + /* WOP_COPY_BYTES / WOP_SET_BYTES */ + struct { + Operand a; + Operand b; + AggregateAccess agg; + } agg; + + /* WOP_BITFIELD_LOAD */ + struct { + Operand dst; + Operand record; + BitFieldAccess bf; + } bitfield_load; + + /* WOP_BITFIELD_STORE */ + struct { + Operand record; + Operand src; + BitFieldAccess bf; + } bitfield_store; + + /* WOP_BINOP */ + struct { + BinOp op; + Operand dst, a, b; + } binop; + + /* WOP_UNOP */ + struct { + UnOp op; + Operand dst, a; + } unop; + + /* WOP_CMP */ + struct { + CmpOp op; + Operand dst, a, b; + } cmp; + + /* WOP_CONVERT */ + struct { + ConvKind kind; + Operand dst, src; + } convert; + + /* WOP_CALL: deep-copied descriptor and inner arrays. */ + struct { + CGCallDesc desc; + CGABIValue* args; /* len = desc.nargs */ + CGABIPart* ret_parts; /* len = desc.ret.nparts; NULL if 0 */ + CGABIPart** arg_parts; /* per-arg parts arrays; entry is NULL if 0 */ + } call; + + /* WOP_RET: present == 1 means there is a CGABIValue; otherwise a + * void return. parts is deep-copied. */ + struct { + u8 present; + CGABIValue val; + CGABIPart* parts; /* len = val.nparts */ + } ret; + + /* WOP_ALLOCA */ + struct { + Operand dst; + Operand size; + u32 align; + } alloca_; + + /* WOP_VA_START / WOP_VA_END */ + struct { + Operand ap; + } va_se; + + /* WOP_VA_ARG */ + struct { + Operand dst; + Operand ap; + const Type* ty; + } va_arg_; + + /* WOP_SETJMP */ + struct { + Operand dst; + Operand buf; + } setjmp_; + + /* WOP_LONGJMP */ + struct { + Operand buf; + Operand val; + } longjmp_; + + /* WOP_ATOMIC_LOAD */ + struct { + Operand dst; + Operand addr; + MemAccess mem; + MemOrder mo; + } atomic_load; + + /* WOP_ATOMIC_STORE */ + struct { + Operand addr; + Operand src; + MemAccess mem; + MemOrder mo; + } atomic_store; + + /* WOP_ATOMIC_RMW */ + struct { + AtomicOp op; + Operand dst; + Operand addr; + Operand val; + MemAccess mem; + MemOrder mo; + } atomic_rmw; + + /* WOP_ATOMIC_CAS */ + struct { + Operand prior; + Operand ok; + Operand addr; + Operand expected; + Operand desired; + MemAccess mem; + MemOrder success; + MemOrder failure; + } atomic_cas; + + /* WOP_FENCE */ + struct { + MemOrder mo; + } fence; + + /* WOP_INTRINSIC */ + struct { + IntrinKind kind; + Operand* dsts; /* deep-copied */ + u32 ndst; + Operand* args; /* deep-copied */ + u32 narg; + } intrinsic; + + /* WOP_SET_LOC */ + struct { + SrcLoc loc; + } set_loc; + } u; +} TapeEntry; + +/* ---- wrapper state ---- */ + +typedef struct OptImpl { + CGTarget base; + CGTarget* target; /* wrapped */ + int level; + Compiler* c; + + /* Tape: per-function, reset on func_begin. Allocated from c->tu so + * the buffer survives panic via compiler_defer cleanups. */ + TapeEntry* tape; + u32 ntape, tape_cap; + + /* Wrapper-local virtual id counters. 1-based; 0 reserved as NONE. + * Reset on each func_begin. */ + Reg next_vreg; + FrameSlot next_vslot; + Label next_vlabel; + CGScope next_vscope; + + /* Replay-time translation tables. Index by virtual id; entry 0 is + * the NONE sentinel and never referenced. Allocated lazily on first + * replay so peak size matches the largest function. */ + Reg* reg_map; + u32 reg_map_cap; + FrameSlot* slot_map; + u32 slot_map_cap; + Label* label_map; + u32 label_map_cap; + CGScope* scope_map; + u32 scope_map_cap; + + SrcLoc pending_loc; /* most recent set_loc; stamped onto each entry */ + + /* If non-NULL, dump the tape to this writer on each func_end (before + * replay). Used by cg-runner --dump-tape and ad-hoc debugging. */ + Writer* dump_writer; +} OptImpl; + +static OptImpl* impl_of(CGTarget* t) { return (OptImpl*)t; } + +static _Noreturn void panic_unsupported(OptImpl* o, const char* what) { + SrcLoc loc = {0, 0, 0}; + compiler_panic(o->c, loc, "opt_cgtarget: %s called under unbounded virtuals", + what); +} + +/* ---- tape append ---- */ + +static TapeEntry* tape_append(OptImpl* o, TapeOpKind op) { + TapeEntry* e; + if (o->ntape == o->tape_cap) { + u32 ncap = o->tape_cap ? o->tape_cap * 2u : 64u; + TapeEntry* nb = arena_array(o->c->tu, TapeEntry, ncap); + if (o->tape) memcpy(nb, o->tape, sizeof(TapeEntry) * o->ntape); + o->tape = nb; + o->tape_cap = ncap; + } + e = &o->tape[o->ntape++]; + memset(e, 0, sizeof *e); + e->op = (u8)op; + e->loc = o->pending_loc; + return e; +} + +/* ---- deep-copy helpers ---- */ + +static CGParamDesc* copy_params(Compiler* c, const CGParamDesc* src, u32 n) { + CGParamDesc* dst; + if (!n) return NULL; + dst = arena_array(c->tu, CGParamDesc, n); + memcpy(dst, src, sizeof(CGParamDesc) * n); + return dst; +} + +static CGABIPart* copy_parts(Compiler* c, const CGABIPart* src, u32 n) { + CGABIPart* dst; + if (!n) return NULL; + dst = arena_array(c->tu, CGABIPart, n); + memcpy(dst, src, sizeof(CGABIPart) * n); + return dst; +} + +static Operand* copy_operands(Compiler* c, const Operand* src, u32 n) { + Operand* dst; + if (!n) return NULL; + dst = arena_array(c->tu, Operand, n); + memcpy(dst, src, sizeof(Operand) * n); + return dst; +} + +/* ---- map helpers (replay-time) ---- + * The maps are direct-indexed by the 1-based virtual id; entry 0 is + * the NONE sentinel. */ + +static void map_reg_grow(OptImpl* o, u32 needed) { + u32 ncap; + Reg* nb; + if (needed <= o->reg_map_cap) return; + ncap = o->reg_map_cap ? o->reg_map_cap : 16u; + while (ncap < needed) ncap *= 2u; + nb = arena_array(o->c->tu, Reg, ncap); + if (o->reg_map) memcpy(nb, o->reg_map, sizeof(Reg) * o->reg_map_cap); + /* New slots default to REG_NONE (0xffffffff). */ + for (u32 i = o->reg_map_cap; i < ncap; ++i) nb[i] = REG_NONE; + o->reg_map = nb; + o->reg_map_cap = ncap; +} + +static void map_slot_grow(OptImpl* o, u32 needed) { + u32 ncap; + FrameSlot* nb; + if (needed <= o->slot_map_cap) return; + ncap = o->slot_map_cap ? o->slot_map_cap : 16u; + while (ncap < needed) ncap *= 2u; + nb = arena_array(o->c->tu, FrameSlot, ncap); + if (o->slot_map) memcpy(nb, o->slot_map, sizeof(FrameSlot) * o->slot_map_cap); + for (u32 i = o->slot_map_cap; i < ncap; ++i) nb[i] = FRAME_SLOT_NONE; + o->slot_map = nb; + o->slot_map_cap = ncap; +} + +static void map_label_grow(OptImpl* o, u32 needed) { + u32 ncap; + Label* nb; + if (needed <= o->label_map_cap) return; + ncap = o->label_map_cap ? o->label_map_cap : 16u; + while (ncap < needed) ncap *= 2u; + nb = arena_array(o->c->tu, Label, ncap); + if (o->label_map) memcpy(nb, o->label_map, sizeof(Label) * o->label_map_cap); + for (u32 i = o->label_map_cap; i < ncap; ++i) nb[i] = LABEL_NONE; + o->label_map = nb; + o->label_map_cap = ncap; +} + +static void map_scope_grow(OptImpl* o, u32 needed) { + u32 ncap; + CGScope* nb; + if (needed <= o->scope_map_cap) return; + ncap = o->scope_map_cap ? o->scope_map_cap : 8u; + while (ncap < needed) ncap *= 2u; + nb = arena_array(o->c->tu, CGScope, ncap); + if (o->scope_map) memcpy(nb, o->scope_map, sizeof(CGScope) * o->scope_map_cap); + for (u32 i = o->scope_map_cap; i < ncap; ++i) nb[i] = CG_SCOPE_NONE; + o->scope_map = nb; + o->scope_map_cap = ncap; +} + +/* ---- recording: every emit-side method records a tape entry. + * + * Allocator methods (alloc_reg, frame_slot, label_new, scope_begin) + * additionally hand back a wrapper-local virtual id; the underlying + * target is not consulted until replay. */ + +static void w_func_begin(CGTarget* t, const CGFuncDesc* fd) { + OptImpl* o = impl_of(t); + TapeEntry* e; + + /* Reset per-function state. */ + o->tape = NULL; + o->ntape = 0; + o->tape_cap = 0; + o->next_vreg = 1; + o->next_vslot = 1; + o->next_vlabel = 1; + o->next_vscope = 1; + o->pending_loc = (SrcLoc){0, 0, 0}; + /* Reset translation maps; capacities are kept for amortization. */ + for (u32 i = 0; i < o->reg_map_cap; ++i) o->reg_map[i] = REG_NONE; + for (u32 i = 0; i < o->slot_map_cap; ++i) o->slot_map[i] = FRAME_SLOT_NONE; + for (u32 i = 0; i < o->label_map_cap; ++i) o->label_map[i] = LABEL_NONE; + for (u32 i = 0; i < o->scope_map_cap; ++i) o->scope_map[i] = CG_SCOPE_NONE; + + e = tape_append(o, TOP_FUNC_BEGIN); + /* Shallow-copy the descriptor by value, then deep-copy the params + * array — the harness mutates pds[i].slot AFTER func_begin returns, + * so we can't rely on pointer-shallow-copy for that field. The slots + * we record here are wrapper vslots (allocated by w_frame_slot in the + * subsequent param-setup loop); replay translates them. */ + e->u.func_begin.desc = *fd; + e->u.func_begin.params = copy_params(o->c, fd->params, fd->nparams); + e->u.func_begin.desc.params = e->u.func_begin.params; +} + +static void w_func_end(CGTarget* t); + +static Reg w_alloc_reg(CGTarget* t, RegClass cls, const Type* ty) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_ALLOC_REG); + Reg vreg = o->next_vreg++; + e->u.alloc_reg.cls = cls; + e->u.alloc_reg.ty = ty; + e->u.alloc_reg.vreg = vreg; + return vreg; +} + +static void w_free_reg(CGTarget* t, Reg r) { + /* Hint; opt_cgtarget ignores. The wrapper's vregs are unbounded — + * there is no pool to return to. */ + (void)t; + (void)r; +} + +static FrameSlot w_frame_slot(CGTarget* t, const FrameSlotDesc* d) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_FRAME_SLOT); + FrameSlot vslot = o->next_vslot++; + e->u.frame_slot.desc = *d; + e->u.frame_slot.vslot = vslot; + return vslot; +} + +static void w_param(CGTarget* t, const CGParamDesc* d) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_PARAM); + e->u.param.desc = *d; +} + +static const Reg* w_clobbers(CGTarget* t, RegClass cls, u32* nregs) { + (void)cls; + (void)nregs; + panic_unsupported(impl_of(t), "clobbers"); +} +static void w_spill_reg(CGTarget* t, Operand src, FrameSlot s, MemAccess m) { + (void)src; + (void)s; + (void)m; + panic_unsupported(impl_of(t), "spill_reg"); +} +static void w_reload_reg(CGTarget* t, Operand dst, FrameSlot s, MemAccess m) { + (void)dst; + (void)s; + (void)m; + panic_unsupported(impl_of(t), "reload_reg"); +} + +static Label w_label_new(CGTarget* t) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_LABEL_NEW); + Label v = o->next_vlabel++; + e->u.label_new.vlabel = v; + return v; +} + +static void w_label_place(CGTarget* t, Label l) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_LABEL_PLACE); + e->u.label_op.vlabel = l; +} +static void w_jump(CGTarget* t, Label l) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_JUMP); + e->u.label_op.vlabel = l; +} +static void w_cmp_branch(CGTarget* t, CmpOp op, Operand a, Operand b, Label l) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_CMP_BRANCH); + e->u.cmp_branch.op = op; + e->u.cmp_branch.a = a; + e->u.cmp_branch.b = b; + e->u.cmp_branch.vlabel = l; +} + +static CGScope w_scope_begin(CGTarget* t, const CGScopeDesc* d) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_SCOPE_BEGIN); + CGScope v = o->next_vscope++; + e->u.scope_begin.desc = *d; + e->u.scope_begin.vscope = v; + return v; +} +static void w_scope_else(CGTarget* t, CGScope s) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_SCOPE_ELSE); + e->u.scope_op.vscope = s; +} +static void w_scope_end(CGTarget* t, CGScope s) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_SCOPE_END); + e->u.scope_op.vscope = s; +} +static void w_break_to(CGTarget* t, CGScope s) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_BREAK_TO); + e->u.scope_op.vscope = s; +} +static void w_continue_to(CGTarget* t, CGScope s) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_CONTINUE_TO); + e->u.scope_op.vscope = s; +} + +static void w_load_imm(CGTarget* t, Operand dst, i64 imm) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_LOAD_IMM); + e->u.load_imm.dst = dst; + e->u.load_imm.imm = imm; +} +static void w_load_const(CGTarget* t, Operand dst, ConstBytes cb) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_LOAD_CONST); + e->u.load_const.dst = dst; + e->u.load_const.cb = cb; +} +static void w_copy(CGTarget* t, Operand dst, Operand src) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_COPY); + e->u.copy.dst = dst; + e->u.copy.src = src; +} +static void w_load(CGTarget* t, Operand dst, Operand addr, MemAccess m) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_LOAD); + e->u.load.dst = dst; + e->u.load.addr = addr; + e->u.load.mem = m; +} +static void w_store(CGTarget* t, Operand addr, Operand src, MemAccess m) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_STORE); + e->u.store.addr = addr; + e->u.store.src = src; + e->u.store.mem = m; +} +static void w_addr_of(CGTarget* t, Operand dst, Operand lv) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_ADDR_OF); + e->u.copy.dst = dst; + e->u.copy.src = lv; +} +static void w_tls_addr_of(CGTarget* t, Operand dst, ObjSymId sym, i64 addend) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_TLS_ADDR_OF); + e->u.tls_addr_of.dst = dst; + e->u.tls_addr_of.sym = sym; + e->u.tls_addr_of.addend = addend; +} +static void w_copy_bytes(CGTarget* t, Operand dst, Operand src, + AggregateAccess agg) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_COPY_BYTES); + e->u.agg.a = dst; + e->u.agg.b = src; + e->u.agg.agg = agg; +} +static void w_set_bytes(CGTarget* t, Operand dst, Operand byte, + AggregateAccess agg) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_SET_BYTES); + e->u.agg.a = dst; + e->u.agg.b = byte; + e->u.agg.agg = agg; +} +static void w_bitfield_load(CGTarget* t, Operand dst, Operand record, + BitFieldAccess bf) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_BITFIELD_LOAD); + e->u.bitfield_load.dst = dst; + e->u.bitfield_load.record = record; + e->u.bitfield_load.bf = bf; +} +static void w_bitfield_store(CGTarget* t, Operand record, Operand src, + BitFieldAccess bf) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_BITFIELD_STORE); + e->u.bitfield_store.record = record; + e->u.bitfield_store.src = src; + e->u.bitfield_store.bf = bf; +} + +static void w_binop(CGTarget* t, BinOp op, Operand dst, Operand a, Operand b) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_BINOP); + e->u.binop.op = op; + e->u.binop.dst = dst; + e->u.binop.a = a; + e->u.binop.b = b; +} +static void w_unop(CGTarget* t, UnOp op, Operand dst, Operand a) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_UNOP); + e->u.unop.op = op; + e->u.unop.dst = dst; + e->u.unop.a = a; +} +static void w_cmp(CGTarget* t, CmpOp op, Operand dst, Operand a, Operand b) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_CMP); + e->u.cmp.op = op; + e->u.cmp.dst = dst; + e->u.cmp.a = a; + e->u.cmp.b = b; +} +static void w_convert(CGTarget* t, ConvKind k, Operand dst, Operand src) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_CONVERT); + e->u.convert.kind = k; + e->u.convert.dst = dst; + e->u.convert.src = src; +} + +static void w_call(CGTarget* t, const CGCallDesc* d) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_CALL); + CGABIValue* args_copy = NULL; + CGABIPart** arg_parts_copy = NULL; + CGABIPart* ret_parts_copy = NULL; + u32 i; + + /* Deep-copy the argv. Caller-owned d may be on the stack, and + * args[i].parts may be too. */ + if (d->nargs) { + args_copy = arena_array(o->c->tu, CGABIValue, d->nargs); + arg_parts_copy = arena_array(o->c->tu, CGABIPart*, d->nargs); + for (i = 0; i < d->nargs; ++i) { + args_copy[i] = d->args[i]; + arg_parts_copy[i] = + copy_parts(o->c, d->args[i].parts, d->args[i].nparts); + args_copy[i].parts = arg_parts_copy[i]; + } + } + ret_parts_copy = copy_parts(o->c, d->ret.parts, d->ret.nparts); + + e->u.call.desc = *d; + e->u.call.desc.args = args_copy; + e->u.call.desc.ret.parts = ret_parts_copy; + e->u.call.args = args_copy; + e->u.call.arg_parts = arg_parts_copy; + e->u.call.ret_parts = ret_parts_copy; +} + +static void w_ret(CGTarget* t, const CGABIValue* v) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_RET); + if (!v) { + e->u.ret.present = 0; + return; + } + e->u.ret.present = 1; + e->u.ret.val = *v; + e->u.ret.parts = copy_parts(o->c, v->parts, v->nparts); + e->u.ret.val.parts = e->u.ret.parts; +} + +static void w_alloca_(CGTarget* t, Operand dst, Operand size, u32 align) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_ALLOCA); + e->u.alloca_.dst = dst; + e->u.alloca_.size = size; + e->u.alloca_.align = align; +} + +static void w_va_start_(CGTarget* t, Operand ap) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_VA_START); + e->u.va_se.ap = ap; +} +static void w_va_arg_(CGTarget* t, Operand dst, Operand ap, const Type* ty) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_VA_ARG); + e->u.va_arg_.dst = dst; + e->u.va_arg_.ap = ap; + e->u.va_arg_.ty = ty; +} +static void w_va_end_(CGTarget* t, Operand ap) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_VA_END); + e->u.va_se.ap = ap; +} +static void w_va_copy_(CGTarget* t, Operand dst, Operand src) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_VA_COPY); + e->u.copy.dst = dst; + e->u.copy.src = src; +} + +static void w_setjmp_(CGTarget* t, Operand dst, Operand buf) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_SETJMP); + e->u.setjmp_.dst = dst; + e->u.setjmp_.buf = buf; +} +static void w_longjmp_(CGTarget* t, Operand buf, Operand val) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_LONGJMP); + e->u.longjmp_.buf = buf; + e->u.longjmp_.val = val; +} + +static void w_atomic_load(CGTarget* t, Operand dst, Operand addr, MemAccess m, + MemOrder mo) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_ATOMIC_LOAD); + e->u.atomic_load.dst = dst; + e->u.atomic_load.addr = addr; + e->u.atomic_load.mem = m; + e->u.atomic_load.mo = mo; +} +static void w_atomic_store(CGTarget* t, Operand addr, Operand src, MemAccess m, + MemOrder mo) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_ATOMIC_STORE); + e->u.atomic_store.addr = addr; + e->u.atomic_store.src = src; + e->u.atomic_store.mem = m; + e->u.atomic_store.mo = mo; +} +static void w_atomic_rmw(CGTarget* t, AtomicOp op, Operand dst, Operand addr, + Operand val, MemAccess m, MemOrder mo) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_ATOMIC_RMW); + e->u.atomic_rmw.op = op; + e->u.atomic_rmw.dst = dst; + e->u.atomic_rmw.addr = addr; + e->u.atomic_rmw.val = val; + e->u.atomic_rmw.mem = m; + e->u.atomic_rmw.mo = mo; +} +static void w_atomic_cas(CGTarget* t, Operand prior, Operand ok, Operand addr, + Operand expected, Operand desired, MemAccess m, + MemOrder s, MemOrder f) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_ATOMIC_CAS); + e->u.atomic_cas.prior = prior; + e->u.atomic_cas.ok = ok; + e->u.atomic_cas.addr = addr; + e->u.atomic_cas.expected = expected; + e->u.atomic_cas.desired = desired; + e->u.atomic_cas.mem = m; + e->u.atomic_cas.success = s; + e->u.atomic_cas.failure = f; +} +static void w_fence(CGTarget* t, MemOrder mo) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_FENCE); + e->u.fence.mo = mo; +} + +static void w_intrinsic(CGTarget* t, IntrinKind k, Operand* dsts, u32 nd, + const Operand* args, u32 na) { + OptImpl* o = impl_of(t); + TapeEntry* e = tape_append(o, TOP_INTRINSIC); + e->u.intrinsic.kind = k; + e->u.intrinsic.ndst = nd; + e->u.intrinsic.narg = na; + e->u.intrinsic.dsts = copy_operands(o->c, dsts, nd); + e->u.intrinsic.args = copy_operands(o->c, args, na); +} + +static void w_asm_block(CGTarget* t, const char* tmpl, + const AsmConstraint* outs, u32 nout, Operand* out_ops, + const AsmConstraint* ins, u32 nin, + const Operand* in_ops, const Sym* clobbers, u32 nclob) { + (void)tmpl; + (void)outs; + (void)nout; + (void)out_ops; + (void)ins; + (void)nin; + (void)in_ops; + (void)clobbers; + (void)nclob; + /* Group M (inline asm) is deferred in the corpus; the wrapper does + * not yet support it. */ + panic_unsupported(impl_of(t), "asm_block"); +} + +static void w_set_loc(CGTarget* t, SrcLoc loc) { + OptImpl* o = impl_of(t); + TapeEntry* e; + o->pending_loc = loc; + e = tape_append(o, TOP_SET_LOC); + e->u.set_loc.loc = loc; +} + +/* ---- replay-time translation ---- */ + +static Reg xlat_reg(OptImpl* o, Reg vreg) { + if (vreg == REG_NONE || vreg == 0u) return vreg; + if (vreg >= o->reg_map_cap || o->reg_map[vreg] == REG_NONE) { + SrcLoc loc = {0, 0, 0}; + compiler_panic(o->c, loc, "opt replay: unmapped vreg %u", (unsigned)vreg); + } + return o->reg_map[vreg]; +} + +static FrameSlot xlat_slot(OptImpl* o, FrameSlot vs) { + if (vs == FRAME_SLOT_NONE) return FRAME_SLOT_NONE; + if (vs >= o->slot_map_cap || o->slot_map[vs] == FRAME_SLOT_NONE) { + SrcLoc loc = {0, 0, 0}; + compiler_panic(o->c, loc, "opt replay: unmapped vslot %u", (unsigned)vs); + } + return o->slot_map[vs]; +} + +static Label xlat_label(OptImpl* o, Label vl) { + if (vl == LABEL_NONE) return LABEL_NONE; + if (vl >= o->label_map_cap || o->label_map[vl] == LABEL_NONE) { + SrcLoc loc = {0, 0, 0}; + compiler_panic(o->c, loc, "opt replay: unmapped vlabel %u", (unsigned)vl); + } + return o->label_map[vl]; +} + +static CGScope xlat_scope(OptImpl* o, CGScope vs) { + if (vs == CG_SCOPE_NONE) return CG_SCOPE_NONE; + if (vs >= o->scope_map_cap || o->scope_map[vs] == CG_SCOPE_NONE) { + SrcLoc loc = {0, 0, 0}; + compiler_panic(o->c, loc, "opt replay: unmapped vscope %u", (unsigned)vs); + } + return o->scope_map[vs]; +} + +static Operand xlat_op(OptImpl* o, Operand op) { + switch ((OpKind)op.kind) { + case OPK_IMM: + case OPK_GLOBAL: + return op; + case OPK_REG: + op.v.reg = xlat_reg(o, op.v.reg); + return op; + case OPK_LOCAL: + op.v.frame_slot = xlat_slot(o, op.v.frame_slot); + return op; + case OPK_INDIRECT: + op.v.ind.base = xlat_reg(o, op.v.ind.base); + return op; + } + /* unreachable */ + return op; +} + +static CGABIValue xlat_abivalue(OptImpl* o, const CGABIValue* in, + CGABIPart* parts_out) { + CGABIValue out = *in; + out.storage = xlat_op(o, in->storage); + if (in->nparts && parts_out) { + for (u32 i = 0; i < in->nparts; ++i) { + parts_out[i] = in->parts[i]; + parts_out[i].op = xlat_op(o, in->parts[i].op); + } + out.parts = parts_out; + } + return out; +} + +/* ---- replay ---- */ + +static void replay(OptImpl* o) { + CGTarget* w = o->target; + + /* Pre-size the maps to the high-water mark for this function. */ + if (o->next_vreg > 1) map_reg_grow(o, o->next_vreg); + if (o->next_vslot > 1) map_slot_grow(o, o->next_vslot); + if (o->next_vlabel > 1) map_label_grow(o, o->next_vlabel); + if (o->next_vscope > 1) map_scope_grow(o, o->next_vscope); + + for (u32 i = 0; i < o->ntape; ++i) { + TapeEntry* e = &o->tape[i]; + if (e->dead) continue; + switch ((TapeOpKind)e->op) { + case TOP_FUNC_BEGIN: { + /* Build a fresh CGFuncDesc with translated param slots. */ + CGFuncDesc fd = e->u.func_begin.desc; + if (fd.nparams) { + CGParamDesc* params = arena_array(o->c->tu, CGParamDesc, fd.nparams); + for (u32 k = 0; k < fd.nparams; ++k) { + params[k] = e->u.func_begin.params[k]; + params[k].slot = xlat_slot(o, e->u.func_begin.params[k].slot); + } + fd.params = params; + } + w->func_begin(w, &fd); + break; + } + case TOP_FUNC_END: + w->func_end(w); + break; + case TOP_ALLOC_REG: { + Reg r = + w->alloc_reg(w, e->u.alloc_reg.cls, e->u.alloc_reg.ty); + Reg v = e->u.alloc_reg.vreg; + if (v >= o->reg_map_cap) map_reg_grow(o, v + 1); + o->reg_map[v] = r; + break; + } + case TOP_FRAME_SLOT: { + FrameSlot s = w->frame_slot(w, &e->u.frame_slot.desc); + FrameSlot v = e->u.frame_slot.vslot; + if (v >= o->slot_map_cap) map_slot_grow(o, v + 1); + o->slot_map[v] = s; + break; + } + case TOP_PARAM: { + CGParamDesc d = e->u.param.desc; + d.slot = xlat_slot(o, d.slot); + w->param(w, &d); + break; + } + case TOP_LABEL_NEW: { + Label l = w->label_new(w); + Label v = e->u.label_new.vlabel; + if (v >= o->label_map_cap) map_label_grow(o, v + 1); + o->label_map[v] = l; + break; + } + case TOP_LABEL_PLACE: + w->label_place(w, xlat_label(o, e->u.label_op.vlabel)); + break; + case TOP_JUMP: + w->jump(w, xlat_label(o, e->u.label_op.vlabel)); + break; + case TOP_CMP_BRANCH: + w->cmp_branch(w, e->u.cmp_branch.op, xlat_op(o, e->u.cmp_branch.a), + xlat_op(o, e->u.cmp_branch.b), + xlat_label(o, e->u.cmp_branch.vlabel)); + break; + case TOP_SCOPE_BEGIN: { + CGScopeDesc d = e->u.scope_begin.desc; + d.cond = xlat_op(o, d.cond); + d.break_label = xlat_label(o, d.break_label); + d.continue_label = xlat_label(o, d.continue_label); + CGScope s = w->scope_begin(w, &d); + CGScope v = e->u.scope_begin.vscope; + if (v >= o->scope_map_cap) map_scope_grow(o, v + 1); + o->scope_map[v] = s; + break; + } + case TOP_SCOPE_ELSE: + w->scope_else(w, xlat_scope(o, e->u.scope_op.vscope)); + break; + case TOP_SCOPE_END: + w->scope_end(w, xlat_scope(o, e->u.scope_op.vscope)); + break; + case TOP_BREAK_TO: + w->break_to(w, xlat_scope(o, e->u.scope_op.vscope)); + break; + case TOP_CONTINUE_TO: + w->continue_to(w, xlat_scope(o, e->u.scope_op.vscope)); + break; + case TOP_LOAD_IMM: + w->load_imm(w, xlat_op(o, e->u.load_imm.dst), e->u.load_imm.imm); + break; + case TOP_LOAD_CONST: + w->load_const(w, xlat_op(o, e->u.load_const.dst), e->u.load_const.cb); + break; + case TOP_COPY: + w->copy(w, xlat_op(o, e->u.copy.dst), xlat_op(o, e->u.copy.src)); + break; + case TOP_LOAD: + w->load(w, xlat_op(o, e->u.load.dst), xlat_op(o, e->u.load.addr), + e->u.load.mem); + break; + case TOP_STORE: + w->store(w, xlat_op(o, e->u.store.addr), xlat_op(o, e->u.store.src), + e->u.store.mem); + break; + case TOP_ADDR_OF: + w->addr_of(w, xlat_op(o, e->u.copy.dst), xlat_op(o, e->u.copy.src)); + break; + case TOP_TLS_ADDR_OF: + w->tls_addr_of(w, xlat_op(o, e->u.tls_addr_of.dst), + e->u.tls_addr_of.sym, e->u.tls_addr_of.addend); + break; + case TOP_COPY_BYTES: + w->copy_bytes(w, xlat_op(o, e->u.agg.a), xlat_op(o, e->u.agg.b), + e->u.agg.agg); + break; + case TOP_SET_BYTES: + w->set_bytes(w, xlat_op(o, e->u.agg.a), xlat_op(o, e->u.agg.b), + e->u.agg.agg); + break; + case TOP_BITFIELD_LOAD: + w->bitfield_load(w, xlat_op(o, e->u.bitfield_load.dst), + xlat_op(o, e->u.bitfield_load.record), + e->u.bitfield_load.bf); + break; + case TOP_BITFIELD_STORE: + w->bitfield_store(w, xlat_op(o, e->u.bitfield_store.record), + xlat_op(o, e->u.bitfield_store.src), + e->u.bitfield_store.bf); + break; + case TOP_BINOP: + w->binop(w, e->u.binop.op, xlat_op(o, e->u.binop.dst), + xlat_op(o, e->u.binop.a), xlat_op(o, e->u.binop.b)); + break; + case TOP_UNOP: + w->unop(w, e->u.unop.op, xlat_op(o, e->u.unop.dst), + xlat_op(o, e->u.unop.a)); + break; + case TOP_CMP: + w->cmp(w, e->u.cmp.op, xlat_op(o, e->u.cmp.dst), + xlat_op(o, e->u.cmp.a), xlat_op(o, e->u.cmp.b)); + break; + case TOP_CONVERT: + w->convert(w, e->u.convert.kind, xlat_op(o, e->u.convert.dst), + xlat_op(o, e->u.convert.src)); + break; + case TOP_CALL: { + CGCallDesc cd = e->u.call.desc; + cd.callee = xlat_op(o, cd.callee); + CGABIValue* args = NULL; + if (cd.nargs) { + args = arena_array(o->c->tu, CGABIValue, cd.nargs); + for (u32 k = 0; k < cd.nargs; ++k) { + CGABIPart* parts = + e->u.call.args[k].nparts + ? arena_array(o->c->tu, CGABIPart, + e->u.call.args[k].nparts) + : NULL; + args[k] = xlat_abivalue(o, &e->u.call.args[k], parts); + } + cd.args = args; + } else { + cd.args = NULL; + } + CGABIPart* ret_parts = + cd.ret.nparts + ? arena_array(o->c->tu, CGABIPart, cd.ret.nparts) + : NULL; + cd.ret = xlat_abivalue(o, &e->u.call.desc.ret, ret_parts); + w->call(w, &cd); + break; + } + case TOP_RET: { + if (!e->u.ret.present) { + w->ret(w, NULL); + break; + } + CGABIPart* parts = + e->u.ret.val.nparts + ? arena_array(o->c->tu, CGABIPart, e->u.ret.val.nparts) + : NULL; + CGABIValue v = xlat_abivalue(o, &e->u.ret.val, parts); + w->ret(w, &v); + break; + } + case TOP_ALLOCA: + w->alloca_(w, xlat_op(o, e->u.alloca_.dst), + xlat_op(o, e->u.alloca_.size), e->u.alloca_.align); + break; + case TOP_VA_START: + w->va_start_(w, xlat_op(o, e->u.va_se.ap)); + break; + case TOP_VA_ARG: + w->va_arg_(w, xlat_op(o, e->u.va_arg_.dst), + xlat_op(o, e->u.va_arg_.ap), e->u.va_arg_.ty); + break; + case TOP_VA_END: + w->va_end_(w, xlat_op(o, e->u.va_se.ap)); + break; + case TOP_VA_COPY: + w->va_copy_(w, xlat_op(o, e->u.copy.dst), xlat_op(o, e->u.copy.src)); + break; + case TOP_SETJMP: + w->setjmp_(w, xlat_op(o, e->u.setjmp_.dst), + xlat_op(o, e->u.setjmp_.buf)); + break; + case TOP_LONGJMP: + w->longjmp_(w, xlat_op(o, e->u.longjmp_.buf), + xlat_op(o, e->u.longjmp_.val)); + break; + case TOP_ATOMIC_LOAD: + w->atomic_load(w, xlat_op(o, e->u.atomic_load.dst), + xlat_op(o, e->u.atomic_load.addr), + e->u.atomic_load.mem, e->u.atomic_load.mo); + break; + case TOP_ATOMIC_STORE: + w->atomic_store(w, xlat_op(o, e->u.atomic_store.addr), + xlat_op(o, e->u.atomic_store.src), + e->u.atomic_store.mem, e->u.atomic_store.mo); + break; + case TOP_ATOMIC_RMW: + w->atomic_rmw(w, e->u.atomic_rmw.op, xlat_op(o, e->u.atomic_rmw.dst), + xlat_op(o, e->u.atomic_rmw.addr), + xlat_op(o, e->u.atomic_rmw.val), e->u.atomic_rmw.mem, + e->u.atomic_rmw.mo); + break; + case TOP_ATOMIC_CAS: + w->atomic_cas(w, xlat_op(o, e->u.atomic_cas.prior), + xlat_op(o, e->u.atomic_cas.ok), + xlat_op(o, e->u.atomic_cas.addr), + xlat_op(o, e->u.atomic_cas.expected), + xlat_op(o, e->u.atomic_cas.desired), + e->u.atomic_cas.mem, e->u.atomic_cas.success, + e->u.atomic_cas.failure); + break; + case TOP_FENCE: + w->fence(w, e->u.fence.mo); + break; + case TOP_INTRINSIC: { + Operand* dsts = NULL; + Operand* args = NULL; + if (e->u.intrinsic.ndst) { + dsts = arena_array(o->c->tu, Operand, e->u.intrinsic.ndst); + for (u32 k = 0; k < e->u.intrinsic.ndst; ++k) { + dsts[k] = xlat_op(o, e->u.intrinsic.dsts[k]); + } + } + if (e->u.intrinsic.narg) { + args = arena_array(o->c->tu, Operand, e->u.intrinsic.narg); + for (u32 k = 0; k < e->u.intrinsic.narg; ++k) { + args[k] = xlat_op(o, e->u.intrinsic.args[k]); + } + } + w->intrinsic(w, e->u.intrinsic.kind, dsts, e->u.intrinsic.ndst, args, + e->u.intrinsic.narg); + break; + } + case TOP_SET_LOC: + w->set_loc(w, e->u.set_loc.loc); + break; + } + } +} + +/* ---- printer ---- */ + +static void wstr(Writer* w, const char* s) { + size_t n = 0; + while (s[n]) ++n; + if (n) w->write(w, s, n); +} + +/* Minimal i64 → decimal formatter. Writes into a 32-byte buffer (enough + * for INT64_MIN). Returns nothing; the caller hands the buffer to wstr. */ +static void fmt_i64(i64 v, char* out) { + char tmp[32]; + u32 n = 0; + u64 u; + int neg = 0; + if (v < 0) { + neg = 1; + u = (u64)(-(v + 1)) + 1u; /* avoid UB for INT64_MIN */ + } else { + u = (u64)v; + } + do { + tmp[n++] = (char)('0' + (u % 10u)); + u /= 10u; + } while (u); + if (neg) tmp[n++] = '-'; + /* reverse */ + for (u32 i = 0; i < n; ++i) out[i] = tmp[n - 1 - i]; + out[n] = 0; +} + +static void wint(Writer* w, i64 v) { + char buf[32]; + fmt_i64(v, buf); + wstr(w, buf); +} + +static const char* binop_name(BinOp op) { + switch (op) { + case BO_IADD: return "iadd"; + case BO_ISUB: return "isub"; + case BO_IMUL: return "imul"; + case BO_SDIV: return "sdiv"; + case BO_UDIV: return "udiv"; + case BO_SREM: return "srem"; + case BO_UREM: return "urem"; + case BO_FADD: return "fadd"; + case BO_FSUB: return "fsub"; + case BO_FMUL: return "fmul"; + case BO_FDIV: return "fdiv"; + case BO_AND: return "and"; + case BO_OR: return "or"; + case BO_XOR: return "xor"; + case BO_SHL: return "shl"; + case BO_SHR_S: return "shr_s"; + case BO_SHR_U: return "shr_u"; + } + return "?binop"; +} + +static const char* unop_name(UnOp op) { + switch (op) { + case UO_NEG: return "neg"; + case UO_NOT: return "not"; + case UO_BNOT: return "bnot"; + } + return "?unop"; +} + +static const char* cmp_name(CmpOp op) { + switch (op) { + case CMP_EQ: return "eq"; + case CMP_NE: return "ne"; + case CMP_LT_S: return "lt_s"; + case CMP_LE_S: return "le_s"; + case CMP_GT_S: return "gt_s"; + case CMP_GE_S: return "ge_s"; + case CMP_LT_U: return "lt_u"; + case CMP_LE_U: return "le_u"; + case CMP_GT_U: return "gt_u"; + case CMP_GE_U: return "ge_u"; + case CMP_LT_F: return "lt_f"; + case CMP_LE_F: return "le_f"; + case CMP_GT_F: return "gt_f"; + case CMP_GE_F: return "ge_f"; + } + return "?cmp"; +} + +static void print_operand(Writer* w, const Operand* op) { + switch ((OpKind)op->kind) { + case OPK_IMM: + wstr(w, "imm:"); + wint(w, op->v.imm); + return; + case OPK_REG: + wstr(w, "v"); + wint(w, (i64)op->v.reg); + return; + case OPK_LOCAL: + wstr(w, "fs"); + wint(w, (i64)op->v.frame_slot); + return; + case OPK_GLOBAL: + wstr(w, "sym"); + wint(w, (i64)op->v.global.sym); + if (op->v.global.addend) { + wstr(w, "+"); + wint(w, op->v.global.addend); + } + return; + case OPK_INDIRECT: + wstr(w, "[v"); + wint(w, (i64)op->v.ind.base); + if (op->v.ind.ofs) { + wstr(w, "+"); + wint(w, op->v.ind.ofs); + } + wstr(w, "]"); + return; + } + wstr(w, "?op"); +} + +static void print_tape(OptImpl* o, Writer* w) { + for (u32 i = 0; i < o->ntape; ++i) { + TapeEntry* e = &o->tape[i]; + if (e->dead) { + wstr(w, " ; dead\n"); + continue; + } + wstr(w, " "); + switch ((TapeOpKind)e->op) { + case TOP_FUNC_BEGIN: + wstr(w, "func_begin sym="); + wint(w, (i64)e->u.func_begin.desc.sym); + wstr(w, " nparams="); + wint(w, (i64)e->u.func_begin.desc.nparams); + break; + case TOP_FUNC_END: + wstr(w, "func_end"); + break; + case TOP_ALLOC_REG: + wstr(w, "alloc_reg v"); + wint(w, (i64)e->u.alloc_reg.vreg); + wstr(w, " cls="); + wint(w, (i64)e->u.alloc_reg.cls); + break; + case TOP_FRAME_SLOT: + wstr(w, "frame_slot fs"); + wint(w, (i64)e->u.frame_slot.vslot); + wstr(w, " size="); + wint(w, (i64)e->u.frame_slot.desc.size); + wstr(w, " kind="); + wint(w, (i64)e->u.frame_slot.desc.kind); + break; + case TOP_PARAM: + wstr(w, "param idx="); + wint(w, (i64)e->u.param.desc.index); + wstr(w, " fs="); + wint(w, (i64)e->u.param.desc.slot); + break; + case TOP_LABEL_NEW: + wstr(w, "label_new L"); + wint(w, (i64)e->u.label_new.vlabel); + break; + case TOP_LABEL_PLACE: + wstr(w, "label_place L"); + wint(w, (i64)e->u.label_op.vlabel); + break; + case TOP_JUMP: + wstr(w, "jump L"); + wint(w, (i64)e->u.label_op.vlabel); + break; + case TOP_CMP_BRANCH: + wstr(w, "cmp_branch "); + wstr(w, cmp_name(e->u.cmp_branch.op)); + wstr(w, " "); + print_operand(w, &e->u.cmp_branch.a); + wstr(w, ", "); + print_operand(w, &e->u.cmp_branch.b); + wstr(w, " -> L"); + wint(w, (i64)e->u.cmp_branch.vlabel); + break; + case TOP_SCOPE_BEGIN: + wstr(w, "scope_begin S"); + wint(w, (i64)e->u.scope_begin.vscope); + wstr(w, " kind="); + wint(w, (i64)e->u.scope_begin.desc.kind); + break; + case TOP_SCOPE_ELSE: + wstr(w, "scope_else S"); + wint(w, (i64)e->u.scope_op.vscope); + break; + case TOP_SCOPE_END: + wstr(w, "scope_end S"); + wint(w, (i64)e->u.scope_op.vscope); + break; + case TOP_BREAK_TO: + wstr(w, "break_to S"); + wint(w, (i64)e->u.scope_op.vscope); + break; + case TOP_CONTINUE_TO: + wstr(w, "continue_to S"); + wint(w, (i64)e->u.scope_op.vscope); + break; + case TOP_LOAD_IMM: + wstr(w, "load_imm "); + print_operand(w, &e->u.load_imm.dst); + wstr(w, ", "); + wint(w, e->u.load_imm.imm); + break; + case TOP_LOAD_CONST: + wstr(w, "load_const "); + print_operand(w, &e->u.load_const.dst); + wstr(w, ", <bytes:"); + wint(w, (i64)e->u.load_const.cb.size); + wstr(w, ">"); + break; + case TOP_COPY: + wstr(w, "copy "); + print_operand(w, &e->u.copy.dst); + wstr(w, ", "); + print_operand(w, &e->u.copy.src); + break; + case TOP_LOAD: + wstr(w, "load "); + print_operand(w, &e->u.load.dst); + wstr(w, ", "); + print_operand(w, &e->u.load.addr); + break; + case TOP_STORE: + wstr(w, "store "); + print_operand(w, &e->u.store.addr); + wstr(w, ", "); + print_operand(w, &e->u.store.src); + break; + case TOP_ADDR_OF: + wstr(w, "addr_of "); + print_operand(w, &e->u.copy.dst); + wstr(w, ", "); + print_operand(w, &e->u.copy.src); + break; + case TOP_TLS_ADDR_OF: + wstr(w, "tls_addr_of "); + print_operand(w, &e->u.tls_addr_of.dst); + wstr(w, ", sym"); + wint(w, (i64)e->u.tls_addr_of.sym); + break; + case TOP_COPY_BYTES: + wstr(w, "copy_bytes "); + print_operand(w, &e->u.agg.a); + wstr(w, ", "); + print_operand(w, &e->u.agg.b); + wstr(w, " size="); + wint(w, (i64)e->u.agg.agg.size); + break; + case TOP_SET_BYTES: + wstr(w, "set_bytes "); + print_operand(w, &e->u.agg.a); + wstr(w, ", "); + print_operand(w, &e->u.agg.b); + wstr(w, " size="); + wint(w, (i64)e->u.agg.agg.size); + break; + case TOP_BITFIELD_LOAD: + wstr(w, "bitfield_load "); + print_operand(w, &e->u.bitfield_load.dst); + wstr(w, ", "); + print_operand(w, &e->u.bitfield_load.record); + break; + case TOP_BITFIELD_STORE: + wstr(w, "bitfield_store "); + print_operand(w, &e->u.bitfield_store.record); + wstr(w, ", "); + print_operand(w, &e->u.bitfield_store.src); + break; + case TOP_BINOP: + wstr(w, binop_name(e->u.binop.op)); + wstr(w, " "); + print_operand(w, &e->u.binop.dst); + wstr(w, ", "); + print_operand(w, &e->u.binop.a); + wstr(w, ", "); + print_operand(w, &e->u.binop.b); + break; + case TOP_UNOP: + wstr(w, unop_name(e->u.unop.op)); + wstr(w, " "); + print_operand(w, &e->u.unop.dst); + wstr(w, ", "); + print_operand(w, &e->u.unop.a); + break; + case TOP_CMP: + wstr(w, "cmp."); + wstr(w, cmp_name(e->u.cmp.op)); + wstr(w, " "); + print_operand(w, &e->u.cmp.dst); + wstr(w, ", "); + print_operand(w, &e->u.cmp.a); + wstr(w, ", "); + print_operand(w, &e->u.cmp.b); + break; + case TOP_CONVERT: + wstr(w, "convert "); + print_operand(w, &e->u.convert.dst); + wstr(w, ", "); + print_operand(w, &e->u.convert.src); + wstr(w, " kind="); + wint(w, (i64)e->u.convert.kind); + break; + case TOP_CALL: + wstr(w, "call "); + print_operand(w, &e->u.call.desc.callee); + wstr(w, " nargs="); + wint(w, (i64)e->u.call.desc.nargs); + break; + case TOP_RET: + wstr(w, "ret"); + if (e->u.ret.present) { + wstr(w, " "); + print_operand(w, &e->u.ret.val.storage); + } + break; + case TOP_ALLOCA: + wstr(w, "alloca "); + print_operand(w, &e->u.alloca_.dst); + wstr(w, ", "); + print_operand(w, &e->u.alloca_.size); + break; + case TOP_VA_START: + wstr(w, "va_start "); + print_operand(w, &e->u.va_se.ap); + break; + case TOP_VA_ARG: + wstr(w, "va_arg "); + print_operand(w, &e->u.va_arg_.dst); + wstr(w, ", "); + print_operand(w, &e->u.va_arg_.ap); + break; + case TOP_VA_END: + wstr(w, "va_end "); + print_operand(w, &e->u.va_se.ap); + break; + case TOP_VA_COPY: + wstr(w, "va_copy "); + print_operand(w, &e->u.copy.dst); + wstr(w, ", "); + print_operand(w, &e->u.copy.src); + break; + case TOP_SETJMP: + wstr(w, "setjmp "); + print_operand(w, &e->u.setjmp_.dst); + wstr(w, ", "); + print_operand(w, &e->u.setjmp_.buf); + break; + case TOP_LONGJMP: + wstr(w, "longjmp "); + print_operand(w, &e->u.longjmp_.buf); + wstr(w, ", "); + print_operand(w, &e->u.longjmp_.val); + break; + case TOP_ATOMIC_LOAD: + wstr(w, "atomic_load "); + print_operand(w, &e->u.atomic_load.dst); + wstr(w, ", "); + print_operand(w, &e->u.atomic_load.addr); + break; + case TOP_ATOMIC_STORE: + wstr(w, "atomic_store "); + print_operand(w, &e->u.atomic_store.addr); + wstr(w, ", "); + print_operand(w, &e->u.atomic_store.src); + break; + case TOP_ATOMIC_RMW: + wstr(w, "atomic_rmw op="); + wint(w, (i64)e->u.atomic_rmw.op); + wstr(w, " "); + print_operand(w, &e->u.atomic_rmw.dst); + wstr(w, ", "); + print_operand(w, &e->u.atomic_rmw.addr); + wstr(w, ", "); + print_operand(w, &e->u.atomic_rmw.val); + break; + case TOP_ATOMIC_CAS: + wstr(w, "atomic_cas prior="); + print_operand(w, &e->u.atomic_cas.prior); + wstr(w, " ok="); + print_operand(w, &e->u.atomic_cas.ok); + wstr(w, " addr="); + print_operand(w, &e->u.atomic_cas.addr); + break; + case TOP_FENCE: + wstr(w, "fence mo="); + wint(w, (i64)e->u.fence.mo); + break; + case TOP_INTRINSIC: + wstr(w, "intrinsic kind="); + wint(w, (i64)e->u.intrinsic.kind); + wstr(w, " ndst="); + wint(w, (i64)e->u.intrinsic.ndst); + wstr(w, " narg="); + wint(w, (i64)e->u.intrinsic.narg); + break; + case TOP_SET_LOC: + wstr(w, "set_loc "); + wint(w, (i64)e->u.set_loc.loc.line); + wstr(w, ":"); + wint(w, (i64)e->u.set_loc.loc.col); + break; + } + wstr(w, "\n"); + } +} + +/* ---- Phase 2 peephole: integer constant folding ---- + * + * Pattern: LOAD_IMM(V_a, k_a); LOAD_IMM(V_b, k_b); BINOP(op, V_d, V_a, V_b) + * with op ∈ {IADD, ISUB, IMUL}. + * After: the BINOP is rewritten to LOAD_IMM(V_d, k_a OP k_b). + * + * Both operands must be OPK_REG referencing wrapper vregs whose only + * recorded definition was a LOAD_IMM. The intermediate LOAD_IMMs are + * left in place — they may have other uses, and DCE is a Phase 3 + * concern. + * + * Folding is done in 64-bit signed arithmetic and truncated by the + * target's load_imm based on the destination type. This matches C11 + * §6.5/3 ("two's-complement wraparound at the abstract machine level + * for signed and unsigned integer types alike" per cfree's no-UB + * stance — see doc/DESIGN.md §9). */ + +typedef struct ImmInfo { + i64 val; + u8 known; +} ImmInfo; + +static void peephole_constfold(OptImpl* o) { + ImmInfo* imm; + u32 cap; + + if (o->next_vreg <= 1) return; + cap = o->next_vreg; + imm = arena_zarray(o->c->tu, ImmInfo, cap); + + for (u32 i = 0; i < o->ntape; ++i) { + TapeEntry* e = &o->tape[i]; + if (e->dead) continue; + switch ((TapeOpKind)e->op) { + case TOP_LOAD_IMM: + if (e->u.load_imm.dst.kind == OPK_REG) { + Reg r = e->u.load_imm.dst.v.reg; + if (r < cap) { + imm[r].val = e->u.load_imm.imm; + imm[r].known = 1; + } + } + break; + case TOP_BINOP: { + Operand a = e->u.binop.a; + Operand b = e->u.binop.b; + BinOp op = e->u.binop.op; + if (a.kind != OPK_REG || b.kind != OPK_REG) break; + if (a.v.reg >= cap || b.v.reg >= cap) break; + if (!imm[a.v.reg].known || !imm[b.v.reg].known) break; + if (op != BO_IADD && op != BO_ISUB && op != BO_IMUL) break; + + i64 av = imm[a.v.reg].val; + i64 bv = imm[b.v.reg].val; + u64 folded; + /* Compute in u64 to make wraparound deterministic, then cast + * back. cfree's no-UB stance forbids signed-overflow-is-UB + * exploitation (doc/DESIGN.md §9), so this is the right shape. */ + switch (op) { + case BO_IADD: folded = (u64)av + (u64)bv; break; + case BO_ISUB: folded = (u64)av - (u64)bv; break; + case BO_IMUL: folded = (u64)av * (u64)bv; break; + default: continue; + } + + Operand dst = e->u.binop.dst; + memset(&e->u, 0, sizeof e->u); + e->op = (u8)TOP_LOAD_IMM; + e->u.load_imm.dst = dst; + e->u.load_imm.imm = (i64)folded; + if (dst.kind == OPK_REG && dst.v.reg < cap) { + imm[dst.v.reg].val = (i64)folded; + imm[dst.v.reg].known = 1; + } + break; + } + default: + break; + } + } +} + +/* ---- func_end: append TOP_FUNC_END, run peepholes, replay ---- */ + +static void w_func_end(CGTarget* t) { + OptImpl* o = impl_of(t); + tape_append(o, TOP_FUNC_END); + peephole_constfold(o); + if (o->dump_writer) print_tape(o, o->dump_writer); + replay(o); +} + +/* ---- public API: dump writer ---- */ + +void opt_set_dump_writer(CGTarget* t, Writer* w) { + /* Identify our own targets by the func_begin slot. Anything else is + * a non-opt CGTarget and the call is a silent no-op. */ + if (!t || t->func_begin != w_func_begin) return; + impl_of(t)->dump_writer = w; +} + +/* ---- end-of-TU and destruction ---- */ + +static void w_finalize(CGTarget* t) { + CGTarget* wr = impl_of(t)->target; + if (wr->finalize) wr->finalize(wr); +} + +static void w_destroy(CGTarget* t) { + CGTarget* wr = impl_of(t)->target; + if (wr->destroy) wr->destroy(wr); +} + +/* ---- construction ---- */ + +CGTarget* opt_cgtarget_new(Compiler* c, CGTarget* target, int level) { + OptImpl* o; + CGTarget* t; + + if (!target) { + SrcLoc loc = {0, 0, 0}; + compiler_panic(c, loc, "opt_cgtarget_new: target is NULL"); + } + if (level < 1 || level > 2) { + SrcLoc loc = {0, 0, 0}; + compiler_panic(c, loc, "opt_cgtarget_new: level %d out of range [1, 2]", + level); + } + + o = arena_new(c->tu, OptImpl); + memset(o, 0, sizeof *o); + o->c = c; + o->target = target; + o->level = level; + + t = &o->base; + t->c = c; + t->obj = target->obj; + t->mc = target->mc; + t->debug = target->debug; + + t->func_begin = w_func_begin; + t->func_end = w_func_end; + + t->alloc_reg = w_alloc_reg; + t->free_reg = w_free_reg; + t->frame_slot = w_frame_slot; + t->param = w_param; + t->clobbers = w_clobbers; + t->spill_reg = w_spill_reg; + t->reload_reg = w_reload_reg; + + t->label_new = w_label_new; + t->label_place = w_label_place; + t->jump = w_jump; + t->cmp_branch = w_cmp_branch; + + t->scope_begin = w_scope_begin; + t->scope_else = w_scope_else; + t->scope_end = w_scope_end; + t->break_to = w_break_to; + t->continue_to = w_continue_to; + + t->load_imm = w_load_imm; + t->load_const = w_load_const; + t->copy = w_copy; + t->load = w_load; + t->store = w_store; + t->addr_of = w_addr_of; + t->tls_addr_of = w_tls_addr_of; + t->copy_bytes = w_copy_bytes; + t->set_bytes = w_set_bytes; + t->bitfield_load = w_bitfield_load; + t->bitfield_store = w_bitfield_store; + + t->binop = w_binop; + t->unop = w_unop; + t->cmp = w_cmp; + t->convert = w_convert; + + t->call = w_call; + t->ret = w_ret; + + t->alloca_ = w_alloca_; + t->va_start_ = w_va_start_; + t->va_arg_ = w_va_arg_; + t->va_end_ = w_va_end_; + t->va_copy_ = w_va_copy_; + + t->setjmp_ = target->setjmp_ ? w_setjmp_ : NULL; + t->longjmp_ = target->longjmp_ ? w_longjmp_ : NULL; + + t->atomic_load = w_atomic_load; + t->atomic_store = w_atomic_store; + t->atomic_rmw = w_atomic_rmw; + t->atomic_cas = w_atomic_cas; + t->fence = w_fence; + + t->intrinsic = w_intrinsic; + t->asm_block = w_asm_block; + + t->set_loc = w_set_loc; + t->finalize = w_finalize; + t->destroy = w_destroy; + + return t; +} diff --git a/src/opt/opt.h b/src/opt/opt.h @@ -77,4 +77,10 @@ void opt_dce(Func*); /* post-RA DCE */ * needs. Stamps each emitted insn's SrcLoc onto target via CGTarget.set_loc. */ void opt_emit(Compiler*, Func*, CGTarget* target); +/* When set, the wrapper writes a textual dump of each function's recorded + * tape to `w` on func_end, immediately before replay. Pass `w == NULL` to + * disable. The format is line-oriented and stable enough for golden-file + * diffs but otherwise unspecified. No-op if `t` is not an opt_cgtarget. */ +void opt_set_dump_writer(CGTarget* t, Writer* w); + #endif diff --git a/test/cg/harness/cg_runner.c b/test/cg/harness/cg_runner.c @@ -31,8 +31,15 @@ #include "debug/debug.h" #include "link/link.h" #include "obj/obj.h" +#include "opt/opt.h" #include "type/type.h" +/* --opt-level N: wrap the constructed CGTarget with opt_cgtarget_new(level) + * before each case runs. 0 (default) drives the backend directly; 1 / 2 + * exercise the opt pipeline. The corpus is the equivalence oracle — every + * case's exit code at level 0 must match levels 1 / 2. */ +static int g_opt_level = 0; + /* ---- env ---- */ static void* h_alloc(CfreeHeap* h, size_t n, size_t a) { @@ -265,6 +272,9 @@ static int build_case(BuildState* st, const CgCase* cc) { if (cc->kind != CG_CASE_MC_ONLY) { st->target = cgtarget_new(c, st->ob, st->mc); + if (g_opt_level > 0) { + st->target = opt_cgtarget_new(c, st->target, g_opt_level); + } } else { st->target = NULL; } @@ -328,6 +338,96 @@ static int mode_expected(const char* name) { return 0; } +/* CfreeWriter that wraps stdout; used by --dump-tape. */ +typedef struct StdoutWriter { + CfreeWriter base; +} StdoutWriter; + +static void sw_write(CfreeWriter* w, const void* data, size_t n) { + (void)w; + fwrite(data, 1, n, stdout); +} +static void sw_seek(CfreeWriter* w, uint64_t off) { + (void)w; + (void)off; +} +static uint64_t sw_tell(CfreeWriter* w) { + (void)w; + return 0; +} +static int sw_error(CfreeWriter* w) { + (void)w; + return 0; +} +static void sw_close(CfreeWriter* w) { (void)w; } + +static StdoutWriter g_stdout_writer = {{sw_write, sw_seek, sw_tell, sw_error, + sw_close}}; + +/* --dump-tape NAME — build the case at the current --opt-level (must be + * >= 1) and print each function's recorded tape to stdout instead of + * just running the equivalence path. Useful for ad-hoc inspection and + * golden-file diffs. */ +static int mode_dump_tape(const char* name) { + const CgCase* cc = find_case(name); + if (!cc) { + fprintf(stderr, "cg-runner: unknown case '%s'\n", name); + return 2; + } + if (g_opt_level < 1) { + fprintf(stderr, "cg-runner: --dump-tape requires --opt-level >= 1\n"); + return 2; + } + + CfreeTarget target; + target_aarch64_linux(&target); + CfreeEnv env; + memset(&env, 0, sizeof env); + env.heap = &g_heap; + env.diag = &g_diag; + env.execmem = &g_execmem; + env.now = -1; + + CfreeCompiler* cc_ = cfree_compiler_new(target, &env); + if (!cc_) return 2; + + BuildState st; + memset(&st, 0, sizeof st); + st.c = (Compiler*)cc_; + + /* Pre-empt build_case so we can install the dump writer before the + * case runs through func_begin/func_end. */ + Compiler* c = st.c; + if (setjmp(c->panic)) { + compiler_run_cleanups(c); + cfree_compiler_free(cc_); + return 1; + } + st.ob = obj_new(c); + st.mc = mc_new(c, st.ob); + st.target = cgtarget_new(c, st.ob, st.mc); + st.target = opt_cgtarget_new(c, st.target, g_opt_level); + opt_set_dump_writer(st.target, &g_stdout_writer.base); + + Sym text_name = pool_intern_cstr(c->global, ".text"); + ObjSecId text_sec = + obj_section(st.ob, text_name, SEC_TEXT, SF_ALLOC | SF_EXEC, 4); + + st.ctx.c = c; + st.ctx.ob = st.ob; + st.ctx.mc = st.mc; + st.ctx.target = st.target; + st.ctx.text_sec = text_sec; + st.ctx.pool = c->global; + st.ctx.debug = NULL; + st.mc->set_section(st.mc, text_sec); + cc->build(&st.ctx); + cgtarget_finalize(st.target); + + cfree_compiler_free(cc_); + return 0; +} + /* --dwarf-checks NAME — print the W-path directive blob registered for * NAME, or nothing if the case has no DWARF checks. The shell harness * pipes this into cg_check_dwarf <obj>. */ @@ -502,11 +602,12 @@ static int mode_jit(const char* name) { static int usage(void) { fprintf(stderr, - "usage: cg-runner --list\n" - " cg-runner --expected NAME\n" - " cg-runner --dwarf-checks NAME\n" - " cg-runner --emit NAME OUT.o\n" - " cg-runner --jit NAME\n"); + "usage: cg-runner [--opt-level N] --list\n" + " cg-runner [--opt-level N] --expected NAME\n" + " cg-runner [--opt-level N] --dwarf-checks NAME\n" + " cg-runner [--opt-level N] --emit NAME OUT.o\n" + " cg-runner [--opt-level N] --jit NAME\n" + " cg-runner --opt-level N --dump-tape NAME\n"); return 2; } @@ -515,6 +616,12 @@ int main(int argc, char** argv) { long ps = sysconf(_SC_PAGESIZE); if (ps > 0) g_execmem.page_size = (size_t)ps; } + /* Optional leading --opt-level N flag. */ + if (argc >= 3 && !strcmp(argv[1], "--opt-level")) { + g_opt_level = atoi(argv[2]); + argc -= 2; + argv += 2; + } if (argc < 2) return usage(); if (!strcmp(argv[1], "--list")) return mode_list(); @@ -526,5 +633,7 @@ int main(int argc, char** argv) { return mode_emit(argv[2], argv[3]); else if (!strcmp(argv[1], "--jit") && argc == 3) return mode_jit(argv[2]); + else if (!strcmp(argv[1], "--dump-tape") && argc == 3) + return mode_dump_tape(argv[2]); return usage(); } diff --git a/test/cg/run.sh b/test/cg/run.sh @@ -51,8 +51,15 @@ ALLOW_SKIP="${CFREE_TEST_ALLOW_SKIP:-0}" # Filters (env vars or positional args; args win): # $1 / CFREE_TEST_FILTER — substring match against case name # $2 / CFREE_TEST_PATHS — subset of "DREJ" (default "DREJ") +# CFREE_OPT_LEVELS — space-separated opt levels to exercise. Default "0 1" +# so every case is built twice: directly against the +# backend (level 0) and through the opt_cgtarget +# wrapper (level 1). Path W (DWARF) only runs at +# level 0 — opt-level DWARF equivalence is a later +# phase concern. FILTER="${1:-${CFREE_TEST_FILTER:-}}" PATHS="${2:-${CFREE_TEST_PATHS:-DREJW}}" +OPT_LEVELS="${CFREE_OPT_LEVELS:-0 1}" case "$PATHS" in *D*) RUN_D=1;; *) RUN_D=0;; esac case "$PATHS" in *R*) RUN_R=1;; *) RUN_R=0;; esac case "$PATHS" in *E*) RUN_E=1;; *) RUN_E=0;; esac @@ -221,178 +228,195 @@ if [ $have_clang_cross -eq 1 ]; then fi fi -printf 'Running cases...\n' +CASES="$($CG_RUNNER --list)" + +# Each level wraps cg-runner with --opt-level N. Level 0 drives the AArch64 +# backend directly; level >0 inserts opt_cgtarget. Cases tagged with /L<N> +# in the output when level>0 so failures localize to the level. +for OPT_LEVEL in $OPT_LEVELS; do + if [ "$OPT_LEVEL" = "0" ]; then + CG_RUN=("$CG_RUNNER") + TAG="" + WORK_SUB="cg" + else + CG_RUN=("$CG_RUNNER" "--opt-level" "$OPT_LEVEL") + TAG="/L${OPT_LEVEL}" + WORK_SUB="cg-L${OPT_LEVEL}" + fi -# ---- per-case loop --------------------------------------------------------- + printf 'Running cases (opt-level %s)...\n' "$OPT_LEVEL" -CASES="$($CG_RUNNER --list)" + # Path E result bookkeeping (per level — flushed at end of this iteration). + E_NAMES=() + E_WORK=() + E_LINK_MS=() + E_EXPECTED=() + + for name in $CASES; do + [ -n "$FILTER" ] && [[ "$name" != *"$FILTER"* ]] && continue + work="$BUILD_DIR/$WORK_SUB/$name" + mkdir -p "$work" -# Path E result bookkeeping. We queue exes during the main loop and verify -# after a single batched podman flush. -E_NAMES=() -E_WORK=() -E_LINK_MS=() -E_EXPECTED=() - -for name in $CASES; do - [ -n "$FILTER" ] && [[ "$name" != *"$FILTER"* ]] && continue - work="$BUILD_DIR/cg/$name" - mkdir -p "$work" - - expected="$($CG_RUNNER --expected "$name" 2>/dev/null)" - expected="${expected:-0}" - # Exit codes are mod 256 on POSIX; mask the expected the same way so - # negative-return cases compare correctly. - expected_byte=$(( expected & 0xff )) - - # ---- Path D: in-process JIT (only on aarch64) ------------------------ - if [ $RUN_D -eq 1 ]; then - if [ $is_aarch64 -eq 1 ]; then - t0=$(now_ms) - "$CG_RUNNER" --jit "$name" >"$work/d.out" 2>"$work/d.err" - d_rc=$? - dt=$(( $(now_ms) - t0 )); T_D=$(( T_D + dt )) - if [ "$d_rc" -eq "$expected_byte" ]; then - note_pass "$name/D (${dt}ms)" + expected="$("${CG_RUN[@]}" --expected "$name" 2>/dev/null)" + expected="${expected:-0}" + # Exit codes are mod 256 on POSIX; mask the expected the same way so + # negative-return cases compare correctly. + expected_byte=$(( expected & 0xff )) + + # ---- Path D: in-process JIT (only on aarch64) ------------------------ + if [ $RUN_D -eq 1 ]; then + if [ $is_aarch64 -eq 1 ]; then + t0=$(now_ms) + "${CG_RUN[@]}" --jit "$name" >"$work/d.out" 2>"$work/d.err" + d_rc=$? + dt=$(( $(now_ms) - t0 )); T_D=$(( T_D + dt )) + if [ "$d_rc" -eq "$expected_byte" ]; then + note_pass "$name/D${TAG} (${dt}ms)" + else + note_fail "$name/D${TAG} (expected $expected_byte got $d_rc, ${dt}ms)" + fi else - note_fail "$name/D (expected $expected_byte got $d_rc, ${dt}ms)" + note_skip "$name/D${TAG}" "not on aarch64 host" fi - else - note_skip "$name/D" "not on aarch64 host" fi - fi - - # ---- emit (needed by R/E/J/W) ----------------------------------------- - obj="$work/$name.o" - if [ $RUN_R -eq 1 ] || [ $RUN_E -eq 1 ] || [ $RUN_J -eq 1 ] \ - || [ $RUN_W -eq 1 ]; then - if ! "$CG_RUNNER" --emit "$name" "$obj" 2>"$work/emit.err"; then - note_fail "$name/emit (cg-runner --emit failed; see $work/emit.err)" - continue - fi - fi - # ---- Path R: ELF roundtrip -------------------------------------------- - if [ $RUN_R -eq 1 ]; then - if [ $have_roundtrip -eq 1 ] && [ $have_readelf -eq 1 ] && [ $have_python3 -eq 1 ]; then - t0=$(now_ms) - rt="$work/$name.rt.o" - r_ok=1; r_msg="" - if ! "$ROUNDTRIP_BIN" "$obj" "$rt" 2>"$work/rt.err"; then - r_ok=0; r_msg=" (roundtrip failed)" - else - "$READELF_BIN" -aW "$obj" | python3 "$NORMALIZE" >"$work/golden.norm" 2>/dev/null - "$READELF_BIN" -aW "$rt" | python3 "$NORMALIZE" >"$work/rt.norm" 2>/dev/null - diff -u "$work/golden.norm" "$work/rt.norm" >"$work/r.diff" 2>&1 || r_ok=0 + # ---- emit (needed by R/E/J/W) ----------------------------------------- + obj="$work/$name.o" + if [ $RUN_R -eq 1 ] || [ $RUN_E -eq 1 ] || [ $RUN_J -eq 1 ] \ + || [ $RUN_W -eq 1 ]; then + if ! "${CG_RUN[@]}" --emit "$name" "$obj" 2>"$work/emit.err"; then + note_fail "$name/emit${TAG} (cg-runner --emit failed; see $work/emit.err)" + continue fi - dt=$(( $(now_ms) - t0 )); T_R=$(( T_R + dt )) - if [ $r_ok -eq 1 ]; then note_pass "$name/R (${dt}ms)" - else note_fail "$name/R${r_msg} (${dt}ms)"; fi - else - note_skip "$name/R" "missing roundtrip/readelf/python3" fi - fi - # ---- Path E: link + (batched) qemu/podman ------------------------------ - # Link now (per case); the run is queued for the post-loop flush. - if [ $RUN_E -eq 1 ]; then - if [ $have_exe_runner -eq 1 ] && [ $have_clang_cross -eq 1 ] \ - && [ $have_start_obj -eq 1 ]; then - t0=$(now_ms) - exe="$work/linked.exe" - if ! "$LINK_EXE_RUNNER" -o "$exe" "$obj" "$START_OBJ" \ - >"$work/exec_link.out" 2>"$work/exec_link.err"; then - dt=$(( $(now_ms) - t0 )); T_E=$(( T_E + dt )) - note_fail "$name/E (link failed, ${dt}ms)" - elif [ $have_runner -eq 1 ]; then - link_dt=$(( $(now_ms) - t0 )); T_E=$(( T_E + link_dt )) - E_NAMES+=("$name") - E_WORK+=("$work") - E_LINK_MS+=("$link_dt") - E_EXPECTED+=("$expected_byte") - exec_aarch64_queue "$name" "$exe" \ - "$work/exec.out" "$work/exec.err" "$work/exec.rc" + # ---- Path R: ELF roundtrip -------------------------------------------- + if [ $RUN_R -eq 1 ]; then + if [ $have_roundtrip -eq 1 ] && [ $have_readelf -eq 1 ] && [ $have_python3 -eq 1 ]; then + t0=$(now_ms) + rt="$work/$name.rt.o" + r_ok=1; r_msg="" + if ! "$ROUNDTRIP_BIN" "$obj" "$rt" 2>"$work/rt.err"; then + r_ok=0; r_msg=" (roundtrip failed)" + else + "$READELF_BIN" -aW "$obj" | python3 "$NORMALIZE" >"$work/golden.norm" 2>/dev/null + "$READELF_BIN" -aW "$rt" | python3 "$NORMALIZE" >"$work/rt.norm" 2>/dev/null + diff -u "$work/golden.norm" "$work/rt.norm" >"$work/r.diff" 2>&1 || r_ok=0 + fi + dt=$(( $(now_ms) - t0 )); T_R=$(( T_R + dt )) + if [ $r_ok -eq 1 ]; then note_pass "$name/R${TAG} (${dt}ms)" + else note_fail "$name/R${TAG}${r_msg} (${dt}ms)"; fi else - note_skip "$name/E" "no qemu/podman" + note_skip "$name/R${TAG}" "missing roundtrip/readelf/python3" fi - else - note_skip "$name/E" "no link-exe-runner, aarch64 clang, or start.o" fi - fi - # ---- Path J: jit-via-file --------------------------------------------- - if [ $RUN_J -eq 1 ]; then - if [ $have_jit_runner -eq 1 ]; then - t0=$(now_ms) - "$JIT_RUNNER" "$obj" >"$work/jit.out" 2>"$work/jit.err" - j_rc=$? - dt=$(( $(now_ms) - t0 )); T_J=$(( T_J + dt )) - if [ "$j_rc" -eq "$expected_byte" ]; then - note_pass "$name/J (${dt}ms)" + # ---- Path E: link + (batched) qemu/podman ------------------------------ + # Link now (per case); the run is queued for the post-loop flush. + if [ $RUN_E -eq 1 ]; then + if [ $have_exe_runner -eq 1 ] && [ $have_clang_cross -eq 1 ] \ + && [ $have_start_obj -eq 1 ]; then + t0=$(now_ms) + exe="$work/linked.exe" + if ! "$LINK_EXE_RUNNER" -o "$exe" "$obj" "$START_OBJ" \ + >"$work/exec_link.out" 2>"$work/exec_link.err"; then + dt=$(( $(now_ms) - t0 )); T_E=$(( T_E + dt )) + note_fail "$name/E${TAG} (link failed, ${dt}ms)" + elif [ $have_runner -eq 1 ]; then + link_dt=$(( $(now_ms) - t0 )); T_E=$(( T_E + link_dt )) + E_NAMES+=("$name") + E_WORK+=("$work") + E_LINK_MS+=("$link_dt") + E_EXPECTED+=("$expected_byte") + # Queue with a level-tagged key so cases at different + # opt levels don't collide in the batched runner. + exec_aarch64_queue "L${OPT_LEVEL}_${name}" "$exe" \ + "$work/exec.out" "$work/exec.err" "$work/exec.rc" + else + note_skip "$name/E${TAG}" "no qemu/podman" + fi else - note_fail "$name/J (expected $expected_byte got $j_rc, ${dt}ms)" + note_skip "$name/E${TAG}" "no link-exe-runner, aarch64 clang, or start.o" fi - else - note_skip "$name/J" "no jit-runner (not aarch64 host)" fi - fi - # ---- Path W: DWARF check ---------------------------------------------- - # Cases that don't register directives produce empty stdout from - # --dwarf-checks; we silently skip those (no SKIP entry, since W is - # opt-in per case rather than per host). - if [ $RUN_W -eq 1 ]; then - "$CG_RUNNER" --dwarf-checks "$name" >"$work/w.directives" \ - 2>"$work/w.dc.err" - if [ -s "$work/w.directives" ]; then - if [ $have_dwarf_check -eq 1 ]; then + # ---- Path J: jit-via-file --------------------------------------------- + if [ $RUN_J -eq 1 ]; then + if [ $have_jit_runner -eq 1 ]; then t0=$(now_ms) - "$DWARF_CHECK" "$obj" <"$work/w.directives" \ - >"$work/w.out" 2>"$work/w.err" - w_rc=$? - dt=$(( $(now_ms) - t0 )); T_W=$(( T_W + dt )) - if [ "$w_rc" -eq 0 ]; then - note_pass "$name/W (${dt}ms)" + "$JIT_RUNNER" "$obj" >"$work/jit.out" 2>"$work/jit.err" + j_rc=$? + dt=$(( $(now_ms) - t0 )); T_J=$(( T_J + dt )) + if [ "$j_rc" -eq "$expected_byte" ]; then + note_pass "$name/J${TAG} (${dt}ms)" else - note_fail "$name/W (see $work/w.out, $work/w.err; ${dt}ms)" + note_fail "$name/J${TAG} (expected $expected_byte got $j_rc, ${dt}ms)" fi else - note_skip "$name/W" "no cg-check-dwarf" + note_skip "$name/J${TAG}" "no jit-runner (not aarch64 host)" fi fi - fi -done -# ---- batched path-E flush + verification ----------------------------------- -# Run every queued case in a single podman invocation, then iterate the -# queue to read each exit code and emit PASS/FAIL. - -T_E_BATCH=0 -if [ "$(exec_aarch64_queue_size)" -gt 0 ]; then - printf 'Running path E (%d cases batched)...\n' "$(exec_aarch64_queue_size)" - t0=$(now_ms) - exec_aarch64_flush - T_E_BATCH=$(( $(now_ms) - t0 )); T_E=$(( T_E + T_E_BATCH )) - - i=0 - while [ $i -lt ${#E_NAMES[@]} ]; do - name="${E_NAMES[$i]}" - work="${E_WORK[$i]}" - link_dt="${E_LINK_MS[$i]}" - expected_byte="${E_EXPECTED[$i]}" - if [ ! -f "$work/exec.rc" ]; then - note_fail "$name/E (no rc; podman batch did not produce results)" - else - RUN_RC="$(cat "$work/exec.rc")" - if [ "$RUN_RC" -eq "$expected_byte" ]; then - note_pass "$name/E (link ${link_dt}ms)" - else - note_fail "$name/E (expected $expected_byte got $RUN_RC, link ${link_dt}ms)" + # ---- Path W: DWARF check ---------------------------------------------- + # Cases that don't register directives produce empty stdout from + # --dwarf-checks; we silently skip those (no SKIP entry, since W is + # opt-in per case rather than per host). DWARF / opt-level + # equivalence is a Phase 5+ concern, so skip W when level > 0. + if [ $RUN_W -eq 1 ] && [ "$OPT_LEVEL" = "0" ]; then + "${CG_RUN[@]}" --dwarf-checks "$name" >"$work/w.directives" \ + 2>"$work/w.dc.err" + if [ -s "$work/w.directives" ]; then + if [ $have_dwarf_check -eq 1 ]; then + t0=$(now_ms) + "$DWARF_CHECK" "$obj" <"$work/w.directives" \ + >"$work/w.out" 2>"$work/w.err" + w_rc=$? + dt=$(( $(now_ms) - t0 )); T_W=$(( T_W + dt )) + if [ "$w_rc" -eq 0 ]; then + note_pass "$name/W (${dt}ms)" + else + note_fail "$name/W (see $work/w.out, $work/w.err; ${dt}ms)" + fi + else + note_skip "$name/W" "no cg-check-dwarf" + fi fi fi - i=$((i+1)) done -fi + + # ---- batched path-E flush + verification (per level) ------------------- + # Run every queued case in a single podman invocation, then iterate the + # queue to read each exit code and emit PASS/FAIL. + if [ "$(exec_aarch64_queue_size)" -gt 0 ]; then + printf 'Running path E%s (%d cases batched)...\n' \ + "$TAG" "$(exec_aarch64_queue_size)" + t0=$(now_ms) + exec_aarch64_flush + DELTA=$(( $(now_ms) - t0 )) + T_E_BATCH=$(( ${T_E_BATCH:-0} + DELTA )); T_E=$(( T_E + DELTA )) + + i=0 + while [ $i -lt ${#E_NAMES[@]} ]; do + name="${E_NAMES[$i]}" + work="${E_WORK[$i]}" + link_dt="${E_LINK_MS[$i]}" + expected_byte="${E_EXPECTED[$i]}" + if [ ! -f "$work/exec.rc" ]; then + note_fail "$name/E${TAG} (no rc; podman batch did not produce results)" + else + RUN_RC="$(cat "$work/exec.rc")" + if [ "$RUN_RC" -eq "$expected_byte" ]; then + note_pass "$name/E${TAG} (link ${link_dt}ms)" + else + note_fail "$name/E${TAG} (expected $expected_byte got $RUN_RC, link ${link_dt}ms)" + fi + fi + i=$((i+1)) + done + fi +done + +T_E_BATCH=${T_E_BATCH:-0} # ---- summary ---------------------------------------------------------------