kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit ef59b5c177f8397efc36282cc508a49977c974f6
parent 57f1dd7b4466ca7780e7231a501980836d83c462
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue, 12 May 2026 07:25:32 -0700

Implement OPT1 lowering and allocation

Diffstat:
Adoc/OPT1.md | 80+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/opt/ir.h | 42++++++++++++++++++++++++++++++++++++++++++
Msrc/opt/opt.c | 78++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
Asrc/opt/pass_lower.c | 670+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atest/opt/opt_test.c | 495+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atest/opt/run.sh | 5+++++
Mtest/test.mk | 11++++++++++-
7 files changed, 1350 insertions(+), 31 deletions(-)

diff --git a/doc/OPT1.md b/doc/OPT1.md @@ -0,0 +1,80 @@ +# OPT1 Checklist + +This is the focused implementation checklist for the first production `-O1` +slice. The fuller design in `doc/OPT.md` is the source of truth for pass +semantics, phase order, and exit criteria, especially: + +- `doc/OPT.md` §3.3, "Lowering and Allocation" +- `doc/OPT.md` §4.1, "`-O1` Minimal Schedule" +- `doc/OPT.md` Phase A, "Production `-O1` Lowering" + +If this checklist is ambiguous, follow `doc/OPT.md`; do not reduce scope or +substitute a behaviorally similar shortcut without updating both documents. + +- [x] Set up `test/opt/` as the first task. Add IR-level and mock-target + red-green tests before implementation: + - `opt_liveness_branch` + - `opt_regalloc_priority` + - `opt_rewrite_spill_use_def` + - `opt_emit_no_virtual_alloc` + These tests must assert pass/rewrite shape, not just executable output. +- [x] Add the liveness data model: block `live_in`/`live_out`, value live + ranges, loop depth/frequency, and complete use/def walkers. +- [x] Implement simple `opt_regalloc(..., false)`: priority allocation by + tied hard-reg needs, frequency, live length, then stable id. +- [x] Add the rewrite pass: map virtual regs to hard regs or `FS_SPILL` + slots, inserting reloads/stores for spilled uses and defs. +- [x] Make `opt_emit` stop relying on wrapped-target `alloc_reg` for virtual + values after rewrite. +- [ ] Fill in target-aware `opt_machinize`/`opt_combine`, starting with + AArch64 ABI/call constraints, noop move deletion, and safe single-use + folds. Partially complete: AArch64 register pools/call-clobber handling + and noop physical move deletion are implemented; general safe single-use + folds remain future work. +- [x] Add focused `-O1` tests for branch liveness, call-clobber preservation, + spill pressure, inline asm tied/fixed registers, and post-rewrite DCE. +- [ ] Review AArch64 backend internal scratch-register usage and decide how + those backend scratch conventions should interact with `opt` register + allocation. + +## Completion Notes + +- Implemented in `src/opt/pass_lower.c`, `src/opt/opt.c`, and + `src/opt/ir.h`. +- Added `test/opt/opt_test.c`, `test/opt/run.sh`, and `make test-opt`. +- `-O1` now runs CFG, machinize, liveness, simple allocation/rewrite, + combine, DCE, then post-rewrite emit. +- Focused opt tests currently cover: + `opt_liveness_branch`, `opt_regalloc_priority`, + `opt_rewrite_spill_use_def`, `opt_emit_no_virtual_alloc`, + `opt_call_clobber_preservation`, `opt_spill_pressure`, + `opt_inline_asm_tied_fixed_regs`, and `opt_post_rewrite_dce`. +- Validation run after implementation: + - `make test-opt` passed with 23 checks. + - `CFREE_OPT_LEVELS=1 CFREE_TEST_PATHS=D make test-cg` passed + 196 cases with 0 failures. + - Targeted `h06`, `h07`, and `q05` `CFREE_TEST_PATHS=E` runs passed. + +## Deviations + +- `opt_combine` is intentionally narrow: it removes noop physical copies but + does not yet implement a broader safe single-use fold framework. +- `opt_dce` is conservative post-rewrite cleanup, currently covering `IR_NOP` + and empty non-side-effecting instructions rather than full dead-definition + elimination. +- AArch64 allocation uses a conservative hard-register pool and explicit + caller-saved save/restore around calls. This is enough for the current `-O1` + slice but should be reconciled with backend-owned scratch-register + conventions. +- x64 and RV64 register pools exist only as initial placeholders and have not + received the same targeted validation as AArch64. + +## Remaining Todos + +- Finish the general safe single-use fold portion of `opt_combine`. +- Expand post-rewrite DCE into true dead-definition elimination once the + rewritten IR has enough precise side-effect and use/def coverage. +- Review AArch64 backend internal scratch usage and decide the long-term + contract between backend scratch registers and opt register allocation. +- Add targeted x64/RV64 `-O1` lowering/allocation tests before treating those + targets as production-ready for OPT1. diff --git a/src/opt/ir.h b/src/opt/ir.h @@ -244,8 +244,38 @@ typedef struct Block { u32 npreds; u32 succ[2]; u8 nsucc; + u8 loop_depth; + u16 pad; + u32 frequency; + u64* live_in; + u64* live_out; + u64* live_use; + u64* live_def; } Block; +typedef enum OptAllocKind { + OPT_ALLOC_NONE, + OPT_ALLOC_HARD, + OPT_ALLOC_SPILL, +} OptAllocKind; + +typedef struct OptValInfo { + u32 first_pos; + u32 last_pos; + u32 live_length; + u32 frequency; + i32 tied_hard_reg; /* -1 means no fixed/tied physical register need. */ + Reg hard_reg; + FrameSlot spill_slot; + u8 alloc_kind; /* OptAllocKind */ + u8 cls; /* RegClass */ + u8 pad[2]; +} OptValInfo; + +#define OPT_REG_CLASSES 3u +#define OPT_MAX_HARD_REGS 32u +#define OPT_MAX_SCRATCH_REGS 4u + typedef struct Func { Arena* arena; CGFuncDesc desc; /* preserved for level-1 replay func_begin */ @@ -283,6 +313,18 @@ typedef struct Func { * skips them. */ u32* emit_order; u32 emit_order_n, emit_order_cap; + + Target opt_target; + u8 opt_has_target; + u8 opt_rewritten; + u16 opt_live_words; + u32 opt_position_count; + OptValInfo* val_info; /* indexed by Val, length nvals */ + + Reg opt_hard_regs[OPT_REG_CLASSES][OPT_MAX_HARD_REGS]; + u32 opt_hard_reg_count[OPT_REG_CLASSES]; + Reg opt_scratch_regs[OPT_REG_CLASSES][OPT_MAX_SCRATCH_REGS]; + u32 opt_scratch_reg_count[OPT_REG_CLASSES]; } Func; /* ---- API ---- */ diff --git a/src/opt/opt.c b/src/opt/opt.c @@ -4,10 +4,9 @@ * coincide with their IR counterparts (Reg ↔ Val, label ↔ block id, * vslot ↔ IR FrameSlot, vscope ↔ scope_aux index — doc/OPT.md §5.1). * - * Phase 3 (current): on func_end the wrapper optionally runs - * opt_build_cfg + opt_build_ssa (level 2; output discarded), then - * replays each Func into the wrapped target. Level 1 skips the SSA - * passes; in both cases replay is the level-1 1:1 walk. + * OPT1: level 1 records the CGTarget stream, runs the minimal backend + * lowering schedule, rewrites virtual regs to hard regs/spill slots, + * and emits the rewritten IR into the wrapped target. * * Methods the wrapper rejects under unbounded virtuals: * - clobbers / spill_reg / reload_reg are CG -O0 register-pressure @@ -739,7 +738,8 @@ static void w_set_loc(CGTarget* t, SrcLoc loc) { * ============================================================ */ typedef struct ReplayCtx { - OptImpl* o; + Compiler* c; + Func* f; CGTarget* tgt; Reg* val_to_reg; FrameSlot* slot_map; @@ -747,14 +747,16 @@ typedef struct ReplayCtx { CGScope* scope_map; u8* val_alloced; u8* block_label_placed; + u8 identity_regs; } ReplayCtx; static Reg val_to_target_reg(ReplayCtx* r, Val v) { - Func* f = r->o->f; + Func* f = r->f; if (v == VAL_NONE) return REG_NONE; + if (r->identity_regs) return (Reg)v; if (v >= f->nvals) { SrcLoc loc = {0, 0, 0}; - compiler_panic(r->o->c, loc, "opt replay: Val %u out of range", v); + compiler_panic(r->c, loc, "opt replay: Val %u out of range", v); } if (!r->val_alloced[v]) { r->val_to_reg[v] = @@ -766,9 +768,9 @@ static Reg val_to_target_reg(ReplayCtx* r, Val v) { static FrameSlot slot_to_target(ReplayCtx* r, FrameSlot vs) { if (vs == FRAME_SLOT_NONE) return FRAME_SLOT_NONE; - if (vs >= r->o->f->nframe_slots + 1u) { + if (vs >= r->f->nframe_slots + 1u) { SrcLoc loc = {0, 0, 0}; - compiler_panic(r->o->c, loc, "opt replay: vslot %u out of range", + compiler_panic(r->c, loc, "opt replay: vslot %u out of range", (unsigned)vs); } return r->slot_map[vs]; @@ -809,7 +811,7 @@ static CGABIValue xlat_abivalue(ReplayCtx* r, const CGABIValue* in, } static Label ensure_label(ReplayCtx* r, u32 b) { - if (b >= r->o->f->nblocks) return LABEL_NONE; + if (b >= r->f->nblocks) return LABEL_NONE; if (r->label_map[b] == LABEL_NONE) { r->label_map[b] = r->tgt->label_new(r->tgt); } @@ -819,7 +821,7 @@ static Label ensure_label(ReplayCtx* r, u32 b) { static void ensure_label_placed(ReplayCtx* r, u32 b) { if (r->block_label_placed[b]) return; r->block_label_placed[b] = 1; - if (b == r->o->f->entry) return; + if (b == r->f->entry) return; Label l = ensure_label(r, b); r->tgt->label_place(r->tgt, l); } @@ -841,13 +843,13 @@ static void replay_inst(ReplayCtx* r, u32 b, Inst* in) { Operand* in_ops_ = NULL; Operand* out_ops_ = NULL; if (aux->nin) { - in_ops_ = arena_array(r->o->f->arena, Operand, aux->nin); + in_ops_ = arena_array(r->f->arena, Operand, aux->nin); for (u32 k = 0; k < aux->nin; ++k) { in_ops_[k] = xlat_op(r, aux->in_ops[k]); } } if (aux->nout) { - out_ops_ = arena_array(r->o->f->arena, Operand, aux->nout); + out_ops_ = arena_array(r->f->arena, Operand, aux->nout); for (u32 k = 0; k < aux->nout; ++k) { out_ops_[k] = xlat_op(r, aux->out_ops[k]); } @@ -956,11 +958,11 @@ static void replay_inst(ReplayCtx* r, u32 b, Inst* in) { cd.callee = xlat_op(r, cd.callee); CGABIValue* args = NULL; if (cd.nargs) { - args = arena_array(r->o->f->arena, CGABIValue, cd.nargs); + args = arena_array(r->f->arena, CGABIValue, cd.nargs); for (u32 k = 0; k < cd.nargs; ++k) { CGABIPart* parts = aux->desc.args[k].nparts - ? arena_array(r->o->f->arena, CGABIPart, + ? arena_array(r->f->arena, CGABIPart, aux->desc.args[k].nparts) : NULL; args[k] = xlat_abivalue(r, &aux->desc.args[k], parts); @@ -971,14 +973,14 @@ static void replay_inst(ReplayCtx* r, u32 b, Inst* in) { } CGABIPart* ret_parts = cd.ret.nparts - ? arena_array(r->o->f->arena, CGABIPart, cd.ret.nparts) + ? arena_array(r->f->arena, CGABIPart, cd.ret.nparts) : NULL; cd.ret = xlat_abivalue(r, &aux->desc.ret, ret_parts); w->call(w, &cd); break; } case IR_BR: { - Block* bl = &r->o->f->blocks[b]; + Block* bl = &r->f->blocks[b]; if (bl->nsucc < 1) break; Label l = ensure_label(r, bl->succ[0]); w->jump(w, l); @@ -987,7 +989,7 @@ static void replay_inst(ReplayCtx* r, u32 b, Inst* in) { case IR_CMP_BRANCH: { Operand a = xlat_op(r, in->opnds[0]); Operand bo = xlat_op(r, in->opnds[1]); - Block* bl = &r->o->f->blocks[b]; + Block* bl = &r->f->blocks[b]; Label taken = ensure_label(r, bl->succ[0]); w->cmp_branch(w, (CmpOp)in->extra.imm, a, bo, taken); break; @@ -999,7 +1001,7 @@ static void replay_inst(ReplayCtx* r, u32 b, Inst* in) { } else { CGABIPart* parts = aux->val.nparts - ? arena_array(r->o->f->arena, CGABIPart, aux->val.nparts) + ? arena_array(r->f->arena, CGABIPart, aux->val.nparts) : NULL; CGABIValue v = xlat_abivalue(r, &aux->val, parts); w->ret(w, &v); @@ -1114,9 +1116,9 @@ static void replay_inst(ReplayCtx* r, u32 b, Inst* in) { case IR_INTRINSIC: { IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; Operand* dsts = - aux->ndst ? arena_array(r->o->f->arena, Operand, aux->ndst) : NULL; + aux->ndst ? arena_array(r->f->arena, Operand, aux->ndst) : NULL; Operand* args = - aux->narg ? arena_array(r->o->f->arena, Operand, aux->narg) : NULL; + aux->narg ? arena_array(r->f->arena, Operand, aux->narg) : NULL; for (u32 k = 0; k < aux->ndst; ++k) dsts[k] = xlat_op(r, aux->dsts[k]); for (u32 k = 0; k < aux->narg; ++k) args[k] = xlat_op(r, aux->args[k]); w->intrinsic(w, aux->kind, dsts, aux->ndst, args, aux->narg); @@ -1126,7 +1128,7 @@ static void replay_inst(ReplayCtx* r, u32 b, Inst* in) { } static void replay_block(ReplayCtx* r, u32 b) { - Func* f = r->o->f; + Func* f = r->f; if (b >= f->nblocks) return; ensure_label_placed(r, b); Block* bl = &f->blocks[b]; @@ -1135,13 +1137,12 @@ static void replay_block(ReplayCtx* r, u32 b) { } } -static void replay_func(OptImpl* o) { - Func* f = o->f; - CGTarget* w = o->target; - +static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) { ReplayCtx r; - r.o = o; + r.c = c; + r.f = f; r.tgt = w; + r.identity_regs = identity ? 1u : 0u; u32 nv = f->nvals ? f->nvals : 1u; r.val_to_reg = arena_zarray(f->arena, Reg, nv); for (u32 i = 0; i < nv; ++i) r.val_to_reg[i] = REG_NONE; @@ -1195,18 +1196,35 @@ static void replay_func(OptImpl* o) { w->func_end(w); } +static void replay_func(OptImpl* o) { + replay_func_to(o->c, o->f, o->target, 0); +} + +void opt_emit(Compiler* c, Func* f, CGTarget* target) { + replay_func_to(c, f, target, 1); +} + /* ---- func_end: optionally run dry-run passes; replay; reset ---- */ static void w_func_end(CGTarget* t) { OptImpl* o = impl_of(t); if (!o->f) return; - if (o->level >= 2) { + if (o->level == 1) { + opt_build_cfg(o->f); + opt_machinize(o->f, o->c->target); + opt_live_info(o->f); + opt_regalloc(o->f, 0); + opt_combine(o->f); + opt_dce(o->f); + opt_emit(o->c, o->f, o->target); + } else if (o->level >= 2) { opt_build_cfg(o->f); opt_build_ssa(o->f); + replay_func(o); + } else { + replay_func(o); } - - replay_func(o); o->f = NULL; o->cur = 0; } diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c @@ -0,0 +1,670 @@ +#include "opt/opt.h" + +#include <string.h> + +#include "core/arena.h" +#include "core/core.h" + +static u32 type_size_fallback(const Func* f, const Type* t) { + if (!t) return f->opt_target.ptr_size ? f->opt_target.ptr_size : 8u; + switch ((TypeKind)t->kind) { + case TY_BOOL: + case TY_CHAR: + case TY_SCHAR: + case TY_UCHAR: + return 1; + case TY_SHORT: + case TY_USHORT: + return 2; + case TY_INT: + case TY_UINT: + case TY_FLOAT: + return 4; + case TY_LONG: + case TY_ULONG: + case TY_LLONG: + case TY_ULLONG: + case TY_DOUBLE: + case TY_PTR: + return 8; + case TY_INT128: + case TY_UINT128: + case TY_LDOUBLE: + return 16; + default: + return f->opt_target.ptr_size ? f->opt_target.ptr_size : 8u; + } +} + +static u32 bit_words(u32 nvals) { return (nvals + 63u) / 64u; } + +static void bit_set(u64* bits, Val v) { bits[v / 64u] |= 1ull << (v % 64u); } +static int bit_has(const u64* bits, Val v) { + return (bits[v / 64u] & (1ull << (v % 64u))) != 0; +} +static int bit_or_changed(u64* dst, const u64* src, u32 n) { + int changed = 0; + for (u32 i = 0; i < n; ++i) { + u64 old = dst[i]; + dst[i] |= src[i]; + changed |= dst[i] != old; + } + return changed; +} +static int bit_copy_changed(u64* dst, const u64* src, u32 n) { + int changed = 0; + for (u32 i = 0; i < n; ++i) { + changed |= dst[i] != src[i]; + dst[i] = src[i]; + } + return changed; +} + +typedef void (*OperandWalkFn)(Func*, Inst*, Operand*, int is_def, void*); + +static int val_in_inst_defs(const Inst* in, Val v) { + if (v == VAL_NONE) return 0; + if (in->def == v) return 1; + for (u32 i = 0; i < in->ndefs; ++i) + if (in->defs[i] == v) return 1; + return 0; +} + +static void walk_operand(Func* f, Inst* in, Operand* op, int is_def, + OperandWalkFn fn, void* ctx) { + if (!op) return; + if (op->kind == OPK_REG) { + fn(f, in, op, is_def, ctx); + } else if (op->kind == OPK_INDIRECT) { + Operand base = *op; + base.kind = OPK_REG; + base.cls = RC_INT; + base.v.reg = op->v.ind.base; + fn(f, in, &base, 0, ctx); + op->v.ind.base = base.v.reg; + } +} + +static void walk_abivalue(Func* f, Inst* in, CGABIValue* v, int storage_def, + OperandWalkFn fn, void* ctx) { + walk_operand(f, in, &v->storage, storage_def, fn, ctx); + for (u32 i = 0; i < v->nparts; ++i) + walk_operand(f, in, (Operand*)&v->parts[i].op, storage_def, fn, ctx); +} + +static void walk_inst_operands(Func* f, Inst* in, OperandWalkFn fn, void* ctx) { + for (u32 i = 0; i < in->nopnds; ++i) { + int is_def = 0; + if (in->opnds[i].kind == OPK_REG) + is_def = i == 0 && val_in_inst_defs(in, (Val)in->opnds[i].v.reg); + if (!is_def) walk_operand(f, in, &in->opnds[i], 0, fn, ctx); + } + for (u32 i = 0; i < in->nopnds; ++i) { + int is_def = 0; + if (in->opnds[i].kind == OPK_REG) + is_def = i == 0 && val_in_inst_defs(in, (Val)in->opnds[i].v.reg); + if (is_def) walk_operand(f, in, &in->opnds[i], 1, fn, ctx); + } + + switch ((IROp)in->op) { + case IR_CALL: { + IRCallAux* aux = (IRCallAux*)in->extra.aux; + if (!aux) break; + walk_operand(f, in, &aux->desc.callee, 0, fn, ctx); + for (u32 i = 0; i < aux->desc.nargs; ++i) + walk_abivalue(f, in, (CGABIValue*)&aux->desc.args[i], 0, fn, ctx); + walk_abivalue(f, in, &aux->desc.ret, 1, fn, ctx); + break; + } + case IR_RET: { + IRRetAux* aux = (IRRetAux*)in->extra.aux; + if (aux && aux->present) walk_abivalue(f, in, &aux->val, 0, fn, ctx); + break; + } + case IR_SCOPE_BEGIN: { + IRScopeAux* aux = (IRScopeAux*)in->extra.aux; + if (aux) walk_operand(f, in, &aux->desc.cond, 0, fn, ctx); + break; + } + case IR_ASM_BLOCK: { + IRAsmAux* aux = (IRAsmAux*)in->extra.aux; + if (!aux) break; + for (u32 i = 0; i < aux->nin; ++i) + walk_operand(f, in, &aux->in_ops[i], 0, fn, ctx); + for (u32 i = 0; i < aux->nout; ++i) + walk_operand(f, in, &aux->out_ops[i], 1, fn, ctx); + break; + } + case IR_INTRINSIC: { + IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; + if (!aux) break; + for (u32 i = 0; i < aux->narg; ++i) + walk_operand(f, in, &aux->args[i], 0, fn, ctx); + for (u32 i = 0; i < aux->ndst; ++i) + walk_operand(f, in, &aux->dsts[i], 1, fn, ctx); + break; + } + default: + break; + } +} + +typedef struct UseDefCtx { + u64* use; + u64* def; + u32 pos; + u32 freq; +} UseDefCtx; + +static void collect_use_def(Func* f, Inst* in, Operand* op, int is_def, + void* arg) { + (void)in; + UseDefCtx* c = (UseDefCtx*)arg; + if (op->kind != OPK_REG) return; + Val v = (Val)op->v.reg; + if (v == VAL_NONE || v >= f->nvals) return; + if (!f->val_info) return; + OptValInfo* vi = &f->val_info[v]; + if (vi->first_pos == 0 || c->pos < vi->first_pos) vi->first_pos = c->pos; + if (c->pos > vi->last_pos) vi->last_pos = c->pos; + vi->frequency += c->freq ? c->freq : 1u; + vi->cls = f->val_cls[v]; + if (is_def) { + bit_set(c->def, v); + } else { + if (!bit_has(c->def, v)) bit_set(c->use, v); + } +} + +typedef struct BitsCtx { + u64* use; + u64* def; +} BitsCtx; + +static void collect_bits(Func* f, Inst* in, Operand* op, int is_def, + void* arg) { + (void)in; + BitsCtx* c = (BitsCtx*)arg; + if (op->kind != OPK_REG) return; + Val v = (Val)op->v.reg; + if (v == VAL_NONE || v >= f->nvals) return; + if (is_def) + bit_set(c->def, v); + else + bit_set(c->use, v); +} + +void opt_machinize(Func* f, Target t) { + f->opt_target = t; + f->opt_has_target = 1; + for (u32 c = 0; c < OPT_REG_CLASSES; ++c) { + f->opt_hard_reg_count[c] = 0; + f->opt_scratch_reg_count[c] = 0; + } + + switch (t.arch) { + case CFREE_ARCH_ARM_64: { + static const Reg ints[] = {13, 14, 15}; + static const Reg fps[] = {17, 18, 19, 20, 21, 22, 23}; + for (u32 i = 0; i < sizeof ints / sizeof ints[0]; ++i) + f->opt_hard_regs[RC_INT][f->opt_hard_reg_count[RC_INT]++] = ints[i]; + for (u32 i = 0; i < sizeof fps / sizeof fps[0]; ++i) + f->opt_hard_regs[RC_FP][f->opt_hard_reg_count[RC_FP]++] = fps[i]; + f->opt_scratch_regs[RC_INT][0] = 16; + f->opt_scratch_regs[RC_INT][1] = 17; + f->opt_scratch_reg_count[RC_INT] = 2; + f->opt_scratch_regs[RC_FP][0] = 24; + f->opt_scratch_regs[RC_FP][1] = 25; + f->opt_scratch_reg_count[RC_FP] = 2; + break; + } + case CFREE_ARCH_X86_64: { + static const Reg ints[] = {10}; + static const Reg fps[] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + for (u32 i = 0; i < sizeof ints / sizeof ints[0]; ++i) + f->opt_hard_regs[RC_INT][f->opt_hard_reg_count[RC_INT]++] = ints[i]; + for (u32 i = 0; i < sizeof fps / sizeof fps[0]; ++i) + f->opt_hard_regs[RC_FP][f->opt_hard_reg_count[RC_FP]++] = fps[i]; + f->opt_scratch_regs[RC_INT][0] = 11; + f->opt_scratch_reg_count[RC_INT] = 1; + f->opt_scratch_regs[RC_FP][0] = 0; + f->opt_scratch_reg_count[RC_FP] = 1; + break; + } + case CFREE_ARCH_RV64: { + for (Reg r = 28; r <= 31; ++r) + f->opt_hard_regs[RC_INT][f->opt_hard_reg_count[RC_INT]++] = r; + for (Reg r = 28; r <= 31; ++r) + f->opt_hard_regs[RC_FP][f->opt_hard_reg_count[RC_FP]++] = r; + f->opt_scratch_regs[RC_INT][0] = 5; + f->opt_scratch_regs[RC_INT][1] = 6; + f->opt_scratch_reg_count[RC_INT] = 2; + f->opt_scratch_regs[RC_FP][0] = 0; + f->opt_scratch_reg_count[RC_FP] = 1; + break; + } + default: + break; + } +} + +static void build_loop_depth(Func* f) { + for (u32 b = 0; b < f->nblocks; ++b) { + f->blocks[b].loop_depth = 0; + f->blocks[b].frequency = 1; + } + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 s = 0; s < bl->nsucc; ++s) { + u32 h = bl->succ[s]; + if (h > b || h >= f->nblocks) continue; + for (u32 k = h; k <= b; ++k) + if (f->blocks[k].loop_depth < 31) ++f->blocks[k].loop_depth; + } + } + for (u32 b = 0; b < f->nblocks; ++b) + f->blocks[b].frequency = 1u << f->blocks[b].loop_depth; +} + +void opt_live_info(Func* f) { + f->opt_live_words = (u16)bit_words(f->nvals); + f->opt_position_count = 0; + u32 words = f->opt_live_words; + if (!f->val_info) f->val_info = arena_zarray(f->arena, OptValInfo, f->nvals); + for (u32 v = 0; v < f->nvals; ++v) { + f->val_info[v].first_pos = 0; + f->val_info[v].last_pos = 0; + f->val_info[v].live_length = 0; + f->val_info[v].frequency = 0; + f->val_info[v].tied_hard_reg = -1; + f->val_info[v].hard_reg = REG_NONE; + f->val_info[v].spill_slot = FRAME_SLOT_NONE; + f->val_info[v].alloc_kind = OPT_ALLOC_NONE; + f->val_info[v].cls = f->val_cls[v]; + } + build_loop_depth(f); + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + bl->live_in = arena_zarray(f->arena, u64, words); + bl->live_out = arena_zarray(f->arena, u64, words); + bl->live_use = arena_zarray(f->arena, u64, words); + bl->live_def = arena_zarray(f->arena, u64, words); + UseDefCtx c; + memset(&c, 0, sizeof c); + c.use = bl->live_use; + c.def = bl->live_def; + c.freq = bl->frequency; + for (u32 i = 0; i < bl->ninsts; ++i) { + c.pos = ++f->opt_position_count; + walk_inst_operands(f, &bl->insts[i], collect_use_def, &c); + } + } + int changed; + do { + changed = 0; + for (u32 bi = f->nblocks; bi > 0; --bi) { + u32 b = bi - 1u; + Block* bl = &f->blocks[b]; + u64* new_out = arena_zarray(f->arena, u64, words); + u64* new_in = arena_zarray(f->arena, u64, words); + for (u32 s = 0; s < bl->nsucc; ++s) { + u32 t = bl->succ[s]; + if (t < f->nblocks) bit_or_changed(new_out, f->blocks[t].live_in, words); + } + for (u32 w = 0; w < words; ++w) + new_in[w] = bl->live_use[w] | (new_out[w] & ~bl->live_def[w]); + changed |= bit_copy_changed(bl->live_out, new_out, words); + changed |= bit_copy_changed(bl->live_in, new_in, words); + } + } while (changed); + + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (Val v = 1; v < f->nvals; ++v) { + if (bit_has(bl->live_in, v) || bit_has(bl->live_out, v)) { + OptValInfo* vi = &f->val_info[v]; + if (vi->first_pos == 0) vi->first_pos = 1; + if (vi->last_pos < f->opt_position_count) + vi->last_pos = f->opt_position_count; + vi->frequency += bl->frequency; + } + } + } + for (Val v = 1; v < f->nvals; ++v) { + OptValInfo* vi = &f->val_info[v]; + if (vi->first_pos && vi->last_pos >= vi->first_pos) + vi->live_length = vi->last_pos - vi->first_pos + 1u; + } +} + +static int ranges_overlap(const OptValInfo* a, const OptValInfo* b) { + if (!a->first_pos || !b->first_pos) return 0; + return a->first_pos <= b->last_pos && b->first_pos <= a->last_pos; +} + +static int val_higher_priority(Func* f, Val a, Val b) { + OptValInfo* av = &f->val_info[a]; + OptValInfo* bv = &f->val_info[b]; + int at = av->tied_hard_reg >= 0; + int bt = bv->tied_hard_reg >= 0; + if (at != bt) return at > bt; + if (av->frequency != bv->frequency) return av->frequency > bv->frequency; + if (av->live_length != bv->live_length) return av->live_length < bv->live_length; + return a < b; +} + +static int hard_conflicts(Func* f, Val* assigned, u32 nassigned, Val v, Reg r) { + for (u32 i = 0; i < nassigned; ++i) { + Val ov = assigned[i]; + if (f->val_info[ov].alloc_kind != OPT_ALLOC_HARD) continue; + if (f->val_info[ov].hard_reg != r) continue; + if (ranges_overlap(&f->val_info[ov], &f->val_info[v])) return 1; + } + return 0; +} + +static FrameSlot spill_slot_for(Func* f, Val v) { + if (f->val_info[v].spill_slot != FRAME_SLOT_NONE) + return f->val_info[v].spill_slot; + FrameSlotDesc d; + memset(&d, 0, sizeof d); + d.type = f->val_type[v]; + d.size = type_size_fallback(f, f->val_type[v]); + d.align = d.size >= 8 ? 8 : d.size; + d.kind = FS_SPILL; + f->val_info[v].spill_slot = ir_frame_slot_new(f, &d); + return f->val_info[v].spill_slot; +} + +typedef struct RewriteList { + Inst* data; + u32 n; + u32 cap; +} RewriteList; + +static Inst* list_push(Func* f, RewriteList* l, IROp op) { + if (l->n == l->cap) { + u32 ncap = l->cap ? l->cap * 2u : 16u; + Inst* nb = arena_zarray(f->arena, Inst, ncap); + if (l->data) memcpy(nb, l->data, sizeof(Inst) * l->n); + l->data = nb; + l->cap = ncap; + } + Inst* in = &l->data[l->n++]; + memset(in, 0, sizeof *in); + in->op = (u16)op; + return in; +} + +static MemAccess spill_mem(Func* f, Val v) { + MemAccess m; + memset(&m, 0, sizeof m); + m.type = f->val_type[v]; + m.size = type_size_fallback(f, f->val_type[v]); + m.align = m.size >= 8 ? 8 : m.size; + m.alias.kind = ALIAS_LOCAL; + m.alias.v.local_id = (i32)spill_slot_for(f, v); + return m; +} + +static Operand spill_addr(Func* f, Val v) { + Operand o; + memset(&o, 0, sizeof o); + o.kind = OPK_LOCAL; + o.cls = RC_INT; + o.type = f->val_type[v]; + o.v.frame_slot = spill_slot_for(f, v); + return o; +} + +static Operand hard_operand(Func* f, Val v) { + Operand o; + memset(&o, 0, sizeof o); + o.kind = OPK_REG; + o.cls = f->val_info[v].cls; + o.type = f->val_type[v]; + o.v.reg = f->val_info[v].hard_reg; + return o; +} + +static void append_store_val(Func* f, RewriteList* out, Val v) { + Inst* st = list_push(f, out, IR_STORE); + st->opnds = arena_array(f->arena, Operand, 2); + st->opnds[0] = spill_addr(f, v); + st->opnds[1] = hard_operand(f, v); + st->nopnds = 2; + st->extra.mem = spill_mem(f, v); +} + +static void append_load_val(Func* f, RewriteList* out, Val v) { + Inst* ld = list_push(f, out, IR_LOAD); + ld->opnds = arena_array(f->arena, Operand, 2); + ld->opnds[0] = hard_operand(f, v); + ld->opnds[1] = spill_addr(f, v); + ld->nopnds = 2; + ld->extra.mem = spill_mem(f, v); +} + +static Reg scratch_for(Func* f, u8 cls, u32* next) { + u32 n = f->opt_scratch_reg_count[cls]; + if (!n) return REG_NONE; + Reg r = f->opt_scratch_regs[cls][*next % n]; + ++*next; + return r; +} + +typedef struct RewriteCtx { + RewriteList* before; + RewriteList* after; + u32 next_scratch[OPT_REG_CLASSES]; +} RewriteCtx; + +static void rewrite_one_operand(Func* f, Inst* owner, Operand* op, int is_def, + void* arg) { + (void)owner; + RewriteCtx* c = (RewriteCtx*)arg; + if (op->kind != OPK_REG) return; + Val v = (Val)op->v.reg; + if (v == VAL_NONE || v >= f->nvals) return; + OptValInfo* vi = &f->val_info[v]; + if (vi->alloc_kind == OPT_ALLOC_HARD) { + op->v.reg = vi->hard_reg; + return; + } + if (vi->alloc_kind != OPT_ALLOC_SPILL) return; + u8 cls = vi->cls; + Reg scratch = scratch_for(f, cls, &c->next_scratch[cls]); + op->v.reg = scratch; + if (!is_def) { + Inst* ld = list_push(f, c->before, IR_LOAD); + ld->opnds = arena_array(f->arena, Operand, 2); + ld->opnds[0] = *op; + ld->opnds[1] = spill_addr(f, v); + ld->nopnds = 2; + ld->extra.mem = spill_mem(f, v); + } else { + Inst* st = list_push(f, c->after, IR_STORE); + st->opnds = arena_array(f->arena, Operand, 2); + st->opnds[0] = spill_addr(f, v); + st->opnds[1] = *op; + st->nopnds = 2; + st->extra.mem = spill_mem(f, v); + } +} + +static void rewrite_func(Func* f) { + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + u64** live_after = arena_array(f->arena, u64*, bl->ninsts ? bl->ninsts : 1u); + u64* live = arena_zarray(f->arena, u64, f->opt_live_words); + for (u32 w = 0; w < f->opt_live_words; ++w) live[w] = bl->live_out[w]; + for (u32 ri = bl->ninsts; ri > 0; --ri) { + u32 i = ri - 1u; + live_after[i] = arena_zarray(f->arena, u64, f->opt_live_words); + for (u32 w = 0; w < f->opt_live_words; ++w) live_after[i][w] = live[w]; + u64* use = arena_zarray(f->arena, u64, f->opt_live_words); + u64* def = arena_zarray(f->arena, u64, f->opt_live_words); + BitsCtx bc = {use, def}; + walk_inst_operands(f, &bl->insts[i], collect_bits, &bc); + for (u32 w = 0; w < f->opt_live_words; ++w) + live[w] = (live[w] & ~def[w]) | use[w]; + } + + RewriteList out; + memset(&out, 0, sizeof out); + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst in = bl->insts[i]; + u64* def = arena_zarray(f->arena, u64, f->opt_live_words); + BitsCtx db = {arena_zarray(f->arena, u64, f->opt_live_words), def}; + walk_inst_operands(f, &in, collect_bits, &db); + RewriteList before, after; + memset(&before, 0, sizeof before); + memset(&after, 0, sizeof after); + RewriteCtx ctx; + memset(&ctx, 0, sizeof ctx); + ctx.before = &before; + ctx.after = &after; + walk_inst_operands(f, &in, rewrite_one_operand, &ctx); + for (u32 k = 0; k < before.n; ++k) { + Inst* dst = list_push(f, &out, (IROp)before.data[k].op); + *dst = before.data[k]; + } + if ((IROp)in.op == IR_CALL) { + for (Val v = 1; v < f->nvals; ++v) { + if (!bit_has(live_after[i], v) || bit_has(def, v)) continue; + if (f->val_info[v].alloc_kind == OPT_ALLOC_HARD) append_store_val(f, &out, v); + } + } + Inst* dst = list_push(f, &out, (IROp)in.op); + *dst = in; + if ((IROp)in.op == IR_CALL) { + for (Val v = 1; v < f->nvals; ++v) { + if (!bit_has(live_after[i], v) || bit_has(def, v)) continue; + if (f->val_info[v].alloc_kind == OPT_ALLOC_HARD) append_load_val(f, &out, v); + } + } + for (u32 k = 0; k < after.n; ++k) { + Inst* ad = list_push(f, &out, (IROp)after.data[k].op); + *ad = after.data[k]; + } + } + bl->insts = out.data; + bl->ninsts = out.n; + bl->cap = out.cap; + } + f->opt_rewritten = 1; +} + +void opt_regalloc(Func* f, int allow_live_range_split) { + (void)allow_live_range_split; + if (!f->val_info) opt_live_info(f); + Val* order = arena_array(f->arena, Val, f->nvals ? f->nvals : 1u); + u32 norder = 0; + for (Val v = 1; v < f->nvals; ++v) + if (f->val_info[v].first_pos) order[norder++] = v; + for (u32 i = 1; i < norder; ++i) { + Val key = order[i]; + u32 j = i; + while (j > 0 && val_higher_priority(f, key, order[j - 1u])) { + order[j] = order[j - 1u]; + --j; + } + order[j] = key; + } + Val* assigned = arena_array(f->arena, Val, norder ? norder : 1u); + u32 nassigned = 0; + for (u32 i = 0; i < norder; ++i) { + Val v = order[i]; + OptValInfo* vi = &f->val_info[v]; + u8 cls = vi->cls; + if (vi->tied_hard_reg >= 0 && + !hard_conflicts(f, assigned, nassigned, v, (Reg)vi->tied_hard_reg)) { + vi->alloc_kind = OPT_ALLOC_HARD; + vi->hard_reg = (Reg)vi->tied_hard_reg; + assigned[nassigned++] = v; + continue; + } + int found = 0; + for (u32 r = 0; r < f->opt_hard_reg_count[cls]; ++r) { + Reg hr = f->opt_hard_regs[cls][r]; + if (!hard_conflicts(f, assigned, nassigned, v, hr)) { + vi->alloc_kind = OPT_ALLOC_HARD; + vi->hard_reg = hr; + assigned[nassigned++] = v; + found = 1; + break; + } + } + if (!found) { + vi->alloc_kind = OPT_ALLOC_SPILL; + vi->spill_slot = spill_slot_for(f, v); + } + } + rewrite_func(f); +} + +void opt_combine(Func* f) { + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + u32 w = 0; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + if ((IROp)in->op == IR_COPY && in->nopnds == 2 && + in->opnds[0].kind == OPK_REG && in->opnds[1].kind == OPK_REG && + in->opnds[0].v.reg == in->opnds[1].v.reg) { + continue; + } + bl->insts[w++] = *in; + } + bl->ninsts = w; + } +} + +static int side_effecting(IROp op) { + switch (op) { + case IR_STORE: + case IR_AGG_COPY: + case IR_AGG_SET: + case IR_BITFIELD_STORE: + case IR_CALL: + case IR_BR: + case IR_CONDBR: + case IR_CMP_BRANCH: + case IR_RET: + case IR_SCOPE_BEGIN: + case IR_SCOPE_ELSE: + case IR_SCOPE_END: + case IR_BREAK_TO: + case IR_CONTINUE_TO: + case IR_VA_START: + case IR_VA_END: + case IR_VA_COPY: + case IR_SETJMP: + case IR_LONGJMP: + case IR_ATOMIC_STORE: + case IR_ATOMIC_RMW: + case IR_ATOMIC_CAS: + case IR_FENCE: + case IR_ASM_BLOCK: + case IR_INTRINSIC: + return 1; + default: + return 0; + } +} + +void opt_dce(Func* f) { + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + u32 w = 0; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + if ((IROp)in->op == IR_NOP) continue; + if (!side_effecting((IROp)in->op) && in->def == VAL_NONE && + in->ndefs == 0 && in->nopnds == 0) + continue; + bl->insts[w++] = *in; + } + bl->ninsts = w; + } +} diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c @@ -0,0 +1,495 @@ +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <cfree.h> + +#include "abi/abi.h" +#include "arch/arch.h" +#include "core/core.h" +#include "core/heap.h" +#include "opt/ir.h" +#include "opt/opt.h" +#include "type/type.h" + +static void* h_alloc(CfreeHeap* h, size_t n, size_t a) { + (void)h; + (void)a; + return n ? malloc(n) : NULL; +} +static void* h_realloc(CfreeHeap* h, void* p, size_t o, size_t n, size_t a) { + (void)h; + (void)o; + (void)a; + return realloc(p, n); +} +static void h_free(CfreeHeap* h, void* p, size_t n) { + (void)h; + (void)n; + free(p); +} +static CfreeHeap g_heap = {h_alloc, h_realloc, h_free, NULL}; + +static void diag_emit(CfreeDiagSink* s, CfreeDiagKind k, CfreeSrcLoc loc, + const char* fmt, va_list ap) { + static const char* names[] = {"note", "warning", "error", "fatal"}; + (void)s; + (void)loc; + fprintf(stderr, "%s: ", names[k]); + vfprintf(stderr, fmt, ap); + fputc('\n', stderr); +} +static CfreeDiagSink g_diag = {diag_emit, NULL, 0, 0}; + +static int g_fails; +static int g_checks; + +#define EXPECT(cond, ...) \ + do { \ + ++g_checks; \ + if (!(cond)) { \ + ++g_fails; \ + fprintf(stderr, "FAIL %s:%d: ", __FILE__, __LINE__); \ + fprintf(stderr, __VA_ARGS__); \ + fputc('\n', stderr); \ + } \ + } while (0) + +typedef struct TestCtx { + Compiler cc; + Compiler* c; + const Type* i32; + const Type* i64; +} TestCtx; + +static void tc_init(TestCtx* tc) { + CfreeEnv env; + memset(&env, 0, sizeof env); + env.heap = &g_heap; + env.diag = &g_diag; + env.now = -1; + + CfreeTarget tgt; + memset(&tgt, 0, sizeof tgt); + tgt.arch = CFREE_ARCH_ARM_64; + tgt.os = CFREE_OS_LINUX; + tgt.obj = CFREE_OBJ_ELF; + tgt.ptr_size = 8; + tgt.ptr_align = 8; + + static CfreeEnv s_env; + s_env = env; + compiler_init(&tc->cc, tgt, &s_env); + tc->c = &tc->cc; + tc->i32 = type_prim(tc->c->global, TY_INT); + tc->i64 = type_prim(tc->c->global, TY_LLONG); +} + +static void tc_fini(TestCtx* tc) { compiler_fini(&tc->cc); } + +static Operand op_reg_(Reg r, const Type* ty) { + Operand o; + memset(&o, 0, sizeof o); + o.kind = OPK_REG; + o.cls = RC_INT; + o.type = ty; + o.v.reg = r; + return o; +} + +static Operand op_imm_(i64 v, const Type* ty) { + Operand o; + memset(&o, 0, sizeof o); + o.kind = OPK_IMM; + o.cls = RC_INT; + o.type = ty; + o.v.imm = v; + return o; +} + +static Func* new_func(TestCtx* tc) { + CGFuncDesc fd; + memset(&fd, 0, sizeof fd); + fd.fn_type = type_func(tc->c->global, tc->i32, NULL, 0, 0); + Func* f = ir_func_new(tc->c, &fd); + f->entry = ir_block_new(f); + ir_note_emit(f, f->entry); + return f; +} + +static Val add_val(Func* f, const Type* ty) { + return ir_alloc_val(f, ty, RC_INT); +} + +static Inst* emit_load_imm(Func* f, u32 b, Val dst, const Type* ty, i64 imm) { + Inst* in = ir_emit(f, b, IR_LOAD_IMM); + in->opnds = arena_array(f->arena, Operand, 1); + in->opnds[0] = op_reg_(dst, ty); + in->nopnds = 1; + in->def = dst; + in->type = ty; + in->extra.imm = imm; + f->val_def_block[dst] = b; + f->val_def_inst[dst] = f->blocks[b].ninsts - 1u; + return in; +} + +static Inst* emit_copy(Func* f, u32 b, Val dst, Val src, const Type* ty) { + Inst* in = ir_emit(f, b, IR_COPY); + in->opnds = arena_array(f->arena, Operand, 2); + in->opnds[0] = op_reg_(dst, ty); + in->opnds[1] = op_reg_(src, ty); + in->nopnds = 2; + in->def = dst; + in->type = ty; + f->val_def_block[dst] = b; + f->val_def_inst[dst] = f->blocks[b].ninsts - 1u; + return in; +} + +static Inst* emit_binop(Func* f, u32 b, Val dst, Val a, Val c, const Type* ty) { + Inst* in = ir_emit(f, b, IR_BINOP); + in->opnds = arena_array(f->arena, Operand, 3); + in->opnds[0] = op_reg_(dst, ty); + in->opnds[1] = op_reg_(a, ty); + in->opnds[2] = op_reg_(c, ty); + in->nopnds = 3; + in->def = dst; + in->type = ty; + in->extra.imm = BO_IADD; + f->val_def_block[dst] = b; + f->val_def_inst[dst] = f->blocks[b].ninsts - 1u; + return in; +} + +static Inst* emit_call_void(Func* f, u32 b) { + Inst* in = ir_emit(f, b, IR_CALL); + IRCallAux* aux = arena_znew(f->arena, IRCallAux); + in->extra.aux = aux; + return in; +} + +static int bit_has(const u64* bits, Val v) { + return (bits[v / 64u] & (1ull << (v % 64u))) != 0; +} + +static void setup_one_int_reg(Func* f) { + f->opt_hard_regs[RC_INT][0] = 19; + f->opt_hard_reg_count[RC_INT] = 1; + f->opt_scratch_regs[RC_INT][0] = 9; + f->opt_scratch_regs[RC_INT][1] = 10; + f->opt_scratch_reg_count[RC_INT] = 2; +} + +static void opt_liveness_branch(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + u32 b0 = f->entry; + u32 b1 = ir_block_new(f); + u32 b2 = ir_block_new(f); + ir_note_emit(f, b1); + ir_note_emit(f, b2); + Val v = add_val(f, tc.i32); + Val c = add_val(f, tc.i32); + Val t = add_val(f, tc.i32); + Val u = add_val(f, tc.i32); + emit_load_imm(f, b0, v, tc.i32, 7); + emit_load_imm(f, b0, c, tc.i32, 1); + Inst* br = ir_emit(f, b0, IR_CMP_BRANCH); + br->opnds = arena_array(f->arena, Operand, 2); + br->opnds[0] = op_reg_(c, tc.i32); + br->opnds[1] = op_imm_(0, tc.i32); + br->nopnds = 2; + br->extra.imm = CMP_NE; + f->blocks[b0].succ[0] = b1; + f->blocks[b0].succ[1] = b2; + f->blocks[b0].nsucc = 2; + emit_copy(f, b1, t, v, tc.i32); + ir_emit(f, b1, IR_RET); + emit_copy(f, b2, u, v, tc.i32); + ir_emit(f, b2, IR_RET); + + opt_build_cfg(f); + opt_live_info(f); + + EXPECT(bit_has(f->blocks[b0].live_out, v), "v%u live_out of branch block", + v); + EXPECT(bit_has(f->blocks[b1].live_in, v), "v%u live_in true block", v); + EXPECT(bit_has(f->blocks[b2].live_in, v), "v%u live_in false block", v); + EXPECT(!bit_has(f->blocks[b1].live_out, v), "v%u dies in true block", v); + tc_fini(&tc); +} + +static void opt_regalloc_priority(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + setup_one_int_reg(f); + Val pinned = add_val(f, tc.i32); + Val hot = add_val(f, tc.i32); + Val out = add_val(f, tc.i32); + emit_load_imm(f, f->entry, pinned, tc.i32, 1); + emit_load_imm(f, f->entry, hot, tc.i32, 2); + emit_binop(f, f->entry, out, pinned, hot, tc.i32); + ir_emit(f, f->entry, IR_RET); + opt_build_cfg(f); + opt_live_info(f); + f->val_info[pinned].tied_hard_reg = 19; + f->val_info[hot].frequency += 1000; + opt_regalloc(f, 0); + EXPECT(f->val_info[pinned].alloc_kind == OPT_ALLOC_HARD, + "tied value should get a hard register"); + EXPECT(f->val_info[pinned].hard_reg == 19, + "tied value should get hard r19, got r%u", + (unsigned)f->val_info[pinned].hard_reg); + EXPECT(f->val_info[hot].alloc_kind == OPT_ALLOC_SPILL, + "overlapping untied value should spill under one-reg pressure"); + tc_fini(&tc); +} + +static int count_op(Func* f, IROp op) { + int n = 0; + for (u32 b = 0; b < f->nblocks; ++b) + for (u32 i = 0; i < f->blocks[b].ninsts; ++i) + if ((IROp)f->blocks[b].insts[i].op == op) ++n; + return n; +} + +static void opt_rewrite_spill_use_def(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + setup_one_int_reg(f); + Val a = add_val(f, tc.i32); + Val b = add_val(f, tc.i32); + Val c = add_val(f, tc.i32); + emit_load_imm(f, f->entry, a, tc.i32, 1); + emit_load_imm(f, f->entry, b, tc.i32, 2); + emit_binop(f, f->entry, c, a, b, tc.i32); + ir_emit(f, f->entry, IR_RET); + opt_build_cfg(f); + opt_live_info(f); + opt_regalloc(f, 0); + EXPECT(f->opt_rewritten, "regalloc should rewrite pseudos"); + EXPECT(count_op(f, IR_STORE) >= 1, "spill def should insert store"); + EXPECT(count_op(f, IR_LOAD) >= 1, "spill use should insert reload"); + int saw_spill_slot = 0; + for (u32 i = 0; i < f->nframe_slots; ++i) + if (f->frame_slots[i].kind == FS_SPILL) saw_spill_slot = 1; + EXPECT(saw_spill_slot, "rewrite should allocate FS_SPILL slot"); + tc_fini(&tc); +} + +static void opt_call_clobber_preservation(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + setup_one_int_reg(f); + Val live = add_val(f, tc.i32); + Val out = add_val(f, tc.i32); + emit_load_imm(f, f->entry, live, tc.i32, 11); + emit_call_void(f, f->entry); + emit_copy(f, f->entry, out, live, tc.i32); + ir_emit(f, f->entry, IR_RET); + opt_build_cfg(f); + opt_live_info(f); + opt_regalloc(f, 0); + + Block* b = &f->blocks[f->entry]; + int saw_call_save_restore = 0; + for (u32 i = 1; i + 1 < b->ninsts; ++i) { + if ((IROp)b->insts[i].op == IR_CALL && + (IROp)b->insts[i - 1u].op == IR_STORE && + (IROp)b->insts[i + 1u].op == IR_LOAD) { + saw_call_save_restore = 1; + } + } + EXPECT(saw_call_save_restore, + "live hard reg across call should be stored before and loaded after"); + tc_fini(&tc); +} + +static void opt_spill_pressure(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + setup_one_int_reg(f); + Val a = add_val(f, tc.i32); + Val b = add_val(f, tc.i32); + Val c = add_val(f, tc.i32); + Val d = add_val(f, tc.i32); + emit_load_imm(f, f->entry, a, tc.i32, 1); + emit_load_imm(f, f->entry, b, tc.i32, 2); + emit_load_imm(f, f->entry, c, tc.i32, 3); + emit_binop(f, f->entry, d, a, b, tc.i32); + emit_binop(f, f->entry, d, d, c, tc.i32); + ir_emit(f, f->entry, IR_RET); + opt_build_cfg(f); + opt_live_info(f); + opt_regalloc(f, 0); + + int spills = 0; + for (Val v = 1; v < f->nvals; ++v) + if (f->val_info[v].alloc_kind == OPT_ALLOC_SPILL) ++spills; + EXPECT(spills >= 2, "one hard reg should force multiple spills, got %d", + spills); + EXPECT(count_op(f, IR_LOAD) >= 2, "spilled uses should reload"); + EXPECT(count_op(f, IR_STORE) >= 2, "spilled defs should store"); + tc_fini(&tc); +} + +static void opt_inline_asm_tied_fixed_regs(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + setup_one_int_reg(f); + Val tied = add_val(f, tc.i32); + emit_load_imm(f, f->entry, tied, tc.i32, 5); + + Inst* in = ir_emit(f, f->entry, IR_ASM_BLOCK); + IRAsmAux* aux = arena_znew(f->arena, IRAsmAux); + aux->tmpl = "add %0, %0, #1"; + aux->nout = 1; + aux->nin = 1; + aux->outs = arena_array(f->arena, AsmConstraint, 1); + aux->ins = arena_array(f->arena, AsmConstraint, 1); + aux->out_ops = arena_array(f->arena, Operand, 1); + aux->in_ops = arena_array(f->arena, Operand, 1); + memset(aux->outs, 0, sizeof(AsmConstraint)); + memset(aux->ins, 0, sizeof(AsmConstraint)); + aux->outs[0].str = "+r"; + aux->outs[0].dir = ASM_INOUT; + aux->outs[0].type = tc.i32; + aux->ins[0].str = "0"; + aux->ins[0].dir = ASM_IN; + aux->ins[0].type = tc.i32; + aux->out_ops[0] = op_reg_(tied, tc.i32); + aux->in_ops[0] = op_reg_(tied, tc.i32); + in->extra.aux = aux; + ir_emit(f, f->entry, IR_RET); + + opt_build_cfg(f); + opt_live_info(f); + f->val_info[tied].tied_hard_reg = 19; + opt_regalloc(f, 0); + aux = (IRAsmAux*)in->extra.aux; + EXPECT(f->val_info[tied].alloc_kind == OPT_ALLOC_HARD, + "tied asm val should allocate hard"); + EXPECT(f->val_info[tied].hard_reg == 19, + "tied asm val should get r19"); + EXPECT(aux->out_ops[0].v.reg == 19 && aux->in_ops[0].v.reg == 19, + "asm tied operands should rewrite to the fixed hard reg"); + tc_fini(&tc); +} + +static void opt_post_rewrite_dce(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + setup_one_int_reg(f); + Val a = add_val(f, tc.i32); + emit_load_imm(f, f->entry, a, tc.i32, 1); + emit_copy(f, f->entry, a, a, tc.i32); + ir_emit(f, f->entry, IR_NOP); + ir_emit(f, f->entry, IR_RET); + opt_build_cfg(f); + opt_live_info(f); + opt_regalloc(f, 0); + opt_combine(f); + opt_dce(f); + EXPECT(count_op(f, IR_COPY) == 0, "noop physical copy should be removed"); + EXPECT(count_op(f, IR_NOP) == 0, "post-rewrite DCE should remove nops"); + tc_fini(&tc); +} + +typedef struct MockEmit { + CGTarget base; + int alloc_calls; + int load_imm_calls; + Reg last_dst; +} MockEmit; + +static Reg me_alloc(CGTarget* t, RegClass cls, const Type* ty) { + (void)cls; + (void)ty; + MockEmit* m = (MockEmit*)t; + ++m->alloc_calls; + return 1000u + (Reg)m->alloc_calls; +} +static void me_func_begin(CGTarget* t, const CGFuncDesc* d) { + (void)t; + (void)d; +} +static void me_func_end(CGTarget* t) { (void)t; } +static void me_load_imm(CGTarget* t, Operand dst, i64 imm) { + (void)imm; + MockEmit* m = (MockEmit*)t; + ++m->load_imm_calls; + m->last_dst = dst.v.reg; +} +static void me_ret(CGTarget* t, const CGABIValue* v) { + (void)t; + (void)v; +} +static FrameSlot me_frame_slot(CGTarget* t, const FrameSlotDesc* d) { + (void)t; + (void)d; + static FrameSlot next = 1; + return next++; +} +static void me_set_loc(CGTarget* t, SrcLoc loc) { + (void)t; + (void)loc; +} + +static void mock_emit_init(MockEmit* m, Compiler* c) { + memset(m, 0, sizeof *m); + m->base.c = c; + m->base.func_begin = me_func_begin; + m->base.func_end = me_func_end; + m->base.alloc_reg = me_alloc; + m->base.frame_slot = me_frame_slot; + m->base.load_imm = me_load_imm; + m->base.ret = me_ret; + m->base.set_loc = me_set_loc; +} + +static void opt_emit_no_virtual_alloc(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + setup_one_int_reg(f); + Val a = add_val(f, tc.i32); + emit_load_imm(f, f->entry, a, tc.i32, 42); + ir_emit(f, f->entry, IR_RET); + opt_build_cfg(f); + opt_live_info(f); + opt_regalloc(f, 0); + MockEmit m; + mock_emit_init(&m, tc.c); + opt_emit(tc.c, f, &m.base); + EXPECT(m.alloc_calls == 0, "opt_emit should not allocate virtual regs"); + EXPECT(m.load_imm_calls == 1, "expected one emitted load_imm"); + EXPECT(m.last_dst == 19, "emitted hard dst should be r19, got r%u", + (unsigned)m.last_dst); + tc_fini(&tc); +} + +int main(void) { + opt_liveness_branch(); + opt_regalloc_priority(); + opt_rewrite_spill_use_def(); + opt_call_clobber_preservation(); + opt_spill_pressure(); + opt_inline_asm_tied_fixed_regs(); + opt_post_rewrite_dce(); + opt_emit_no_virtual_alloc(); + if (g_fails) { + fprintf(stderr, "opt tests: %d failed (%d checks)\n", g_fails, g_checks); + return 1; + } + printf("opt tests: PASS (%d checks)\n", g_checks); + return 0; +} diff --git a/test/opt/run.sh b/test/opt/run.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT="$(cd "$(dirname "$0")/../.." && pwd)" +"$ROOT/build/test/opt_test" diff --git a/test/test.mk b/test/test.mk @@ -29,7 +29,7 @@ # parse_asm / cfree_disasm_iter_* are still stubs; the harness builds # and runs end-to-end so the wiring stays exercised. See doc/ASM.md. -.PHONY: test test-lex test-pp test-pp-err test-elf test-ar test-ar-driver test-link test-cg test-cg-binder test-dwarf test-debug test-parse test-parse-err test-asm test-isa test-aa64-inline test-libc test-musl test-glibc test-lib-deps test-smoke-x64 test-smoke-rv64 +.PHONY: test test-lex test-pp test-pp-err test-elf test-ar test-ar-driver test-link test-cg test-cg-binder test-opt test-dwarf test-debug test-parse test-parse-err test-asm test-isa test-aa64-inline test-libc test-musl test-glibc test-lib-deps test-smoke-x64 test-smoke-rv64 test: test-lex test-pp test-pp-err test-elf test-ar test-ar-driver test-link test-cg test-cg-binder test-dwarf test-debug test-parse test-parse-err test-asm test-isa test-aa64-inline test-lib-deps @@ -177,6 +177,15 @@ test-link: lib $(ROUNDTRIP_BIN) $(ROUNDTRIP_BIN_MACHO) $(LINK_EXE_RUNNER) $(JIT_ test-cg: lib $(ROUNDTRIP_BIN) $(LINK_EXE_RUNNER) $(JIT_RUNNER) bash test/cg/run.sh +OPT_TEST_BIN = build/test/opt_test + +test-opt: $(OPT_TEST_BIN) + bash test/opt/run.sh + +$(OPT_TEST_BIN): test/opt/opt_test.c $(LIB_AR) + @mkdir -p $(dir $@) + $(CC) $(DRIVER_CFLAGS) -Isrc test/opt/opt_test.c $(LIB_AR) -o $@ + test-parse: lib $(PARSE_RUNNER) $(ROUNDTRIP_BIN) $(LINK_EXE_RUNNER) $(JIT_RUNNER) bash test/parse/run.sh