kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit f60a16d14658662018ec245649772ee7990d67ba
parent a691bcbf26887ba8ddcb59d1ebbf17408d2a3fca
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue, 26 May 2026 17:53:43 -0700

opt: rewrite pipeline to consume CgIrFunc input; replace pass_emit with NativeTarget

Fundamental rearchitecture of the optimizer pipeline:

- cg_ir_lower.c (new): converts a completed CgIrFunc semantic recording into
  the optimizer's pseudo-register/frame-slot Func representation. This is the
  new pipeline intake point, replacing the old CGTarget-intercepting recorder.

- pass_addr_fold.c (new): address-folding optimizations extracted from pass_o2
  so they run at all opt levels (>= O1), not just O2.

- pass_native_emit.c (new): replaces the deleted pass_emit.c (1384 lines).
  Drives a NativeTarget instead of replaying through a CGTarget, handling
  local-static-data sequences, frame-slot mapping, and label placement.

- pass_emit.c (deleted): old CGTarget replay loop, superseded above.

- opt.c: rewritten from ~1700 lines to ~680. Sheds CGTarget-wrapping recorder
  logic; entry point now accepts CgIrFunc and runs:
  cg_ir_lower → addr_fold → [O2] → lower → machinize → native_emit.

- pass_o2.c: addr-folding logic removed (now in pass_addr_fold).
- pass_machinize.c, pass_lower.c: updated for NativeTarget interface.
- pass_analysis.c, pass_cfg.c, pass_dce.c, ir_print.c: handle new
  local-static-data IR ops.

Includes cg_ir_lower_test.c: unit test for the CgIrFunc → Func conversion.

Diffstat:
Asrc/opt/cg_ir_lower.c | 1082+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/opt/ir_print.c | 10+++++++++-
Msrc/opt/opt.c | 2124+++++++++++++++++++++----------------------------------------------------------
Asrc/opt/pass_addr_fold.c | 760+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/opt/pass_analysis.c | 12+++++++-----
Msrc/opt/pass_cfg.c | 6++++--
Msrc/opt/pass_dce.c | 30++++++++++++++++++++++++++++++
Dsrc/opt/pass_emit.c | 1384-------------------------------------------------------------------------------
Msrc/opt/pass_lower.c | 65++++++++++++++++++++++++++++++++++++++---------------------------
Msrc/opt/pass_machinize.c | 168++++++++++++++++++++++++-------------------------------------------------------
Asrc/opt/pass_native_emit.c | 1219+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/opt/pass_o2.c | 735-------------------------------------------------------------------------------
Atest/opt/cg_ir_lower_test.c | 199+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
13 files changed, 3951 insertions(+), 3843 deletions(-)

diff --git a/src/opt/cg_ir_lower.c b/src/opt/cg_ir_lower.c @@ -0,0 +1,1082 @@ +#include <string.h> + +#include "cg/ir.h" +#include "cg/type.h" +#include "opt/opt_internal.h" + +#undef Operand +#undef CGParamDesc +#undef CGCallDesc +#undef CGFuncDesc +#undef CGLocalStorage +#undef FrameSlotDesc + +typedef struct OptLocalMap { + OptCGLocalStorage storage; + NativeFrameSlot home_slot; + CfreeCgTypeId type; + u32 size; + u32 align; + u8 cls; + u8 address_taken; + u8 pad[2]; +} OptLocalMap; + +typedef struct CgIrLower { + Compiler* c; + const CgIrFunc* src; + Func* f; + OptLocalMap* locals; + u32 nlocals; + u32* label_block; + u32 nlabels; + u32* inst_block; + u8* leader; +} CgIrLower; + +static _Noreturn void lower_panic(CgIrLower* l, SrcLoc loc, const char* msg) { + compiler_panic(l->c, loc, "opt cg-ir lower: %s", msg); +} + +static u8 local_reg_class(Compiler* c, CfreeCgTypeId ty) { + return cg_type_is_float(c, ty) ? RC_FP : RC_INT; +} + +static OptCGFuncDesc lower_func_desc(Arena* a, const struct CGFuncDesc* in) { + OptCGFuncDesc out; + memset(&out, 0, sizeof out); + if (!in) return out; + out.sym = in->sym; + out.text_section_id = in->text_section_id; + out.group_id = in->group_id; + out.fn_type = in->fn_type; + out.result_types = in->result_types; + out.nresults = in->nresults; + out.nparams = in->nparams; + out.loc = in->loc; + out.flags = in->flags; + out.inline_policy = in->inline_policy; + out.atomize = in->atomize; + if (in->nparams && in->params) { + OptCGParamDesc* params = arena_zarray(a, OptCGParamDesc, in->nparams); + for (u32 i = 0; i < in->nparams; ++i) { + params[i].index = in->params[i].index; + params[i].name = in->params[i].name; + params[i].type = in->params[i].type; + params[i].size = in->params[i].size; + params[i].align = in->params[i].align; + params[i].flags = in->params[i].flags; + params[i].loc = in->params[i].loc; + } + out.params = params; + } + return out; +} + +static NativeFrameSlotDesc local_slot_desc(const CgIrLocal* in, u8 kind) { + NativeFrameSlotDesc out; + memset(&out, 0, sizeof out); + out.type = in->desc.type; + out.name = in->desc.name; + out.loc = in->desc.loc; + out.size = in->desc.size; + out.align = in->desc.align; + out.kind = kind; + if (in->address_taken || (in->desc.flags & CG_LOCAL_ADDR_TAKEN)) + out.flags |= FSF_ADDR_TAKEN; + if (in->desc.flags & CG_LOCAL_MEMORY_REQUIRED) + out.flags |= FSF_MEMORY_REQUIRED; + return out; +} + +static OptLocalMap* local_map(CgIrLower* l, CGLocal id, SrcLoc loc) { + if (id == CG_LOCAL_NONE || id > l->nlocals) + lower_panic(l, loc, "bad semantic local"); + return &l->locals[id - 1u]; +} + +static int local_needs_home(const CgIrLocal* in) { + return in->address_taken || + (in->desc.flags & (CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED)); +} + +static int operand_uses_local_addr(const Operand* op, CGLocal local) { + if (!op) return 0; + if (op->kind == OPK_LOCAL) return op->v.local == local; + return 0; +} + +static int local_address_used_in_cg_ir(const CgIrFunc* f, CGLocal local) { + for (u32 i = 0; i < f->ninsts; ++i) { + const CgIrInst* in = &f->insts[i]; + switch ((CgIrOp)in->op) { + case CG_IR_LOAD: + case CG_IR_BITFIELD_LOAD: + if (in->nopnds > 1u && operand_uses_local_addr(&in->opnds[1], local)) + return 1; + break; + case CG_IR_STORE: + case CG_IR_AGG_SET: + case CG_IR_BITFIELD_STORE: + if (in->nopnds > 0u && operand_uses_local_addr(&in->opnds[0], local)) + return 1; + break; + case CG_IR_ADDR_OF: + if (in->nopnds > 1u && operand_uses_local_addr(&in->opnds[1], local)) + return 1; + break; + case CG_IR_AGG_COPY: + case CG_IR_VA_COPY: + if ((in->nopnds > 0u && + operand_uses_local_addr(&in->opnds[0], local)) || + (in->nopnds > 1u && operand_uses_local_addr(&in->opnds[1], local))) + return 1; + break; + case CG_IR_VA_START: + case CG_IR_VA_END: + if (in->nopnds > 0u && operand_uses_local_addr(&in->opnds[0], local)) + return 1; + break; + default: + break; + } + } + return 0; +} + +static void lower_locals(CgIrLower* l) { + l->nlocals = l->src->nlocals; + l->locals = + arena_zarray(l->f->arena, OptLocalMap, l->nlocals ? l->nlocals : 1u); + for (u32 i = 0; i < l->src->nlocals; ++i) { + const CgIrLocal* in = &l->src->locals[i]; + OptLocalMap* m; + if (in->id == CG_LOCAL_NONE || in->id > l->src->nlocals) + lower_panic(l, in->desc.loc, "non-dense semantic local table"); + m = &l->locals[in->id - 1u]; + m->type = in->desc.type; + m->size = in->desc.size; + m->align = in->desc.align; + m->cls = local_reg_class(l->c, in->desc.type); + m->address_taken = + local_needs_home(in) || local_address_used_in_cg_ir(l->src, in->id); + + PReg r = ir_alloc_preg(l->f, in->desc.type, m->cls); + if (m->address_taken) { + m->storage.kind = CG_LOCAL_STORAGE_FRAME; + } else { + m->storage.kind = CG_LOCAL_STORAGE_REG; + m->storage.v.reg = (Reg)r; + } + + if (m->address_taken) { + NativeFrameSlotDesc fsd = + local_slot_desc(in, in->is_param ? FS_PARAM : FS_LOCAL); + m->home_slot = ir_frame_slot_new(l->f, &fsd); + m->storage.v.frame_slot = m->home_slot; + } else { + m->home_slot = FRAME_SLOT_NONE; + } + (void)ir_local_add(l->f, &in->desc, m->storage); + l->f->locals[l->f->nlocals - 1u].address_taken = m->address_taken; + l->f->locals[l->f->nlocals - 1u].home_slot = m->home_slot; + } +} + +static const CgIrParam* find_param(const CgIrFunc* f, CGLocal local) { + for (u32 i = 0; i < f->nparams; ++i) + if (f->params[i].local == local) return &f->params[i]; + return NULL; +} + +static void lower_params(CgIrLower* l) { + for (u32 i = 0; i < l->src->nlocals; ++i) { + const CgIrLocal* loc = &l->src->locals[i]; + if (!loc->is_param) continue; + const CgIrParam* p = find_param(l->src, loc->id); + OptLocalMap* m = local_map(l, loc->id, loc->desc.loc); + OptCGParamDesc d; + memset(&d, 0, sizeof d); + if (p) { + d.index = p->desc.index; + d.name = p->desc.name; + d.type = p->desc.type; + d.size = p->desc.size; + d.align = p->desc.align; + d.flags = p->desc.flags; + d.loc = p->desc.loc; + } else { + d.index = loc->param_index; + d.name = loc->desc.name; + d.type = loc->desc.type; + d.size = loc->desc.size; + d.align = loc->desc.align; + d.flags = loc->desc.flags; + d.loc = loc->desc.loc; + } + d.storage = m->storage; + ir_param_add(l->f, &d); + } +} + +static int cg_inst_terminates(const CgIrInst* in) { + if (!in) return 0; + switch ((CgIrOp)in->op) { + case CG_IR_BR: + case CG_IR_RET: + case CG_IR_CMP_BRANCH: + case CG_IR_SWITCH: + case CG_IR_INDIRECT_BRANCH: + case CG_IR_BREAK_TO: + case CG_IR_CONTINUE_TO: + return 1; + case CG_IR_INTRINSIC: { + const CgIrIntrinsicAux* aux = (const CgIrIntrinsicAux*)in->extra.aux; + return aux && (aux->kind == INTRIN_LONGJMP || aux->kind == INTRIN_TRAP || + aux->kind == INTRIN_UNREACHABLE); + } + default: + return 0; + } +} + +static u32 label_id_max(const CgIrFunc* f) { + u32 max = 0; + for (u32 i = 0; i < f->nlabels; ++i) + if (f->labels[i].id > max) max = f->labels[i].id; + return max; +} + +static void mark_label_leader(CgIrLower* l, Label label, const u32* place) { + if (label == LABEL_NONE || label > l->nlabels || place[label] == UINT32_MAX) + return; + l->leader[place[label]] = 1; +} + +static void mark_leaders(CgIrLower* l, u32* label_place) { + const CgIrFunc* f = l->src; + for (u32 i = 0; i <= f->ninsts; ++i) l->leader[i] = 0; + if (f->ninsts) l->leader[0] = 1; + for (u32 i = 0; i < f->ninsts; ++i) { + const CgIrInst* in = &f->insts[i]; + if ((CgIrOp)in->op == CG_IR_LABEL) { + Label label = (Label)in->extra.imm; + l->leader[i] = 1; + if (label && label <= l->nlabels && label_place[label] == UINT32_MAX) + label_place[label] = i; + } + } + for (u32 i = 0; i < f->ninsts; ++i) { + const CgIrInst* in = &f->insts[i]; + if (cg_inst_terminates(in) && i + 1u < f->ninsts) l->leader[i + 1u] = 1; + switch ((CgIrOp)in->op) { + case CG_IR_BR: + case CG_IR_LOAD_LABEL_ADDR: + mark_label_leader(l, (Label)in->extra.imm, label_place); + break; + case CG_IR_CMP_BRANCH: { + CgIrCmpBranchAux* aux = (CgIrCmpBranchAux*)in->extra.aux; + if (i + 1u < f->ninsts) l->leader[i + 1u] = 1; + if (aux) mark_label_leader(l, aux->target, label_place); + break; + } + case CG_IR_SWITCH: { + CgIrSwitchAux* aux = (CgIrSwitchAux*)in->extra.aux; + if (i + 1u < f->ninsts) l->leader[i + 1u] = 1; + if (aux) { + mark_label_leader(l, aux->default_label, label_place); + for (u32 c = 0; c < aux->ncases; ++c) + mark_label_leader(l, aux->cases[c].label, label_place); + } + break; + } + case CG_IR_INDIRECT_BRANCH: { + CgIrIndirectAux* aux = (CgIrIndirectAux*)in->extra.aux; + if (aux) { + for (u32 t = 0; t < aux->ntargets; ++t) + mark_label_leader(l, aux->targets[t], label_place); + } + break; + } + case CG_IR_SCOPE_BEGIN: + if (i + 1u < f->ninsts) l->leader[i + 1u] = 1; + break; + case CG_IR_SCOPE_ELSE: + case CG_IR_SCOPE_END: + l->leader[i] = 1; + if (i + 1u < f->ninsts) l->leader[i + 1u] = 1; + break; + default: + break; + } + } +} + +static void make_blocks(CgIrLower* l, const u32* label_place) { + const CgIrFunc* f = l->src; + u32 cur = UINT32_MAX; + l->inst_block = arena_zarray(l->f->arena, u32, f->ninsts ? f->ninsts : 1u); + for (u32 i = 0; i < f->ninsts; ++i) { + if (l->leader[i] || cur == UINT32_MAX) { + cur = ir_block_new(l->f); + ir_note_emit(l->f, cur); + if (l->f->nblocks == 1u) l->f->entry = cur; + } + l->inst_block[i] = cur; + } + l->label_block = + arena_zarray(l->f->arena, u32, l->nlabels ? l->nlabels + 1u : 1u); + for (u32 i = 0; i <= l->nlabels; ++i) l->label_block[i] = UINT32_MAX; + for (u32 label = 1; label <= l->nlabels; ++label) { + if (label_place[label] != UINT32_MAX) { + u32 place = label_place[label]; + l->label_block[label] = (place + 1u < f->ninsts) + ? l->inst_block[place + 1u] + : l->inst_block[place]; + } else { + l->label_block[label] = ir_block_new(l->f); + } + } + if (!l->f->nblocks) { + l->f->entry = ir_block_new(l->f); + ir_note_emit(l->f, l->f->entry); + } + l->f->emit_order_n = 0; + for (u32 i = 0; i < f->ninsts; ++i) ir_note_emit(l->f, l->inst_block[i]); + if (!f->ninsts) ir_note_emit(l->f, l->f->entry); +} + +static void emit_param_decls(CgIrLower* l) { + if (!l->f->nparams || l->f->entry >= l->f->nblocks) return; + for (u32 i = 0; i < l->f->nparams; ++i) { + IRParam* p = &l->f->params[i]; + Inst* in = ir_emit(l->f, l->f->entry, IR_PARAM_DECL); + IRParamDeclAux* aux = arena_znew(l->f->arena, IRParamDeclAux); + in->loc = p->loc; + in->type = p->type; + if (p->storage.kind == CG_LOCAL_STORAGE_REG) in->def = p->storage.v.reg; + memset(aux, 0, sizeof *aux); + aux->desc.index = p->index; + aux->desc.name = p->name; + aux->desc.type = p->type; + aux->desc.size = p->size; + aux->desc.align = p->align; + aux->desc.flags = p->flags; + aux->desc.loc = p->loc; + aux->desc.storage = p->storage; + aux->desc.abi = p->abi; + in->extra.aux = aux; + } +} + +static u32 block_for_label(CgIrLower* l, Label label, SrcLoc loc) { + if (label == LABEL_NONE || label > l->nlabels || + l->label_block[label] == UINT32_MAX) + lower_panic(l, loc, "bad label"); + return l->label_block[label]; +} + +static u32 fallthrough_block(CgIrLower* l, u32 inst_index) { + if (inst_index + 1u >= l->src->ninsts) return UINT32_MAX; + return l->inst_block[inst_index + 1u]; +} + +static void set_succ1(CgIrLower* l, u32 block, u32 succ) { + if (succ == UINT32_MAX) { + l->f->blocks[block].nsucc = 0; + return; + } + l->f->blocks[block].succ[0] = succ; + l->f->blocks[block].nsucc = 1; +} + +static OptOperand* dup_opt_ops(CgIrLower* l, const OptOperand* ops, u32 n) { + if (!n) return NULL; + OptOperand* out = arena_array(l->f->arena, OptOperand, n); + memcpy(out, ops, sizeof(*out) * n); + return out; +} + +static OptOperand opt_reg_operand(OptLocalMap* m) { + OptOperand out; + memset(&out, 0, sizeof out); + out.kind = OPK_REG; + out.cls = m->cls; + out.type = m->type; + out.v.reg = m->storage.v.reg; + return out; +} + +static OptOperand opt_frame_operand(OptLocalMap* m) { + OptOperand out; + memset(&out, 0, sizeof out); + out.kind = OPK_LOCAL; + out.cls = RC_INT; + out.type = m->type; + out.v.frame_slot = m->home_slot; + return out; +} + +static OptOperand lower_operand_value(CgIrLower* l, const Operand* in, + SrcLoc loc); + +static OptOperand lower_operand_addr(CgIrLower* l, const Operand* in, + SrcLoc loc) { + OptOperand out; + memset(&out, 0, sizeof out); + if (!in) return out; + out.type = in->type; + switch ((OpKind)in->kind) { + case OPK_LOCAL: { + OptLocalMap* m = local_map(l, in->v.local, loc); + if (m->home_slot == FRAME_SLOT_NONE) { + const CgIrLocal* src = &l->src->locals[in->v.local - 1u]; + NativeFrameSlotDesc fsd = + local_slot_desc(src, src->is_param ? FS_PARAM : FS_LOCAL); + m->home_slot = ir_frame_slot_new(l->f, &fsd); + m->address_taken = 1; + if (in->v.local - 1u < l->f->nlocals) { + l->f->locals[in->v.local - 1u].address_taken = 1; + l->f->locals[in->v.local - 1u].home_slot = m->home_slot; + } + } + return opt_frame_operand(m); + } + case OPK_GLOBAL: + out.kind = OPK_GLOBAL; + out.cls = RC_INT; + out.v.global.sym = in->v.global.sym; + out.v.global.addend = in->v.global.addend; + return out; + case OPK_INDIRECT: { + OptLocalMap* base = local_map(l, in->v.ind.base, loc); + out.kind = OPK_INDIRECT; + out.cls = RC_INT; + out.v.ind.base = base->storage.v.reg; + out.v.ind.index = REG_NONE; + if (in->v.ind.index != CG_LOCAL_NONE) { + OptLocalMap* idx = local_map(l, in->v.ind.index, loc); + out.v.ind.index = idx->storage.v.reg; + } + out.v.ind.log2_scale = in->v.ind.log2_scale; + out.v.ind.ofs = in->v.ind.ofs; + return out; + } + case OPK_IMM: + default: + lower_panic(l, loc, "operand is not addressable"); + } +} + +static OptOperand lower_operand_value(CgIrLower* l, const Operand* in, + SrcLoc loc) { + OptOperand out; + memset(&out, 0, sizeof out); + if (!in) return out; + out.type = in->type; + switch ((OpKind)in->kind) { + case OPK_IMM: + out.kind = OPK_IMM; + out.cls = RC_INT; + out.v.imm = in->v.imm; + return out; + case OPK_LOCAL: { + OptLocalMap* m = local_map(l, in->v.local, loc); + return m->address_taken ? opt_frame_operand(m) : opt_reg_operand(m); + } + case OPK_GLOBAL: + out.kind = OPK_GLOBAL; + out.cls = RC_INT; + out.v.global.sym = in->v.global.sym; + out.v.global.addend = in->v.global.addend; + return out; + case OPK_INDIRECT: + return lower_operand_addr(l, in, loc); + default: + lower_panic(l, loc, "bad operand kind"); + } +} + +static void set_inst_def(Inst* out, const OptOperand* op) { + if (op && op->kind == OPK_REG) { + out->def = (Val)op->v.reg; + out->type = op->type; + } +} + +/* Lower `n` value operands. When `defs_first` is set, opnds[0] is the + * instruction's destination (def); otherwise all operands are uses. Branch + * terminators (CMP_BRANCH, SWITCH, INDIRECT_BRANCH) read their first operand + * and define nothing, so they must pass defs_first=0 -- otherwise dead-def + * elimination treats the branch as a redefinition of the tested value and + * removes the real producer. */ +static void lower_value_ops_ex(CgIrLower* l, Inst* out, const CgIrInst* in, + u32 n, int defs_first) { + OptOperand tmp[5]; + if (n > 5u) lower_panic(l, in->loc, "too many operands"); + for (u32 i = 0; i < n; ++i) + tmp[i] = lower_operand_value(l, &in->opnds[i], in->loc); + out->opnds = dup_opt_ops(l, tmp, n); + out->nopnds = n; + if (n && defs_first) set_inst_def(out, &out->opnds[0]); +} + +static void lower_value_ops(CgIrLower* l, Inst* out, const CgIrInst* in, + u32 n) { + lower_value_ops_ex(l, out, in, n, 1); +} + +static void lower_use_ops(CgIrLower* l, Inst* out, const CgIrInst* in, u32 n) { + lower_value_ops_ex(l, out, in, n, 0); +} + +static void lower_addr_value_ops(CgIrLower* l, Inst* out, const CgIrInst* in, + u32 naddr, u32 nvalue) { + OptOperand tmp[5]; + u32 n = naddr + nvalue; + if (n > 5u) lower_panic(l, in->loc, "too many operands"); + for (u32 i = 0; i < naddr; ++i) + tmp[i] = lower_operand_addr(l, &in->opnds[i], in->loc); + for (u32 i = 0; i < nvalue; ++i) + tmp[naddr + i] = lower_operand_value(l, &in->opnds[naddr + i], in->loc); + out->opnds = dup_opt_ops(l, tmp, n); + out->nopnds = n; +} + +static OptCGABIValue abi_value_for_local(CgIrLower* l, CGLocal local, + SrcLoc loc) { + OptCGABIValue out; + memset(&out, 0, sizeof out); + OptLocalMap* m = local_map(l, local, loc); + out.type = m->type; + out.storage = m->address_taken ? opt_frame_operand(m) : opt_reg_operand(m); + return out; +} + +static void lower_call(CgIrLower* l, Inst* out, const CgIrInst* in) { + const CgIrCallAux* src = (const CgIrCallAux*)in->extra.aux; + IRCallAux* aux = arena_znew(l->f->arena, IRCallAux); + memset(aux, 0, sizeof *aux); + if (!src) { + out->extra.aux = aux; + return; + } + aux->desc.fn_type = src->desc.fn_type; + aux->desc.callee = lower_operand_value(l, &src->desc.callee, in->loc); + aux->desc.nargs = src->desc.nargs; + aux->desc.flags = src->desc.flags; + aux->desc.tail_policy = src->desc.tail_policy; + aux->desc.inline_policy = src->desc.inline_policy; + if (src->desc.nargs) { + aux->desc.args = arena_zarray(l->f->arena, OptCGABIValue, src->desc.nargs); + for (u32 i = 0; i < src->desc.nargs; ++i) + aux->desc.args[i] = abi_value_for_local(l, src->desc.args[i], in->loc); + } + if (src->desc.nresults) { + aux->desc.ret = abi_value_for_local(l, src->desc.results[0], in->loc); + set_inst_def(out, &aux->desc.ret.storage); + } + out->type = src->desc.fn_type; + out->extra.aux = aux; +} + +static void lower_ret(CgIrLower* l, Inst* out, const CgIrInst* in) { + const CgIrRetAux* src = (const CgIrRetAux*)in->extra.aux; + IRRetAux* aux = arena_znew(l->f->arena, IRRetAux); + if (src && src->nvalues) { + aux->present = 1; + aux->val = abi_value_for_local(l, src->values[0], in->loc); + } + out->extra.aux = aux; +} + +static void lower_intrinsic(CgIrLower* l, Inst* out, const CgIrInst* in) { + const CgIrIntrinsicAux* src = (const CgIrIntrinsicAux*)in->extra.aux; + IRIntrinAux* aux = arena_znew(l->f->arena, IRIntrinAux); + if (src) { + aux->kind = src->kind; + aux->ndst = src->ndst; + aux->narg = src->narg; + aux->dsts = + src->ndst ? arena_array(l->f->arena, OptOperand, src->ndst) : NULL; + aux->args = + src->narg ? arena_array(l->f->arena, OptOperand, src->narg) : NULL; + for (u32 i = 0; i < src->ndst; ++i) + aux->dsts[i] = lower_operand_value(l, &src->dsts[i], in->loc); + for (u32 i = 0; i < src->narg; ++i) + aux->args[i] = lower_operand_value(l, &src->args[i], in->loc); + if (src->ndst) { + out->ndefs = src->ndst; + out->defs = arena_array(l->f->arena, Val, src->ndst); + for (u32 i = 0; i < src->ndst; ++i) + out->defs[i] = aux->dsts[i].kind == OPK_REG ? aux->dsts[i].v.reg : 0; + out->def = out->defs[0]; + out->type = aux->dsts[0].type; + } + } + out->extra.aux = aux; +} + +static void lower_asm(CgIrLower* l, Inst* out, const CgIrInst* in) { + const CgIrAsmAux* src = (const CgIrAsmAux*)in->extra.aux; + IRAsmAux* aux = arena_znew(l->f->arena, IRAsmAux); + if (src) { + aux->tmpl = src->tmpl; + aux->outs = src->outs; + aux->ins = src->ins; + aux->clobbers = src->clobbers; + aux->nout = src->nout; + aux->nin = src->nin; + aux->nclob = src->nclob; + aux->out_ops = + src->nout ? arena_array(l->f->arena, OptOperand, src->nout) : NULL; + aux->in_ops = + src->nin ? arena_array(l->f->arena, OptOperand, src->nin) : NULL; + for (u32 i = 0; i < src->nout; ++i) + aux->out_ops[i] = lower_operand_value(l, &src->out_ops[i], in->loc); + for (u32 i = 0; i < src->nin; ++i) + aux->in_ops[i] = lower_operand_value(l, &src->in_ops[i], in->loc); + if (src->nout) { + out->ndefs = src->nout; + out->defs = arena_array(l->f->arena, Val, src->nout); + for (u32 i = 0; i < src->nout; ++i) + out->defs[i] = + aux->out_ops[i].kind == OPK_REG ? aux->out_ops[i].v.reg : 0; + out->def = out->defs[0]; + out->type = aux->out_ops[0].type; + } + } + out->extra.aux = aux; +} + +static void lower_one_inst(CgIrLower* l, u32 idx) { + const CgIrInst* in = &l->src->insts[idx]; + u32 block = l->inst_block[idx]; + Inst* out = NULL; + IROp op = IR_NOP; + switch ((CgIrOp)in->op) { + case CG_IR_LABEL: + return; + case CG_IR_LOAD_IMM: + op = IR_LOAD_IMM; + break; + case CG_IR_LOAD_CONST: + op = IR_LOAD_CONST; + break; + case CG_IR_COPY: + op = IR_COPY; + break; + case CG_IR_LOAD: + op = IR_LOAD; + break; + case CG_IR_STORE: + op = IR_STORE; + break; + case CG_IR_ADDR_OF: + op = IR_ADDR_OF; + break; + case CG_IR_TLS_ADDR_OF: + op = IR_TLS_ADDR_OF; + break; + case CG_IR_AGG_COPY: + op = IR_AGG_COPY; + break; + case CG_IR_AGG_SET: + op = IR_AGG_SET; + break; + case CG_IR_BITFIELD_LOAD: + op = IR_BITFIELD_LOAD; + break; + case CG_IR_BITFIELD_STORE: + op = IR_BITFIELD_STORE; + break; + case CG_IR_BINOP: + op = IR_BINOP; + break; + case CG_IR_UNOP: + op = IR_UNOP; + break; + case CG_IR_CMP: + op = IR_CMP; + break; + case CG_IR_CONVERT: + op = IR_CONVERT; + break; + case CG_IR_CALL: + op = IR_CALL; + break; + case CG_IR_RET: + op = IR_RET; + break; + case CG_IR_BR: + op = IR_BR; + break; + case CG_IR_CMP_BRANCH: + op = IR_CMP_BRANCH; + break; + case CG_IR_SWITCH: + op = IR_SWITCH; + break; + case CG_IR_INDIRECT_BRANCH: + op = IR_INDIRECT_BRANCH; + break; + case CG_IR_LOAD_LABEL_ADDR: + op = IR_LOAD_LABEL_ADDR; + break; + case CG_IR_LOCAL_STATIC_DATA_BEGIN: + op = IR_LOCAL_STATIC_DATA_BEGIN; + break; + case CG_IR_LOCAL_STATIC_DATA_WRITE: + op = IR_LOCAL_STATIC_DATA_WRITE; + break; + case CG_IR_LOCAL_STATIC_DATA_LABEL_ADDR: + op = IR_LOCAL_STATIC_DATA_LABEL_ADDR; + break; + case CG_IR_LOCAL_STATIC_DATA_END: + op = IR_LOCAL_STATIC_DATA_END; + break; + case CG_IR_SCOPE_BEGIN: + op = IR_SCOPE_BEGIN; + break; + case CG_IR_SCOPE_ELSE: + op = IR_SCOPE_ELSE; + break; + case CG_IR_SCOPE_END: + op = IR_SCOPE_END; + break; + case CG_IR_BREAK_TO: + op = IR_BREAK_TO; + break; + case CG_IR_CONTINUE_TO: + op = IR_CONTINUE_TO; + break; + case CG_IR_ALLOCA: + op = IR_ALLOCA; + break; + case CG_IR_VA_START: + op = IR_VA_START; + break; + case CG_IR_VA_ARG: + op = IR_VA_ARG; + break; + case CG_IR_VA_END: + op = IR_VA_END; + break; + case CG_IR_VA_COPY: + op = IR_VA_COPY; + break; + case CG_IR_ATOMIC_LOAD: + op = IR_ATOMIC_LOAD; + break; + case CG_IR_ATOMIC_STORE: + op = IR_ATOMIC_STORE; + break; + case CG_IR_ATOMIC_RMW: + op = IR_ATOMIC_RMW; + break; + case CG_IR_ATOMIC_CAS: + op = IR_ATOMIC_CAS; + break; + case CG_IR_FENCE: + op = IR_FENCE; + break; + case CG_IR_INTRINSIC: + op = IR_INTRINSIC; + break; + case CG_IR_ASM_BLOCK: + op = IR_ASM_BLOCK; + break; + default: + op = IR_NOP; + break; + } + out = ir_emit(l->f, block, op); + out->loc = in->loc; + switch ((CgIrOp)in->op) { + case CG_IR_LOAD_IMM: + lower_value_ops(l, out, in, 1); + out->extra.imm = in->extra.imm; + break; + case CG_IR_LOAD_CONST: + lower_value_ops(l, out, in, 1); + out->extra.cbytes = in->extra.cbytes; + break; + case CG_IR_COPY: + case CG_IR_BINOP: + case CG_IR_UNOP: + case CG_IR_CMP: + case CG_IR_CONVERT: + case CG_IR_ALLOCA: + case CG_IR_VA_ARG: + lower_value_ops(l, out, in, in->nopnds); + out->extra.imm = in->extra.imm; + break; + case CG_IR_LOAD: + case CG_IR_BITFIELD_LOAD: { + OptOperand ops[2]; + ops[0] = lower_operand_value(l, &in->opnds[0], in->loc); + ops[1] = lower_operand_addr(l, &in->opnds[1], in->loc); + out->opnds = dup_opt_ops(l, ops, 2); + out->nopnds = 2; + set_inst_def(out, &out->opnds[0]); + if ((CgIrOp)in->op == CG_IR_LOAD) + out->extra.mem = in->extra.mem; + else + out->extra.aux = in->extra.aux; + break; + } + case CG_IR_ATOMIC_LOAD: { + OptOperand ops[2]; + ops[0] = lower_operand_value(l, &in->opnds[0], in->loc); + ops[1] = lower_operand_value(l, &in->opnds[1], in->loc); + out->opnds = dup_opt_ops(l, ops, 2); + out->nopnds = 2; + set_inst_def(out, &out->opnds[0]); + out->extra.aux = in->extra.aux; + break; + } + case CG_IR_STORE: + case CG_IR_AGG_COPY: + case CG_IR_AGG_SET: + case CG_IR_BITFIELD_STORE: + lower_addr_value_ops(l, out, in, 1, in->nopnds - 1u); + if ((CgIrOp)in->op == CG_IR_STORE) + out->extra.mem = in->extra.mem; + else + out->extra.aux = in->extra.aux; + break; + case CG_IR_ATOMIC_STORE: { + OptOperand ops[2]; + ops[0] = lower_operand_value(l, &in->opnds[0], in->loc); + ops[1] = lower_operand_value(l, &in->opnds[1], in->loc); + out->opnds = dup_opt_ops(l, ops, 2); + out->nopnds = 2; + out->extra.aux = in->extra.aux; + break; + } + case CG_IR_ADDR_OF: { + OptOperand ops[2]; + ops[0] = lower_operand_value(l, &in->opnds[0], in->loc); + ops[1] = lower_operand_addr(l, &in->opnds[1], in->loc); + out->opnds = dup_opt_ops(l, ops, 2); + out->nopnds = 2; + set_inst_def(out, &out->opnds[0]); + break; + } + case CG_IR_TLS_ADDR_OF: + lower_value_ops(l, out, in, 1); + out->extra.aux = in->extra.aux; + break; + case CG_IR_CALL: + lower_call(l, out, in); + break; + case CG_IR_RET: + lower_ret(l, out, in); + l->f->blocks[block].nsucc = 0; + break; + case CG_IR_BR: + out->extra.imm = block_for_label(l, (Label)in->extra.imm, in->loc); + set_succ1(l, block, (u32)out->extra.imm); + break; + case CG_IR_CMP_BRANCH: { + CgIrCmpBranchAux* aux = (CgIrCmpBranchAux*)in->extra.aux; + lower_use_ops(l, out, in, 2); + out->extra.imm = aux ? aux->op : CMP_NE; + ir_block_set_nsucc(l->f, block, 2); + l->f->blocks[block].succ[0] = + aux ? block_for_label(l, aux->target, in->loc) : UINT32_MAX; + l->f->blocks[block].succ[1] = fallthrough_block(l, idx); + break; + } + case CG_IR_SWITCH: { + CgIrSwitchAux* src = (CgIrSwitchAux*)in->extra.aux; + IRSwitchAux* aux = arena_znew(l->f->arena, IRSwitchAux); + lower_use_ops(l, out, in, 1); + if (src) { + aux->selector_type = src->selector_type; + aux->ncases = src->ncases; + aux->hint = src->hint; + aux->has_default = src->default_label != LABEL_NONE; + aux->default_block = + aux->has_default ? block_for_label(l, src->default_label, in->loc) + : fallthrough_block(l, idx); + if (src->ncases) { + aux->cases = arena_array(l->f->arena, IRSwitchAuxCase, src->ncases); + for (u32 i = 0; i < src->ncases; ++i) { + aux->cases[i].value = src->cases[i].value; + aux->cases[i].block = + block_for_label(l, src->cases[i].label, in->loc); + } + } + ir_block_set_nsucc(l->f, block, src->ncases + 1u); + for (u32 i = 0; i < src->ncases; ++i) + l->f->blocks[block].succ[i] = aux->cases[i].block; + l->f->blocks[block].succ[src->ncases] = aux->default_block; + } + out->extra.aux = aux; + break; + } + case CG_IR_INDIRECT_BRANCH: { + CgIrIndirectAux* src = (CgIrIndirectAux*)in->extra.aux; + IRIndirectAux* aux = arena_znew(l->f->arena, IRIndirectAux); + lower_use_ops(l, out, in, 1); + if (src && src->ntargets) { + aux->ntargets = src->ntargets; + aux->targets = arena_array(l->f->arena, u32, src->ntargets); + ir_block_set_nsucc(l->f, block, src->ntargets); + for (u32 i = 0; i < src->ntargets; ++i) { + aux->targets[i] = block_for_label(l, src->targets[i], in->loc); + l->f->blocks[block].succ[i] = aux->targets[i]; + } + } + out->extra.aux = aux; + break; + } + case CG_IR_LOAD_LABEL_ADDR: + lower_value_ops(l, out, in, 1); + out->extra.imm = block_for_label(l, (Label)in->extra.imm, in->loc); + break; + case CG_IR_LOCAL_STATIC_DATA_BEGIN: + out->extra.aux = in->extra.aux; + break; + case CG_IR_LOCAL_STATIC_DATA_WRITE: + out->extra.aux = in->extra.aux; + break; + case CG_IR_LOCAL_STATIC_DATA_LABEL_ADDR: { + CgIrLocalStaticLabelAux* src = (CgIrLocalStaticLabelAux*)in->extra.aux; + CgIrLocalStaticLabelAux* aux = + arena_znew(l->f->arena, CgIrLocalStaticLabelAux); + if (src) { + *aux = *src; + aux->target = (Label)block_for_label(l, src->target, in->loc); + } + out->extra.aux = aux; + break; + } + case CG_IR_LOCAL_STATIC_DATA_END: + break; + case CG_IR_SCOPE_BEGIN: { + CgIrScopeAux* src = (CgIrScopeAux*)in->extra.aux; + IRScopeAux* aux = arena_znew(l->f->arena, IRScopeAux); + if (src) { + aux->scope_id = src->scope; + aux->desc.kind = src->desc.kind; + aux->desc.break_label = src->desc.break_label; + aux->desc.continue_label = src->desc.continue_label; + aux->desc.result_type = src->desc.result_type; + aux->desc.cond = lower_operand_value(l, &src->desc.cond, in->loc); + } + out->extra.aux = aux; + break; + } + case CG_IR_SCOPE_ELSE: + case CG_IR_SCOPE_END: + case CG_IR_BREAK_TO: + case CG_IR_CONTINUE_TO: + out->extra.imm = in->extra.imm; + break; + case CG_IR_VA_START: + case CG_IR_VA_END: + lower_addr_value_ops(l, out, in, 1, 0); + break; + case CG_IR_VA_COPY: + lower_addr_value_ops(l, out, in, 2, 0); + break; + case CG_IR_ATOMIC_RMW: + lower_value_ops(l, out, in, 3); + out->extra.aux = in->extra.aux; + break; + case CG_IR_ATOMIC_CAS: + lower_value_ops(l, out, in, 5); + out->ndefs = 2; + out->defs = arena_array(l->f->arena, Val, 2); + out->defs[0] = out->opnds[0].v.reg; + out->defs[1] = out->opnds[1].v.reg; + out->def = out->defs[0]; + out->type = out->opnds[0].type; + { + const CgIrAtomicAux* src = (const CgIrAtomicAux*)in->extra.aux; + IRCasAux* aux = arena_znew(l->f->arena, IRCasAux); + if (src) { + aux->mem = src->mem; + aux->success = src->order; + aux->failure = src->failure; + } + out->extra.aux = aux; + } + break; + case CG_IR_FENCE: + out->extra.imm = in->extra.imm; + break; + case CG_IR_INTRINSIC: + lower_intrinsic(l, out, in); + break; + case CG_IR_ASM_BLOCK: + lower_asm(l, out, in); + break; + default: + out->extra.aux = in->extra.aux; + break; + } +} + +static void add_fallthrough_succs(CgIrLower* l) { + for (u32 b = 0; b < l->f->nblocks; ++b) { + Block* bl = &l->f->blocks[b]; + if (bl->nsucc) continue; + if (bl->ninsts) { + Inst* last = &bl->insts[bl->ninsts - 1u]; + switch ((IROp)last->op) { + case IR_BR: + case IR_CONDBR: + case IR_CMP_BRANCH: + case IR_SWITCH: + case IR_INDIRECT_BRANCH: + case IR_RET: + case IR_BREAK_TO: + case IR_CONTINUE_TO: + continue; + case IR_INTRINSIC: { + IRIntrinAux* aux = (IRIntrinAux*)last->extra.aux; + if (aux && (aux->kind == INTRIN_LONGJMP || aux->kind == INTRIN_TRAP || + aux->kind == INTRIN_UNREACHABLE)) + continue; + break; + } + default: + break; + } + } + for (u32 i = 0; i + 1u < l->f->emit_order_n; ++i) { + if (l->f->emit_order[i] == b) { + set_succ1(l, b, l->f->emit_order[i + 1u]); + break; + } + } + } +} + +Func* opt_func_from_cg_ir(Compiler* c, const CgIrFunc* src) { + if (!c || !src) return NULL; + OptCGFuncDesc desc = lower_func_desc(c->tu, &src->desc); + Func* f = ir_func_new(c, &desc); + CgIrLower l; + memset(&l, 0, sizeof l); + l.c = c; + l.src = src; + l.f = f; + l.nlabels = label_id_max(src); + u32* label_place = + arena_array(f->arena, u32, l.nlabels ? l.nlabels + 1u : 1u); + for (u32 i = 0; i <= l.nlabels; ++i) label_place[i] = UINT32_MAX; + l.leader = arena_zarray(f->arena, u8, src->ninsts + 1u); + lower_locals(&l); + lower_params(&l); + mark_leaders(&l, label_place); + make_blocks(&l, label_place); + emit_param_decls(&l); + for (u32 i = 0; i < src->ninsts; ++i) lower_one_inst(&l, i); + add_fallthrough_succs(&l); + opt_build_cfg(f); + return f; +} diff --git a/src/opt/ir_print.c b/src/opt/ir_print.c @@ -78,6 +78,14 @@ static const char* op_name(IROp op) { return "indirect_branch"; case IR_LOAD_LABEL_ADDR: return "load_label_addr"; + case IR_LOCAL_STATIC_DATA_BEGIN: + return "local_static_data_begin"; + case IR_LOCAL_STATIC_DATA_WRITE: + return "local_static_data_write"; + case IR_LOCAL_STATIC_DATA_LABEL_ADDR: + return "local_static_data_label_addr"; + case IR_LOCAL_STATIC_DATA_END: + return "local_static_data_end"; case IR_RET: return "ret"; case IR_SCOPE_BEGIN: @@ -177,7 +185,7 @@ static void dump_operand(Writer* w, const Operand* op) { return; } strbuf_init(&sb, buf, sizeof buf); - switch ((OpKind)op->kind) { + switch ((OptOperandKind)op->kind) { case OPK_IMM: strbuf_puts(&sb, "imm:"); strbuf_put_i64(&sb, (i64)op->v.imm); diff --git a/src/opt/opt.c b/src/opt/opt.c @@ -1,1698 +1,678 @@ -/* opt.c — CGTarget wrapper that records each function as IR (doc/OPT.md - * §1). Each CGTarget call lands as exactly one Inst in the current - * Func's current block. CG virtual registers are recorded as mutable - * pseudo-register ids (PReg); O2 turns them into Val ids with - * opt_build_reg_ssa. Labels, frame slots, and scopes keep their direct IR id - * mappings (label ↔ block id, vslot ↔ IR FrameSlot, vscope ↔ scope_aux index). - * - * OPT1: level 1 records the CGTarget stream, runs the minimal backend - * lowering schedule, rewrites virtual regs to hard regs/spill slots, - * and emits the rewritten IR into the wrapped target. - * - * Methods the wrapper rejects under unbounded virtuals: - * - spill_reg / reload_reg are CG -O0 register-pressure - * mechanics. CG never invokes them on real backends in v1, and - * they're meaningless for opt's vreg space — calling them is a - * wiring bug, so we panic loudly. */ - #include <string.h> +#include "abi/abi.h" +#include "cg/ir.h" +#include "cg/ir_recorder.h" +#include "cg/native_direct_target.h" +#include "cg/type.h" #include "core/arena.h" #include "core/core.h" #include "core/metrics.h" -#include "core/slice.h" -#include "opt/ir.h" +#include "core/strbuf.h" #include "opt/opt_internal.h" -/* ---- wrapper state ---- */ +#undef Operand +#undef CGCallDesc +#undef CGFuncDesc +#undef CGParamDesc +#undef CGScopeDesc typedef struct OptImpl { - CGTarget base; - CGTarget* target; - int level; Compiler* c; - - /* Current function being recorded. NULL between functions. */ - Func* f; - u32 cur; /* current block id */ - SrcLoc pending_loc; /* most recent set_loc; stamped on each Inst */ - FuncSet funcs; - + CgTarget* target; + NativeTarget* native; + int level; Writer* dump_writer; } OptImpl; -static OptImpl* impl_of(CGTarget* t) { return (OptImpl*)t; } - -static _Noreturn void panic_unsupported(OptImpl* o, const char* what) { - SrcLoc loc = {0, 0, 0}; - compiler_panic(o->c, loc, - "opt_cgtarget: %.*s called under unbounded virtuals", - SLICE_ARG(slice_from_cstr(what))); -} - -/* ---- recording helpers ---- */ - -static Inst* rec(OptImpl* o, IROp op) { - Inst* in = ir_emit(o->f, o->cur, op); - in->loc = o->pending_loc; - return in; -} - -static void set_preg_def(Func* f, Inst* in, u32 block, PReg r, - CfreeCgTypeId t) { - (void)f; - (void)block; - in->def = (Val)r; - in->type = t; -} - -static int intrinsic_terminates(IntrinKind kind) { - return kind == INTRIN_LONGJMP || kind == INTRIN_TRAP || - kind == INTRIN_UNREACHABLE; -} - -static void ensure_operand(Func* f, const Operand* op) { - if (!op) return; - if (op->kind == OPK_REG) { - ir_ensure_preg(f, (PReg)op->v.reg, op->type, op->cls); - } else if (op->kind == OPK_INDIRECT) { - ir_ensure_preg(f, (PReg)op->v.ind.base, 0, RC_INT); - if (op->v.ind.index != (Reg)REG_NONE) - ir_ensure_preg(f, (PReg)op->v.ind.index, 0, RC_INT); - } -} - -static void ensure_abivalue(Func* f, const CGABIValue* v) { - if (!v) return; - ensure_operand(f, &v->storage); - for (u32 i = 0; i < v->nparts; ++i) ensure_operand(f, &v->parts[i].op); -} - -static Operand* dup_opnds(Func* f, const Operand* src, u32 n) { - if (!n) return NULL; - for (u32 i = 0; i < n; ++i) ensure_operand(f, &src[i]); - Operand* dst = arena_array(f->arena, Operand, n); - memcpy(dst, src, sizeof(Operand) * n); - return dst; -} - -static int cur_terminated(OptImpl* o) { - Block* b = &o->f->blocks[o->cur]; - if (b->nsucc > 0) return 1; - if (b->ninsts == 0) return 0; - Inst* last = &b->insts[b->ninsts - 1]; - if ((IROp)last->op == IR_RET) return 1; - if ((IROp)last->op == IR_INTRINSIC) { - IRIntrinAux* aux = (IRIntrinAux*)last->extra.aux; - return aux && intrinsic_terminates(aux->kind); +typedef struct OptReplay { + OptImpl* o; + CGLocal* local_map; + u32 nlocals; + Label* label_map; + u32 nlabels; + CGScope* scope_map; + u32 nscopes; +} OptReplay; + +static int opt_type_large_or_aggregate(Compiler* c, CfreeCgTypeId ty) { + if (!ty) return 0; + return cg_type_is_aggregate(c, ty) || abi_cg_sizeof(c->abi, ty) > 8u; +} + +static int opt_func_needs_direct_replay(OptImpl* o, const CgIrFunc* f) { + for (u32 i = 0; i < f->desc.nresults; ++i) + if (opt_type_large_or_aggregate(o->c, f->desc.result_types[i])) return 1; + for (u32 i = 0; i < f->desc.nparams; ++i) + if (opt_type_large_or_aggregate(o->c, f->desc.params[i].type)) return 1; + for (u32 i = 0; i < f->ninsts; ++i) { + const CgIrInst* in = &f->insts[i]; + switch ((CgIrOp)in->op) { + case CG_IR_ASM_BLOCK: + case CG_IR_ALLOCA: + case CG_IR_INTRINSIC: + case CG_IR_VA_START: + case CG_IR_VA_ARG: + case CG_IR_VA_END: + case CG_IR_VA_COPY: + return 1; + case CG_IR_CALL: { + const CgIrCallAux* aux = (const CgIrCallAux*)in->extra.aux; + if (!aux) break; + for (u32 a = 0; a < aux->desc.nargs; ++a) { + CGLocal local = aux->desc.args[a]; + if (local && local <= f->nlocals && + opt_type_large_or_aggregate(o->c, + f->locals[local - 1u].desc.type)) + return 1; + } + for (u32 r = 0; r < aux->desc.nresults; ++r) { + CGLocal local = aux->desc.results[r]; + if (local && local <= f->nlocals && + opt_type_large_or_aggregate(o->c, + f->locals[local - 1u].desc.type)) + return 1; + } + break; + } + default: + break; + } } return 0; } -static void set_cur(OptImpl* o, u32 b) { - o->cur = b; - ir_note_emit(o->f, b); -} - -/* After emitting a terminator, allocate a fresh block for any - * subsequent (likely unreachable) recording. */ -static void after_terminator(OptImpl* o) { set_cur(o, ir_block_new(o->f)); } - -/* ---- function lifecycle ---- */ - -static void w_func_begin(CGTarget* t, const CGFuncDesc* fd) { - OptImpl* o = impl_of(t); - o->f = ir_func_new(o->c, fd); - u32 entry = ir_block_new(o->f); - o->f->entry = entry; - set_cur(o, entry); - o->pending_loc = (SrcLoc){0, 0, 0}; -} - -static void w_func_end(CGTarget* t); -static void w_addr_of(CGTarget* t, Operand dst, Operand lv); - -/* ---- registers and frame slots ---- */ - -static FrameSlot w_frame_slot(CGTarget* t, const FrameSlotDesc* d) { - OptImpl* o = impl_of(t); - return ir_frame_slot_new(o->f, d); -} - -static FrameSlot opt_local_frame_slot(Func* f, const CGLocalDesc* d, - int force_addr_taken) { - FrameSlotDesc fsd; - memset(&fsd, 0, sizeof fsd); - fsd.type = d->type; - fsd.name = d->name; - fsd.loc = d->loc; - fsd.size = d->size; - fsd.align = d->align; - fsd.kind = FS_LOCAL; - if (force_addr_taken || (d->flags & CG_LOCAL_ADDR_TAKEN)) - fsd.flags |= FSF_ADDR_TAKEN; - return ir_frame_slot_new(f, &fsd); +static Label replay_label(OptReplay* r, Label label, SrcLoc loc) { + if (label == LABEL_NONE) return LABEL_NONE; + if (label > r->nlabels || !r->label_map[label]) + compiler_panic(r->o->c, loc, "opt direct replay: bad label"); + return r->label_map[label]; } -static FrameSlot opt_param_frame_slot(Func* f, const CGParamDesc* d) { - FrameSlotDesc fsd; - memset(&fsd, 0, sizeof fsd); - fsd.type = d->type; - fsd.name = d->name; - fsd.loc = d->loc; - fsd.size = d->size; - fsd.align = d->align; - fsd.kind = FS_PARAM; - if (d->flags & CG_LOCAL_ADDR_TAKEN) fsd.flags |= FSF_ADDR_TAKEN; - return ir_frame_slot_new(f, &fsd); +static CGLocal replay_local(OptReplay* r, CGLocal local, SrcLoc loc) { + if (local == CG_LOCAL_NONE) return CG_LOCAL_NONE; + if (local > r->nlocals || !r->local_map[local]) + compiler_panic(r->o->c, loc, "opt direct replay: bad local"); + return r->local_map[local]; } -static u8 opt_local_reg_class_for(Compiler* c, CfreeCgTypeId ty) { - CfreeCgTypeKind kind = cfree_cg_type_kind((CfreeCompiler*)c, ty); - return kind == CFREE_CG_TYPE_FLOAT ? RC_FP : RC_INT; +static CGScope replay_scope(OptReplay* r, CGScope scope, SrcLoc loc) { + if (scope == CG_SCOPE_NONE) return CG_SCOPE_NONE; + if (scope > r->nscopes || !r->scope_map[scope]) + compiler_panic(r->o->c, loc, "opt direct replay: bad scope"); + return r->scope_map[scope]; } -static u8 opt_local_reg_class(OptImpl* o, CfreeCgTypeId ty) { - return opt_local_reg_class_for(o->c, ty); -} - -static CGLocalStorage w_local(CGTarget* t, const CGLocalDesc* d) { - OptImpl* o = impl_of(t); - CGLocalStorage st; - memset(&st, 0, sizeof st); - if (o->level < 2 && - (d->flags & (CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED)) == 0) { - PReg v = ir_alloc_preg(o->f, d->type, opt_local_reg_class(o, d->type)); - st.kind = CG_LOCAL_STORAGE_REG; - st.v.reg = (Reg)v; - } else { - st.kind = CG_LOCAL_STORAGE_FRAME; - st.v.frame_slot = opt_local_frame_slot(o->f, d, 0); +static Operand replay_operand(OptReplay* r, Operand in, SrcLoc loc) { + if (in.kind == OPK_LOCAL) { + in.v.local = replay_local(r, in.v.local, loc); + } else if (in.kind == OPK_INDIRECT) { + in.v.ind.base = replay_local(r, in.v.ind.base, loc); + in.v.ind.index = replay_local(r, in.v.ind.index, loc); } - ir_local_add(o->f, d, st); - return st; -} - -static IRLocal* opt_find_local_by_reg(Func* f, Reg reg) { - for (u32 i = 0; i < f->nlocals; ++i) { - IRLocal* l = &f->locals[i]; - if (l->storage.kind == CG_LOCAL_STORAGE_REG && l->storage.v.reg == reg) - return l; - } - return NULL; + return in; } -static void w_local_addr(CGTarget* t, Operand dst, const CGLocalDesc* d, - CGLocalStorage st) { - OptImpl* o = impl_of(t); - IRLocal* local = NULL; - FrameSlot frame_slot = FRAME_SLOT_NONE; - const CGLocalDesc* desc = d; - if (st.kind == CG_LOCAL_STORAGE_REG) { - local = opt_find_local_by_reg(o->f, st.v.reg); - if (!local) { - compiler_panic(o->c, d ? d->loc : o->pending_loc, - "opt_cgtarget: unknown register-backed local address"); +static void replay_operands(OptReplay* r, Operand* dst, const Operand* src, + u32 n, SrcLoc loc) { + for (u32 i = 0; i < n; ++i) dst[i] = replay_operand(r, src[i], loc); +} + +static CGCallDesc replay_call_desc(OptReplay* r, const CGCallDesc* src, + SrcLoc loc) { + CGCallDesc out = *src; + out.callee = replay_operand(r, src->callee, loc); + if (src->nargs) { + CGLocal* args = arena_array(r->o->c->tu, CGLocal, src->nargs); + for (u32 i = 0; i < src->nargs; ++i) + args[i] = replay_local(r, src->args[i], loc); + out.args = args; + } + if (src->nresults) { + CGLocal* results = arena_array(r->o->c->tu, CGLocal, src->nresults); + for (u32 i = 0; i < src->nresults; ++i) + results[i] = replay_local(r, src->results[i], loc); + out.results = results; + } + return out; +} + +static void replay_switch(OptReplay* r, const CgIrInst* in) { + const CgIrSwitchAux* src = (const CgIrSwitchAux*)in->extra.aux; + CGSwitchDesc d; + memset(&d, 0, sizeof d); + d.selector = replay_operand(r, in->opnds[0], in->loc); + d.selector_type = src->selector_type; + d.default_label = replay_label(r, src->default_label, in->loc); + d.ncases = src->ncases; + d.hint = src->hint; + d.opt_level = src->opt_level; + if (src->ncases) { + CGSwitchCase* cases = arena_array(r->o->c->tu, CGSwitchCase, src->ncases); + for (u32 i = 0; i < src->ncases; ++i) { + cases[i] = src->cases[i]; + cases[i].label = replay_label(r, src->cases[i].label, in->loc); } - if (local->home_slot == FRAME_SLOT_NONE) - local->home_slot = opt_local_frame_slot(o->f, &local->desc, 1); - local->address_taken = 1; - local->desc.flags |= CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED; - frame_slot = local->home_slot; - desc = &local->desc; - } else { - frame_slot = st.v.frame_slot; - } - Operand lv; - memset(&lv, 0, sizeof lv); - lv.kind = OPK_LOCAL; - lv.cls = RC_INT; - lv.type = desc ? desc->type : dst.type; - lv.v.frame_slot = frame_slot; - w_addr_of(t, dst, lv); -} - -static Operand opt_local_addr_operand(IRLocal* l) { - Operand o; - memset(&o, 0, sizeof o); - o.kind = OPK_LOCAL; - o.cls = RC_INT; - o.type = l->desc.type; - o.v.frame_slot = l->home_slot; - return o; -} - -static MemAccess opt_local_mem(IRLocal* l) { - MemAccess m; - memset(&m, 0, sizeof m); - m.type = l->desc.type; - m.size = l->desc.size; - m.align = l->desc.align; - m.alias.kind = ALIAS_LOCAL; - m.alias.v.local_id = (i32)l->home_slot; - return m; -} - -static int inst_defines_val(const Inst* in, Val v) { - if (!in || v == VAL_NONE) return 0; - if (in->def == v) return 1; - for (u32 i = 0; i < in->ndefs; ++i) - if (in->defs[i] == v) return 1; - return 0; -} - -static int op_uses_reg(const Operand* op, Reg reg) { - if (!op) return 0; - if (op->kind == OPK_REG && op->v.reg == reg) return 1; - if (op->kind == OPK_INDIRECT && - (op->v.ind.base == reg || - (op->v.ind.index != (Reg)REG_NONE && op->v.ind.index == reg))) - return 1; - return 0; -} - -static int abivalue_uses_reg(const CGABIValue* v, Reg reg) { - if (!v) return 0; - if (op_uses_reg(&v->storage, reg)) return 1; - for (u32 i = 0; i < v->nparts; ++i) - if (op_uses_reg(&v->parts[i].op, reg)) return 1; - return 0; -} - -static int inst_uses_local_reg(const Inst* in, Reg reg) { - if (!in) return 0; - for (u32 i = 0; i < in->nopnds; ++i) { - int is_def = i == 0 && in->opnds[i].kind == OPK_REG && - inst_defines_val(in, (Val)in->opnds[i].v.reg); - if (!is_def && op_uses_reg(&in->opnds[i], reg)) return 1; - } - switch ((IROp)in->op) { - case IR_CALL: { - IRCallAux* aux = (IRCallAux*)in->extra.aux; - if (!aux) return 0; - if (op_uses_reg(&aux->desc.callee, reg)) return 1; - for (u32 i = 0; i < aux->desc.nargs; ++i) - if (abivalue_uses_reg(&aux->desc.args[i], reg)) return 1; - return 0; + d.cases = cases; + } + r->o->target->switch_(r->o->target, &d); +} + +static void replay_inst(OptReplay* r, const CgIrInst* in) { + CgTarget* t = r->o->target; + Operand ops[5]; + if (t->set_loc) t->set_loc(t, in->loc); + switch ((CgIrOp)in->op) { + case CG_IR_NOP: + return; + case CG_IR_LABEL: + t->label_place(t, replay_label(r, (Label)in->extra.imm, in->loc)); + return; + case CG_IR_LOAD_IMM: + ops[0] = replay_operand(r, in->opnds[0], in->loc); + t->load_imm(t, ops[0], in->extra.imm); + return; + case CG_IR_LOAD_CONST: + ops[0] = replay_operand(r, in->opnds[0], in->loc); + t->load_const(t, ops[0], in->extra.cbytes); + return; + case CG_IR_COPY: + replay_operands(r, ops, in->opnds, 2, in->loc); + t->copy(t, ops[0], ops[1]); + return; + case CG_IR_LOAD: + replay_operands(r, ops, in->opnds, 2, in->loc); + t->load(t, ops[0], ops[1], in->extra.mem); + return; + case CG_IR_STORE: + replay_operands(r, ops, in->opnds, 2, in->loc); + t->store(t, ops[0], ops[1], in->extra.mem); + return; + case CG_IR_ADDR_OF: + replay_operands(r, ops, in->opnds, 2, in->loc); + t->addr_of(t, ops[0], ops[1]); + return; + case CG_IR_TLS_ADDR_OF: { + const CgIrTlsAux* aux = (const CgIrTlsAux*)in->extra.aux; + ops[0] = replay_operand(r, in->opnds[0], in->loc); + t->tls_addr_of(t, ops[0], aux->sym, aux->addend); + return; } - case IR_RET: { - IRRetAux* aux = (IRRetAux*)in->extra.aux; - return aux && aux->present && abivalue_uses_reg(&aux->val, reg); + case CG_IR_AGG_COPY: { + const CgIrAggAux* aux = (const CgIrAggAux*)in->extra.aux; + replay_operands(r, ops, in->opnds, 2, in->loc); + t->copy_bytes(t, ops[0], ops[1], aux->access); + return; } - case IR_SCOPE_BEGIN: { - IRScopeAux* aux = (IRScopeAux*)in->extra.aux; - return aux && op_uses_reg(&aux->desc.cond, reg); + case CG_IR_AGG_SET: { + const CgIrAggAux* aux = (const CgIrAggAux*)in->extra.aux; + replay_operands(r, ops, in->opnds, 2, in->loc); + t->set_bytes(t, ops[0], ops[1], aux->access); + return; } - case IR_ASM_BLOCK: { - IRAsmAux* aux = (IRAsmAux*)in->extra.aux; - if (!aux) return 0; - for (u32 i = 0; i < aux->nin; ++i) - if (op_uses_reg(&aux->in_ops[i], reg)) return 1; - return 0; + case CG_IR_BITFIELD_LOAD: { + const CgIrBitFieldAux* aux = (const CgIrBitFieldAux*)in->extra.aux; + replay_operands(r, ops, in->opnds, 2, in->loc); + t->bitfield_load(t, ops[0], ops[1], aux->access); + return; } - case IR_INTRINSIC: { - IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; - if (!aux) return 0; - for (u32 i = 0; i < aux->narg; ++i) - if (op_uses_reg(&aux->args[i], reg)) return 1; - return 0; + case CG_IR_BITFIELD_STORE: { + const CgIrBitFieldAux* aux = (const CgIrBitFieldAux*)in->extra.aux; + replay_operands(r, ops, in->opnds, 2, in->loc); + t->bitfield_store(t, ops[0], ops[1], aux->access); + return; } - default: - return 0; - } -} - -static void opt_make_local_load(Func* f, Inst* out, IRLocal* l, SrcLoc loc) { - memset(out, 0, sizeof *out); - out->op = IR_LOAD; - ir_assign_inst_id(f, out); - out->loc = loc; - out->type = l->desc.type; - out->def = (Val)l->storage.v.reg; - out->opnds = arena_array(f->arena, Operand, 2); - out->opnds[0].kind = OPK_REG; - out->opnds[0].cls = opt_local_reg_class_for(f->c, l->desc.type); - out->opnds[0].type = l->desc.type; - out->opnds[0].v.reg = l->storage.v.reg; - out->opnds[1] = opt_local_addr_operand(l); - out->nopnds = 2; - out->extra.mem = opt_local_mem(l); -} - -static void opt_make_local_store(Func* f, Inst* out, IRLocal* l, SrcLoc loc) { - memset(out, 0, sizeof *out); - out->op = IR_STORE; - ir_assign_inst_id(f, out); - out->loc = loc; - out->opnds = arena_array(f->arena, Operand, 2); - out->opnds[0] = opt_local_addr_operand(l); - out->opnds[1].kind = OPK_REG; - out->opnds[1].cls = opt_local_reg_class_for(f->c, l->desc.type); - out->opnds[1].type = l->desc.type; - out->opnds[1].v.reg = l->storage.v.reg; - out->nopnds = 2; - out->extra.mem = opt_local_mem(l); -} - -static IRLocal* opt_addr_taken_reg_local_defined_by(Func* f, const Inst* in) { - if (!in) return NULL; - for (u32 i = 0; i < f->nlocals; ++i) { - IRLocal* l = &f->locals[i]; - if (!l->address_taken || l->home_slot == FRAME_SLOT_NONE) continue; - if (l->storage.kind == CG_LOCAL_STORAGE_REG && - inst_defines_val(in, (Val)l->storage.v.reg)) - return l; - } - return NULL; -} - -static void opt_frame_home_addr_taken_locals(Func* f) { - int any = 0; - for (u32 i = 0; i < f->nlocals; ++i) { - IRLocal* l = &f->locals[i]; - if (l->address_taken && l->storage.kind == CG_LOCAL_STORAGE_REG && - l->home_slot != FRAME_SLOT_NONE) { - any = 1; - break; + case CG_IR_BINOP: + replay_operands(r, ops, in->opnds, 3, in->loc); + t->binop(t, (BinOp)in->extra.imm, ops[0], ops[1], ops[2]); + return; + case CG_IR_UNOP: + replay_operands(r, ops, in->opnds, 2, in->loc); + t->unop(t, (UnOp)in->extra.imm, ops[0], ops[1]); + return; + case CG_IR_CMP: + replay_operands(r, ops, in->opnds, 3, in->loc); + t->cmp(t, (CmpOp)in->extra.imm, ops[0], ops[1], ops[2]); + return; + case CG_IR_CONVERT: + replay_operands(r, ops, in->opnds, 2, in->loc); + t->convert(t, (ConvKind)in->extra.imm, ops[0], ops[1]); + return; + case CG_IR_CALL: { + const CgIrCallAux* aux = (const CgIrCallAux*)in->extra.aux; + CGCallDesc d = replay_call_desc(r, &aux->desc, in->loc); + t->call(t, &d); + return; } - } - if (!any) return; - - for (u32 b = 0; b < f->nblocks; ++b) { - Block* bl = &f->blocks[b]; - if (!bl->ninsts) continue; - u32 out_cap = bl->ninsts * (f->nlocals + 2u); - Inst* out = arena_zarray(f->arena, Inst, out_cap ? out_cap : 1u); - u32 nout = 0; - for (u32 i = 0; i < bl->ninsts; ++i) { - Inst in = bl->insts[i]; - for (u32 j = 0; j < f->nlocals; ++j) { - IRLocal* used = &f->locals[j]; - if (!used->address_taken || used->home_slot == FRAME_SLOT_NONE) - continue; - if (used->storage.kind != CG_LOCAL_STORAGE_REG) continue; - if (inst_uses_local_reg(&in, used->storage.v.reg)) - opt_make_local_load(f, &out[nout++], used, in.loc); + case CG_IR_RET: { + const CgIrRetAux* aux = (const CgIrRetAux*)in->extra.aux; + CGLocal* values = NULL; + if (aux && aux->nvalues) { + values = arena_array(r->o->c->tu, CGLocal, aux->nvalues); + for (u32 i = 0; i < aux->nvalues; ++i) + values[i] = replay_local(r, aux->values[i], in->loc); } - out[nout++] = in; - IRLocal* defined = opt_addr_taken_reg_local_defined_by(f, &in); - if (defined) opt_make_local_store(f, &out[nout++], defined, in.loc); + t->ret(t, values, aux ? aux->nvalues : 0u); + return; } - bl->insts = out; - bl->ninsts = nout; - bl->cap = bl->ninsts; - } -} - -static CGLocalStorage w_param(CGTarget* t, const CGParamDesc* d) { - OptImpl* o = impl_of(t); - CGLocalStorage st = d->storage; - CGLocalDesc local_desc; - memset(&local_desc, 0, sizeof local_desc); - local_desc.type = d->type; - local_desc.name = d->name; - local_desc.loc = d->loc; - local_desc.size = d->size; - local_desc.align = d->align; - local_desc.flags = d->flags; - if (st.kind == CG_LOCAL_STORAGE_FRAME && st.v.frame_slot == FRAME_SLOT_NONE) { - if (o->level < 2 && - (d->flags & (CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED)) == 0) { - PReg v = ir_alloc_preg(o->f, d->type, opt_local_reg_class(o, d->type)); - st.kind = CG_LOCAL_STORAGE_REG; - st.v.reg = (Reg)v; - } else { - st.kind = CG_LOCAL_STORAGE_FRAME; - st.v.frame_slot = opt_param_frame_slot(o->f, d); + case CG_IR_BR: + t->jump(t, replay_label(r, (Label)in->extra.imm, in->loc)); + return; + case CG_IR_CMP_BRANCH: { + const CgIrCmpBranchAux* aux = (const CgIrCmpBranchAux*)in->extra.aux; + replay_operands(r, ops, in->opnds, 2, in->loc); + t->cmp_branch(t, aux->op, ops[0], ops[1], + replay_label(r, aux->target, in->loc)); + return; } - } - /* Deep-copy parts so caller-stack memory isn't relied on. */ - CGParamDesc copy = *d; - copy.storage = st; - if (d->nincoming) { - CGABIPart* parts = arena_array(o->f->arena, CGABIPart, d->nincoming); - memcpy(parts, d->incoming, sizeof(CGABIPart) * d->nincoming); - copy.incoming = parts; - } - ir_param_add(o->f, &copy); - ir_local_add(o->f, &local_desc, st); - if (st.kind == CG_LOCAL_STORAGE_REG) { - ir_ensure_preg(o->f, (PReg)st.v.reg, d->type, - opt_local_reg_class(o, d->type)); - Inst* in = rec(o, IR_PARAM_DECL); - in->def = (Val)st.v.reg; - in->type = d->type; - } - return st; -} - -static void w_spill_reg(CGTarget* t, Operand src, FrameSlot s, MemAccess m) { - (void)src; - (void)s; - (void)m; - panic_unsupported(impl_of(t), "spill_reg"); -} -static void w_reload_reg(CGTarget* t, Operand dst, FrameSlot s, MemAccess m) { - (void)dst; - (void)s; - (void)m; - panic_unsupported(impl_of(t), "reload_reg"); -} - -static void w_get_allocable_regs(CGTarget* t, RegClass cls, const Reg** out, - u32* nregs) { - CGTarget* wr = impl_of(t)->target; - if (wr->get_allocable_regs) - wr->get_allocable_regs(wr, cls, out, nregs); - else { - *out = NULL; - *nregs = 0; - } -} - -static void w_get_phys_regs(CGTarget* t, RegClass cls, - const CGPhysRegInfo** out, u32* nregs) { - CGTarget* wr = impl_of(t)->target; - if (wr->get_phys_regs) - wr->get_phys_regs(wr, cls, out, nregs); - else { - *out = NULL; - *nregs = 0; - } -} - -static void w_get_scratch_regs(CGTarget* t, RegClass cls, const Reg** out, - u32* nregs) { - CGTarget* wr = impl_of(t)->target; - if (wr->get_scratch_regs) - wr->get_scratch_regs(wr, cls, out, nregs); - else { - *out = NULL; - *nregs = 0; - } -} - -static int w_is_caller_saved(CGTarget* t, RegClass cls, Reg r) { - CGTarget* wr = impl_of(t)->target; - if (wr->is_caller_saved) return wr->is_caller_saved(wr, cls, r); - return 0; -} - -static u32 w_call_clobber_mask(CGTarget* t, const CGCallDesc* d, RegClass cls) { - CGTarget* wr = impl_of(t)->target; - if (wr->call_clobber_mask) return wr->call_clobber_mask(wr, d, cls); - return 0; -} - -static u32 w_return_reg_mask(CGTarget* t, const ABIFuncInfo* abi, - RegClass cls) { - CGTarget* wr = impl_of(t)->target; - if (wr->return_reg_mask) return wr->return_reg_mask(wr, abi, cls); - return 0; -} - -static u32 w_callee_save_mask(CGTarget* t, RegClass cls) { - CGTarget* wr = impl_of(t)->target; - if (wr->callee_save_mask) return wr->callee_save_mask(wr, cls); - return 0; -} - -static void w_plan_hard_regs(CGTarget* t, RegClass cls, const Reg* regs, - u32 n) { - CGTarget* wr = impl_of(t)->target; - if (wr->plan_hard_regs) wr->plan_hard_regs(wr, cls, regs, n); -} - -static void w_reserve_hard_regs(CGTarget* t, RegClass cls, const Reg* regs, - u32 n) { - CGTarget* wr = impl_of(t)->target; - if (wr->reserve_hard_regs) wr->reserve_hard_regs(wr, cls, regs, n); -} - -static int w_resolve_reg_name(CGTarget* t, Sym name, Reg* out, - RegClass* cls_out) { - CGTarget* wr = impl_of(t)->target; - if (wr->resolve_reg_name) return wr->resolve_reg_name(wr, name, out, cls_out); - return 1; -} - -static void w_file_scope_asm(CGTarget* t, const char* src, size_t len) { - CGTarget* wr = impl_of(t)->target; - if (wr->file_scope_asm) wr->file_scope_asm(wr, src, len); -} - -/* ---- labels and control flow ---- */ - -static Label w_label_new(CGTarget* t) { - OptImpl* o = impl_of(t); - u32 block = ir_block_new(o->f); - /* Pre-allocate an MCLabel id so frontend code that needs a stable - * MCLabel before pass_emit replays (cfree_cg_data_label_addr in - * particular) has one. pass_emit places it through the wrapped - * target's label_place during replay. */ - if (o->target && o->target->mc) { - o->f->blocks[block].mc_label = o->target->mc->label_new(o->target->mc); - } - return (Label)block; -} - -static MCLabel w_cg_label_to_mc_label(CGTarget* t, Label l) { - OptImpl* o = impl_of(t); - u32 block = (u32)l; - if (block >= o->f->nblocks) return MC_LABEL_NONE; - return o->f->blocks[block].mc_label; -} - -static void w_label_place(CGTarget* t, Label l) { - OptImpl* o = impl_of(t); - u32 target_blk = (u32)l; - if (target_blk >= o->f->nblocks) { - SrcLoc loc = {0, 0, 0}; - compiler_panic(o->c, loc, "opt: label_place(%u) out of range", (unsigned)l); - } - if (!cur_terminated(o)) { - Block* cb = &o->f->blocks[o->cur]; - rec(o, IR_BR); - cb->succ[0] = target_blk; - cb->nsucc = 1; - } - set_cur(o, target_blk); -} - -static void w_jump(CGTarget* t, Label l) { - OptImpl* o = impl_of(t); - u32 target_blk = (u32)l; - Block* cb = &o->f->blocks[o->cur]; - rec(o, IR_BR); - cb->succ[0] = target_blk; - cb->nsucc = 1; - after_terminator(o); -} - -static void w_cmp_branch(CGTarget* t, CmpOp op, Operand a, Operand b, Label l) { - OptImpl* o = impl_of(t); - u32 taken = (u32)l; - Inst* in = rec(o, IR_CMP_BRANCH); - Operand ops[2] = {a, b}; - in->opnds = dup_opnds(o->f, ops, 2); - in->nopnds = 2; - in->extra.imm = (i64)op; - u32 cur = o->cur; - u32 ft = ir_block_new(o->f); - Block* cb = &o->f->blocks[cur]; - cb->succ[0] = taken; - cb->succ[1] = ft; - cb->nsucc = 2; - set_cur(o, ft); -} - -static void w_switch_(CGTarget* t, const CGSwitchDesc* d) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_SWITCH); - IRSwitchAux* aux = arena_znew(o->f->arena, IRSwitchAux); - Operand sel = d->selector; - in->opnds = dup_opnds(o->f, &sel, 1); - in->nopnds = 1; - aux->selector_type = d->selector_type; - aux->ncases = d->ncases; - aux->hint = d->hint; - aux->cases = NULL; - if (d->ncases) { - aux->cases = arena_array(o->f->arena, IRSwitchAuxCase, d->ncases); - for (u32 i = 0; i < d->ncases; ++i) { - aux->cases[i].value = d->cases[i].value; - aux->cases[i].block = (u32)d->cases[i].label; + case CG_IR_SWITCH: + replay_switch(r, in); + return; + case CG_IR_INDIRECT_BRANCH: { + const CgIrIndirectAux* aux = (const CgIrIndirectAux*)in->extra.aux; + Label* targets = + arena_array(r->o->c->tu, Label, aux->ntargets ? aux->ntargets : 1u); + for (u32 i = 0; i < aux->ntargets; ++i) + targets[i] = replay_label(r, aux->targets[i], in->loc); + ops[0] = replay_operand(r, in->opnds[0], in->loc); + t->indirect_branch(t, ops[0], targets, aux->ntargets); + return; } - } - u32 cur = o->cur; - /* Default label may be LABEL_NONE meaning "fall through past the - * switch." Materialize a fresh post-switch block to land on in that - * case so the CFG still has a single block as default successor. */ - u32 default_blk; - if (d->default_label != LABEL_NONE) { - aux->has_default = 1; - default_blk = (u32)d->default_label; - } else { - aux->has_default = 0; - default_blk = ir_block_new(o->f); - } - aux->default_block = default_blk; - in->extra.aux = aux; - - ir_block_set_nsucc(o->f, cur, d->ncases + 1u); - Block* cb = &o->f->blocks[cur]; - for (u32 i = 0; i < d->ncases; ++i) cb->succ[i] = (u32)d->cases[i].label; - cb->succ[d->ncases] = default_blk; - /* No-default fall-through: emit a fresh post-switch block as the - * continuation point. With an explicit default the next recorded - * instruction is unreachable until a label_place re-anchors cur. */ - if (!aux->has_default) { - set_cur(o, default_blk); - } else { - after_terminator(o); - } -} - -static void w_indirect_branch(CGTarget* t, Operand addr, const Label* targets, - u32 ntargets) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_INDIRECT_BRANCH); - IRIndirectAux* aux = arena_znew(o->f->arena, IRIndirectAux); - Operand a = addr; - in->opnds = dup_opnds(o->f, &a, 1); - in->nopnds = 1; - aux->ntargets = ntargets; - aux->targets = NULL; - if (ntargets) { - aux->targets = arena_array(o->f->arena, u32, ntargets); - for (u32 i = 0; i < ntargets; ++i) aux->targets[i] = (u32)targets[i]; - } - in->extra.aux = aux; - u32 cur = o->cur; - ir_block_set_nsucc(o->f, cur, ntargets); - Block* cb = &o->f->blocks[cur]; - for (u32 i = 0; i < ntargets; ++i) cb->succ[i] = (u32)targets[i]; - after_terminator(o); -} - -static void w_load_label_addr(CGTarget* t, Operand dst, Label l) { - OptImpl* o = impl_of(t); - ensure_operand(o->f, &dst); - Inst* in = rec(o, IR_LOAD_LABEL_ADDR); - in->opnds = dup_opnds(o->f, &dst, 1); - in->nopnds = 1; - in->extra.imm = (i64)(u32)l; - set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type); -} - -/* ---- structured scopes ---- */ - -static u32 scope_register(Func* f, Inst* in) { - if (f->nscopes == f->scopes_cap) { - u32 ncap = f->scopes_cap ? f->scopes_cap * 2u : 4u; - Inst** nb = arena_zarray(f->arena, Inst*, ncap); - if (f->scope_aux_inst) - memcpy(nb, f->scope_aux_inst, sizeof(Inst*) * f->nscopes); - f->scope_aux_inst = nb; - f->scopes_cap = ncap; - } - f->scope_aux_inst[f->nscopes++] = in; - return f->nscopes; -} - -static IRScopeAux* scope_lookup(OptImpl* o, CGScope s) { - if (s == CG_SCOPE_NONE || s > o->f->nscopes) { - SrcLoc loc = {0, 0, 0}; - compiler_panic(o->c, loc, "opt: bad scope id %u", (unsigned)s); - } - return (IRScopeAux*)o->f->scope_aux_inst[s - 1]->extra.aux; -} - -static CGScope w_scope_begin(CGTarget* t, const CGScopeDesc* d) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_SCOPE_BEGIN); - IRScopeAux* aux = arena_znew(o->f->arena, IRScopeAux); - aux->desc = *d; - in->extra.aux = aux; - u32 sid = scope_register(o->f, in); - aux->scope_id = sid; - - if (d->kind == SCOPE_IF) { - aux->if_then_block = ir_block_new(o->f); - aux->if_else_block = ir_block_new(o->f); - aux->if_end_block = ir_block_new(o->f); - Block* cb = &o->f->blocks[o->cur]; - cb->succ[0] = aux->if_then_block; - cb->succ[1] = aux->if_else_block; - cb->nsucc = 2; - set_cur(o, aux->if_then_block); - } else if (d->kind == SCOPE_LOOP || d->kind == SCOPE_BLOCK) { - aux->loop_break_block = - d->break_label != LABEL_NONE ? (u32)d->break_label : 0; - aux->loop_continue_block = - d->continue_label != LABEL_NONE ? (u32)d->continue_label : 0; - } - return (CGScope)sid; -} - -static void w_scope_else(CGTarget* t, CGScope s) { - OptImpl* o = impl_of(t); - IRScopeAux* aux = scope_lookup(o, s); - if (aux->desc.kind != SCOPE_IF) { - SrcLoc loc = {0, 0, 0}; - compiler_panic(o->c, loc, "opt: scope_else on non-IF scope %u", - (unsigned)s); - } - Inst* in = rec(o, IR_SCOPE_ELSE); - in->extra.imm = (i64)s; - if (!cur_terminated(o)) { - Block* cb = &o->f->blocks[o->cur]; - cb->succ[0] = aux->if_end_block; - cb->nsucc = 1; - } - aux->if_has_else = 1; - set_cur(o, aux->if_else_block); -} - -static void w_scope_end(CGTarget* t, CGScope s) { - OptImpl* o = impl_of(t); - IRScopeAux* aux = scope_lookup(o, s); - Inst* in = rec(o, IR_SCOPE_END); - in->extra.imm = (i64)s; - if (aux->desc.kind == SCOPE_IF) { - if (!cur_terminated(o)) { - Block* cb = &o->f->blocks[o->cur]; - cb->succ[0] = aux->if_end_block; - cb->nsucc = 1; + case CG_IR_LOAD_LABEL_ADDR: + ops[0] = replay_operand(r, in->opnds[0], in->loc); + t->load_label_addr(t, ops[0], + replay_label(r, (Label)in->extra.imm, in->loc)); + return; + case CG_IR_SCOPE_BEGIN: { + const CgIrScopeAux* aux = (const CgIrScopeAux*)in->extra.aux; + CGScopeDesc d = aux->desc; + d.break_label = replay_label(r, d.break_label, in->loc); + d.continue_label = replay_label(r, d.continue_label, in->loc); + d.cond = replay_operand(r, d.cond, in->loc); + r->scope_map[aux->scope] = t->scope_begin(t, &d); + return; } - if (!aux->if_has_else) { - Block* eb = &o->f->blocks[aux->if_else_block]; - if (eb->nsucc == 0) { - eb->succ[0] = aux->if_end_block; - eb->nsucc = 1; - } - /* Else block was never visited as cur, but it has code (the - * fall-through from scope_begin) — record it before end so emit - * order has it. */ - ir_note_emit(o->f, aux->if_else_block); + case CG_IR_SCOPE_ELSE: + t->scope_else(t, replay_scope(r, (CGScope)in->extra.imm, in->loc)); + return; + case CG_IR_SCOPE_END: + t->scope_end(t, replay_scope(r, (CGScope)in->extra.imm, in->loc)); + return; + case CG_IR_BREAK_TO: + t->break_to(t, replay_scope(r, (CGScope)in->extra.imm, in->loc)); + return; + case CG_IR_CONTINUE_TO: + t->continue_to(t, replay_scope(r, (CGScope)in->extra.imm, in->loc)); + return; + case CG_IR_ALLOCA: + replay_operands(r, ops, in->opnds, 2, in->loc); + t->alloca_(t, ops[0], ops[1], (u32)in->extra.imm); + return; + case CG_IR_VA_START: + ops[0] = replay_operand(r, in->opnds[0], in->loc); + t->va_start_(t, ops[0]); + return; + case CG_IR_VA_ARG: + replay_operands(r, ops, in->opnds, 2, in->loc); + t->va_arg_(t, ops[0], ops[1], (CfreeCgTypeId)in->extra.imm); + return; + case CG_IR_VA_END: + ops[0] = replay_operand(r, in->opnds[0], in->loc); + t->va_end_(t, ops[0]); + return; + case CG_IR_VA_COPY: + replay_operands(r, ops, in->opnds, 2, in->loc); + t->va_copy_(t, ops[0], ops[1]); + return; + case CG_IR_ATOMIC_LOAD: { + const CgIrAtomicAux* aux = (const CgIrAtomicAux*)in->extra.aux; + replay_operands(r, ops, in->opnds, 2, in->loc); + t->atomic_load(t, ops[0], ops[1], aux->mem, aux->order); + return; } - set_cur(o, aux->if_end_block); - } -} - -static void w_break_to(CGTarget* t, CGScope s) { - OptImpl* o = impl_of(t); - IRScopeAux* aux = scope_lookup(o, s); - Inst* in = rec(o, IR_BREAK_TO); - in->extra.imm = (i64)s; - Block* cb = &o->f->blocks[o->cur]; - cb->succ[0] = aux->loop_break_block; - cb->nsucc = 1; - after_terminator(o); -} - -static void w_continue_to(CGTarget* t, CGScope s) { - OptImpl* o = impl_of(t); - IRScopeAux* aux = scope_lookup(o, s); - Inst* in = rec(o, IR_CONTINUE_TO); - in->extra.imm = (i64)s; - Block* cb = &o->f->blocks[o->cur]; - cb->succ[0] = aux->loop_continue_block; - cb->nsucc = 1; - after_terminator(o); -} - -/* ---- data movement ---- */ - -static void w_load_imm(CGTarget* t, Operand dst, i64 imm) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_LOAD_IMM); - Operand ops[1] = {dst}; - in->opnds = dup_opnds(o->f, ops, 1); - in->nopnds = 1; - in->extra.imm = imm; - if (dst.kind == OPK_REG) - set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type); -} - -static void w_load_const(CGTarget* t, Operand dst, ConstBytes cb) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_LOAD_CONST); - Operand ops[1] = {dst}; - in->opnds = dup_opnds(o->f, ops, 1); - in->nopnds = 1; - in->extra.cbytes = cb; - if (cb.size) { - u8* bytes = arena_array(o->f->arena, u8, cb.size); - memcpy(bytes, cb.bytes, cb.size); - in->extra.cbytes.bytes = bytes; - } - if (dst.kind == OPK_REG) - set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type); -} - -static void w_copy(CGTarget* t, Operand dst, Operand src) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_COPY); - Operand ops[2] = {dst, src}; - in->opnds = dup_opnds(o->f, ops, 2); - in->nopnds = 2; - if (dst.kind == OPK_REG) - set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type); -} - -static void w_load(CGTarget* t, Operand dst, Operand addr, MemAccess m) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_LOAD); - Operand ops[2] = {dst, addr}; - in->opnds = dup_opnds(o->f, ops, 2); - in->nopnds = 2; - in->extra.mem = m; - if (dst.kind == OPK_REG) - set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type); -} - -static void w_store(CGTarget* t, Operand addr, Operand src, MemAccess m) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_STORE); - Operand ops[2] = {addr, src}; - in->opnds = dup_opnds(o->f, ops, 2); - in->nopnds = 2; - in->extra.mem = m; -} - -static void w_addr_of(CGTarget* t, Operand dst, Operand lv) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_ADDR_OF); - Operand ops[2] = {dst, lv}; - in->opnds = dup_opnds(o->f, ops, 2); - in->nopnds = 2; - if (dst.kind == OPK_REG) - set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type); -} - -static void w_tls_addr_of(CGTarget* t, Operand dst, ObjSymId sym, i64 addend) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_TLS_ADDR_OF); - Operand ops[1] = {dst}; - in->opnds = dup_opnds(o->f, ops, 1); - in->nopnds = 1; - IRTlsAux* aux = arena_znew(o->f->arena, IRTlsAux); - aux->sym = sym; - aux->addend = addend; - in->extra.aux = aux; - if (dst.kind == OPK_REG) - set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type); -} - -static void w_copy_bytes(CGTarget* t, Operand dst, Operand src, - AggregateAccess agg) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_AGG_COPY); - Operand ops[2] = {dst, src}; - in->opnds = dup_opnds(o->f, ops, 2); - in->nopnds = 2; - IRAggAux* aux = arena_znew(o->f->arena, IRAggAux); - aux->access = agg; - in->extra.aux = aux; -} - -static void w_set_bytes(CGTarget* t, Operand dst, Operand byte, - AggregateAccess agg) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_AGG_SET); - Operand ops[2] = {dst, byte}; - in->opnds = dup_opnds(o->f, ops, 2); - in->nopnds = 2; - IRAggAux* aux = arena_znew(o->f->arena, IRAggAux); - aux->access = agg; - in->extra.aux = aux; -} - -static void w_bitfield_load(CGTarget* t, Operand dst, Operand record, - BitFieldAccess bf) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_BITFIELD_LOAD); - Operand ops[2] = {dst, record}; - in->opnds = dup_opnds(o->f, ops, 2); - in->nopnds = 2; - IRBitFieldAux* aux = arena_znew(o->f->arena, IRBitFieldAux); - aux->access = bf; - in->extra.aux = aux; - if (dst.kind == OPK_REG) - set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type); -} - -static void w_bitfield_store(CGTarget* t, Operand record, Operand src, - BitFieldAccess bf) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_BITFIELD_STORE); - Operand ops[2] = {record, src}; - in->opnds = dup_opnds(o->f, ops, 2); - in->nopnds = 2; - IRBitFieldAux* aux = arena_znew(o->f->arena, IRBitFieldAux); - aux->access = bf; - in->extra.aux = aux; -} - -/* ---- arithmetic / cmp / convert ---- */ - -static void w_binop(CGTarget* t, BinOp op, Operand dst, Operand a, Operand b) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_BINOP); - Operand ops[3] = {dst, a, b}; - in->opnds = dup_opnds(o->f, ops, 3); - in->nopnds = 3; - in->extra.imm = (i64)op; - if (dst.kind == OPK_REG) - set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type); -} - -static void w_unop(CGTarget* t, UnOp op, Operand dst, Operand a) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_UNOP); - Operand ops[2] = {dst, a}; - in->opnds = dup_opnds(o->f, ops, 2); - in->nopnds = 2; - in->extra.imm = (i64)op; - if (dst.kind == OPK_REG) - set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type); -} - -static void w_cmp(CGTarget* t, CmpOp op, Operand dst, Operand a, Operand b) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_CMP); - Operand ops[3] = {dst, a, b}; - in->opnds = dup_opnds(o->f, ops, 3); - in->nopnds = 3; - in->extra.imm = (i64)op; - if (dst.kind == OPK_REG) - set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type); -} - -static void w_convert(CGTarget* t, ConvKind k, Operand dst, Operand src) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_CONVERT); - Operand ops[2] = {dst, src}; - in->opnds = dup_opnds(o->f, ops, 2); - in->nopnds = 2; - in->extra.imm = (i64)k; - if (dst.kind == OPK_REG) - set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type); -} - -/* ---- calls / return ---- */ - -static CGABIPart* dup_parts(Arena* a, const CGABIPart* src, u32 n) { - if (!n) return NULL; - CGABIPart* dst = arena_array(a, CGABIPart, n); - memcpy(dst, src, sizeof(CGABIPart) * n); - return dst; -} - -static void w_call(CGTarget* t, const CGCallDesc* d) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_CALL); - IRCallAux* aux = arena_znew(o->f->arena, IRCallAux); - ensure_operand(o->f, &d->callee); - aux->desc = *d; - if (d->nargs) { - CGABIValue* args = arena_array(o->f->arena, CGABIValue, d->nargs); - for (u32 i = 0; i < d->nargs; ++i) { - ensure_abivalue(o->f, &d->args[i]); - args[i] = d->args[i]; - args[i].parts = - dup_parts(o->f->arena, d->args[i].parts, d->args[i].nparts); + case CG_IR_ATOMIC_STORE: { + const CgIrAtomicAux* aux = (const CgIrAtomicAux*)in->extra.aux; + replay_operands(r, ops, in->opnds, 2, in->loc); + t->atomic_store(t, ops[0], ops[1], aux->mem, aux->order); + return; } - aux->desc.args = args; - } - ensure_abivalue(o->f, &d->ret); - aux->desc.ret = d->ret; - aux->desc.ret.parts = dup_parts(o->f->arena, d->ret.parts, d->ret.nparts); - in->extra.aux = aux; - in->type = d->fn_type; - if (d->ret.storage.kind == OPK_REG) { - set_preg_def(o->f, in, o->cur, (PReg)d->ret.storage.v.reg, d->ret.type); - } -} - -static const char* w_tail_call_unrealizable_reason(CGTarget* t, - const CGCallDesc* d) { - (void)t; - (void)d; - /* The recorder accepts every tail call. Realizability depends on the laid- - * out frame, known only when the call is emitted onto the real backend - * during replay (pass_emit). There the real target's hook is consulted and - * the call is emitted as a tail, falls back to an ordinary call+return - * (ALLOWED), or diagnosed (MUST). */ - return NULL; -} - -static void w_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) { - CGTarget* wr = impl_of(t)->target; - if (wr->plan_call) - wr->plan_call(wr, d, out); - else - memset(out, 0, sizeof *out); -} - -static void w_emit_call_plan(CGTarget* t, const CGCallPlan* p) { - CGTarget* wr = impl_of(t)->target; - if (wr->emit_call_plan) wr->emit_call_plan(wr, p); -} - -static void w_load_call_arg(CGTarget* t, Operand dst, const CGCallPlanMove* m) { - CGTarget* wr = impl_of(t)->target; - if (wr->load_call_arg) wr->load_call_arg(wr, dst, m); -} - -static void w_store_call_arg(CGTarget* t, const CGCallPlanMove* m) { - CGTarget* wr = impl_of(t)->target; - if (wr->store_call_arg) wr->store_call_arg(wr, m); -} - -static void w_store_call_ret(CGTarget* t, const CGCallPlanRet* ret, - Operand src) { - CGTarget* wr = impl_of(t)->target; - if (wr->store_call_ret) wr->store_call_ret(wr, ret, src); -} - -static void w_ret(CGTarget* t, const CGABIValue* v) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_RET); - IRRetAux* aux = arena_znew(o->f->arena, IRRetAux); - if (v) { - ensure_abivalue(o->f, v); - aux->present = 1; - aux->val = *v; - aux->val.parts = dup_parts(o->f->arena, v->parts, v->nparts); - } - in->extra.aux = aux; - Block* cb = &o->f->blocks[o->cur]; - cb->nsucc = 0; - after_terminator(o); -} - -/* ---- alloca / variadics / atomics / fence / intrinsic ---- */ - -static void w_alloca_(CGTarget* t, Operand dst, Operand size, u32 align) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_ALLOCA); - Operand ops[2] = {dst, size}; - in->opnds = dup_opnds(o->f, ops, 2); - in->nopnds = 2; - in->extra.imm = (i64)align; - if (dst.kind == OPK_REG) - set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type); -} - -static void w_va_start_(CGTarget* t, Operand ap) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_VA_START); - Operand ops[1] = {ap}; - in->opnds = dup_opnds(o->f, ops, 1); - in->nopnds = 1; -} - -static void w_va_arg_(CGTarget* t, Operand dst, Operand ap, CfreeCgTypeId ty) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_VA_ARG); - Operand ops[2] = {dst, ap}; - in->opnds = dup_opnds(o->f, ops, 2); - in->nopnds = 2; - in->extra.aux = (void*)(uintptr_t)ty; - if (dst.kind == OPK_REG) - set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type); -} - -static void w_va_end_(CGTarget* t, Operand ap) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_VA_END); - Operand ops[1] = {ap}; - in->opnds = dup_opnds(o->f, ops, 1); - in->nopnds = 1; -} - -static void w_va_copy_(CGTarget* t, Operand dst, Operand src) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_VA_COPY); - Operand ops[2] = {dst, src}; - in->opnds = dup_opnds(o->f, ops, 2); - in->nopnds = 2; -} - -static void w_atomic_load(CGTarget* t, Operand dst, Operand addr, MemAccess m, - MemOrder mo) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_ATOMIC_LOAD); - Operand ops[2] = {dst, addr}; - in->opnds = dup_opnds(o->f, ops, 2); - in->nopnds = 2; - IRAtomicAux* aux = arena_znew(o->f->arena, IRAtomicAux); - aux->mem = m; - aux->mo = mo; - in->extra.aux = aux; - if (dst.kind == OPK_REG) - set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type); -} - -static void w_atomic_store(CGTarget* t, Operand addr, Operand src, MemAccess m, - MemOrder mo) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_ATOMIC_STORE); - Operand ops[2] = {addr, src}; - in->opnds = dup_opnds(o->f, ops, 2); - in->nopnds = 2; - IRAtomicAux* aux = arena_znew(o->f->arena, IRAtomicAux); - aux->mem = m; - aux->mo = mo; - in->extra.aux = aux; -} - -static void w_atomic_rmw(CGTarget* t, AtomicOp op, Operand dst, Operand addr, - Operand val, MemAccess m, MemOrder mo) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_ATOMIC_RMW); - Operand ops[3] = {dst, addr, val}; - in->opnds = dup_opnds(o->f, ops, 3); - in->nopnds = 3; - IRAtomicAux* aux = arena_znew(o->f->arena, IRAtomicAux); - aux->mem = m; - aux->mo = mo; - aux->op = (u8)op; - in->extra.aux = aux; - if (dst.kind == OPK_REG) - set_preg_def(o->f, in, o->cur, (PReg)dst.v.reg, dst.type); -} - -static void w_atomic_cas(CGTarget* t, Operand prior, Operand ok, Operand addr, - Operand expected, Operand desired, MemAccess m, - MemOrder s, MemOrder f) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_ATOMIC_CAS); - Operand ops[5] = {prior, ok, addr, expected, desired}; - in->opnds = dup_opnds(o->f, ops, 5); - in->nopnds = 5; - IRCasAux* aux = arena_znew(o->f->arena, IRCasAux); - aux->mem = m; - aux->success = s; - aux->failure = f; - in->extra.aux = aux; - if (prior.kind == OPK_REG) - set_preg_def(o->f, in, o->cur, (PReg)prior.v.reg, prior.type); - if (ok.kind == OPK_REG) { - in->ndefs = 2; - in->defs = arena_array(o->f->arena, Val, 2); - in->defs[0] = (prior.kind == OPK_REG) ? (Val)prior.v.reg : VAL_NONE; - in->defs[1] = (Val)ok.v.reg; - } -} - -static void w_fence(CGTarget* t, MemOrder mo) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_FENCE); - in->extra.imm = (i64)mo; -} - -static void w_intrinsic(CGTarget* t, IntrinKind kind, Operand* dsts, u32 nd, - const Operand* args, u32 na) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_INTRINSIC); - IRIntrinAux* aux = arena_znew(o->f->arena, IRIntrinAux); - aux->kind = kind; - aux->ndst = nd; - aux->narg = na; - aux->dsts = nd ? arena_array(o->f->arena, Operand, nd) : NULL; - aux->args = na ? arena_array(o->f->arena, Operand, na) : NULL; - if (nd) { - memcpy(aux->dsts, dsts, sizeof(Operand) * nd); - for (u32 i = 0; i < nd; ++i) ensure_operand(o->f, &aux->dsts[i]); - } - if (na) { - memcpy(aux->args, args, sizeof(Operand) * na); - for (u32 i = 0; i < na; ++i) ensure_operand(o->f, &aux->args[i]); - } - in->extra.aux = aux; - if (nd == 1 && dsts[0].kind == OPK_REG) { - set_preg_def(o->f, in, o->cur, (PReg)dsts[0].v.reg, dsts[0].type); - } else if (nd > 1) { - in->ndefs = nd; - in->defs = arena_array(o->f->arena, Val, nd); - for (u32 i = 0; i < nd; ++i) { - in->defs[i] = (dsts[i].kind == OPK_REG) ? (Val)dsts[i].v.reg : VAL_NONE; + case CG_IR_ATOMIC_RMW: { + const CgIrAtomicAux* aux = (const CgIrAtomicAux*)in->extra.aux; + replay_operands(r, ops, in->opnds, 3, in->loc); + t->atomic_rmw(t, aux->op, ops[0], ops[1], ops[2], aux->mem, aux->order); + return; } - in->def = in->defs[0]; - in->type = dsts[0].type; - } - if (intrinsic_terminates(kind)) { - Block* cb = &o->f->blocks[o->cur]; - cb->nsucc = 0; - after_terminator(o); - } -} - -static void w_asm_block(CGTarget* t, const char* tmpl, - const AsmConstraint* outs, u32 nout, Operand* out_ops, - const AsmConstraint* ins, u32 nin, - const Operand* in_ops, const Sym* clobbers, u32 nclob) { - OptImpl* o = impl_of(t); - Inst* in = rec(o, IR_ASM_BLOCK); - IRAsmAux* aux = arena_znew(o->f->arena, IRAsmAux); - /* Template strings reach us via the parser's interned string pool, which - * outlives the CG/Opt arenas. Storing the pointer is safe; copy - * defensively into the IR arena anyway so the IR is self-contained. */ - if (tmpl) { - size_t tl = 0; - while (tmpl[tl]) ++tl; - aux->tmpl = arena_strdup(o->f->arena, tmpl, tl); - } else { - aux->tmpl = NULL; - } - aux->nout = nout; - aux->nin = nin; - aux->nclob = nclob; - if (nout) { - aux->outs = arena_array(o->f->arena, AsmConstraint, nout); - memcpy(aux->outs, outs, nout * sizeof *outs); - aux->out_ops = arena_array(o->f->arena, Operand, nout); - memcpy(aux->out_ops, out_ops, nout * sizeof *out_ops); - for (u32 i = 0; i < nout; ++i) ensure_operand(o->f, &aux->out_ops[i]); - } - if (nin) { - aux->ins = arena_array(o->f->arena, AsmConstraint, nin); - memcpy(aux->ins, ins, nin * sizeof *ins); - aux->in_ops = arena_array(o->f->arena, Operand, nin); - memcpy(aux->in_ops, in_ops, nin * sizeof *in_ops); - for (u32 i = 0; i < nin; ++i) ensure_operand(o->f, &aux->in_ops[i]); - } - if (nclob) { - aux->clobbers = arena_array(o->f->arena, Sym, nclob); - memcpy(aux->clobbers, clobbers, nclob * sizeof *clobbers); - } - in->extra.aux = aux; - if (nout) { - in->ndefs = nout; - in->defs = arena_array(o->f->arena, Val, nout); - for (u32 i = 0; i < nout; ++i) { - in->defs[i] = - (out_ops[i].kind == OPK_REG) ? (Val)out_ops[i].v.reg : VAL_NONE; + case CG_IR_ATOMIC_CAS: { + const CgIrAtomicAux* aux = (const CgIrAtomicAux*)in->extra.aux; + replay_operands(r, ops, in->opnds, 5, in->loc); + t->atomic_cas(t, ops[0], ops[1], ops[2], ops[3], ops[4], aux->mem, + aux->order, aux->failure); + return; } - in->def = in->defs[0]; - in->type = out_ops[0].type; - } -} - -static void w_set_loc(CGTarget* t, SrcLoc loc) { - OptImpl* o = impl_of(t); - o->pending_loc = loc; -} - -static u64 func_inst_count(Func* f) { - u64 n = 0; - if (!f) return 0; - for (u32 b = 0; b < f->nblocks; ++b) n += f->blocks[b].ninsts; - return n; -} - -static int inst_spill_local(Func* f, const Inst* in, u32 op_idx) { - FrameSlot fs; - if (!f || !in || op_idx >= in->nopnds) return 0; - if (in->opnds[op_idx].kind != OPK_LOCAL) return 0; - fs = in->opnds[op_idx].v.frame_slot; - return fs != FRAME_SLOT_NONE && fs <= f->nframe_slots && - f->frame_slots[fs - 1u].kind == FS_SPILL; -} - -static u64 func_spill_alloc_count(Func* f) { - u64 n = 0; - if (!f || (!f->preg_locs && !f->preg_info)) return 0; - for (PReg r = 1; r < opt_reg_count(f); ++r) - if (opt_preg_alloc_kind(f, r) == OPT_ALLOC_SPILL) ++n; - return n; -} - -static u64 blocks_spill_load_count(Func* f, Block* blocks, u32 nblocks) { - u64 n = 0; - if (!f || !blocks) return 0; - for (u32 b = 0; b < nblocks; ++b) { - Block* bl = &blocks[b]; - for (u32 i = 0; i < bl->ninsts; ++i) { - Inst* in = &bl->insts[i]; - if ((IROp)in->op == IR_LOAD && inst_spill_local(f, in, 1)) ++n; + case CG_IR_FENCE: + t->fence(t, (MemOrder)in->extra.imm); + return; + case CG_IR_INTRINSIC: { + const CgIrIntrinsicAux* aux = (const CgIrIntrinsicAux*)in->extra.aux; + Operand* dsts = + arena_array(r->o->c->tu, Operand, aux->ndst ? aux->ndst : 1u); + Operand* args = + arena_array(r->o->c->tu, Operand, aux->narg ? aux->narg : 1u); + replay_operands(r, dsts, aux->dsts, aux->ndst, in->loc); + replay_operands(r, args, aux->args, aux->narg, in->loc); + t->intrinsic(t, aux->kind, dsts, aux->ndst, args, aux->narg); + return; } - } - return n; -} - -static u64 func_spill_load_count(Func* f) { - if (!f) return 0; - if (f->mir) - return blocks_spill_load_count(f, f->mir->blocks, f->mir->nblocks); - return blocks_spill_load_count(f, f->blocks, f->nblocks); -} - -static u64 blocks_spill_store_count(Func* f, Block* blocks, u32 nblocks) { - u64 n = 0; - if (!f || !blocks) return 0; - for (u32 b = 0; b < nblocks; ++b) { - Block* bl = &blocks[b]; - for (u32 i = 0; i < bl->ninsts; ++i) { - Inst* in = &bl->insts[i]; - if ((IROp)in->op == IR_STORE && inst_spill_local(f, in, 0)) ++n; + case CG_IR_ASM_BLOCK: { + const CgIrAsmAux* aux = (const CgIrAsmAux*)in->extra.aux; + Operand* out_ops = + arena_array(r->o->c->tu, Operand, aux->nout ? aux->nout : 1u); + Operand* in_ops = + arena_array(r->o->c->tu, Operand, aux->nin ? aux->nin : 1u); + replay_operands(r, out_ops, aux->out_ops, aux->nout, in->loc); + replay_operands(r, in_ops, aux->in_ops, aux->nin, in->loc); + t->asm_block(t, aux->tmpl, aux->outs, aux->nout, out_ops, aux->ins, + aux->nin, in_ops, aux->clobbers, aux->nclob); + return; } + case CG_IR_LOCAL_STATIC_DATA_BEGIN: { + const CgIrLocalStaticBeginAux* aux = + (const CgIrLocalStaticBeginAux*)in->extra.aux; + if (!t->local_static_data_begin || + !t->local_static_data_begin(t, &aux->desc)) + compiler_panic(r->o->c, in->loc, + "opt direct replay: local static data unsupported"); + return; + } + case CG_IR_LOCAL_STATIC_DATA_WRITE: { + const CgIrLocalStaticWriteAux* aux = + (const CgIrLocalStaticWriteAux*)in->extra.aux; + t->local_static_data_write(t, aux->has_data ? aux->data : NULL, aux->len); + return; + } + case CG_IR_LOCAL_STATIC_DATA_LABEL_ADDR: { + const CgIrLocalStaticLabelAux* aux = + (const CgIrLocalStaticLabelAux*)in->extra.aux; + t->local_static_data_label_addr(t, replay_label(r, aux->target, in->loc), + aux->addend, aux->width, + aux->address_space); + return; + } + case CG_IR_LOCAL_STATIC_DATA_END: + t->local_static_data_end(t); + return; + } +} + +static void opt_replay_cg_ir_direct(OptImpl* o, const CgIrFunc* f) { + OptReplay r; + memset(&r, 0, sizeof r); + r.o = o; + r.nlocals = f->nlocals; + r.local_map = + arena_zarray(o->c->tu, CGLocal, f->nlocals ? f->nlocals + 1u : 1u); + for (u32 i = 0; i < f->nlabels; ++i) + if (f->labels[i].id > r.nlabels) r.nlabels = f->labels[i].id; + r.label_map = arena_zarray(o->c->tu, Label, r.nlabels ? r.nlabels + 1u : 1u); + r.nscopes = f->nscopes; + r.scope_map = + arena_zarray(o->c->tu, CGScope, f->nscopes ? f->nscopes + 1u : 1u); + + o->target->func_begin(o->target, &f->desc); + for (u32 i = 0; i < f->nlabels; ++i) + r.label_map[f->labels[i].id] = o->target->label_new(o->target); + for (u32 i = 0; i < f->nparams; ++i) { + const CgIrParam* p = &f->params[i]; + r.local_map[p->local] = o->target->param(o->target, &p->desc); + } + for (u32 i = 0; i < f->nlocals; ++i) { + const CgIrLocal* l = &f->locals[i]; + if (!r.local_map[l->id]) + r.local_map[l->id] = o->target->local(o->target, &l->desc); } - return n; + for (u32 i = 0; i < f->ninsts; ++i) replay_inst(&r, &f->insts[i]); + o->target->func_end(o->target); } -static u64 func_spill_store_count(Func* f) { - if (!f) return 0; - if (f->mir) - return blocks_spill_store_count(f, f->mir->blocks, f->mir->nblocks); - return blocks_spill_store_count(f, f->blocks, f->nblocks); +static void opt_dbg_dump(OptImpl* o, Func* f, const char* tag) { + extern char* getenv(const char*); + const char* s = getenv("CFREE_DUMP"); + CfreeWriter* w = NULL; + size_t len = 0; + const uint8_t* bytes; + if (!s) return; + cfree_writer_mem(o->c->ctx->heap, &w); + opt_ir_dump(f, w); + bytes = cfree_writer_mem_bytes(w, &len); + compiler_panic(o->c, f->desc.loc, "DUMP %s:\n%.*s", tag, (int)len, + (const char*)bytes); } -static void opt_run_lowering_pipeline(OptImpl* o, const char* total_scope, - int allow_live_range_split) { - (void)allow_live_range_split; - metrics_scope_begin(o->c, total_scope); +static void opt_run_o1_native(OptImpl* o, Func* f) { + OptLiveInfo live; + OptLiveInfo regalloc_live; + if (!o->native) + compiler_panic(o->c, f ? f->desc.loc : (SrcLoc){0, 0, 0}, + "O1 optimizer requires a native target"); + opt_dbg_dump(o, f, "entry"); + + metrics_scope_begin(o->c, "opt.o1.total"); metrics_count(o->c, "opt.funcs", 1); - metrics_count(o->c, "opt.blocks", o->f->nblocks); - metrics_count(o->c, "opt.insts", func_inst_count(o->f)); - metrics_count(o->c, "opt.pregs", o->f->npregs); + metrics_count(o->c, "opt.blocks", f->nblocks); + metrics_count(o->c, "opt.pregs", f->npregs); + metrics_scope_begin(o->c, "opt.cfg.build_1"); - opt_build_cfg(o->f); + opt_build_cfg(f); metrics_scope_end(o->c, "opt.cfg.build_1"); metrics_scope_begin(o->c, "opt.cfg.jump_cleanup_cfg"); - opt_jump_cleanup(o->f, OPT_JUMP_CLEANUP_CFG); + opt_jump_cleanup(f, OPT_JUMP_CLEANUP_CFG); metrics_scope_end(o->c, "opt.cfg.jump_cleanup_cfg"); metrics_scope_begin(o->c, "opt.cfg.build_2"); - opt_build_cfg(o->f); + opt_build_cfg(f); metrics_scope_end(o->c, "opt.cfg.build_2"); metrics_scope_begin(o->c, "opt.cfg.simplify_local"); - opt_simplify_local(o->f); + opt_simplify_local(f); metrics_scope_end(o->c, "opt.cfg.simplify_local"); metrics_scope_begin(o->c, "opt.cfg.verify"); - opt_verify(o->f, "lowering-cfg"); + opt_verify(f, "lowering-cfg"); metrics_scope_end(o->c, "opt.cfg.verify"); + metrics_scope_begin(o->c, "opt.machinize"); - opt_machinize(o->f, o->target); + opt_machinize_native(f, o->native); metrics_scope_end(o->c, "opt.machinize"); metrics_scope_begin(o->c, "opt.machinize.verify"); - opt_verify(o->f, "lowering-machinize"); + opt_verify(f, "lowering-machinize"); metrics_scope_end(o->c, "opt.machinize.verify"); + metrics_scope_begin(o->c, "opt.o1.addr_xform_pregs"); - opt_addr_xform_pregs(o->f); + opt_addr_xform_pregs(f); metrics_scope_end(o->c, "opt.o1.addr_xform_pregs"); metrics_scope_begin(o->c, "opt.o1.addr_xform.verify"); - opt_verify(o->f, "o1-addr-xform"); + opt_verify(f, "o1-addr-xform"); metrics_scope_end(o->c, "opt.o1.addr_xform.verify"); metrics_scope_begin(o->c, "opt.o1.promote_scalar_locals"); - opt_promote_scalar_locals(o->f); + opt_promote_scalar_locals(f); metrics_scope_end(o->c, "opt.o1.promote_scalar_locals"); metrics_scope_begin(o->c, "opt.o1.promote_scalar.verify"); - opt_verify(o->f, "o1-promote-scalar"); + opt_verify(f, "o1-promote-scalar"); metrics_scope_end(o->c, "opt.o1.promote_scalar.verify"); metrics_scope_begin(o->c, "opt.o1.addr_of_global_cse"); - opt_addr_of_global_cse(o->f); + opt_addr_of_global_cse(f); metrics_scope_end(o->c, "opt.o1.addr_of_global_cse"); metrics_scope_begin(o->c, "opt.o1.addr_of_global.verify"); - opt_verify(o->f, "o1-addr-global-cse"); + opt_verify(f, "o1-addr-global-cse"); metrics_scope_end(o->c, "opt.o1.addr_of_global.verify"); + metrics_scope_begin(o->c, "opt.build_loop_tree"); - opt_build_loop_tree(o->f); + opt_build_loop_tree(f); metrics_scope_end(o->c, "opt.build_loop_tree"); + metrics_scope_begin(o->c, "opt.live_blocks.pre_dde"); - OptLiveInfo live; - opt_live_blocks(o->f, &live); - metrics_count(o->c, "opt.live_words", o->f->opt_live_words); - metrics_count(o->c, "opt.live.blocks", o->f->nblocks); - metrics_count(o->c, "opt.live.active_words", live.active_words); - metrics_count(o->c, "opt.live.block_bytes", live.block_bytes); - metrics_count(o->c, "opt.live.set_bit_scans", live.set_bit_scans); - metrics_count(o->c, "opt.live.bitset_words_touched", - live.bitset_words_touched); - metrics_count(o->c, "opt.live.dataflow_iterations", live.dataflow_iterations); - metrics_count(o->c, "opt.live.dataflow_block_visits", - live.dataflow_block_visits); - metrics_count(o->c, "opt.conflict_bytes", 0); + memset(&live, 0, sizeof live); + opt_live_blocks(f, &live); + metrics_count(o->c, "opt.live_words", f->opt_live_words); metrics_scope_end(o->c, "opt.live_blocks.pre_dde"); metrics_scope_begin(o->c, "opt.dead_def_elim"); - opt_dead_def_elim_with_live(o->f, &live); - metrics_count(o->c, "opt.dde.live_words_touched", - o->f->opt_dde_live_words_touched); + opt_dead_def_elim_with_live(f, &live); metrics_scope_end(o->c, "opt.dead_def_elim"); + metrics_scope_begin(o->c, "opt.regalloc"); - OptLiveInfo regalloc_live; - opt_regalloc_locations(o->f, 0, &regalloc_live); - metrics_count(o->c, "opt.alloc.used_loc_words", o->f->opt_used_loc_words); - metrics_count(o->c, "opt.alloc.hard_loc_words", - o->f->opt_alloc_hard_loc_words); - metrics_count(o->c, "opt.alloc.stack_loc_words", - o->f->opt_alloc_stack_loc_words); - metrics_count(o->c, "opt.alloc.stack_slots", o->f->opt_alloc_stack_slots); - metrics_count(o->c, "opt.alloc.hard_point_visits", - o->f->opt_alloc_hard_point_visits); - metrics_count(o->c, "opt.alloc.stack_point_visits", - o->f->opt_alloc_stack_point_visits); - metrics_count(o->c, "opt.alloc.hard_word_ors", o->f->opt_alloc_hard_word_ors); - metrics_count(o->c, "opt.alloc.stack_word_ors", - o->f->opt_alloc_stack_word_ors); - metrics_count(o->c, "opt.alloc.hard_mark_points", - o->f->opt_alloc_hard_mark_points); - metrics_count(o->c, "opt.alloc.stack_mark_points", - o->f->opt_alloc_stack_mark_points); - metrics_count(o->c, "opt.alloc.spills", func_spill_alloc_count(o->f)); + memset(&regalloc_live, 0, sizeof regalloc_live); + opt_regalloc_locations(f, 0, &regalloc_live); metrics_scope_end(o->c, "opt.regalloc"); metrics_scope_begin(o->c, "opt.regalloc.verify"); - opt_analysis_invalidate(o->f, OPT_ANALYSIS_DEF_USE); - opt_verify(o->f, "post-regalloc"); + opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE); + opt_verify(f, "post-regalloc"); metrics_scope_end(o->c, "opt.regalloc.verify"); + metrics_scope_begin(o->c, "opt.lower_mir"); - opt_lower_to_mir(o->f, &regalloc_live); - metrics_count(o->c, "opt.rewrite.reloads", func_spill_load_count(o->f)); - metrics_count(o->c, "opt.rewrite.stores", func_spill_store_count(o->f)); - metrics_count(o->c, "opt.rewrite.inserted_insts", - o->f->opt_rewrite_inserted_insts); - metrics_count(o->c, "opt.rewrite.live_words_touched", - o->f->opt_rewrite_live_words_touched); + opt_lower_to_mir(f, &regalloc_live); metrics_scope_end(o->c, "opt.lower_mir"); metrics_scope_begin(o->c, "opt.lower_mir.verify"); - opt_mir_verify(o->f, "lower-mir"); + opt_mir_verify(f, "lower-mir"); metrics_scope_end(o->c, "opt.lower_mir.verify"); metrics_scope_begin(o->c, "opt.combine"); - opt_mir_combine(o->f); + opt_mir_combine(f); metrics_scope_end(o->c, "opt.combine"); metrics_scope_begin(o->c, "opt.combine.verify"); - opt_mir_verify(o->f, "post-mir-combine"); + opt_mir_verify(f, "post-mir-combine"); metrics_scope_end(o->c, "opt.combine.verify"); metrics_scope_begin(o->c, "opt.dce"); - opt_mir_dce(o->f); + opt_mir_dce(f); metrics_scope_end(o->c, "opt.dce"); metrics_scope_begin(o->c, "opt.dce.verify"); - opt_mir_verify(o->f, "post-mir-dce"); + opt_mir_verify(f, "post-mir-dce"); metrics_scope_end(o->c, "opt.dce.verify"); metrics_scope_begin(o->c, "opt.post_ra.jump_cleanup_cfg"); - opt_mir_jump_cleanup(o->f, OPT_JUMP_CLEANUP_CFG); + opt_mir_jump_cleanup(f, OPT_JUMP_CLEANUP_CFG); metrics_scope_end(o->c, "opt.post_ra.jump_cleanup_cfg"); metrics_scope_begin(o->c, "opt.post_ra.build_cfg"); - opt_mir_build_cfg(o->f); + opt_mir_build_cfg(f); metrics_scope_end(o->c, "opt.post_ra.build_cfg"); metrics_scope_begin(o->c, "opt.post_ra.verify"); - opt_mir_verify(o->f, "post-mir-jump-cfg"); + opt_mir_verify(f, "post-mir-jump-cfg"); metrics_scope_end(o->c, "opt.post_ra.verify"); metrics_scope_begin(o->c, "opt.post_ra.jump_cleanup_layout"); - opt_mir_jump_cleanup(o->f, OPT_JUMP_CLEANUP_LAYOUT); + opt_mir_jump_cleanup(f, OPT_JUMP_CLEANUP_LAYOUT); metrics_scope_end(o->c, "opt.post_ra.jump_cleanup_layout"); + metrics_scope_begin(o->c, "opt.emit"); - opt_emit(o->c, o->f, o->target); + opt_emit_native(o->c, f, o->native); metrics_scope_end(o->c, "opt.emit"); - metrics_scope_end(o->c, total_scope); -} - -static void opt_run_o1_pipeline(OptImpl* o) { - opt_run_lowering_pipeline(o, "opt.o1.total", 0); -} - -static void opt_run_o2_pipeline(OptImpl* o) { - metrics_scope_begin(o->c, "opt.o2.ssa"); - opt_cleanup(o->f); - metrics_scope_end(o->c, "opt.o2.ssa"); - opt_run_lowering_pipeline(o, "opt.o2.total", 1); -} - -static void opt_funcset_add(OptImpl* o, Func* f) { - FuncSet* fs = &o->funcs; - if (fs->nfuncs == fs->cap) { - u32 ncap = fs->cap ? fs->cap * 2u : 8u; - Func** nf = arena_array(o->c->tu, Func*, ncap); - if (fs->funcs) memcpy(nf, fs->funcs, sizeof(Func*) * fs->nfuncs); - fs->funcs = nf; - fs->cap = ncap; - } - fs->funcs[fs->nfuncs++] = f; -} - -static int func_requires_non_ssa_o2(Func* f) { - if (!f) return 0; - for (u32 b = 0; b < f->nblocks; ++b) { - Block* bl = &f->blocks[b]; - for (u32 i = 0; i < bl->ninsts; ++i) { - switch ((IROp)bl->insts[i].op) { - case IR_ASM_BLOCK: - case IR_LOAD_LABEL_ADDR: - case IR_INDIRECT_BRANCH: - return 1; - default: - break; + metrics_scope_end(o->c, "opt.o1.total"); +} + +static void opt_dbg_dump_cg(OptImpl* o, const CgIrFunc* f) { + extern char* getenv(const char*); + StrBuf sb; + char buf[8192]; + if (!getenv("CFREE_DUMPCG")) return; + strbuf_init(&sb, buf, sizeof buf); + for (u32 i = 0; i < f->ninsts; ++i) { + const CgIrInst* in = &f->insts[i]; + strbuf_put_u64(&sb, in->op); + if (in->op == CG_IR_LOAD_IMM) { + strbuf_put_slice(&sb, SLICE_LIT("(imm=")); + strbuf_put_u64(&sb, (u64)in->extra.imm); + strbuf_put_slice(&sb, SLICE_LIT(")")); + } + strbuf_put_slice(&sb, SLICE_LIT(" [")); + for (u32 j = 0; j < in->nopnds; ++j) { + const Operand* op = &in->opnds[j]; + strbuf_put_slice(&sb, SLICE_LIT(" k")); + strbuf_put_u64(&sb, op->kind); + if (op->kind == OPK_LOCAL) { + strbuf_put_slice(&sb, SLICE_LIT(":L")); + strbuf_put_u64(&sb, op->v.local); + strbuf_put_slice(&sb, cg_type_is_ptr(o->c, op->type) + ? SLICE_LIT("(ptr)") + : SLICE_LIT("(val)")); + } else if (op->kind == OPK_INDIRECT) { + strbuf_put_slice(&sb, SLICE_LIT(":ind(b")); + strbuf_put_u64(&sb, op->v.ind.base); + strbuf_put_slice(&sb, SLICE_LIT(",i")); + strbuf_put_u64(&sb, op->v.ind.index); + strbuf_put_slice(&sb, SLICE_LIT(")")); + } else if (op->kind == OPK_IMM) { + strbuf_put_slice(&sb, SLICE_LIT(":i")); + strbuf_put_u64(&sb, (u64)op->v.imm); } } + strbuf_put_slice(&sb, SLICE_LIT(" ]\n")); } - return 0; + compiler_panic(o->c, f->desc.loc, "CGIR:\n%s", strbuf_cstr(&sb)); } -/* ---- func_end: optionally run dry-run passes; replay; reset ---- */ - -static void w_func_end(CGTarget* t) { - OptImpl* o = impl_of(t); - if (!o->f) return; - opt_frame_home_addr_taken_locals(o->f); - - if (o->level >= 2) { - opt_funcset_add(o, o->f); - } else if (o->level >= 1) { - opt_run_o1_pipeline(o); - } else { - opt_replay(o->c, o->f, o->target); +static void opt_on_func(void* user, CgIrFunc* cg_func) { + OptImpl* o = (OptImpl*)user; + Func* f; + opt_dbg_dump_cg(o, cg_func); + if (opt_func_needs_direct_replay(o, cg_func)) { + opt_replay_cg_ir_direct(o, cg_func); + return; } - o->f = NULL; - o->cur = 0; + metrics_scope_begin(o->c, "opt.o1.cg_ir_lower"); + f = opt_func_from_cg_ir(o->c, cg_func); + metrics_scope_end(o->c, "opt.o1.cg_ir_lower"); + if (o->dump_writer && f) opt_ir_dump(f, o->dump_writer); + opt_run_o1_native(o, f); } -/* ---- finalize / destroy ---- */ - -static void w_finalize(CGTarget* t) { - OptImpl* o = impl_of(t); - CGTarget* wr = o->target; - if (o->level >= 2 && o->funcs.nfuncs) { - opt_inline(&o->funcs, 1); - for (u32 i = 0; i < o->funcs.nfuncs; ++i) { - o->f = o->funcs.funcs[i]; - if (!o->f) continue; - if (!func_requires_non_ssa_o2(o->f)) - opt_run_o2_pipeline(o); - else - opt_run_o1_pipeline(o); - } - o->f = NULL; - } - if (wr->finalize) wr->finalize(wr); +static void opt_on_finalize(void* user, const CgIrModule* module) { + OptImpl* o = (OptImpl*)user; + (void)module; + if (o->native && o->native->finalize) o->native->finalize(o->native); } -static void w_destroy(CGTarget* t) { - CGTarget* wr = impl_of(t)->target; - if (wr->destroy) wr->destroy(wr); +static void opt_on_destroy(void* user) { + OptImpl* o = (OptImpl*)user; + if (o->native && o->native->destroy) o->native->destroy(o->native); + (void)o; } -/* ---- public dump-writer API ---- */ - -void opt_set_dump_writer(CGTarget* t, Writer* w) { - if (!t || t->func_begin != w_func_begin) return; - impl_of(t)->dump_writer = w; +static int opt_on_local_static_data_begin(void* user, + const CGLocalStaticDataDesc* desc) { + (void)desc; + OptImpl* o = (OptImpl*)user; + return o && o->target && o->target->local_static_data_begin && + o->target->local_static_data_write && + o->target->local_static_data_label_addr && + o->target->local_static_data_end; } -/* ---- construction ---- */ - -CGTarget* opt_cgtarget_new(Compiler* c, CGTarget* target, int level) { - if (!target) { - SrcLoc loc = {0, 0, 0}; - compiler_panic(c, loc, "opt_cgtarget_new: target is NULL"); - } - if (level < 1 || level > 2) { - SrcLoc loc = {0, 0, 0}; - compiler_panic(c, loc, "opt_cgtarget_new: level %d out of range [1, 2]", - level); - } +static const char* opt_on_tail_call_unrealizable_reason( + void* user, const struct CGFuncDesc* caller, const CGCallDesc* call) { + (void)user; + (void)caller; + (void)call; + return NULL; +} - OptImpl* o = arena_new(c->tu, OptImpl); - memset(o, 0, sizeof *o); +CgTarget* opt_cgtarget_new(Compiler* c, CgTarget* target, int level) { + if (!target) + compiler_panic(c, (SrcLoc){0, 0, 0}, "opt_cgtarget_new: target is NULL"); + if (level < 1) + compiler_panic(c, (SrcLoc){0, 0, 0}, + "opt_cgtarget_new: level %d out of range", level); + OptImpl* o = arena_znew(c->tu, OptImpl); o->c = c; o->target = target; - o->level = level; - o->funcs.c = c; - o->funcs.arena = c->tu; - - CGTarget* t = &o->base; - t->c = c; - t->obj = target->obj; - t->mc = target->mc; - t->debug = target->debug; - t->virtual_regs = 1; - - t->func_begin = w_func_begin; - t->func_end = w_func_end; - - t->frame_slot = w_frame_slot; - t->local = w_local; - t->local_addr = w_local_addr; - t->param = w_param; - t->spill_reg = w_spill_reg; - t->reload_reg = w_reload_reg; - - t->get_allocable_regs = w_get_allocable_regs; - t->get_phys_regs = w_get_phys_regs; - t->get_scratch_regs = w_get_scratch_regs; - t->is_caller_saved = w_is_caller_saved; - t->call_clobber_mask = w_call_clobber_mask; - t->return_reg_mask = w_return_reg_mask; - t->callee_save_mask = w_callee_save_mask; - t->plan_hard_regs = w_plan_hard_regs; - t->reserve_hard_regs = w_reserve_hard_regs; - - t->label_new = w_label_new; - t->label_place = w_label_place; - t->cg_label_to_mc_label = w_cg_label_to_mc_label; - t->jump = w_jump; - t->cmp_branch = w_cmp_branch; - t->switch_ = w_switch_; - t->indirect_branch = w_indirect_branch; - t->load_label_addr = w_load_label_addr; - - t->scope_begin = w_scope_begin; - t->scope_else = w_scope_else; - t->scope_end = w_scope_end; - t->break_to = w_break_to; - t->continue_to = w_continue_to; - - t->load_imm = w_load_imm; - t->load_const = w_load_const; - t->copy = w_copy; - t->load = w_load; - t->store = w_store; - t->addr_of = w_addr_of; - t->tls_addr_of = w_tls_addr_of; - t->copy_bytes = w_copy_bytes; - t->set_bytes = w_set_bytes; - t->bitfield_load = w_bitfield_load; - t->bitfield_store = w_bitfield_store; - - t->binop = w_binop; - t->unop = w_unop; - t->cmp = w_cmp; - t->convert = w_convert; - - t->call = w_call; - t->tail_call_unrealizable_reason = w_tail_call_unrealizable_reason; - t->plan_call = w_plan_call; - t->load_call_arg = w_load_call_arg; - t->store_call_arg = w_store_call_arg; - t->store_call_ret = w_store_call_ret; - t->emit_call_plan = w_emit_call_plan; - t->ret = w_ret; - - t->alloca_ = w_alloca_; - t->va_start_ = w_va_start_; - t->va_arg_ = w_va_arg_; - t->va_end_ = w_va_end_; - t->va_copy_ = w_va_copy_; - - t->atomic_load = w_atomic_load; - t->atomic_store = w_atomic_store; - t->atomic_rmw = w_atomic_rmw; - t->atomic_cas = w_atomic_cas; - t->fence = w_fence; - - t->intrinsic = w_intrinsic; - t->asm_block = w_asm_block; - t->resolve_reg_name = w_resolve_reg_name; - /* Only expose file_scope_asm if the wrapped target overrides it. Native - * targets leave it NULL so cfree_cg_file_scope_asm falls through to the - * MC-based asm_parse path; setting the wrapper unconditionally would - * swallow that NULL signal and silently drop file-scope asm. */ - if (target->file_scope_asm) t->file_scope_asm = w_file_scope_asm; - - t->set_loc = w_set_loc; - t->finalize = w_finalize; - t->destroy = w_destroy; - - return t; + o->native = native_direct_target_native(target); + o->level = 1; + + CgIrRecorderConfig cfg; + memset(&cfg, 0, sizeof cfg); + cfg.func_recorded = opt_on_func; + cfg.finalize = opt_on_finalize; + cfg.destroy = opt_on_destroy; + cfg.local_static_data_begin = opt_on_local_static_data_begin; + cfg.tail_call_unrealizable_reason = opt_on_tail_call_unrealizable_reason; + cfg.user = o; + return cg_ir_recorder_new(c, target->obj, &cfg); +} + +void opt_set_dump_writer(CgTarget* t, Writer* w) { + CgIrRecorder* rec = cg_ir_recorder_from_target(t); + (void)rec; + (void)w; } diff --git a/src/opt/pass_addr_fold.c b/src/opt/pass_addr_fold.c @@ -0,0 +1,760 @@ +/* O1 HIR address-folding passes. + * + * Split out of pass_o2.c so the always-on O1 lowering pipeline does not depend + * on the (currently disabled) O2 pass translation unit. These three passes run + * at every opt_level >= 1: + * + * - opt_addr_xform_pregs: PReg-namespace addr-of-local folding + * - opt_promote_scalar_locals: promote non-escaped scalar locals to PRegs + * - opt_addr_of_global_cse: hoist/CSE duplicate ADDR_OF(global) + */ +#include <cfree/cg.h> +#include <string.h> + +#include "opt/opt_internal.h" + +/* Private copy of the tiny inst-removal helper shared with pass_o2.c's + * opt_addr_xform. Both are file-local statics, so there is no link conflict. */ +static void addr_inst_remove(Inst* in) { + in->op = IR_NOP; + in->def = VAL_NONE; + in->ndefs = 0; + in->defs = NULL; + in->nopnds = 0; + in->opnds = NULL; +} + +/* PReg-namespace variant of opt_addr_xform for the O1 pipeline (no SSA, no + * Val-keyed def-use chains). Scans the whole function once per candidate + * IR_ADDR_OF def to classify uses of its PReg result. + * + * Use classifications (see addr_xform_pregs_classify_use): + * + * OPF_ESCAPE The use is something other than a non-observable + * IR_LOAD/IR_STORE base operand. The IR_ADDR_OF cannot + * be folded; the local's address truly escapes. + * OPF_FOLD_LOCAL Zero-EA use: `OPK_INDIRECT(base=p, ofs=0, index=NONE)` + * in load/store base position. Foldable to OPK_LOCAL. + * OPF_FOLD_EA EA-shaped use: same load/store base position, but with + * nonzero `ofs` or `index != REG_NONE`. The EA must stay + * on the load/store (the operand layout for OPK_LOCAL + * cannot carry the EA today), so the operand is left + * alone and the IR_ADDR_OF def must stay alive to feed + * the OPK_INDIRECT base. The use is still recognized as + * "non-escape" for downstream analysis (e.g. scalar + * promotion's non-escape check). + * + * After classification: if any use is OPF_ESCAPE, no rewrite happens. If + * every use is OPF_FOLD_LOCAL, fold all uses to OPK_LOCAL and NOP the + * IR_ADDR_OF. If a mix of OPF_FOLD_LOCAL and OPF_FOLD_EA, fold the + * zero-EA uses but keep the IR_ADDR_OF alive for the EA-shaped uses. */ + +typedef enum AddrXformUseClass { + OPF_ESCAPE = 0, + OPF_FOLD_LOCAL = 1, + OPF_FOLD_EA = 2, +} AddrXformUseClass; + +static int addr_xform_pregs_main_op_position_ok(Inst* in, u32 op_idx) { + if ((IROp)in->op != IR_LOAD && (IROp)in->op != IR_STORE) return 0; + if (opt_mem_observable(&in->extra.mem)) return 0; + if ((IROp)in->op == IR_LOAD && op_idx != 1u) return 0; + if ((IROp)in->op == IR_STORE && op_idx != 0u) return 0; + return 1; +} + +static AddrXformUseClass addr_xform_pregs_classify_use(Inst* in, Operand* op, + u32 op_idx) { + if (op->kind != OPK_INDIRECT) return OPF_ESCAPE; + if (!addr_xform_pregs_main_op_position_ok(in, op_idx)) return OPF_ESCAPE; + if (op->v.ind.ofs == 0 && op->v.ind.index == (Reg)REG_NONE) + return OPF_FOLD_LOCAL; + return OPF_FOLD_EA; +} + +static int addr_xform_pregs_op_uses(const Operand* op, PReg p) { + if (!op) return 0; + if (op->kind == OPK_REG && (PReg)op->v.reg == p) return 1; + if (op->kind == OPK_INDIRECT) { + if ((PReg)op->v.ind.base == p) return 1; + if (op->v.ind.index != (Reg)REG_NONE && (PReg)op->v.ind.index == p) + return 1; + } + return 0; +} + +static int addr_xform_pregs_abivalue_uses(const CGABIValue* v, PReg p) { + if (!v) return 0; + if (addr_xform_pregs_op_uses(&v->storage, p)) return 1; + for (u32 i = 0; i < v->nparts; ++i) + if (addr_xform_pregs_op_uses((const Operand*)&v->parts[i].op, p)) return 1; + return 0; +} + +static int addr_xform_pregs_aux_uses(Inst* in, PReg p) { + switch ((IROp)in->op) { + case IR_CALL: { + IRCallAux* aux = (IRCallAux*)in->extra.aux; + if (!aux) return 0; + if (aux->use_plan_replay) { + if (addr_xform_pregs_op_uses(&aux->plan.callee, p)) return 1; + for (u32 i = 0; i < aux->plan.nargs; ++i) + if (addr_xform_pregs_op_uses(&aux->plan.args[i].src, p)) return 1; + for (u32 i = 0; i < aux->plan.nrets; ++i) + if (addr_xform_pregs_op_uses(&aux->plan.rets[i].dst, p)) return 1; + } else { + if (addr_xform_pregs_op_uses(&aux->desc.callee, p)) return 1; + for (u32 i = 0; i < aux->desc.nargs; ++i) + if (addr_xform_pregs_abivalue_uses( + (const CGABIValue*)&aux->desc.args[i], p)) + return 1; + if (addr_xform_pregs_abivalue_uses(&aux->desc.ret, p)) return 1; + } + return 0; + } + case IR_RET: { + IRRetAux* aux = (IRRetAux*)in->extra.aux; + if (!aux || !aux->present) return 0; + return addr_xform_pregs_abivalue_uses(&aux->val, p); + } + case IR_SCOPE_BEGIN: { + IRScopeAux* aux = (IRScopeAux*)in->extra.aux; + if (!aux) return 0; + return addr_xform_pregs_op_uses(&aux->desc.cond, p); + } + case IR_ASM_BLOCK: { + IRAsmAux* aux = (IRAsmAux*)in->extra.aux; + if (!aux) return 0; + for (u32 i = 0; i < aux->nin; ++i) + if (addr_xform_pregs_op_uses(&aux->in_ops[i], p)) return 1; + for (u32 i = 0; i < aux->nout; ++i) + if (addr_xform_pregs_op_uses(&aux->out_ops[i], p)) return 1; + return 0; + } + case IR_INTRINSIC: { + IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; + if (!aux) return 0; + for (u32 i = 0; i < aux->narg; ++i) + if (addr_xform_pregs_op_uses(&aux->args[i], p)) return 1; + for (u32 i = 0; i < aux->ndst; ++i) + if (addr_xform_pregs_op_uses(&aux->dsts[i], p)) return 1; + return 0; + } + default: + return 0; + } +} + +/* Returns nonzero if every use of `p` is foldable (OPF_FOLD_LOCAL or + * OPF_FOLD_EA) and at least one use exists. *out_has_ea is set to 1 if any + * use was OPF_FOLD_EA; in that case the rewrite must keep the IR_ADDR_OF + * alive (the EA-shaped use still names p as the OPK_INDIRECT base). */ +static int addr_xform_pregs_classify(Func* f, PReg p, Inst* def_inst, + int* out_has_ea) { + int has_foldable_use = 0; + int has_ea = 0; + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + if (in == def_inst) continue; + for (u32 o = 0; o < in->nopnds; ++o) { + Operand* op = &in->opnds[o]; + if (!addr_xform_pregs_op_uses(op, p)) continue; + AddrXformUseClass uc = addr_xform_pregs_classify_use(in, op, o); + if (uc == OPF_ESCAPE) return 0; + has_foldable_use = 1; + if (uc == OPF_FOLD_EA) has_ea = 1; + } + if (addr_xform_pregs_aux_uses(in, p)) return 0; + } + } + if (out_has_ea) *out_has_ea = has_ea; + return has_foldable_use; +} + +void opt_addr_xform_pregs(Func* f) { + if (!f || f->opt_reg_ssa || f->opt_rewritten) return; + int changed = 0; + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + if ((IROp)in->op != IR_ADDR_OF) continue; + if (in->nopnds < 2) continue; + if (in->opnds[0].kind != OPK_REG) continue; + if (in->opnds[1].kind != OPK_LOCAL) continue; + PReg p = (PReg)in->opnds[0].v.reg; + if (!opt_reg_valid(f, p)) continue; + int has_ea = 0; + if (!addr_xform_pregs_classify(f, p, in, &has_ea)) continue; + Operand local = in->opnds[1]; + /* Fold every zero-EA use of p to OPK_LOCAL. EA-shaped uses are left + * as OPK_INDIRECT(base=p, ofs, index, log2_scale) so the EA stays on + * the load/store; the IR_ADDR_OF def must survive to feed them. */ + for (u32 bb = 0; bb < f->nblocks; ++bb) { + Block* rb = &f->blocks[bb]; + for (u32 ii = 0; ii < rb->ninsts; ++ii) { + Inst* use = &rb->insts[ii]; + if (use == in) continue; + for (u32 o = 0; o < use->nopnds; ++o) { + Operand* op = &use->opnds[o]; + if (op->kind != OPK_INDIRECT) continue; + if ((PReg)op->v.ind.base != p) continue; + if (op->v.ind.ofs != 0 || op->v.ind.index != (Reg)REG_NONE) + continue; /* EA-shaped; leave alone */ + Operand folded = local; + folded.type = + use->extra.mem.type ? use->extra.mem.type : local.type; + *op = folded; + } + } + } + if (!has_ea) addr_inst_remove(in); + changed = 1; + } + } + /* After folding, walk all frame slots and clear FSF_ADDR_TAKEN on any + * slot whose surviving IR_ADDR_OF defs (if any) have all been retired. + * The frontend-set ADDR_TAKEN flag is conservative; if we proved the + * address no longer escapes, downstream passes (opt_promote_scalar_locals) + * can take advantage of the actual non-escape state. */ + if (changed) { + u8* still_taken = + arena_zarray(f->arena, u8, f->nframe_slots ? f->nframe_slots : 1u); + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + if ((IROp)in->op != IR_ADDR_OF) continue; + if (in->nopnds < 2 || in->opnds[1].kind != OPK_LOCAL) continue; + FrameSlot slot = in->opnds[1].v.frame_slot; + if (slot && slot <= f->nframe_slots) still_taken[slot - 1u] = 1; + } + } + for (u32 s = 0; s < f->nframe_slots; ++s) { + if (!still_taken[s]) f->frame_slots[s].flags &= (u16)~FSF_ADDR_TAKEN; + } + } + if (changed) + opt_analysis_invalidate( + f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP); +} + +/* Scalar local promotion for the O1 pipeline. Runs after + * `opt_addr_xform_pregs` has folded zero-EA `OPK_INDIRECT(p)` uses to + * `OPK_LOCAL(slot)` and retired non-escaping `IR_ADDR_OF` defs. For each + * frame slot that is now only referenced as the base of matching-type, + * non-observable `IR_LOAD`/`IR_STORE`, the slot is replaced by a fresh + * mutable PReg: each store becomes `IR_COPY P_slot, src` (or `IR_LOAD_IMM` + * for an immediate source), each load becomes `IR_COPY dst, P_slot`. The + * slot becomes unreferenced and the backend drops it from the frame. + * + * A mutable PReg in `-O1` IR has the same data-flow semantics as a named + * memory cell that does not escape (multiple defs, multiple uses, value at + * a use comes from whichever def reaches it via CFG edges). No phis are + * required because the IR model has no phis; PReg flow becomes hard-reg + * flow after regalloc, and regalloc already handles it. + * + * Conditions for promotion (per slot): + * + * 1. Slot kind is FS_LOCAL (real locals, not spills, sret, alloca). + * 2. Slot has no FSF_ADDR_TAKEN, FSF_VOLATILE flag (after + * `opt_addr_xform_pregs` has cleared the conservative ADDR_TAKEN + * flag for slots whose IR_ADDR_OF defs were all retired). + * 3. Slot's declared type is scalar (int, float, bool, ptr, enum). + * 4. Every appearance of `OPK_LOCAL(slot)` in any instruction operand is + * either: + * - `IR_LOAD.opnds[1]` with matching `access.type == slot.type`, + * no observable mem flags, dst is OPK_REG; + * - `IR_STORE.opnds[0]` with matching `access.type == slot.type`, + * no observable mem flags, src is OPK_REG or OPK_IMM. + * 5. Slot does not appear in any aux operand position (calls, asm, etc.) + * or as an OPK_LOCAL anywhere else (e.g., a surviving IR_ADDR_OF). + * + * Param-slot case: FS_PARAM slots are excluded. The backend prologue is + * responsible for moving the ABI-incoming hard reg into the slot, and that + * move is not visible in the IR (there is no `IR_STORE OPK_LOCAL(slot)` to + * rewrite). At O1 the wrapper already places scalar params in REG storage + * when the frontend does not force a memory home, so the param's value + * arrives in a PReg without needing this pass. If a future scheme records + * the entry-move as a synthetic IR_STORE OPK_LOCAL(slot), this pass would + * promote it the same way it promotes any other store-to-slot. */ + +static int promote_local_type_is_scalar(Func* f, CfreeCgTypeId ty) { + if (!ty) return 0; + CfreeCgTypeKind kind = cfree_cg_type_kind((CfreeCompiler*)f->c, ty); + switch (kind) { + case CFREE_CG_TYPE_BOOL: + case CFREE_CG_TYPE_INT: + case CFREE_CG_TYPE_FLOAT: + case CFREE_CG_TYPE_PTR: + case CFREE_CG_TYPE_ENUM: + return 1; + default: + return 0; + } +} + +static int promote_op_uses_slot(const Operand* op, FrameSlot slot) { + return op && op->kind == OPK_LOCAL && op->v.frame_slot == slot; +} + +static int promote_abivalue_uses_slot(const CGABIValue* v, FrameSlot slot) { + if (!v) return 0; + if (promote_op_uses_slot(&v->storage, slot)) return 1; + for (u32 i = 0; i < v->nparts; ++i) + if (promote_op_uses_slot((const Operand*)&v->parts[i].op, slot)) return 1; + return 0; +} + +static int promote_aux_uses_slot(const Inst* in, FrameSlot slot) { + switch ((IROp)in->op) { + case IR_CALL: { + IRCallAux* aux = (IRCallAux*)in->extra.aux; + if (!aux) return 0; + if (aux->use_plan_replay) { + if (promote_op_uses_slot(&aux->plan.callee, slot)) return 1; + for (u32 i = 0; i < aux->plan.nargs; ++i) + if (promote_op_uses_slot(&aux->plan.args[i].src, slot)) return 1; + for (u32 i = 0; i < aux->plan.nrets; ++i) + if (promote_op_uses_slot(&aux->plan.rets[i].dst, slot)) return 1; + } else { + if (promote_op_uses_slot(&aux->desc.callee, slot)) return 1; + for (u32 i = 0; i < aux->desc.nargs; ++i) + if (promote_abivalue_uses_slot((const CGABIValue*)&aux->desc.args[i], + slot)) + return 1; + if (promote_abivalue_uses_slot(&aux->desc.ret, slot)) return 1; + } + return 0; + } + case IR_RET: { + IRRetAux* aux = (IRRetAux*)in->extra.aux; + if (!aux || !aux->present) return 0; + return promote_abivalue_uses_slot(&aux->val, slot); + } + case IR_SCOPE_BEGIN: { + IRScopeAux* aux = (IRScopeAux*)in->extra.aux; + if (!aux) return 0; + return promote_op_uses_slot(&aux->desc.cond, slot); + } + case IR_ASM_BLOCK: { + IRAsmAux* aux = (IRAsmAux*)in->extra.aux; + if (!aux) return 0; + for (u32 i = 0; i < aux->nin; ++i) + if (promote_op_uses_slot(&aux->in_ops[i], slot)) return 1; + for (u32 i = 0; i < aux->nout; ++i) + if (promote_op_uses_slot(&aux->out_ops[i], slot)) return 1; + return 0; + } + case IR_INTRINSIC: { + IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; + if (!aux) return 0; + for (u32 i = 0; i < aux->narg; ++i) + if (promote_op_uses_slot(&aux->args[i], slot)) return 1; + for (u32 i = 0; i < aux->ndst; ++i) + if (promote_op_uses_slot(&aux->dsts[i], slot)) return 1; + return 0; + } + default: + return 0; + } +} + +/* Per-inst check. Returns: + * 1 = "instruction touches slot in a promotable position" (load/store base). + * 0 = "instruction does not touch slot at all". + * -1 = "instruction touches slot in a non-promotable way" (e.g., wrong + * operand position, type mismatch, observable flags, aux use). */ +static int promote_inst_classify(const Inst* in, FrameSlot slot, + CfreeCgTypeId slot_ty) { + int touched = 0; + /* IR_LOAD: opnds[0]=dst REG, opnds[1]=addr (allowed: OPK_LOCAL slot). */ + if ((IROp)in->op == IR_LOAD) { + if (in->nopnds >= 2 && promote_op_uses_slot(&in->opnds[1], slot)) { + if (opt_mem_observable(&in->extra.mem)) return -1; + if (in->opnds[0].kind != OPK_REG) return -1; + CfreeCgTypeId at = in->extra.mem.type; + if (at && at != slot_ty) return -1; + touched = 1; + } + /* opnds[0] is the dst REG — never OPK_LOCAL by construction. */ + if (in->nopnds >= 1 && promote_op_uses_slot(&in->opnds[0], slot)) return -1; + } else if ((IROp)in->op == IR_STORE) { + if (in->nopnds >= 1 && promote_op_uses_slot(&in->opnds[0], slot)) { + if (opt_mem_observable(&in->extra.mem)) return -1; + if (in->nopnds < 2) return -1; + Operand* src = &in->opnds[1]; + if (src->kind != OPK_REG && src->kind != OPK_IMM) return -1; + CfreeCgTypeId at = in->extra.mem.type; + if (at && at != slot_ty) return -1; + touched = 1; + } + /* opnds[1] is the src value — should never be OPK_LOCAL for a scalar. */ + if (in->nopnds >= 2 && promote_op_uses_slot(&in->opnds[1], slot)) return -1; + } else { + /* Any other instruction with an OPK_LOCAL(slot) operand blocks promotion. + */ + for (u32 o = 0; o < in->nopnds; ++o) + if (promote_op_uses_slot(&in->opnds[o], slot)) return -1; + } + if (promote_aux_uses_slot(in, slot)) return -1; + return touched; +} + +/* Rewrite an `IR_STORE OPK_LOCAL(slot), src` into a PReg def. If src is + * OPK_IMM, emit IR_LOAD_IMM into preg; otherwise emit IR_COPY. */ +static void promote_rewrite_store(Func* f, Inst* in, PReg preg, + CfreeCgTypeId ty, u8 cls) { + Operand src = in->opnds[1]; + Operand* opnds = arena_array(f->arena, Operand, 2); + memset(&opnds[0], 0, sizeof opnds[0]); + opnds[0].kind = OPK_REG; + opnds[0].type = ty; + opnds[0].cls = cls; + opnds[0].v.reg = (Reg)preg; + in->type = ty; + in->def = (Val)preg; + if (src.kind == OPK_IMM) { + in->op = IR_LOAD_IMM; + in->nopnds = 1; + in->opnds = opnds; + in->extra.imm = src.v.imm; + } else { + opnds[1] = src; + opnds[1].type = ty; + opnds[1].cls = cls; + in->op = IR_COPY; + in->nopnds = 2; + in->opnds = opnds; + memset(&in->extra, 0, sizeof in->extra); + } +} + +/* Rewrite an `IR_LOAD dst, OPK_LOCAL(slot)` into `IR_COPY dst, preg`. */ +static void promote_rewrite_load(Func* f, Inst* in, PReg preg, CfreeCgTypeId ty, + u8 cls) { + Operand dst = in->opnds[0]; + Operand* opnds = arena_array(f->arena, Operand, 2); + opnds[0] = dst; + opnds[0].type = ty; + opnds[0].cls = cls; + memset(&opnds[1], 0, sizeof opnds[1]); + opnds[1].kind = OPK_REG; + opnds[1].type = ty; + opnds[1].cls = cls; + opnds[1].v.reg = (Reg)preg; + in->op = IR_COPY; + in->type = ty; + in->nopnds = 2; + in->opnds = opnds; + memset(&in->extra, 0, sizeof in->extra); +} + +void opt_promote_scalar_locals(Func* f) { + if (!f || f->opt_reg_ssa || f->opt_rewritten) return; + if (!f->nframe_slots) return; + int changed = 0; + for (u32 sidx = 0; sidx < f->nframe_slots; ++sidx) { + IRFrameSlot* slot = &f->frame_slots[sidx]; + FrameSlot id = slot->id; + /* FS_PARAM slots are owned by the backend prologue (which copies the + * ABI-incoming hard reg into the slot before any user IR runs); there + * is no IR-level store to rewrite. At O1, the wrapper already places + * scalar params in REG storage when the frontend does not force a + * memory home, so the FS_PARAM promotion path is normally a no-op. + * Only promote FS_LOCAL slots. */ + if (slot->kind != FS_LOCAL) continue; + if (slot->flags & (FSF_ADDR_TAKEN | FSF_VOLATILE)) continue; + if (!promote_local_type_is_scalar(f, slot->type)) continue; + int touched_count = 0; + int rejected = 0; + for (u32 b = 0; b < f->nblocks && !rejected; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + int r = promote_inst_classify(in, id, slot->type); + if (r < 0) { + rejected = 1; + break; + } + touched_count += r; + } + } + if (rejected || !touched_count) continue; + u8 cls = (cfree_cg_type_kind((CfreeCompiler*)f->c, slot->type) == + CFREE_CG_TYPE_FLOAT) + ? RC_FP + : RC_INT; + PReg preg = ir_alloc_preg(f, slot->type, cls); + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + if ((IROp)in->op == IR_LOAD && in->nopnds >= 2 && + promote_op_uses_slot(&in->opnds[1], id)) { + promote_rewrite_load(f, in, preg, slot->type, cls); + } else if ((IROp)in->op == IR_STORE && in->nopnds >= 2 && + promote_op_uses_slot(&in->opnds[0], id)) { + promote_rewrite_store(f, in, preg, slot->type, cls); + } + } + } + /* The frame slot is now unreferenced. Leave the slot table entry in + * place (compaction would require remapping every other slot id); + * the backend's frame layout pass simply omits unreferenced slots. */ + changed = 1; + } + if (changed) + opt_analysis_invalidate( + f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP); +} + +/* CSE-style hoist of `IR_ADDR_OF(OPK_GLOBAL{sym, addend})` defs that appear + * more than once in the same function. The address is a link-time constant + * (TLS and IFUNC live on separate IROps), so all occurrences compute the + * same value; consolidating to a single entry-block def shrinks each loop + * body by the per-iter `adrp`/`add` pair the backend would otherwise re-emit. + * + * Implementation: + * - Walk all insts, group ADDR_OF defs by (sym, addend). + * - For each key with >= 2 defs: allocate a fresh PReg, materialize one + * IR_ADDR_OF in block 0 (after any IR_PARAM_DECL prologue), build a + * preg-remap from each original def-PReg to the new PReg, and NOP each + * original def. + * - One IR walk applies the remap to every operand `v.reg` / + * `v.ind.base`. + * + * Runs after opt_addr_xform_pregs so local addr-of has already been folded + * out; the remaining IR_ADDR_OF defs are global. */ + +typedef struct AddrCseEntry { + ObjSymId sym; + i64 addend; + PReg canonical; /* freshly allocated PReg, def in block 0 */ + CfreeCgTypeId addr_type; /* operand[0].type from the first def */ + u8 cls; /* operand[0].cls from the first def */ + u32 count; /* number of original ADDR_OF defs seen */ +} AddrCseEntry; + +static u32 addr_cse_find_or_add(AddrCseEntry** entries, u32* n, u32* cap, + Arena* arena, ObjSymId sym, i64 addend) { + for (u32 i = 0; i < *n; ++i) { + if ((*entries)[i].sym == sym && (*entries)[i].addend == addend) return i; + } + if (*n == *cap) { + u32 ncap = *cap ? *cap * 2u : 16u; + AddrCseEntry* nv = arena_array(arena, AddrCseEntry, ncap); + if (*entries) memcpy(nv, *entries, sizeof(AddrCseEntry) * (*n)); + *entries = nv; + *cap = ncap; + } + u32 idx = (*n)++; + AddrCseEntry* e = &(*entries)[idx]; + memset(e, 0, sizeof *e); + e->sym = sym; + e->addend = addend; + e->canonical = PREG_NONE; + e->count = 0; + return idx; +} + +static void addr_cse_apply_to_operand(Operand* op, const PReg* remap) { + /* remap is zero-initialized; 0 means "no remap" (preg 0 is reserved as + * unused). PREG_NONE = 0xffffffff and would be a valid remap target but + * we never produce that. */ + if (!op) return; + if (op->kind == OPK_REG) { + PReg p = (PReg)op->v.reg; + if (p != PREG_NONE && p != 0 && remap[p] != 0) op->v.reg = remap[p]; + } else if (op->kind == OPK_INDIRECT) { + PReg p = (PReg)op->v.ind.base; + if (p != PREG_NONE && p != 0 && remap[p] != 0) op->v.ind.base = remap[p]; + if (op->v.ind.index != (Reg)REG_NONE) { + PReg pi = (PReg)op->v.ind.index; + if (pi != PREG_NONE && pi != 0 && remap[pi] != 0) + op->v.ind.index = remap[pi]; + } + } +} + +static void addr_cse_apply_to_inst(Inst* in, const PReg* remap) { + for (u32 o = 0; o < in->nopnds; ++o) + addr_cse_apply_to_operand(&in->opnds[o], remap); + /* IR_CALL aux carries operands too; rewrite both replay variants. */ + if ((IROp)in->op == IR_CALL) { + IRCallAux* aux = (IRCallAux*)in->extra.aux; + if (!aux) return; + if (aux->use_plan_replay) { + addr_cse_apply_to_operand(&aux->plan.callee, remap); + for (u32 i = 0; i < aux->plan.nargs; ++i) + addr_cse_apply_to_operand(&aux->plan.args[i].src, remap); + for (u32 i = 0; i < aux->plan.nrets; ++i) + addr_cse_apply_to_operand(&aux->plan.rets[i].dst, remap); + } else { + addr_cse_apply_to_operand(&aux->desc.callee, remap); + for (u32 i = 0; i < aux->desc.nargs; ++i) { + CGABIValue* v = (CGABIValue*)&aux->desc.args[i]; + addr_cse_apply_to_operand(&v->storage, remap); + for (u32 k = 0; k < v->nparts; ++k) + addr_cse_apply_to_operand((Operand*)&v->parts[k].op, remap); + } + addr_cse_apply_to_operand(&aux->desc.ret.storage, remap); + for (u32 k = 0; k < aux->desc.ret.nparts; ++k) + addr_cse_apply_to_operand((Operand*)&aux->desc.ret.parts[k].op, remap); + } + } else if ((IROp)in->op == IR_RET) { + IRRetAux* aux = (IRRetAux*)in->extra.aux; + if (aux && aux->present) { + addr_cse_apply_to_operand(&aux->val.storage, remap); + for (u32 k = 0; k < aux->val.nparts; ++k) + addr_cse_apply_to_operand((Operand*)&aux->val.parts[k].op, remap); + } + } else if ((IROp)in->op == IR_ASM_BLOCK) { + IRAsmAux* aux = (IRAsmAux*)in->extra.aux; + if (!aux) return; + for (u32 i = 0; i < aux->nin; ++i) + addr_cse_apply_to_operand(&aux->in_ops[i], remap); + for (u32 i = 0; i < aux->nout; ++i) + addr_cse_apply_to_operand(&aux->out_ops[i], remap); + } else if ((IROp)in->op == IR_INTRINSIC) { + IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; + if (!aux) return; + for (u32 i = 0; i < aux->narg; ++i) + addr_cse_apply_to_operand(&aux->args[i], remap); + for (u32 i = 0; i < aux->ndst; ++i) + addr_cse_apply_to_operand(&aux->dsts[i], remap); + } +} + +static Inst* block_insert_at(Func* f, Block* bl, u32 at, u32 k) { + if (at > bl->ninsts) at = bl->ninsts; + if (bl->ninsts + k > bl->cap) { + u32 ncap = bl->cap ? bl->cap : 8u; + while (ncap < bl->ninsts + k) ncap *= 2u; + Inst* nb = arena_zarray(f->arena, Inst, ncap); + if (bl->insts && at) memcpy(nb, bl->insts, sizeof(Inst) * at); + if (bl->insts && bl->ninsts > at) + memcpy(nb + at + k, bl->insts + at, sizeof(Inst) * (bl->ninsts - at)); + bl->insts = nb; + bl->cap = ncap; + } else { + if (bl->ninsts > at) + memmove(bl->insts + at + k, bl->insts + at, + sizeof(Inst) * (bl->ninsts - at)); + } + for (u32 i = 0; i < k; ++i) memset(&bl->insts[at + i], 0, sizeof(Inst)); + bl->ninsts += k; + return &bl->insts[at]; +} + +void opt_addr_of_global_cse(Func* f) { + if (!f || f->opt_reg_ssa || f->opt_rewritten) return; + if (f->nblocks == 0) return; + + /* Pass 1: index ADDR_OF(global) defs by (sym, addend). */ + AddrCseEntry* entries = NULL; + u32 n_entries = 0; + u32 cap_entries = 0; + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + if ((IROp)in->op != IR_ADDR_OF) continue; + if (in->nopnds < 2) continue; + if (in->opnds[0].kind != OPK_REG) continue; + if (in->opnds[1].kind != OPK_GLOBAL) continue; + u32 idx = addr_cse_find_or_add(&entries, &n_entries, &cap_entries, + f->arena, in->opnds[1].v.global.sym, + in->opnds[1].v.global.addend); + AddrCseEntry* e = &entries[idx]; + if (e->count == 0) { + e->addr_type = in->opnds[0].type; + e->cls = in->opnds[0].cls; + } + ++e->count; + } + } + if (!n_entries) return; + + /* Pass 2: for each duplicate key, allocate a canonical PReg. */ + u32 dup_count = 0; + for (u32 i = 0; i < n_entries; ++i) { + if (entries[i].count >= 2) { + entries[i].canonical = + ir_alloc_preg(f, entries[i].addr_type, entries[i].cls); + ++dup_count; + } + } + if (!dup_count) return; + + /* Pass 3: walk again, build per-old-PReg remap and NOP duplicate defs. */ + PReg* remap = arena_zarray(f->arena, PReg, opt_reg_count(f)); + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + if ((IROp)in->op != IR_ADDR_OF) continue; + if (in->nopnds < 2) continue; + if (in->opnds[0].kind != OPK_REG) continue; + if (in->opnds[1].kind != OPK_GLOBAL) continue; + u32 idx = addr_cse_find_or_add(&entries, &n_entries, &cap_entries, + f->arena, in->opnds[1].v.global.sym, + in->opnds[1].v.global.addend); + if (entries[idx].canonical == PREG_NONE) continue; /* singleton */ + PReg old = (PReg)in->opnds[0].v.reg; + if (opt_reg_valid(f, old)) remap[old] = entries[idx].canonical; + /* NOP the original def. */ + in->op = IR_NOP; + in->def = VAL_NONE; + in->ndefs = 0; + in->defs = NULL; + in->nopnds = 0; + in->opnds = NULL; + } + } + + /* Pass 4: hoist a single ADDR_OF for each duplicated key to the entry + * block, inserted after any leading IR_PARAM_DECL instructions. */ + if (f->entry >= f->nblocks) return; + Block* entry = &f->blocks[f->entry]; + u32 insert_at = 0; + while (insert_at < entry->ninsts && + (IROp)entry->insts[insert_at].op == IR_PARAM_DECL) + ++insert_at; + Inst* slot = block_insert_at(f, entry, insert_at, dup_count); + u32 w = 0; + for (u32 i = 0; i < n_entries; ++i) { + if (entries[i].canonical == PREG_NONE) continue; + Inst* in = &slot[w++]; + in->op = (u16)IR_ADDR_OF; + in->def = (Val)entries[i].canonical; + in->type = entries[i].addr_type; + in->nopnds = 2; + in->opnds = arena_array(f->arena, Operand, 2); + memset(&in->opnds[0], 0, sizeof(Operand)); + in->opnds[0].kind = OPK_REG; + in->opnds[0].cls = entries[i].cls; + in->opnds[0].type = entries[i].addr_type; + in->opnds[0].v.reg = entries[i].canonical; + memset(&in->opnds[1], 0, sizeof(Operand)); + in->opnds[1].kind = OPK_GLOBAL; + in->opnds[1].cls = entries[i].cls; + in->opnds[1].type = entries[i].addr_type; + in->opnds[1].v.global.sym = entries[i].sym; + in->opnds[1].v.global.addend = entries[i].addend; + ir_assign_inst_id(f, in); + } + + /* Pass 5: apply remap to all operand uses in the function. */ + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + addr_cse_apply_to_inst(&bl->insts[i], remap); + } + } + + opt_analysis_invalidate( + f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP); +} diff --git a/src/opt/pass_analysis.c b/src/opt/pass_analysis.c @@ -75,7 +75,7 @@ static void verify_storage(Func* f, const char* stage, CGLocalStorage st, static void verify_operand_shape(Func* f, const char* stage, const Operand* op, int physical_regs) { if (!op) return; - switch ((OpKind)op->kind) { + switch ((OptOperandKind)op->kind) { case OPK_IMM: case OPK_GLOBAL: break; @@ -578,9 +578,11 @@ static void verify_values(Func* f, const char* stage) { opt_fail(f, stage, "phi input type mismatch", b, p); } } else if ((IROp)in->op == IR_PARAM_DECL) { + IRParamDeclAux* aux = (IRParamDeclAux*)in->extra.aux; if (in->nopnds || in->opnds) opt_fail(f, stage, "param_decl should not carry operands", b, i); - if (in->def == VAL_NONE) + if ((!aux || aux->desc.storage.kind == CG_LOCAL_STORAGE_REG) && + in->def == VAL_NONE) opt_fail(f, stage, "param_decl missing def", b, i); } } @@ -693,9 +695,11 @@ static void verify_rewritten(Func* f, const char* stage) { if ((IROp)in->op == IR_PHI) opt_fail(f, stage, "phi survived rewrite", b, i); if ((IROp)in->op == IR_PARAM_DECL) { + IRParamDeclAux* aux = (IRParamDeclAux*)in->extra.aux; if (in->nopnds || in->opnds) opt_fail(f, stage, "param_decl carries operands after rewrite", b, i); - if (in->def == VAL_NONE || in->def >= opt_reg_count(f)) + if ((!aux || aux->desc.storage.kind == CG_LOCAL_STORAGE_REG) && + (in->def == VAL_NONE || in->def >= opt_reg_count(f))) opt_fail(f, stage, "bad param_decl def after rewrite", b, i); continue; } @@ -804,8 +808,6 @@ void opt_verify(Func* f, const char* stage) { if (bl->id != b) opt_fail(f, stage, "block id mismatch", bl->id, b); if (!a.reachable[b] && (bl->ninsts || bl->nsucc || bl->npreds)) opt_fail(f, stage, "unreachable block still connected", b, bl->ninsts); - if (bl->ninsts == 0 && bl->nsucc != 0) - opt_fail(f, stage, "empty block has successors", b, bl->nsucc); if (bl->ninsts) { u32 expected = 0; if (fixed_terminator_succ_count(&bl->insts[bl->ninsts - 1], &expected) && diff --git a/src/opt/pass_cfg.c b/src/opt/pass_cfg.c @@ -220,7 +220,10 @@ void opt_build_cfg(Func* f) { for (u32 b = 0; b < f->nblocks; ++b) { Block* bl = &f->blocks[b]; if (bl->ninsts == 0) { - bl->nsucc = 0; + /* Empty blocks are valid label-only blocks. Their fallthrough successor + * is assigned by the lowering/layout pass and must survive CFG rebuilds + * so branches to labels placed immediately before another block remain + * connected. */ continue; } const Inst* last = &bl->insts[bl->ninsts - 1]; @@ -230,7 +233,6 @@ void opt_build_cfg(Func* f) { bl->nsucc = nsucc; continue; } - bl->nsucc = 0; continue; } switch ((IROp)last->op) { diff --git a/src/opt/pass_dce.c b/src/opt/pass_dce.c @@ -1,8 +1,34 @@ #include "core/arena.h" #include "opt/opt_internal.h" +/* A value-producing op whose destination is an OPK_LOCAL operand writes to an + * address-taken (frame-homed) local. cg_ir_lower emits those as a value op with + * a frame destination rather than a separate IR_STORE, so the write is a memory + * side effect even though the op itself (e.g. IR_LOAD_IMM, IR_COPY) is + * otherwise pure. Without this, dead-def elimination drops stores to escaped + * locals. */ +static int opt_inst_writes_frame_local(const Inst* in) { + switch ((IROp)in->op) { + case IR_LOAD_IMM: + case IR_LOAD_CONST: + case IR_LOAD_LABEL_ADDR: + case IR_COPY: + case IR_LOAD: + case IR_ADDR_OF: + case IR_TLS_ADDR_OF: + case IR_BINOP: + case IR_UNOP: + case IR_CMP: + case IR_CONVERT: + return in->nopnds > 0 && in->opnds[0].kind == OPK_LOCAL; + default: + return 0; + } +} + int opt_inst_has_side_effect(Func* f, const Inst* in) { (void)f; + if (opt_inst_writes_frame_local(in)) return 1; switch ((IROp)in->op) { case IR_LOAD: return opt_mem_observable(&in->extra.mem); @@ -22,6 +48,10 @@ int opt_inst_has_side_effect(Func* f, const Inst* in) { case IR_CMP_BRANCH: case IR_SWITCH: case IR_INDIRECT_BRANCH: + case IR_LOCAL_STATIC_DATA_BEGIN: + case IR_LOCAL_STATIC_DATA_WRITE: + case IR_LOCAL_STATIC_DATA_LABEL_ADDR: + case IR_LOCAL_STATIC_DATA_END: case IR_RET: case IR_SCOPE_BEGIN: case IR_SCOPE_ELSE: diff --git a/src/opt/pass_emit.c b/src/opt/pass_emit.c @@ -1,1384 +0,0 @@ -#include <string.h> - -#include "arch/regalloc.h" -#include "core/arena.h" -#include "core/core.h" -#include "core/metrics.h" -#include "core/slice.h" -#include "opt/ir.h" -#include "opt/opt_internal.h" - -typedef struct ReplayCtx { - Compiler* c; - Func* f; - CGTarget* tgt; - Reg* val_to_reg; - FrameSlot* slot_map; - Label* label_map; - CGScope* scope_map; - u8* val_alloced; - u8* block_label_placed; - u8 identity_regs; - CGSimpleRegAlloc regalloc; - /* Cached hard-reg collection: filled once when identity_regs is set and - * reused by plan_hard_regs and reserve_hard_regs callbacks. */ - Reg used_hard_regs[OPT_REG_CLASSES][OPT_MAX_HARD_REGS]; - u32 nused_hard_regs[OPT_REG_CLASSES]; - u8 used_hard_regs_valid; - /* Last source location pushed to the target — used to skip redundant - * set_loc calls when consecutive insts share a loc (the common case). */ - SrcLoc last_loc; - u8 last_loc_valid; - /* When debug info isn't being emitted, set_loc only affects the panic - * loc — we set it once per function in func_begin and skip per-inst - * updates entirely. */ - u8 wants_loc; -} ReplayCtx; - -static inline int srcloc_eq(SrcLoc a, SrcLoc b) { - return a.file_id == b.file_id && a.line == b.line && a.col == b.col; -} - -static Reg val_to_target_reg(ReplayCtx* r, Val v) { - Func* f = r->f; - if (v == VAL_NONE) return REG_NONE; - if (r->identity_regs) return (Reg)v; - if (v >= f->nvals) { - SrcLoc loc = {0, 0, 0}; - compiler_panic(r->c, loc, "opt replay: Val %u out of range", v); - } - if (!r->val_alloced[v]) { - r->val_to_reg[v] = - cg_simple_regalloc_alloc(&r->regalloc, (RegClass)f->val_cls[v]); - if (r->val_to_reg[v] == (Reg)REG_NONE) { - SrcLoc loc = {0, 0, 0}; - compiler_panic(r->c, loc, "opt replay: hard reg pool exhausted"); - } - r->val_alloced[v] = 1; - } - return r->val_to_reg[v]; -} - -static FrameSlot slot_to_target(ReplayCtx* r, FrameSlot vs) { - if (vs == FRAME_SLOT_NONE) return FRAME_SLOT_NONE; - if (vs >= r->f->nframe_slots + 1u) { - SrcLoc loc = {0, 0, 0}; - compiler_panic(r->c, loc, "opt replay: vslot %u out of range", - (unsigned)vs); - } - return r->slot_map[vs]; -} - -static CGLocalStorage xlat_storage(ReplayCtx* r, CGLocalStorage st, - CfreeCgTypeId ty) { - (void)ty; - if (st.kind == CG_LOCAL_STORAGE_REG) { - PReg pr = (PReg)st.v.reg; - if (r->identity_regs && r->f->opt_rewritten && opt_reg_valid(r->f, pr)) { - u8 alloc_kind = opt_preg_alloc_kind(r->f, pr); - if (alloc_kind == OPT_ALLOC_HARD) { - st.v.reg = opt_preg_hard_reg(r->f, pr); - } else if (alloc_kind == OPT_ALLOC_SPILL) { - st.kind = CG_LOCAL_STORAGE_FRAME; - st.v.frame_slot = slot_to_target(r, opt_preg_spill_slot(r->f, pr)); - } else { - st.v.reg = val_to_target_reg(r, (Val)pr); - } - } else { - st.v.reg = val_to_target_reg(r, (Val)pr); - } - } else { - st.v.frame_slot = slot_to_target(r, st.v.frame_slot); - } - return st; -} - -static int replay_reg_storage_unused(ReplayCtx* r, CGLocalStorage st) { - if (!r || st.kind != CG_LOCAL_STORAGE_REG) return 0; - if (!(r->identity_regs && r->f->opt_rewritten)) return 0; - PReg pr = (PReg)st.v.reg; - if (pr == 0 || pr >= opt_reg_count(r->f)) return 0; - if (opt_preg_alloc_kind(r->f, pr) == OPT_ALLOC_NONE) return 1; - return r->f->preg_info && r->f->preg_info[pr].use_freq == 0; -} - -static Operand xlat_op(ReplayCtx* r, Operand op) { - switch ((OpKind)op.kind) { - case OPK_IMM: - case OPK_GLOBAL: - return op; - case OPK_REG: - if (r->identity_regs && r->f->opt_rewritten) return op; - op.v.reg = val_to_target_reg(r, (Val)op.v.reg); - return op; - case OPK_LOCAL: - op.v.frame_slot = slot_to_target(r, op.v.frame_slot); - return op; - case OPK_INDIRECT: - if (!(r->identity_regs && r->f->opt_rewritten)) { - op.v.ind.base = val_to_target_reg(r, (Val)op.v.ind.base); - if (op.v.ind.index != (Reg)REG_NONE) - op.v.ind.index = val_to_target_reg(r, (Val)op.v.ind.index); - } - return op; - } - return op; -} - -static CGABIValue xlat_abivalue(ReplayCtx* r, const CGABIValue* in, - CGABIPart* parts_out) { - CGABIValue out = *in; - out.storage = xlat_op(r, in->storage); - if (in->nparts && parts_out) { - for (u32 i = 0; i < in->nparts; ++i) { - parts_out[i] = in->parts[i]; - parts_out[i].op = xlat_op(r, in->parts[i].op); - } - out.parts = parts_out; - } else { - out.parts = NULL; - } - return out; -} - -typedef struct ReplayParallelMove { - Operand dst; - Operand src; - MemAccess mem; - const CGCallPlanRet* ret; - u32 src_offset; - u32 dst_offset; - u32 stack_offset; - u8 dst_kind; - u8 src_kind; - u8 is_ret; - u8 done; -} ReplayParallelMove; - -static Operand phys_reg_operand(Reg r, RegClass cls, CfreeCgTypeId ty) { - Operand op; - memset(&op, 0, sizeof op); - op.kind = OPK_REG; - op.cls = (u8)cls; - op.type = ty; - op.v.reg = r; - return op; -} - -static int operand_reg_eq(const Operand* a, const Operand* b) { - return a && b && a->kind == OPK_REG && b->kind == OPK_REG && - a->cls == b->cls && a->v.reg == b->v.reg; -} - -static int operand_uses_reg_for_replay(const Operand* op, const Operand* r) { - if (!op || !r || r->kind != OPK_REG) return 0; - if (op->kind == OPK_REG) return operand_reg_eq(op, r); - if (op->kind == OPK_INDIRECT) - return r->cls == RC_INT && - (op->v.ind.base == r->v.reg || - (op->v.ind.index != (Reg)REG_NONE && op->v.ind.index == r->v.reg)); - return 0; -} - -static int replay_move_src_ready(const ReplayParallelMove* moves, u32 n, - u32 idx) { - const Operand* dst = &moves[idx].dst; - for (u32 i = 0; i < n; ++i) { - if (i == idx || moves[i].done) continue; - if (operand_uses_reg_for_replay(&moves[i].src, dst)) return 0; - } - return 1; -} - -static int replay_find_move_dst(const ReplayParallelMove* moves, u32 n, - const Operand* dst) { - for (u32 i = 0; i < n; ++i) { - if (!moves[i].done && operand_reg_eq(&moves[i].dst, dst)) return (int)i; - } - return -1; -} - -static Reg replay_scratch_reg(ReplayCtx* r, RegClass cls, Reg avoid) { - if ((u32)cls >= OPT_REG_CLASSES) return REG_NONE; - for (u32 i = 0; i < r->f->opt_scratch_reg_count[cls]; ++i) { - Reg sr = r->f->opt_scratch_regs[cls][i]; - if (sr != avoid) return sr; - } - return REG_NONE; -} - -static void replay_emit_move(CGTarget* w, const ReplayParallelMove* move) { - Operand dst = move->dst; - Operand src = move->src; - MemAccess mem = move->mem; - if (move->dst_kind == CG_CALL_PLAN_STACK || - move->dst_kind == CG_CALL_PLAN_TAIL_STACK) { - CGCallPlanMove m; - memset(&m, 0, sizeof m); - m.src = src; - m.src_kind = move->src_kind; - m.dst_kind = move->dst_kind; - m.cls = dst.cls; - m.src_offset = move->src_offset; - m.stack_offset = move->stack_offset; - m.mem = mem; - w->store_call_arg(w, &m); - } else if (dst.kind == OPK_REG) { - if (move->src_kind == CG_CALL_PLAN_SRC_ADDR || move->src_offset) { - CGCallPlanMove m; - memset(&m, 0, sizeof m); - m.src = src; - m.src_kind = move->src_kind; - m.dst_kind = CG_CALL_PLAN_REG; - m.cls = dst.cls; - m.dst_reg = dst.v.reg; - m.src_offset = move->src_offset; - m.mem = mem; - w->load_call_arg(w, dst, &m); - return; - } - if (src.kind == OPK_REG) { - if (!operand_reg_eq(&dst, &src)) w->copy(w, dst, src); - } else if (src.kind == OPK_IMM) { - w->load_imm(w, dst, src.v.imm); - } else if (src.kind == OPK_LOCAL || src.kind == OPK_INDIRECT) { - w->load(w, dst, src, mem); - } else if (src.kind == OPK_GLOBAL) { - w->addr_of(w, dst, src); - } - } else if (dst.kind == OPK_LOCAL || dst.kind == OPK_INDIRECT) { - if (move->is_ret && move->dst_offset) { - CGCallPlanRet ret = move->ret ? *move->ret : (CGCallPlanRet){0}; - ret.dst = dst; - ret.dst_offset = move->dst_offset; - ret.mem = mem; - w->store_call_ret(w, &ret, src); - return; - } - w->store(w, dst, src, mem); - } -} - -/* `avoid` names a physical register the caller has reserved across these - * moves (e.g. the scratch holding an indirect call's target). Cycle-breaking - * temporaries must steer clear of it, or they would clobber the live value. */ -static void replay_parallel_moves(ReplayCtx* r, ReplayParallelMove* moves, - u32 n, Reg avoid) { - CGTarget* w = r->tgt; - u32 remaining = 0; - for (u32 i = 0; i < n; ++i) { - if (operand_reg_eq(&moves[i].dst, &moves[i].src)) { - moves[i].done = 1; - } else { - ++remaining; - } - } - - while (remaining) { - int progressed = 0; - for (u32 i = 0; i < n; ++i) { - if (moves[i].done || !replay_move_src_ready(moves, n, i)) continue; - replay_emit_move(w, &moves[i]); - moves[i].done = 1; - --remaining; - progressed = 1; - } - if (progressed) continue; - - for (u32 i = 0; i < n; ++i) { - if (moves[i].done || moves[i].src.kind == OPK_REG) continue; - Reg sr = replay_scratch_reg(r, (RegClass)moves[i].dst.cls, avoid); - if (sr == (Reg)REG_NONE) continue; - Operand tmp = - phys_reg_operand(sr, (RegClass)moves[i].dst.cls, moves[i].dst.type); - ReplayParallelMove tmp_move = moves[i]; - tmp_move.dst = tmp; - tmp_move.dst_kind = CG_CALL_PLAN_REG; - replay_emit_move(w, &tmp_move); - moves[i].src = tmp; - moves[i].src_kind = CG_CALL_PLAN_SRC_VALUE; - moves[i].src_offset = 0; - progressed = 1; - break; - } - if (progressed) continue; - - u32 first = 0; - while (first < n && moves[first].done) ++first; - if (first == n) break; - Operand save = moves[first].src; - Reg sr = replay_scratch_reg(r, (RegClass)save.cls, avoid); - if (sr == (Reg)REG_NONE) { - SrcLoc loc = {0, 0, 0}; - compiler_panic(r->c, loc, - "opt replay: no scratch register for parallel call move"); - } - Operand tmp = phys_reg_operand(sr, (RegClass)save.cls, save.type); - w->copy(w, tmp, save); - - Operand hole = save; - for (;;) { - int idx = replay_find_move_dst(moves, n, &hole); - if (idx < 0 || (u32)idx == first) break; - replay_emit_move(w, &moves[idx]); - hole = moves[idx].src; - moves[idx].done = 1; - --remaining; - } - moves[first].src = tmp; - moves[first].src_kind = CG_CALL_PLAN_SRC_VALUE; - moves[first].src_offset = 0; - replay_emit_move(w, &moves[first]); - moves[first].done = 1; - --remaining; - } -} - -static int replay_plan_supported(CGTarget* w, const CGCallPlan* p, - const char** reason) { - if (reason) *reason = NULL; - if (!p) { - if (reason) *reason = "missing plan"; - return 0; - } - for (u32 i = 0; i < p->nargs; ++i) { - if ((p->args[i].dst_kind == CG_CALL_PLAN_STACK || - p->args[i].dst_kind == CG_CALL_PLAN_TAIL_STACK) && - !w->store_call_arg) { - if (reason) *reason = "stack arg without store_call_arg"; - return 0; - } - if (p->args[i].dst_kind == CG_CALL_PLAN_REG && - (p->args[i].src_kind == CG_CALL_PLAN_SRC_ADDR || - p->args[i].src_offset) && - !w->load_call_arg) { - if (reason) *reason = "reg arg without load_call_arg"; - return 0; - } - } - for (u32 i = 0; i < p->nrets; ++i) - if (p->rets[i].dst.kind != OPK_REG && p->rets[i].dst.kind != OPK_LOCAL && - p->rets[i].dst.kind != OPK_INDIRECT) { - if (reason) *reason = "unsupported ret destination"; - return 0; - } - for (u32 i = 0; i < p->nrets; ++i) - if (p->rets[i].dst_offset && - (p->rets[i].dst.kind == OPK_LOCAL || - p->rets[i].dst.kind == OPK_INDIRECT) && - !w->store_call_ret) { - if (reason) *reason = "ret offset without store_call_ret"; - return 0; - } - return 1; -} - -/* Materialize args and emit the call instruction for a planned call. Does not - * emit return-value moves. */ -static void emit_call_and_args(ReplayCtx* r, const CGCallPlan* src_plan) { - CGCallPlan plan = *src_plan; - plan.callee = xlat_op(r, src_plan->callee); - plan.args = src_plan->nargs - ? arena_array(r->f->arena, CGCallPlanMove, src_plan->nargs) - : NULL; - - ReplayParallelMove* arg_moves = - src_plan->nargs - ? arena_zarray(r->f->arena, ReplayParallelMove, src_plan->nargs) - : NULL; - u32 nargs = 0; - for (u32 i = 0; i < src_plan->nargs; ++i) { - plan.args[i] = src_plan->args[i]; - plan.args[i].src = xlat_op(r, src_plan->args[i].src); - if ((src_plan->flags & CG_CALL_TAIL) && - plan.args[i].dst_kind == CG_CALL_PLAN_STACK) { - plan.args[i].dst_kind = CG_CALL_PLAN_TAIL_STACK; - } - Operand dst; - if (plan.args[i].dst_kind == CG_CALL_PLAN_REG) { - dst = phys_reg_operand(plan.args[i].dst_reg, (RegClass)plan.args[i].cls, - plan.args[i].mem.type); - } else { - memset(&dst, 0, sizeof dst); - dst.kind = OPK_LOCAL; - dst.cls = plan.args[i].cls; - dst.type = plan.args[i].mem.type; - } - arg_moves[nargs].dst = dst; - arg_moves[nargs].src = plan.args[i].src; - arg_moves[nargs].mem = plan.args[i].mem; - arg_moves[nargs].src_offset = plan.args[i].src_offset; - arg_moves[nargs].stack_offset = plan.args[i].stack_offset; - arg_moves[nargs].dst_kind = plan.args[i].dst_kind; - arg_moves[nargs].src_kind = plan.args[i].src_kind; - ++nargs; - } - - Reg callee_scratch = REG_NONE; - if (plan.callee.kind == OPK_REG) { - for (u32 i = 0; i < nargs; ++i) { - if (arg_moves[i].dst_kind != CG_CALL_PLAN_REG || - !operand_reg_eq(&arg_moves[i].dst, &plan.callee)) - continue; - callee_scratch = replay_scratch_reg(r, RC_INT, REG_NONE); - if (callee_scratch == (Reg)REG_NONE) { - SrcLoc loc = {0, 0, 0}; - compiler_panic(r->c, loc, - "opt replay: no scratch register for indirect call"); - } - Operand tmp = phys_reg_operand(callee_scratch, RC_INT, plan.callee.type); - r->tgt->copy(r->tgt, tmp, plan.callee); - plan.callee = tmp; - break; - } - } - - replay_parallel_moves(r, arg_moves, nargs, callee_scratch); - r->tgt->emit_call_plan(r->tgt, &plan); -} - -/* An ALLOWED tail call the backend could not realize: emit it as an ordinary - * call and return its result. The callee's return registers are the function's - * (the return shapes match by CG's precondition) and the epilogue preserves - * them, so a plain call followed by a bare return forwards the value. The - * frame already reserves outgoing space for this call (the known-frame sizing - * counts tail calls' stack args), so the ordinary call's args fit. */ -static void replay_tail_fallback(ReplayCtx* r, const IRCallAux* aux) { - /* Reuse the recorded plan (its arg sources are the post-regalloc operands) - * but clear CG_CALL_TAIL so emit_call_and_args emits an ordinary BL and - * keeps stack args in the outgoing area rather than the tail slots. */ - CGCallPlan plan = aux->plan; - plan.flags &= (u16)~CG_CALL_TAIL; - emit_call_and_args(r, &plan); - r->tgt->ret(r->tgt, NULL); -} - -static void replay_planned_call(ReplayCtx* r, const IRCallAux* aux, - SrcLoc loc) { - const CGCallPlan* src_plan = &aux->plan; - - /* The opt recorder accepted this tail call unconditionally; resolve it now - * against the real backend, which has a laid-out frame. NULL => realizable, - * emit the sibling call. Otherwise MUST diagnoses and ALLOWED falls back. */ - if (src_plan->flags & CG_CALL_TAIL) { - const char* reason = - r->tgt->tail_call_unrealizable_reason - ? r->tgt->tail_call_unrealizable_reason(r->tgt, &aux->desc) - : "target does not support tail calls"; - if (reason) { - if (aux->desc.tail_policy == CFREE_CG_TAIL_MUST) - compiler_panic(r->c, loc, "musttail call not realizable: %s", reason); - replay_tail_fallback(r, aux); - return; - } - } - - emit_call_and_args(r, src_plan); - if (src_plan->flags & CG_CALL_TAIL) return; - - CGCallPlanRet* rets = - src_plan->nrets ? arena_array(r->f->arena, CGCallPlanRet, src_plan->nrets) - : NULL; - ReplayParallelMove* ret_moves = - src_plan->nrets - ? arena_zarray(r->f->arena, ReplayParallelMove, src_plan->nrets) - : NULL; - u32 nrets = 0; - for (u32 i = 0; i < src_plan->nrets; ++i) { - rets[i] = src_plan->rets[i]; - rets[i].dst = xlat_op(r, src_plan->rets[i].dst); - Operand src = phys_reg_operand(rets[i].src_reg, (RegClass)rets[i].cls, - rets[i].mem.type); - ret_moves[nrets].dst = rets[i].dst; - ret_moves[nrets].src = src; - ret_moves[nrets].mem = rets[i].mem; - ret_moves[nrets].ret = &rets[i]; - ret_moves[nrets].dst_offset = rets[i].dst_offset; - ret_moves[nrets].dst_kind = CG_CALL_PLAN_REG; - ret_moves[nrets].src_kind = CG_CALL_PLAN_SRC_VALUE; - ret_moves[nrets].is_ret = 1; - ++nrets; - } - replay_parallel_moves(r, ret_moves, nrets, REG_NONE); -} - -static Label ensure_label(ReplayCtx* r, u32 b) { - if (b >= r->f->nblocks) return LABEL_NONE; - if (r->label_map[b] == LABEL_NONE) { - /* If w_label_new pre-allocated an MCLabel during recording (so - * cfree_cg_data_label_addr could queue a deferred fixup against - * it), reuse it here so the place we emit lines up with the - * existing pending fixup list. */ - Block* bl = &r->f->blocks[b]; - if (bl->mc_label != MC_LABEL_NONE) { - r->label_map[b] = (Label)bl->mc_label; - } else { - r->label_map[b] = r->tgt->label_new(r->tgt); - } - } - return r->label_map[b]; -} - -static void ensure_label_placed(ReplayCtx* r, u32 b) { - if (r->block_label_placed[b]) return; - r->block_label_placed[b] = 1; - if (b == r->f->entry) return; - Label l = ensure_label(r, b); - r->tgt->label_place(r->tgt, l); -} - -static void replay_inst(ReplayCtx* r, u32 b, Inst* in) { - CGTarget* w = r->tgt; - /* set_loc serves two purposes (see arch/mc.c and the per-arch emit code): - * 1. error reporting via compiler_panic - needs some recent loc - * 2. DWARF line-info rows via debug_emit_row, gated on mc->debug - * When debug info isn't being emitted we still set the loc once (the - * first inst's check catches that via last_loc_valid=0), so panic - * messages still point at a real source location, but subsequent updates - * are skipped. When debug info IS being emitted we update on every - * change so line rows stay accurate. */ - if (r->wants_loc) { - if (!r->last_loc_valid || !srcloc_eq(r->last_loc, in->loc)) { - w->set_loc(w, in->loc); - r->last_loc = in->loc; - r->last_loc_valid = 1; - } - } else if (!r->last_loc_valid) { - w->set_loc(w, in->loc); - r->last_loc = in->loc; - r->last_loc_valid = 1; - } - - switch ((IROp)in->op) { - case IR_NOP: - case IR_CONST_I: - case IR_CONST_BYTES: - case IR_PARAM_DECL: - case IR_PHI: - case IR_CONDBR: - break; - case IR_ASM_BLOCK: { - IRAsmAux* aux = (IRAsmAux*)in->extra.aux; - Operand* in_ops_ = NULL; - Operand* out_ops_ = NULL; - if (aux->nin) { - in_ops_ = arena_array(r->f->arena, Operand, aux->nin); - for (u32 k = 0; k < aux->nin; ++k) { - in_ops_[k] = xlat_op(r, aux->in_ops[k]); - } - } - if (aux->nout) { - out_ops_ = arena_array(r->f->arena, Operand, aux->nout); - for (u32 k = 0; k < aux->nout; ++k) { - out_ops_[k] = xlat_op(r, aux->out_ops[k]); - } - } - w->asm_block(w, aux->tmpl, aux->outs, aux->nout, out_ops_, aux->ins, - aux->nin, in_ops_, aux->clobbers, aux->nclob); - break; - } - case IR_LOAD_IMM: { - Operand dst = xlat_op(r, in->opnds[0]); - w->load_imm(w, dst, in->extra.imm); - break; - } - case IR_LOAD_CONST: { - Operand dst = xlat_op(r, in->opnds[0]); - w->load_const(w, dst, in->extra.cbytes); - break; - } - case IR_COPY: { - Operand dst = xlat_op(r, in->opnds[0]); - Operand src = xlat_op(r, in->opnds[1]); - w->copy(w, dst, src); - break; - } - case IR_LOAD: { - Operand dst = xlat_op(r, in->opnds[0]); - Operand addr = xlat_op(r, in->opnds[1]); - w->load(w, dst, addr, in->extra.mem); - break; - } - case IR_STORE: { - Operand addr = xlat_op(r, in->opnds[0]); - Operand src = xlat_op(r, in->opnds[1]); - w->store(w, addr, src, in->extra.mem); - break; - } - case IR_ADDR_OF: { - Operand dst = xlat_op(r, in->opnds[0]); - Operand lv = xlat_op(r, in->opnds[1]); - w->addr_of(w, dst, lv); - break; - } - case IR_TLS_ADDR_OF: { - Operand dst = xlat_op(r, in->opnds[0]); - IRTlsAux* aux = (IRTlsAux*)in->extra.aux; - w->tls_addr_of(w, dst, aux->sym, aux->addend); - break; - } - case IR_AGG_COPY: { - Operand a = xlat_op(r, in->opnds[0]); - Operand bo = xlat_op(r, in->opnds[1]); - IRAggAux* aux = (IRAggAux*)in->extra.aux; - w->copy_bytes(w, a, bo, aux->access); - break; - } - case IR_AGG_SET: { - Operand a = xlat_op(r, in->opnds[0]); - Operand bo = xlat_op(r, in->opnds[1]); - IRAggAux* aux = (IRAggAux*)in->extra.aux; - w->set_bytes(w, a, bo, aux->access); - break; - } - case IR_BITFIELD_LOAD: { - Operand dst = xlat_op(r, in->opnds[0]); - Operand rec_ = xlat_op(r, in->opnds[1]); - IRBitFieldAux* aux = (IRBitFieldAux*)in->extra.aux; - w->bitfield_load(w, dst, rec_, aux->access); - break; - } - case IR_BITFIELD_STORE: { - Operand rec_ = xlat_op(r, in->opnds[0]); - Operand src = xlat_op(r, in->opnds[1]); - IRBitFieldAux* aux = (IRBitFieldAux*)in->extra.aux; - w->bitfield_store(w, rec_, src, aux->access); - break; - } - case IR_BINOP: { - Operand dst = xlat_op(r, in->opnds[0]); - Operand a = xlat_op(r, in->opnds[1]); - Operand bo = xlat_op(r, in->opnds[2]); - w->binop(w, (BinOp)in->extra.imm, dst, a, bo); - break; - } - case IR_UNOP: { - Operand dst = xlat_op(r, in->opnds[0]); - Operand a = xlat_op(r, in->opnds[1]); - w->unop(w, (UnOp)in->extra.imm, dst, a); - break; - } - case IR_CMP: { - Operand dst = xlat_op(r, in->opnds[0]); - Operand a = xlat_op(r, in->opnds[1]); - Operand bo = xlat_op(r, in->opnds[2]); - w->cmp(w, (CmpOp)in->extra.imm, dst, a, bo); - break; - } - case IR_CONVERT: { - Operand dst = xlat_op(r, in->opnds[0]); - Operand src = xlat_op(r, in->opnds[1]); - w->convert(w, (ConvKind)in->extra.imm, dst, src); - break; - } - case IR_CALL: { - IRCallAux* aux = (IRCallAux*)in->extra.aux; - const char* plan_reason = NULL; - if (aux && aux->use_plan_replay && w->emit_call_plan && - replay_plan_supported(w, &aux->plan, &plan_reason)) { - replay_planned_call(r, aux, in->loc); - break; - } - compiler_panic( - r->c, in->loc, "opt replay: call has no supported call plan%.*s%.*s", - SLICE_ARG(plan_reason ? SLICE_LIT(": ") : SLICE_NULL), - SLICE_ARG(plan_reason ? slice_from_cstr(plan_reason) : SLICE_NULL)); - break; - } - case IR_BR: { - Block* bl = &r->f->blocks[b]; - if (bl->nsucc < 1) break; - Label l = ensure_label(r, bl->succ[0]); - w->jump(w, l); - break; - } - case IR_CMP_BRANCH: { - Operand a = xlat_op(r, in->opnds[0]); - Operand bo = xlat_op(r, in->opnds[1]); - Block* bl = &r->f->blocks[b]; - Label taken = ensure_label(r, bl->succ[0]); - w->cmp_branch(w, (CmpOp)in->extra.imm, a, bo, taken); - break; - } - case IR_SWITCH: { - IRSwitchAux* aux = (IRSwitchAux*)in->extra.aux; - Operand sel = xlat_op(r, in->opnds[0]); - CGSwitchDesc d; - CGSwitchCase* cases = NULL; - memset(&d, 0, sizeof d); - d.selector = sel; - d.selector_type = aux->selector_type; - /* default_block is always a real successor block in the IR (the - * recorder synthesizes one for no-default switches). Replay must - * emit an explicit jump to it so fall-through layout assumptions - * don't depend on block placement. */ - d.default_label = ensure_label(r, aux->default_block); - d.ncases = aux->ncases; - d.hint = aux->hint; - /* opt only invokes pass_emit at level >= 1; cfree_cg_switch - * already routed dense/forced-table switches through - * cg_emit_switch_table, so anything that survives in IR_SWITCH - * is chain by construction. Set the field anyway to keep the - * desc fully populated. */ - d.opt_level = 1u; - if (aux->ncases) { - cases = arena_array(r->f->arena, CGSwitchCase, aux->ncases); - for (u32 i = 0; i < aux->ncases; ++i) { - cases[i].value = aux->cases[i].value; - cases[i].label = ensure_label(r, aux->cases[i].block); - } - d.cases = cases; - } - if (w->switch_) { - w->switch_(w, &d); - } else { - cg_lower_switch_default(w, &d); - } - break; - } - case IR_INDIRECT_BRANCH: { - IRIndirectAux* aux = (IRIndirectAux*)in->extra.aux; - Operand addr = xlat_op(r, in->opnds[0]); - Label* labels = NULL; - if (aux->ntargets) { - labels = arena_array(r->f->arena, Label, aux->ntargets); - for (u32 i = 0; i < aux->ntargets; ++i) - labels[i] = ensure_label(r, aux->targets[i]); - } - w->indirect_branch(w, addr, labels, aux->ntargets); - break; - } - case IR_LOAD_LABEL_ADDR: { - Operand dst = xlat_op(r, in->opnds[0]); - Label l = ensure_label(r, (u32)in->extra.imm); - w->load_label_addr(w, dst, l); - break; - } - case IR_RET: { - IRRetAux* aux = (IRRetAux*)in->extra.aux; - if (!aux || !aux->present) { - w->ret(w, NULL); - } else { - CGABIPart* parts = aux->val.nparts ? arena_array(r->f->arena, CGABIPart, - aux->val.nparts) - : NULL; - CGABIValue v = xlat_abivalue(r, &aux->val, parts); - w->ret(w, &v); - } - break; - } - case IR_SCOPE_BEGIN: { - IRScopeAux* aux = (IRScopeAux*)in->extra.aux; - CGScopeDesc d = aux->desc; - d.cond = xlat_op(r, d.cond); - if (aux->desc.kind == SCOPE_LOOP || aux->desc.kind == SCOPE_BLOCK) { - d.break_label = aux->loop_break_block - ? ensure_label(r, aux->loop_break_block) - : LABEL_NONE; - d.continue_label = aux->loop_continue_block - ? ensure_label(r, aux->loop_continue_block) - : LABEL_NONE; - } - CGScope cs = w->scope_begin(w, &d); - r->scope_map[aux->scope_id] = cs; - break; - } - case IR_SCOPE_ELSE: - w->scope_else(w, r->scope_map[(u32)in->extra.imm]); - break; - case IR_SCOPE_END: - w->scope_end(w, r->scope_map[(u32)in->extra.imm]); - break; - case IR_BREAK_TO: - w->break_to(w, r->scope_map[(u32)in->extra.imm]); - break; - case IR_CONTINUE_TO: - w->continue_to(w, r->scope_map[(u32)in->extra.imm]); - break; - case IR_ALLOCA: { - Operand dst = xlat_op(r, in->opnds[0]); - Operand size = xlat_op(r, in->opnds[1]); - w->alloca_(w, dst, size, (u32)in->extra.imm); - break; - } - case IR_VA_START: { - Operand ap = xlat_op(r, in->opnds[0]); - w->va_start_(w, ap); - break; - } - case IR_VA_ARG: { - Operand dst = xlat_op(r, in->opnds[0]); - Operand ap = xlat_op(r, in->opnds[1]); - CfreeCgTypeId ty = (CfreeCgTypeId)(uintptr_t)in->extra.aux; - w->va_arg_(w, dst, ap, ty); - break; - } - case IR_VA_END: { - Operand ap = xlat_op(r, in->opnds[0]); - w->va_end_(w, ap); - break; - } - case IR_VA_COPY: { - Operand a = xlat_op(r, in->opnds[0]); - Operand src = xlat_op(r, in->opnds[1]); - w->va_copy_(w, a, src); - break; - } - case IR_ATOMIC_LOAD: { - Operand dst = xlat_op(r, in->opnds[0]); - Operand addr = xlat_op(r, in->opnds[1]); - IRAtomicAux* aux = (IRAtomicAux*)in->extra.aux; - w->atomic_load(w, dst, addr, aux->mem, aux->mo); - break; - } - case IR_ATOMIC_STORE: { - Operand addr = xlat_op(r, in->opnds[0]); - Operand src = xlat_op(r, in->opnds[1]); - IRAtomicAux* aux = (IRAtomicAux*)in->extra.aux; - w->atomic_store(w, addr, src, aux->mem, aux->mo); - break; - } - case IR_ATOMIC_RMW: { - Operand dst = xlat_op(r, in->opnds[0]); - Operand addr = xlat_op(r, in->opnds[1]); - Operand val = xlat_op(r, in->opnds[2]); - IRAtomicAux* aux = (IRAtomicAux*)in->extra.aux; - w->atomic_rmw(w, (AtomicOp)aux->op, dst, addr, val, aux->mem, aux->mo); - break; - } - case IR_ATOMIC_CAS: { - Operand prior = xlat_op(r, in->opnds[0]); - Operand ok = xlat_op(r, in->opnds[1]); - Operand addr = xlat_op(r, in->opnds[2]); - Operand expected = xlat_op(r, in->opnds[3]); - Operand desired = xlat_op(r, in->opnds[4]); - IRCasAux* aux = (IRCasAux*)in->extra.aux; - w->atomic_cas(w, prior, ok, addr, expected, desired, aux->mem, - aux->success, aux->failure); - break; - } - case IR_FENCE: - w->fence(w, (MemOrder)in->extra.imm); - break; - case IR_INTRINSIC: { - IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; - Operand* dsts = - aux->ndst ? arena_array(r->f->arena, Operand, aux->ndst) : NULL; - Operand* args = - aux->narg ? arena_array(r->f->arena, Operand, aux->narg) : NULL; - for (u32 k = 0; k < aux->ndst; ++k) dsts[k] = xlat_op(r, aux->dsts[k]); - for (u32 k = 0; k < aux->narg; ++k) args[k] = xlat_op(r, aux->args[k]); - w->intrinsic(w, aux->kind, dsts, aux->ndst, args, aux->narg); - break; - } - } -} - -static void replay_block(ReplayCtx* r, u32 b) { - Func* f = r->f; - if (b >= f->nblocks) return; - ensure_label_placed(r, b); - Block* bl = &f->blocks[b]; - for (u32 i = 0; i < bl->ninsts; ++i) { - replay_inst(r, b, &bl->insts[i]); - } -} - -static void add_unique_reg(Reg* used, u32* nused, u32 cap, Reg r) { - for (u32 i = 0; i < *nused; ++i) { - if (used[i] == r) return; - } - if (*nused < cap) used[(*nused)++] = r; -} - -static void collect_replayed_reg(Func* f, Reg raw, RegClass cls, Reg* used, - u32* nused, u32 cap) { - if (raw == (Reg)REG_NONE) return; - if (f && f->opt_rewritten && opt_reg_valid(f, (PReg)raw)) { - PReg pr = (PReg)raw; - if (opt_preg_alloc_kind(f, pr) == OPT_ALLOC_HARD && - opt_preg_loc_cls(f, pr) == (u8)cls) - add_unique_reg(used, nused, cap, opt_preg_hard_reg(f, pr)); - } - add_unique_reg(used, nused, cap, raw); -} - -static void collect_replayed_operand_reg(Func* f, const Operand* op, - RegClass cls, Reg* used, u32* nused, - u32 cap) { - if (!op) return; - if (op->kind == OPK_REG) { - if (op->cls == cls) - collect_replayed_reg(f, op->v.reg, cls, used, nused, cap); - } else if (op->kind == OPK_INDIRECT) { - if (cls == RC_INT) { - collect_replayed_reg(f, op->v.ind.base, cls, used, nused, cap); - if (op->v.ind.index != (Reg)REG_NONE) - collect_replayed_reg(f, op->v.ind.index, cls, used, nused, cap); - } - } -} - -static void collect_replayed_abivalue_regs(Func* f, const CGABIValue* v, - RegClass cls, Reg* used, u32* nused, - u32 cap) { - if (!v) return; - collect_replayed_operand_reg(f, &v->storage, cls, used, nused, cap); - for (u32 i = 0; i < v->nparts; ++i) - collect_replayed_operand_reg(f, &v->parts[i].op, cls, used, nused, cap); -} - -static void collect_replayed_param_regs(Func* f, RegClass cls, Reg* used, - u32* nused, u32 cap) { - if (!f->opt_rewritten) return; - for (u32 i = 0; i < f->nparams; ++i) { - IRParam* p = &f->params[i]; - if (p->storage.kind != CG_LOCAL_STORAGE_REG) continue; - PReg pr = (PReg)p->storage.v.reg; - if (pr == 0 || pr >= opt_reg_count(f)) continue; - if (opt_preg_alloc_kind(f, pr) != OPT_ALLOC_HARD || - opt_preg_loc_cls(f, pr) != (u8)cls) - continue; - add_unique_reg(used, nused, cap, opt_preg_hard_reg(f, pr)); - } -} - -static u32 collect_replayed_hard_regs(Func* f, CGTarget* w, RegClass cls, - Reg* used, u32 cap) { - u32 nused = 0; - collect_replayed_param_regs(f, cls, used, &nused, cap); - for (u32 b = 0; b < f->nblocks; ++b) { - Block* bl = &f->blocks[b]; - for (u32 i = 0; i < bl->ninsts; ++i) { - Inst* in = &bl->insts[i]; - if ((IROp)in->op == IR_PARAM_DECL) continue; - for (u32 j = 0; j < in->nopnds; ++j) - collect_replayed_operand_reg(f, &in->opnds[j], cls, used, &nused, cap); - - switch ((IROp)in->op) { - case IR_CALL: { - IRCallAux* aux = (IRCallAux*)in->extra.aux; - if (!aux) break; - if (aux->use_plan_replay) { - collect_replayed_operand_reg(f, &aux->plan.callee, cls, used, - &nused, cap); - for (u32 j = 0; j < aux->plan.nargs; ++j) { - collect_replayed_operand_reg(f, &aux->plan.args[j].src, cls, used, - &nused, cap); - if (aux->plan.args[j].dst_kind == CG_CALL_PLAN_REG && - aux->plan.args[j].cls == (u8)cls) - add_unique_reg(used, &nused, cap, aux->plan.args[j].dst_reg); - } - for (u32 j = 0; j < aux->plan.nrets; ++j) { - collect_replayed_operand_reg(f, &aux->plan.rets[j].dst, cls, used, - &nused, cap); - if (aux->plan.rets[j].cls == (u8)cls) - add_unique_reg(used, &nused, cap, aux->plan.rets[j].src_reg); - } - } else { - collect_replayed_operand_reg(f, &aux->desc.callee, cls, used, - &nused, cap); - for (u32 j = 0; j < aux->desc.nargs; ++j) - collect_replayed_abivalue_regs(f, &aux->desc.args[j], cls, used, - &nused, cap); - collect_replayed_abivalue_regs(f, &aux->desc.ret, cls, used, &nused, - cap); - } - break; - } - case IR_RET: { - IRRetAux* aux = (IRRetAux*)in->extra.aux; - if (aux && aux->present) - collect_replayed_abivalue_regs(f, &aux->val, cls, used, &nused, - cap); - break; - } - case IR_SCOPE_BEGIN: { - IRScopeAux* aux = (IRScopeAux*)in->extra.aux; - if (aux) - collect_replayed_operand_reg(f, &aux->desc.cond, cls, used, &nused, - cap); - break; - } - case IR_ASM_BLOCK: { - IRAsmAux* aux = (IRAsmAux*)in->extra.aux; - if (!aux) break; - for (u32 j = 0; j < aux->nin; ++j) - collect_replayed_operand_reg(f, &aux->in_ops[j], cls, used, &nused, - cap); - for (u32 j = 0; j < aux->nout; ++j) - collect_replayed_operand_reg(f, &aux->out_ops[j], cls, used, &nused, - cap); - break; - } - case IR_INTRINSIC: { - IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; - if (!aux) break; - for (u32 j = 0; j < aux->narg; ++j) - collect_replayed_operand_reg(f, &aux->args[j], cls, used, &nused, - cap); - for (u32 j = 0; j < aux->ndst; ++j) - collect_replayed_operand_reg(f, &aux->dsts[j], cls, used, &nused, - cap); - break; - } - default: - break; - } - } - } - if (w->resolve_reg_name) { - for (u32 b = 0; b < f->nblocks; ++b) { - Block* bl = &f->blocks[b]; - for (u32 i = 0; i < bl->ninsts; ++i) { - Inst* in = &bl->insts[i]; - if ((IROp)in->op != IR_ASM_BLOCK) continue; - IRAsmAux* aux = (IRAsmAux*)in->extra.aux; - if (!aux) continue; - for (u32 j = 0; j < aux->nclob; ++j) { - Reg r; - RegClass rcls; - if (w->resolve_reg_name(w, aux->clobbers[j], &r, &rcls) != 0) - continue; - if (rcls == cls) add_unique_reg(used, &nused, cap, r); - } - } - } - } - return nused; -} - -static int replay_operand_uses_frame_slot(const Operand* op) { - return op && op->kind == OPK_LOCAL && op->v.frame_slot != FRAME_SLOT_NONE; -} - -static int replay_storage_uses_frame_slot(CGLocalStorage st) { - return st.kind == CG_LOCAL_STORAGE_FRAME && - st.v.frame_slot != FRAME_SLOT_NONE; -} - -static int replay_param_storage_uses_frame_slot(Func* f, CGLocalStorage st) { - if (replay_storage_uses_frame_slot(st)) return 1; - if (st.kind != CG_LOCAL_STORAGE_REG || !f || !f->opt_rewritten) return 0; - PReg pr = (PReg)st.v.reg; - return opt_reg_valid(f, pr) && - opt_preg_alloc_kind(f, pr) == OPT_ALLOC_SPILL && - opt_preg_spill_slot(f, pr) != FRAME_SLOT_NONE; -} - -static int replay_abivalue_uses_frame_slot(const CGABIValue* v) { - if (!v) return 0; - if (replay_operand_uses_frame_slot(&v->storage)) return 1; - for (u32 i = 0; i < v->nparts; ++i) - if (replay_operand_uses_frame_slot(&v->parts[i].op)) return 1; - return 0; -} - -static int replay_inst_uses_frame_slot(const Inst* in) { - for (u32 i = 0; i < in->nopnds; ++i) - if (replay_operand_uses_frame_slot(&in->opnds[i])) return 1; - switch ((IROp)in->op) { - case IR_CALL: { - IRCallAux* aux = (IRCallAux*)in->extra.aux; - if (!aux) return 0; - if (aux->use_plan_replay) { - if (replay_operand_uses_frame_slot(&aux->plan.callee)) return 1; - for (u32 i = 0; i < aux->plan.nargs; ++i) - if (replay_operand_uses_frame_slot(&aux->plan.args[i].src)) return 1; - for (u32 i = 0; i < aux->plan.nrets; ++i) - if (replay_operand_uses_frame_slot(&aux->plan.rets[i].dst)) return 1; - } else { - if (replay_operand_uses_frame_slot(&aux->desc.callee)) return 1; - for (u32 i = 0; i < aux->desc.nargs; ++i) - if (replay_abivalue_uses_frame_slot(&aux->desc.args[i])) return 1; - if (replay_abivalue_uses_frame_slot(&aux->desc.ret)) return 1; - } - return 0; - } - case IR_RET: { - IRRetAux* aux = (IRRetAux*)in->extra.aux; - return aux && aux->present && replay_abivalue_uses_frame_slot(&aux->val); - } - case IR_SCOPE_BEGIN: { - IRScopeAux* aux = (IRScopeAux*)in->extra.aux; - return aux && replay_operand_uses_frame_slot(&aux->desc.cond); - } - case IR_ASM_BLOCK: { - IRAsmAux* aux = (IRAsmAux*)in->extra.aux; - if (!aux) return 0; - for (u32 i = 0; i < aux->nin; ++i) - if (replay_operand_uses_frame_slot(&aux->in_ops[i])) return 1; - for (u32 i = 0; i < aux->nout; ++i) - if (replay_operand_uses_frame_slot(&aux->out_ops[i])) return 1; - return 0; - } - case IR_INTRINSIC: { - IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; - if (!aux) return 0; - for (u32 i = 0; i < aux->narg; ++i) - if (replay_operand_uses_frame_slot(&aux->args[i])) return 1; - for (u32 i = 0; i < aux->ndst; ++i) - if (replay_operand_uses_frame_slot(&aux->dsts[i])) return 1; - return 0; - } - default: - return 0; - } -} - -static int replay_func_uses_frame_slot(Func* f) { - for (u32 b = 0; b < f->nblocks; ++b) { - Block* bl = &f->blocks[b]; - for (u32 i = 0; i < bl->ninsts; ++i) - if (replay_inst_uses_frame_slot(&bl->insts[i])) return 1; - } - for (u32 i = 0; i < f->nparams; ++i) - if (replay_param_storage_uses_frame_slot(f, f->params[i].storage)) return 1; - for (u32 i = 0; i < f->nframe_slots; ++i) - if (f->frame_slots[i].flags & (FSF_ADDR_TAKEN | FSF_VOLATILE)) return 1; - return 0; -} - -static void collect_known_frame(Func* f, CGTarget* w, CGKnownFrameDesc* out) { - memset(out, 0, sizeof(*out)); - FrameSlotDesc* slots = NULL; - int uses_frame_slot = replay_func_uses_frame_slot(f); - if (uses_frame_slot && f->nframe_slots) { - slots = arena_zarray(f->arena, FrameSlotDesc, f->nframe_slots); - for (u32 i = 0; i < f->nframe_slots; ++i) { - IRFrameSlot* s = &f->frame_slots[i]; - slots[i].type = s->type; - slots[i].name = s->name; - slots[i].loc = s->loc; - slots[i].size = s->size; - slots[i].align = s->align; - slots[i].kind = s->kind; - slots[i].flags = s->flags; - } - } - out->slots = slots; - out->nslots = uses_frame_slot ? f->nframe_slots : 0; - - for (u32 b = 0; b < f->nblocks; ++b) { - Block* bl = &f->blocks[b]; - for (u32 i = 0; i < bl->ninsts; ++i) { - Inst* in = &bl->insts[i]; - if ((IROp)in->op == IR_ALLOCA) { - out->has_alloca = 1; - } else if ((IROp)in->op == IR_CALL) { - IRCallAux* aux = (IRCallAux*)in->extra.aux; - if (!aux) { - out->has_call = 1; - continue; - } - /* A non-tail call, or an ALLOWED tail that may fall back to an - * ordinary call (clobbering the link register), needs a frame. A MUST - * tail never falls back, so it alone does not force one. */ - if ((aux->desc.flags & CG_CALL_TAIL) == 0 || - aux->desc.tail_policy != CFREE_CG_TAIL_MUST) - out->has_call = 1; - u32 need = 0; - if (aux->use_plan_replay) { - need = aux->plan.stack_arg_size; - for (u32 j = 0; j < aux->plan.nargs; ++j) { - CGCallPlanMove* m = &aux->plan.args[j]; - if (m->dst_kind == CG_CALL_PLAN_STACK || - m->dst_kind == CG_CALL_PLAN_TAIL_STACK) { - u32 end = m->stack_offset + (m->mem.size > 8u ? m->mem.size : 8u); - if (end > need) need = end; - } - } - need = (need + 15u) & ~15u; - } else if (w->call_stack_size) { - need = w->call_stack_size(w, &aux->desc); - } - if (need > out->max_outgoing) out->max_outgoing = need; - } - } - } - out->may_omit_frame = (!out->has_call && !out->has_alloca && - out->nslots == 0 && out->max_outgoing == 0) - ? 1u - : 0u; -} - -static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) { - ReplayCtx r; - metrics_scope_begin(c, "opt.emit.setup"); - r.c = c; - r.f = f; - r.tgt = w; - r.identity_regs = identity ? 1u : 0u; - cg_simple_regalloc_init(&r.regalloc); - u32 nv = f->nvals ? f->nvals : 1u; - r.val_to_reg = arena_zarray(f->arena, Reg, nv); - for (u32 i = 0; i < nv; ++i) r.val_to_reg[i] = REG_NONE; - r.val_alloced = arena_zarray(f->arena, u8, nv); - r.slot_map = arena_zarray(f->arena, FrameSlot, f->nframe_slots + 1u); - for (u32 i = 0; i <= f->nframe_slots; ++i) r.slot_map[i] = FRAME_SLOT_NONE; - u32 nb = f->nblocks ? f->nblocks : 1u; - r.label_map = arena_zarray(f->arena, Label, nb); - for (u32 i = 0; i < f->nblocks; ++i) r.label_map[i] = LABEL_NONE; - r.scope_map = arena_zarray(f->arena, CGScope, f->nscopes + 1u); - for (u32 i = 0; i <= f->nscopes; ++i) r.scope_map[i] = CG_SCOPE_NONE; - r.block_label_placed = arena_zarray(f->arena, u8, nb); - r.used_hard_regs_valid = 0; - r.last_loc_valid = 0; - /* If the target isn't emitting debug info, we only need to keep the - * panic loc accurate at function granularity. Set once at func entry - * (handled by the first replay_inst's dedup check) and skip the rest. */ - r.wants_loc = w->debug != NULL; - metrics_scope_end(c, "opt.emit.setup"); - - metrics_scope_begin(c, "opt.emit.plan_hard_regs"); - if (identity && (w->plan_hard_regs || w->reserve_hard_regs)) { - /* Collect once; reuse for both plan_hard_regs (here) and - * reserve_hard_regs (after the body). The IR doesn't change between - * these two callbacks, so a second scan would compute the same data. */ - for (u32 cidx = 0; cidx < OPT_REG_CLASSES; ++cidx) { - r.nused_hard_regs[cidx] = collect_replayed_hard_regs( - f, w, (RegClass)cidx, r.used_hard_regs[cidx], OPT_MAX_HARD_REGS); - } - r.used_hard_regs_valid = 1; - if (w->plan_hard_regs) { - for (u32 cidx = 0; cidx < OPT_REG_CLASSES; ++cidx) { - w->plan_hard_regs(w, (RegClass)cidx, r.used_hard_regs[cidx], - r.nused_hard_regs[cidx]); - } - } - } - metrics_scope_end(c, "opt.emit.plan_hard_regs"); - - metrics_scope_begin(c, "opt.emit.func_begin"); - int known_frame = w->func_begin_known_frame != NULL; - if (known_frame) { - CGKnownFrameDesc frame; - FrameSlot* target_slots = - f->nframe_slots ? arena_zarray(f->arena, FrameSlot, f->nframe_slots) - : NULL; - collect_known_frame(f, w, &frame); - w->func_begin_known_frame(w, &f->desc, &frame, target_slots); - for (u32 i = 0; i < f->nframe_slots; ++i) - r.slot_map[f->frame_slots[i].id] = target_slots[i]; - } - if (!known_frame) { - /* func_begin with the recorded descriptor. Parameter storage is replayed - * through target->param below after frame slots are mapped. */ - w->func_begin(w, &f->desc); - } - - if (!r.identity_regs) { - for (u32 cidx = 0; cidx < OPT_REG_CLASSES; ++cidx) { - const Reg* regs = NULL; - u32 nregs = 0; - if (w->get_allocable_regs) - w->get_allocable_regs(w, (RegClass)cidx, &regs, &nregs); - if (regs && nregs) - cg_simple_regalloc_set_ordered(&r.regalloc, (RegClass)cidx, regs, - nregs); - } - } - - if (!known_frame) { - for (u32 i = 0; i < f->nframe_slots; ++i) { - IRFrameSlot* s = &f->frame_slots[i]; - FrameSlotDesc d = {0}; - d.type = s->type; - d.name = s->name; - d.loc = s->loc; - d.size = s->size; - d.align = s->align; - d.kind = s->kind; - d.flags = s->flags; - r.slot_map[s->id] = w->frame_slot(w, &d); - } - } - - for (u32 i = 0; i < f->nparams; ++i) { - IRParam* p = &f->params[i]; - CGParamDesc d = {0}; - d.index = p->index; - d.name = p->name; - d.type = p->type; - d.size = p->size; - d.align = p->align; - d.flags = p->flags; - if (replay_reg_storage_unused(&r, p->storage)) { - d.storage = p->storage; - d.storage.v.reg = REG_NONE; - } else { - d.storage = xlat_storage(&r, p->storage, p->type); - } - if (known_frame && d.storage.kind == CG_LOCAL_STORAGE_FRAME && - d.storage.v.frame_slot == FRAME_SLOT_NONE) { - SrcLoc loc = p->loc; - compiler_panic(c, loc, - "opt replay: frame-backed param %u missing known-frame " - "slot mapping", - (unsigned)i); - } - d.abi = p->abi; - d.loc = p->loc; - (void)w->param(w, &d); - } - metrics_scope_end(c, "opt.emit.func_begin"); - - metrics_scope_begin(c, "opt.emit.body"); - /* Body in emit order — the order CG's emit cursor visited each - * block. Block-creation order can differ when label_new precedes a - * cmp_branch whose fallthrough block must physically follow. */ - for (u32 i = 0; i < f->emit_order_n; ++i) { - replay_block(&r, f->emit_order[i]); - } - metrics_scope_end(c, "opt.emit.body"); - - metrics_scope_begin(c, "opt.emit.reserve_hard_regs"); - /* At -O1, opt managed allocation and emitted hard regs directly, - * bypassing backend-local allocation. Tell the backend which hard - * regs are still visible in replay so it can save the right callee-saved - * subset in prologue/epilogue. Reuses the cached collection from the - * plan_hard_regs pass — the IR hasn't changed since. - * - * The backend records only callee-saved members of this set for - * prologue/epilogue preservation. */ - if (r.identity_regs && w->reserve_hard_regs && r.used_hard_regs_valid) { - for (u32 c = 0; c < OPT_REG_CLASSES; ++c) { - if (r.nused_hard_regs[c]) - w->reserve_hard_regs(w, (RegClass)c, r.used_hard_regs[c], - r.nused_hard_regs[c]); - } - } else if (!r.identity_regs && w->reserve_hard_regs) { - for (u32 c = 0; c < OPT_REG_CLASSES; ++c) { - Reg used[CG_SIMPLE_REGALLOC_MAX_REGS]; - u32 nused = cg_simple_regalloc_used_regs(&r.regalloc, (RegClass)c, used, - CG_SIMPLE_REGALLOC_MAX_REGS); - if (nused) w->reserve_hard_regs(w, (RegClass)c, used, nused); - } - } - - metrics_scope_end(c, "opt.emit.reserve_hard_regs"); - metrics_scope_begin(c, "opt.emit.func_end"); - w->func_end(w); - metrics_scope_end(c, "opt.emit.func_end"); -} - -void opt_replay(Compiler* c, Func* f, CGTarget* target) { - replay_func_to(c, f, target, 0); -} - -void opt_emit(Compiler* c, Func* f, CGTarget* target) { - if (f && f->mir) { - Func view = *f; - view.blocks = f->mir->blocks; - view.nblocks = f->mir->nblocks; - view.entry = f->mir->entry; - view.emit_order = f->mir->emit_order; - view.emit_order_n = f->mir->emit_order_n; - view.emit_order_cap = f->mir->emit_order_cap; - view.opt_rewritten = 1; - view.mir = NULL; - replay_func_to(c, &view, target, 1); - return; - } - replay_func_to(c, f, target, 1); -} diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c @@ -474,7 +474,7 @@ static u32 alloc_alloc_stack_slot(Func* f, OptAllocator* a, FrameSlot fs) { static u32 hard_reg_alloc_score(Func* f, const OptAllocator* a, const OptPRegInfo* vi, Reg hr) { const CGPhysRegInfo* pi = phys_info_for(f, vi->cls, hr); - u32 score = pi ? pi->use_cost : 0; + u32 score = pi ? pi->spill_cost : 0; if (vi->live_across_call_freq) { if (is_caller_saved(f, vi->cls, hr)) score += 1000u + vi->live_across_call_freq; @@ -484,7 +484,7 @@ static u32 hard_reg_alloc_score(Func* f, const OptAllocator* a, u32 bit = hard_loc_bit(vi->cls, hr); int already_open = a->hard_open && bit < a->hard_loc_bits && a->hard_open[bit]; - if (!already_open) score += pi ? pi->save_cost : 50u; + if (!already_open) score += pi ? pi->copy_cost : 50u; } return score; } @@ -1158,6 +1158,35 @@ static void rewrite_call_arg_operand(Func* f, Operand* op) { } } +static void rewrite_store_value_operand(Func* f, Inst* owner, Operand* op, + RewriteCtx* ctx) { + PReg v; + u8 alloc_kind; + const OptAllocSegment* seg; + if (!op || op->kind != OPK_REG) return; + v = (PReg)op->v.reg; + if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return; + alloc_kind = opt_preg_alloc_kind(f, v); + if (alloc_kind == OPT_ALLOC_HARD) { + op->v.reg = opt_preg_hard_reg(f, v); + return; + } + if (alloc_kind == OPT_ALLOC_SPLIT) { + seg = split_segment_at(f, v, ctx->raw_point); + if (seg && seg->loc_kind == OPT_LOC_HARD) { + op->v.reg = seg->hard_reg; + return; + } + *op = spill_addr(f, v); + return; + } + if (alloc_kind == OPT_ALLOC_SPILL) { + *op = spill_addr(f, v); + return; + } + rewrite_one_operand(f, owner, op, 0, ctx); +} + static void rewrite_call_arg_indirect_base(Func* f, Inst* owner, Operand* op, RewriteCtx* ctx) { if (!op || op->kind != OPK_INDIRECT) return; @@ -1492,6 +1521,9 @@ static void rewrite_func(Func* f, const OptLiveInfo* live_info) { &ctx); } } + } else if ((IROp)in.op == IR_STORE && in.nopnds >= 2) { + opt_walk_operand(f, &in, &in.opnds[0], 0, rewrite_one_operand, &ctx); + rewrite_store_value_operand(f, &in, &in.opnds[1], &ctx); } else { opt_walk_inst_operands(f, &in, rewrite_one_operand, &ctx); } @@ -1520,34 +1552,10 @@ static void rewrite_func(Func* f, const OptLiveInfo* live_info) { f->opt_rewritten = 1; } -static Block* lower_copy_blocks(Func* f) { - Block* blocks = arena_zarray(f->arena, Block, f->nblocks ? f->nblocks : 1u); - for (u32 b = 0; b < f->nblocks; ++b) { - Block* dst = &blocks[b]; - Block* src = &f->blocks[b]; - *dst = *src; - if (src->ninsts) { - dst->insts = arena_array(f->arena, Inst, src->ninsts); - memcpy(dst->insts, src->insts, sizeof(Inst) * src->ninsts); - dst->cap = src->ninsts; - } - if (src->nsucc) { - dst->succ = arena_array(f->arena, u32, src->nsucc); - memcpy(dst->succ, src->succ, sizeof(u32) * src->nsucc); - dst->succ_cap = src->nsucc; - } - if (src->npreds) { - dst->preds = arena_array(f->arena, u32, src->npreds); - memcpy(dst->preds, src->preds, sizeof(u32) * src->npreds); - } - } - return blocks; -} - void opt_lower_to_mir(Func* f, const OptLiveInfo* live_info) { if (!f) return; Func phys = *f; - phys.blocks = lower_copy_blocks(f); + phys.blocks = f->blocks; phys.opt_rewritten = 0; phys.mir = NULL; @@ -1565,6 +1573,9 @@ void opt_lower_to_mir(Func* f, const OptLiveInfo* live_info) { memcpy(m->emit_order, phys.emit_order, sizeof(u32) * phys.emit_order_n); f->mir = m; + f->blocks = phys.blocks; + f->nblocks = phys.nblocks; + f->blocks_cap = phys.blocks_cap; f->frame_slots = phys.frame_slots; f->nframe_slots = phys.nframe_slots; f->frame_slots_cap = phys.frame_slots_cap; diff --git a/src/opt/pass_machinize.c b/src/opt/pass_machinize.c @@ -1,7 +1,5 @@ #include <string.h> -#include "core/arena.h" -#include "core/core.h" #include "core/pool.h" #include "core/slice.h" #include "opt/opt_internal.h" @@ -13,24 +11,32 @@ static const char* asm_constraint_body(const char* s) { return s; } -static int asm_resolve_fixed_constraint(Func* f, CGTarget* target, +static int native_resolve_reg(NativeTarget* target, Sym name, Reg* out, + RegClass* cls_out) { + NativeAllocClass cls; + if (!target || !target->regs || !target->regs->resolve_name) return 1; + if (target->regs->resolve_name(target->regs, name, out, &cls) != 0) return 1; + if (cls_out) *cls_out = (RegClass)cls; + return 0; +} + +static int asm_resolve_fixed_constraint(Func* f, NativeTarget* target, const char* constraint, Reg* reg_out, RegClass* cls_out) { const char* body = asm_constraint_body(constraint); - if (!target->resolve_reg_name) return 0; if (body[0] != '{') return 0; const char* end = body + 1; while (*end && *end != '}') ++end; if (*end != '}' || end == body + 1) return 0; Sym name = pool_intern_slice( f->c->global, (Slice){.s = body + 1, .len = (size_t)(end - body - 1)}); - return target->resolve_reg_name(target, name, reg_out, cls_out) == 0; + return native_resolve_reg(target, name, reg_out, cls_out) == 0; } -static void asm_prepare_constraints(Func* f, CGTarget* target, IRAsmAux* aux) { +static void asm_prepare_constraints(Func* f, NativeTarget* target, + IRAsmAux* aux) { if (!aux) return; for (u32 c = 0; c < OPT_REG_CLASSES; ++c) aux->clobber_mask[c] = 0; - if (aux->nout && !aux->out_fixed_regs) { aux->out_fixed_regs = arena_array(f->arena, i32, aux->nout); aux->out_fixed_cls = arena_zarray(f->arena, u8, aux->nout); @@ -41,18 +47,12 @@ static void asm_prepare_constraints(Func* f, CGTarget* target, IRAsmAux* aux) { aux->in_fixed_cls = arena_zarray(f->arena, u8, aux->nin); for (u32 i = 0; i < aux->nin; ++i) aux->in_fixed_regs[i] = -1; } - - if (target->resolve_reg_name) { - for (u32 i = 0; i < aux->nclob; ++i) { - Reg r; - RegClass cls; - if (target->resolve_reg_name(target, aux->clobbers[i], &r, &cls) != 0) - continue; - if ((u32)cls < OPT_REG_CLASSES && r < 32) - aux->clobber_mask[cls] |= 1u << r; - } + for (u32 i = 0; i < aux->nclob; ++i) { + Reg r; + RegClass cls; + if (native_resolve_reg(target, aux->clobbers[i], &r, &cls) != 0) continue; + if ((u32)cls < OPT_REG_CLASSES && r < 32) aux->clobber_mask[cls] |= 1u << r; } - for (u32 i = 0; i < aux->nout; ++i) { Reg r; RegClass cls; @@ -71,10 +71,7 @@ static void asm_prepare_constraints(Func* f, CGTarget* target, IRAsmAux* aux) { } } -static int call_plan_replay_supported(const IRCallAux* aux, - const CGTarget* target); - -static void machinize_reset(Func* f, CGTarget* target) { +static void machinize_reset(Func* f, NativeTarget* target) { f->opt_target = target->c->target; f->opt_has_target = 1; for (u32 c = 0; c < OPT_REG_CLASSES; ++c) { @@ -89,80 +86,46 @@ static void machinize_reset(Func* f, CGTarget* target) { } } -static void machinize_prepare_insts(Func* f, CGTarget* target) { +static void machinize_prepare_insts(Func* f, NativeTarget* target) { for (u32 b = 0; b < f->nblocks; ++b) { Block* bl = &f->blocks[b]; for (u32 i = 0; i < bl->ninsts; ++i) { Inst* in = &bl->insts[i]; - if ((IROp)in->op == IR_ASM_BLOCK) { + if ((IROp)in->op == IR_ASM_BLOCK) asm_prepare_constraints(f, target, (IRAsmAux*)in->extra.aux); - } else if ((IROp)in->op == IR_CALL && target->plan_call) { - IRCallAux* aux = (IRCallAux*)in->extra.aux; - if (aux) { - target->plan_call(target, &aux->desc, &aux->plan); - aux->plan_valid = 1; - aux->use_plan_replay = call_plan_replay_supported(aux, target); - } - } } } } -static void machinize_collect_regs(Func* f, CGTarget* target) { - for (u32 c = 0; c < OPT_REG_CLASSES; ++c) { - const CGPhysRegInfo* phys = NULL; - u32 nphys = 0; - if (target->get_phys_regs) - target->get_phys_regs(target, (RegClass)c, &phys, &nphys); - if (phys) { - u32 phys_limit = nphys < OPT_MAX_HARD_REGS ? nphys : OPT_MAX_HARD_REGS; - for (u32 i = 0; i < phys_limit; ++i) { - Reg hr = phys[i].reg; - u16 flags = phys[i].flags; - if (hr < 32u) { - if (flags & CG_REG_CALLER_SAVED) f->opt_caller_saved[c] |= 1u << hr; - if (flags & CG_REG_CALLEE_SAVED) f->opt_callee_saved[c] |= 1u << hr; - if (flags & CG_REG_RESERVED) f->opt_reserved_regs[c] |= 1u << hr; - if (flags & CG_REG_ARG) f->opt_arg_regs[c] |= 1u << hr; - if (flags & CG_REG_RET) f->opt_ret_regs[c] |= 1u << hr; - } - f->opt_phys_regs[c][f->opt_phys_reg_count[c]++] = phys[i]; - if ((flags & CG_REG_ALLOCABLE) && !(flags & CG_REG_RESERVED)) { - f->opt_hard_regs[c][f->opt_hard_reg_count[c]++] = hr; - } - } - } else { - const Reg* hard = NULL; - u32 nhard = 0; - if (target->get_allocable_regs) - target->get_allocable_regs(target, (RegClass)c, &hard, &nhard); - u32 hard_limit = nhard < OPT_MAX_HARD_REGS ? nhard : OPT_MAX_HARD_REGS; - for (u32 i = 0; i < hard_limit; ++i) - f->opt_hard_regs[c][f->opt_hard_reg_count[c]++] = hard[i]; - } - - const Reg* scratch = NULL; - u32 nscratch = 0; - if (target->get_scratch_regs) - target->get_scratch_regs(target, (RegClass)c, &scratch, &nscratch); - u32 scratch_limit = - nscratch < OPT_MAX_SCRATCH_REGS ? nscratch : OPT_MAX_SCRATCH_REGS; - for (u32 i = 0; i < scratch_limit; ++i) - f->opt_scratch_regs[c][f->opt_scratch_reg_count[c]++] = scratch[i]; - - if (!phys && target->is_caller_saved) { - for (u32 i = 0; i < f->opt_hard_reg_count[c]; ++i) { - Reg hr = f->opt_hard_regs[c][i]; - if (target->is_caller_saved(target, (RegClass)c, hr)) - f->opt_caller_saved[c] |= (1u << hr); - } - } - u32* callee_saved = &f->opt_callee_saved[c]; - if (target->callee_save_mask) { - u32 mask = target->callee_save_mask(target, (RegClass)c); - *callee_saved |= mask; - } +static void collect_class(Func* f, const NativeAllocClassInfo* ci) { + u32 cls = ci->cls; + if (cls >= OPT_REG_CLASSES) return; + f->opt_caller_saved[cls] = ci->caller_saved_mask; + f->opt_callee_saved[cls] = ci->callee_saved_mask; + f->opt_reserved_regs[cls] = ci->reserved_mask; + f->opt_arg_regs[cls] = ci->arg_mask; + f->opt_ret_regs[cls] = ci->ret_mask; + for (u32 i = 0; + i < ci->nphys && f->opt_phys_reg_count[cls] < OPT_MAX_HARD_REGS; ++i) { + const NativePhysRegInfo* src = &ci->phys[i]; + CGPhysRegInfo* dst = &f->opt_phys_regs[cls][f->opt_phys_reg_count[cls]++]; + memset(dst, 0, sizeof *dst); + dst->reg = src->reg; + dst->cls = src->cls; + dst->abi_index = src->abi_index; + dst->flags = src->flags; + if ((src->flags & CG_REG_ALLOCABLE) && !(src->flags & CG_REG_RESERVED) && + f->opt_hard_reg_count[cls] < OPT_MAX_HARD_REGS) + f->opt_hard_regs[cls][f->opt_hard_reg_count[cls]++] = src->reg; } + for (u32 i = 0; i < ci->nscratch && i < OPT_MAX_SCRATCH_REGS; ++i) + f->opt_scratch_regs[cls][f->opt_scratch_reg_count[cls]++] = ci->scratch[i]; +} + +static void machinize_collect_regs(Func* f, NativeTarget* target) { + if (!target || !target->regs) return; + for (u32 i = 0; i < target->regs->nclasses; ++i) + collect_class(f, &target->regs->classes[i]); } static void machinize_check_overlap(Func* f) { @@ -171,8 +134,7 @@ static void machinize_check_overlap(Func* f) { Reg hr = f->opt_hard_regs[c][i]; for (u32 s = 0; s < f->opt_scratch_reg_count[c]; ++s) { if (f->opt_scratch_regs[c][s] == hr) { - SrcLoc loc = {0, 0, 0}; - compiler_panic(f->c, loc, + compiler_panic(f->c, (SrcLoc){0, 0, 0}, "opt_machinize: hard reg %u overlaps scratch reg " "in class %u", (unsigned)hr, (unsigned)c); @@ -182,37 +144,9 @@ static void machinize_check_overlap(Func* f) { } } -void opt_machinize(Func* f, CGTarget* target) { +void opt_machinize_native(Func* f, NativeTarget* target) { machinize_reset(f, target); machinize_prepare_insts(f, target); machinize_collect_regs(f, target); machinize_check_overlap(f); } - -static int call_plan_replay_supported(const IRCallAux* aux, - const CGTarget* target) { - if (!aux || !aux->plan_valid || !target || !target->emit_call_plan) return 0; - for (u32 i = 0; i < aux->plan.nargs; ++i) { - if ((aux->plan.args[i].dst_kind == CG_CALL_PLAN_STACK || - aux->plan.args[i].dst_kind == CG_CALL_PLAN_TAIL_STACK) && - !target->store_call_arg) - return 0; - if (aux->plan.args[i].dst_kind == CG_CALL_PLAN_REG && - (aux->plan.args[i].src_kind == CG_CALL_PLAN_SRC_ADDR || - aux->plan.args[i].src_offset) && - !target->load_call_arg) - return 0; - } - for (u32 i = 0; i < aux->plan.nrets; ++i) - if (aux->plan.rets[i].dst.kind != OPK_REG && - aux->plan.rets[i].dst.kind != OPK_LOCAL && - aux->plan.rets[i].dst.kind != OPK_INDIRECT) - return 0; - for (u32 i = 0; i < aux->plan.nrets; ++i) - if (aux->plan.rets[i].dst_offset && - (aux->plan.rets[i].dst.kind == OPK_LOCAL || - aux->plan.rets[i].dst.kind == OPK_INDIRECT) && - !target->store_call_ret) - return 0; - return 1; -} diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c @@ -0,0 +1,1219 @@ +#include <string.h> + +#include "cg/type.h" +#include "core/metrics.h" +#include "core/pool.h" +#include "opt/opt_internal.h" + +#undef Operand +#undef CGParamDesc +#undef CGCallDesc +#undef CGFuncDesc +#undef CGLocalStorage +#undef CGABIValue +#undef CGABIPart +#undef CGCallPlan +#undef CGCallPlanMove +#undef CGCallPlanRet +#undef CGScopeDesc + +typedef struct NativeEmitCtx { + Compiler* c; + Func* f; + NativeTarget* target; + NativeFrameSlot* slot_map; + NativeFrameSlot* param_home_by_preg; + MCLabel* labels; + u8* label_placed; + u32 max_outgoing; + ObjSecId local_static_sec; + ObjSymId local_static_sym; + u32 local_static_base; + u32 local_static_size; + u8 local_static_active; +} NativeEmitCtx; + +static _Noreturn void emit_panic(NativeEmitCtx* e, SrcLoc loc, + const char* msg) { + compiler_panic(e->c, loc, "opt native emit: %s", msg); +} + +static void emit_local_static_begin(NativeEmitCtx* e, + const CGLocalStaticDataDesc* desc, + SrcLoc loc) { + Sym name; + SecKind kind; + u16 flags; + u32 align; + if (!desc) emit_panic(e, loc, "missing local static data descriptor"); + if (e->local_static_active) emit_panic(e, loc, "nested local static data"); + if (desc->attrs.section) { + name = (Sym)desc->attrs.section; + kind = + (desc->attrs.flags & CFREE_CG_DATADEF_READONLY) ? SEC_RODATA : SEC_DATA; + flags = (desc->attrs.flags & CFREE_CG_DATADEF_READONLY) + ? SF_ALLOC + : (SF_ALLOC | SF_WRITE); + } else if (desc->attrs.flags & CFREE_CG_DATADEF_READONLY) { + name = pool_intern_slice(e->c->global, SLICE_LIT(".rodata")); + kind = SEC_RODATA; + flags = SF_ALLOC; + } else { + name = pool_intern_slice(e->c->global, SLICE_LIT(".data")); + kind = SEC_DATA; + flags = SF_ALLOC | SF_WRITE; + } + align = desc->align ? desc->align : 1u; + e->local_static_sec = obj_section(e->target->obj, name, kind, flags, align); + e->local_static_base = + obj_align_to(e->target->obj, e->local_static_sec, align); + e->local_static_size = 0; + e->local_static_sym = desc->sym; + e->local_static_active = 1; +} + +static void emit_local_static_write(NativeEmitCtx* e, const u8* data, u64 len, + SrcLoc loc) { + u8 zero[64]; + u64 orig_len = len; + if (!e->local_static_active) emit_panic(e, loc, "local static data inactive"); + if (!len) return; + if (data) { + obj_write(e->target->obj, e->local_static_sec, data, (size_t)len); + } else { + memset(zero, 0, sizeof zero); + while (len >= sizeof zero) { + obj_write(e->target->obj, e->local_static_sec, zero, sizeof zero); + len -= sizeof zero; + } + if (len) obj_write(e->target->obj, e->local_static_sec, zero, (size_t)len); + } + e->local_static_size += (u32)orig_len; +} + +static void emit_local_static_label_addr(NativeEmitCtx* e, MCLabel target, + i64 addend, u32 width, SrcLoc loc) { + u8 zero[8]; + u32 off; + if (!e->local_static_active) emit_panic(e, loc, "local static data inactive"); + if (width != 8u) emit_panic(e, loc, "unsupported local static label width"); + memset(zero, 0, sizeof zero); + off = e->local_static_base + e->local_static_size; + obj_write(e->target->obj, e->local_static_sec, zero, width); + e->target->mc->emit_label_data_reloc(e->target->mc, e->local_static_sec, off, + target, R_ABS64, width, addend); + e->local_static_size += width; +} + +static void emit_local_static_end(NativeEmitCtx* e, SrcLoc loc) { + if (!e->local_static_active) emit_panic(e, loc, "local static data inactive"); + obj_symbol_define(e->target->obj, e->local_static_sym, e->local_static_sec, + e->local_static_base, e->local_static_size); + e->local_static_active = 0; + e->local_static_sec = OBJ_SEC_NONE; + e->local_static_sym = OBJ_SYM_NONE; + e->local_static_base = 0; + e->local_static_size = 0; +} + +static u32 type_size_or(Compiler* c, CfreeCgTypeId type, u32 fallback) { + u64 n = type ? cg_type_size(c, type) : 0u; + if (!n || n > 0xffffffffull) return fallback; + return (u32)n; +} + +static u32 type_align_or(Compiler* c, CfreeCgTypeId type, u32 fallback) { + u64 n = type ? cg_type_align(c, type) : 0u; + if (!n || n > 0xffffffffull) return fallback; + return (u32)n; +} + +static MemAccess mem_for_type(Compiler* c, CfreeCgTypeId type) { + MemAccess mem; + memset(&mem, 0, sizeof mem); + mem.type = type; + mem.size = type_size_or(c, type, 8u); + mem.align = type_align_or(c, type, mem.size >= 8u ? 8u : mem.size); + return mem; +} + +static NativeAllocClass class_for_type(NativeEmitCtx* e, CfreeCgTypeId type) { + if (e->target->class_for_type) + return e->target->class_for_type(e->target, type); + return cg_type_is_float(e->c, type) ? NATIVE_REG_FP : NATIVE_REG_INT; +} + +static NativeLoc loc_none(void) { + NativeLoc loc; + memset(&loc, 0, sizeof loc); + return loc; +} + +static NativeLoc loc_reg(CfreeCgTypeId type, NativeAllocClass cls, Reg reg) { + NativeLoc loc; + memset(&loc, 0, sizeof loc); + loc.kind = NATIVE_LOC_REG; + loc.cls = (u8)cls; + loc.type = type; + loc.v.reg = reg; + return loc; +} + +static NativeLoc loc_frame(CfreeCgTypeId type, NativeAllocClass cls, + NativeFrameSlot slot) { + NativeLoc loc; + memset(&loc, 0, sizeof loc); + loc.kind = NATIVE_LOC_FRAME; + loc.cls = (u8)cls; + loc.type = type; + loc.v.frame = slot; + return loc; +} + +static NativeLoc loc_imm(CfreeCgTypeId type, i64 imm) { + NativeLoc loc; + memset(&loc, 0, sizeof loc); + loc.kind = NATIVE_LOC_IMM; + loc.cls = NATIVE_REG_INT; + loc.type = type; + loc.v.imm = imm; + return loc; +} + +static NativeLoc loc_global(CfreeCgTypeId type, ObjSymId sym, i64 addend) { + NativeLoc loc; + memset(&loc, 0, sizeof loc); + loc.kind = NATIVE_LOC_GLOBAL; + loc.cls = NATIVE_REG_INT; + loc.type = type; + loc.v.global.sym = sym; + loc.v.global.addend = addend; + return loc; +} + +static int loc_same_frame(NativeLoc a, NativeLoc b) { + return a.kind == NATIVE_LOC_FRAME && b.kind == NATIVE_LOC_FRAME && + a.v.frame == b.v.frame; +} + +static Reg scratch_reg(NativeEmitCtx* e, NativeAllocClass cls, Reg a, Reg b, + SrcLoc loc) { + u32 c = (u32)cls; + if (c < OPT_REG_CLASSES) { + for (u32 i = 0; i < e->f->opt_scratch_reg_count[c]; ++i) { + Reg r = e->f->opt_scratch_regs[c][i]; + if (r != a && r != b) return r; + } + } + emit_panic(e, loc, "no scratch register for native emission"); +} + +static int scratch_available(NativeEmitCtx* e, NativeAllocClass cls, Reg a, + Reg b) { + u32 c = (u32)cls; + if (c < OPT_REG_CLASSES) { + for (u32 i = 0; i < e->f->opt_scratch_reg_count[c]; ++i) { + Reg r = e->f->opt_scratch_regs[c][i]; + if (r != a && r != b) return 1; + } + } + return 0; +} + +static NativeLoc scratch_loc(NativeEmitCtx* e, CfreeCgTypeId type, + NativeAllocClass cls, Reg a, Reg b, SrcLoc loc) { + return loc_reg(type, cls, scratch_reg(e, cls, a, b, loc)); +} + +static NativeFrameSlot map_slot(NativeEmitCtx* e, NativeFrameSlot slot, + SrcLoc loc) { + if (slot == NATIVE_FRAME_SLOT_NONE) return NATIVE_FRAME_SLOT_NONE; + if (slot > e->f->nframe_slots) emit_panic(e, loc, "bad frame slot"); + if (!e->slot_map[slot]) emit_panic(e, loc, "unmapped frame slot"); + return e->slot_map[slot]; +} + +static MCLabel ensure_label(NativeEmitCtx* e, u32 block, SrcLoc loc) { + if (block >= e->f->nblocks) emit_panic(e, loc, "bad block label"); + if (e->labels[block] == MC_LABEL_NONE) + e->labels[block] = e->target->label_new(e->target); + return e->labels[block]; +} + +static NativeAddr addr_from_loc(NativeEmitCtx* e, NativeLoc loc, + SrcLoc src_loc) { + NativeAddr addr; + memset(&addr, 0, sizeof addr); + addr.base_type = loc.type; + switch ((NativeLocKind)loc.kind) { + case NATIVE_LOC_FRAME: + addr.base_kind = NATIVE_ADDR_BASE_FRAME; + addr.base.frame = loc.v.frame; + return addr; + case NATIVE_LOC_STACK: + addr.base_kind = NATIVE_ADDR_BASE_FRAME; + addr.base.frame = loc.v.stack.slot; + addr.offset = loc.v.stack.offset; + return addr; + case NATIVE_LOC_GLOBAL: + addr.base_kind = NATIVE_ADDR_BASE_GLOBAL; + addr.base.global.sym = loc.v.global.sym; + addr.base.global.addend = loc.v.global.addend; + return addr; + case NATIVE_LOC_REG: + addr.base_kind = NATIVE_ADDR_BASE_REG; + addr.cls = loc.cls; + addr.base.reg = loc.v.reg; + return addr; + case NATIVE_LOC_ADDR: + return loc.v.addr; + default: + emit_panic(e, src_loc, "location is not addressable"); + } +} + +static NativeAddr addr_from_operand(NativeEmitCtx* e, const OptOperand* op, + SrcLoc loc) { + NativeAddr addr; + memset(&addr, 0, sizeof addr); + if (!op) emit_panic(e, loc, "missing address operand"); + addr.base_type = op->type; + switch ((OptOperandKind)op->kind) { + case OPT_OPK_LOCAL: + addr.base_kind = NATIVE_ADDR_BASE_FRAME; + addr.base.frame = map_slot(e, op->v.frame_slot, loc); + return addr; + case OPT_OPK_GLOBAL: + addr.base_kind = NATIVE_ADDR_BASE_GLOBAL; + addr.base.global.sym = op->v.global.sym; + addr.base.global.addend = op->v.global.addend; + return addr; + case OPT_OPK_INDIRECT: + addr.base_kind = NATIVE_ADDR_BASE_REG; + addr.cls = NATIVE_REG_INT; + addr.base.reg = op->v.ind.base; + addr.index_kind = op->v.ind.index == (Reg)REG_NONE + ? NATIVE_ADDR_INDEX_NONE + : NATIVE_ADDR_INDEX_REG; + addr.index_cls = NATIVE_REG_INT; + addr.index.reg = op->v.ind.index; + addr.log2_scale = op->v.ind.log2_scale; + addr.offset = op->v.ind.ofs; + return addr; + case OPT_OPK_REG: + addr.base_kind = NATIVE_ADDR_BASE_REG; + addr.cls = op->cls; + addr.base.reg = op->v.reg; + return addr; + default: + emit_panic(e, loc, "operand is not addressable"); + } +} + +static NativeAddr pointer_addr_from_operand(NativeEmitCtx* e, + const OptOperand* op, SrcLoc loc, + Reg avoid_a, Reg avoid_b) { + NativeAddr addr; + memset(&addr, 0, sizeof addr); + if (!op) emit_panic(e, loc, "missing pointer operand"); + addr.base_type = op->type; + switch ((OptOperandKind)op->kind) { + case OPT_OPK_LOCAL: { + NativeAddr frame; + NativeLoc dst; + NativeAllocClass cls = class_for_type(e, op->type); + Reg r = scratch_reg(e, cls, avoid_a, avoid_b, loc); + memset(&frame, 0, sizeof frame); + frame.base_kind = NATIVE_ADDR_BASE_FRAME; + frame.base.frame = map_slot(e, op->v.frame_slot, loc); + frame.base_type = op->type; + dst = loc_reg(op->type, cls, r); + e->target->load(e->target, dst, frame, mem_for_type(e->c, op->type)); + addr.base_kind = NATIVE_ADDR_BASE_REG; + addr.cls = (u8)cls; + addr.base.reg = r; + return addr; + } + case OPT_OPK_GLOBAL: + addr.base_kind = NATIVE_ADDR_BASE_GLOBAL; + addr.base.global.sym = op->v.global.sym; + addr.base.global.addend = op->v.global.addend; + return addr; + case OPT_OPK_INDIRECT: + return addr_from_operand(e, op, loc); + case OPT_OPK_REG: + addr.base_kind = NATIVE_ADDR_BASE_REG; + addr.cls = op->cls; + addr.base.reg = op->v.reg; + return addr; + default: + emit_panic(e, loc, "operand is not a pointer address"); + } +} + +static Reg addr_base_reg(const NativeAddr* addr) { + return addr && addr->base_kind == NATIVE_ADDR_BASE_REG ? addr->base.reg + : REG_NONE; +} + +static Reg addr_index_reg(const NativeAddr* addr) { + return addr && addr->index_kind == NATIVE_ADDR_INDEX_REG ? addr->index.reg + : REG_NONE; +} + +static void collapse_addr_to_reg(NativeEmitCtx* e, NativeAddr* addr, + SrcLoc loc) { + Reg r = addr_base_reg(addr); + NativeLoc dst; + if (r == (Reg)REG_NONE) + r = scratch_reg(e, NATIVE_REG_INT, REG_NONE, REG_NONE, loc); + dst = loc_reg(addr->base_type, NATIVE_REG_INT, r); + e->target->load_addr(e->target, dst, *addr); + memset(addr, 0, sizeof *addr); + addr->base_kind = NATIVE_ADDR_BASE_REG; + addr->cls = NATIVE_REG_INT; + addr->base.reg = r; + addr->base_type = dst.type; +} + +/* Collapse an address the target cannot encode for this access (e.g. an + * index scale aarch64 cannot fold into a load/store) into a single base + * register via load_addr. Mirrors NativeDirectTarget's nd_addr_materialize so + * the O1 emit path legalizes the same address shapes as direct -O0 emission. */ +static void legalize_addr(NativeEmitCtx* e, NativeAddr* addr, MemAccess mem, + SrcLoc loc) { + if (e->target->addr_legal && !e->target->addr_legal(e->target, addr, mem)) + collapse_addr_to_reg(e, addr, loc); +} + +static NativeLoc loc_from_operand(NativeEmitCtx* e, const OptOperand* op, + SrcLoc loc) { + if (!op) return loc_none(); + switch ((OptOperandKind)op->kind) { + case OPT_OPK_REG: + return loc_reg(op->type, (NativeAllocClass)op->cls, op->v.reg); + case OPT_OPK_IMM: + return loc_imm(op->type, op->v.imm); + case OPT_OPK_GLOBAL: + return loc_global(op->type, op->v.global.sym, op->v.global.addend); + case OPT_OPK_LOCAL: + return loc_frame(op->type, class_for_type(e, op->type), + map_slot(e, op->v.frame_slot, loc)); + case OPT_OPK_INDIRECT: { + NativeLoc out = loc_none(); + out.kind = NATIVE_LOC_ADDR; + out.cls = op->cls; + out.type = op->type; + out.v.addr = addr_from_operand(e, op, loc); + return out; + } + } + emit_panic(e, loc, "bad operand kind"); +} + +static NativeLoc materialize(NativeEmitCtx* e, NativeLoc src, + NativeAllocClass cls, CfreeCgTypeId type, + Reg avoid_a, Reg avoid_b, SrcLoc loc) { + NativeLoc dst; + NativeAddr addr; + MemAccess mem; + if (src.kind == NATIVE_LOC_REG) return src; + dst = scratch_loc(e, type ? type : src.type, cls, avoid_a, avoid_b, loc); + switch ((NativeLocKind)src.kind) { + case NATIVE_LOC_IMM: + e->target->load_imm(e->target, dst, src.v.imm); + return dst; + case NATIVE_LOC_GLOBAL: + addr = addr_from_loc(e, src, loc); + e->target->load_addr(e->target, dst, addr); + return dst; + case NATIVE_LOC_FRAME: + case NATIVE_LOC_STACK: + case NATIVE_LOC_ADDR: + addr = addr_from_loc(e, src, loc); + mem = mem_for_type(e->c, dst.type); + e->target->load(e->target, dst, addr, mem); + return dst; + default: + emit_panic(e, loc, "cannot materialize location"); + } +} + +static void write_loc(NativeEmitCtx* e, NativeLoc dst, NativeLoc src, + MemAccess mem, SrcLoc loc) { + NativeAddr addr; + NativeLoc tmp; + if (dst.kind == NATIVE_LOC_NONE) return; + if (loc_same_frame(dst, src)) return; + if (dst.kind == NATIVE_LOC_REG) { + if (src.kind == NATIVE_LOC_REG) { + if (dst.v.reg != src.v.reg || dst.cls != src.cls) + e->target->move(e->target, dst, src); + return; + } + tmp = materialize(e, src, (NativeAllocClass)dst.cls, dst.type, dst.v.reg, + REG_NONE, loc); + if (tmp.v.reg != dst.v.reg || tmp.cls != dst.cls) + e->target->move(e->target, dst, tmp); + return; + } + addr = addr_from_loc(e, dst, loc); + if (src.kind != NATIVE_LOC_REG) + src = materialize(e, src, (NativeAllocClass)dst.cls, dst.type, REG_NONE, + REG_NONE, loc); + e->target->store(e->target, addr, src, mem); +} + +static CGFuncDesc semantic_func_desc(NativeEmitCtx* e) { + OptCGFuncDesc* in = &e->f->desc; + CGFuncDesc out; + memset(&out, 0, sizeof out); + out.sym = in->sym; + out.text_section_id = in->text_section_id; + out.group_id = in->group_id; + out.fn_type = in->fn_type; + out.result_types = in->result_types; + out.nresults = in->nresults; + out.nparams = in->nparams; + out.loc = in->loc; + out.flags = in->flags; + out.inline_policy = in->inline_policy; + out.atomize = in->atomize; + if (in->nparams && in->params) { + CGParamDesc* params = arena_zarray(e->f->arena, CGParamDesc, in->nparams); + for (u32 i = 0; i < in->nparams; ++i) { + params[i].index = in->params[i].index; + params[i].name = in->params[i].name; + params[i].type = in->params[i].type; + params[i].size = in->params[i].size; + params[i].align = in->params[i].align; + params[i].flags = in->params[i].flags; + params[i].loc = in->params[i].loc; + } + out.params = params; + } + return out; +} + +static CGParamDesc semantic_param_desc(const IRParam* p) { + CGParamDesc out; + memset(&out, 0, sizeof out); + out.index = p->index; + out.name = p->name; + out.type = p->type; + out.size = p->size; + out.align = p->align; + out.flags = p->flags; + out.loc = p->loc; + return out; +} + +static NativeFrameSlot local_home_for_preg(Func* f, PReg preg) { + for (u32 i = 0; i < f->nlocals; ++i) { + IRLocal* l = &f->locals[i]; + if (l->storage.kind == CG_LOCAL_STORAGE_REG && + (PReg)l->storage.v.reg == preg && l->home_slot) + return l->home_slot; + } + return NATIVE_FRAME_SLOT_NONE; +} + +static NativeFrameSlot allocate_param_home(NativeEmitCtx* e, const IRParam* p) { + NativeFrameSlot opt_home = NATIVE_FRAME_SLOT_NONE; + NativeFrameSlotDesc d; + if (p->storage.kind == CG_LOCAL_STORAGE_REG) + opt_home = local_home_for_preg(e->f, (PReg)p->storage.v.reg); + if (opt_home) return map_slot(e, opt_home, p->loc); + memset(&d, 0, sizeof d); + d.type = p->type; + d.name = p->name; + d.loc = p->loc; + d.size = p->size ? p->size : type_size_or(e->c, p->type, 8u); + d.align = p->align ? p->align : type_align_or(e->c, p->type, 8u); + d.kind = NATIVE_FRAME_SLOT_PARAM; + if (p->flags & CG_LOCAL_ADDR_TAKEN) d.flags |= NATIVE_FRAME_SLOT_ADDR_TAKEN; + if (p->flags & CG_LOCAL_MEMORY_REQUIRED) + d.flags |= NATIVE_FRAME_SLOT_MEMORY_REQUIRED; + return e->target->frame_slot(e->target, &d); +} + +static NativeLoc loc_for_preg(NativeEmitCtx* e, PReg preg, CfreeCgTypeId type, + SrcLoc loc) { + u8 kind = opt_preg_alloc_kind(e->f, preg); + if (kind == OPT_ALLOC_HARD) + return loc_reg(type, (NativeAllocClass)opt_preg_loc_cls(e->f, preg), + opt_preg_hard_reg(e->f, preg)); + if (kind == OPT_ALLOC_SPILL) + return loc_frame(type, class_for_type(e, type), + map_slot(e, opt_preg_spill_slot(e->f, preg), loc)); + return loc_none(); +} + +static void bind_params(NativeEmitCtx* e) { + u32 nregs = opt_reg_count(e->f); + e->param_home_by_preg = + arena_zarray(e->f->arena, NativeFrameSlot, nregs ? nregs : 1u); + for (u32 i = 0; i < e->f->nparams; ++i) { + IRParam* p = &e->f->params[i]; + CGParamDesc sd = semantic_param_desc(p); + NativeFrameSlot home = allocate_param_home(e, p); + if (p->storage.kind == CG_LOCAL_STORAGE_REG && p->storage.v.reg < nregs) + e->param_home_by_preg[p->storage.v.reg] = home; + if (p->storage.kind == CG_LOCAL_STORAGE_FRAME) + home = map_slot(e, p->storage.v.frame_slot, p->loc); + if (e->target->bind_param) e->target->bind_param(e->target, &sd, home); + } +} + +static void emit_param_decl(NativeEmitCtx* e, Inst* in) { + IRParamDeclAux* aux = (IRParamDeclAux*)in->extra.aux; + NativeFrameSlot home; + NativeLoc src, dst; + MemAccess mem; + if (!aux || aux->desc.storage.kind != CG_LOCAL_STORAGE_REG) return; + PReg preg = (PReg)aux->desc.storage.v.reg; + if (!preg || preg >= opt_reg_count(e->f)) return; + home = e->param_home_by_preg ? e->param_home_by_preg[preg] : 0u; + if (!home) return; + src = loc_frame(aux->desc.type, class_for_type(e, aux->desc.type), home); + dst = loc_for_preg(e, preg, aux->desc.type, in->loc); + mem = mem_for_type(e->c, aux->desc.type); + write_loc(e, dst, src, mem, in->loc); +} + +static NativeFrameSlot temp_slot(NativeEmitCtx* e, CfreeCgTypeId type, + SrcLoc loc, NativeFrameSlotKind kind) { + NativeFrameSlotDesc d; + memset(&d, 0, sizeof d); + d.type = type; + d.loc = loc; + d.size = type_size_or(e->c, type, 8u); + d.align = type_align_or(e->c, type, d.size >= 8u ? 8u : d.size); + d.kind = kind; + return e->target->frame_slot(e->target, &d); +} + +static NativeLoc abi_storage_loc(NativeEmitCtx* e, const OptCGABIValue* v, + SrcLoc loc) { + if (!v) return loc_none(); + return loc_from_operand(e, &v->storage, loc); +} + +static void emit_call(NativeEmitCtx* e, Inst* in) { + IRCallAux* aux = (IRCallAux*)in->extra.aux; + NativeCallDesc d; + NativeCallPlan plan; + NativeLoc* args = NULL; + NativeLoc* results = NULL; + NativeLoc final_result = loc_none(); + NativeFrameSlot result_slot = NATIVE_FRAME_SLOT_NONE; + MemAccess result_mem; + if (!aux) return; + memset(&d, 0, sizeof d); + memset(&plan, 0, sizeof plan); + if (aux->desc.nargs) + args = arena_zarray(e->f->arena, NativeLoc, aux->desc.nargs); + for (u32 i = 0; i < aux->desc.nargs; ++i) + args[i] = abi_storage_loc(e, &aux->desc.args[i], in->loc); + if (aux->desc.ret.storage.kind) { + results = arena_zarray(e->f->arena, NativeLoc, 1); + final_result = abi_storage_loc(e, &aux->desc.ret, in->loc); + result_slot = + temp_slot(e, aux->desc.ret.type, in->loc, NATIVE_FRAME_SLOT_SPILL); + results[0] = loc_frame(aux->desc.ret.type, + class_for_type(e, aux->desc.ret.type), result_slot); + } + d.fn_type = aux->desc.fn_type; + d.callee = loc_from_operand(e, &aux->desc.callee, in->loc); + d.args = args; + d.results = results; + d.nargs = aux->desc.nargs; + d.nresults = results ? 1u : 0u; + d.flags = aux->desc.flags; + d.tail_policy = aux->desc.tail_policy; + d.inline_policy = aux->desc.inline_policy; + e->target->plan_call(e->target, &d, &plan); + if (plan.stack_arg_size > e->max_outgoing) + e->max_outgoing = plan.stack_arg_size; + for (u32 i = 0; i < plan.nargs; ++i) + write_loc(e, plan.args[i].dst, plan.args[i].src, plan.args[i].mem, in->loc); + if (plan.callee.kind != NATIVE_LOC_REG && + plan.callee.kind != NATIVE_LOC_GLOBAL) + plan.callee = materialize(e, plan.callee, NATIVE_REG_INT, plan.callee.type, + REG_NONE, REG_NONE, in->loc); + e->target->emit_call(e->target, &plan); + for (u32 i = 0; i < plan.nrets; ++i) + write_loc(e, plan.rets[i].dst, plan.rets[i].src, plan.rets[i].mem, in->loc); + if (result_slot && final_result.kind != NATIVE_LOC_NONE) { + NativeLoc tmp = loc_frame( + aux->desc.ret.type, class_for_type(e, aux->desc.ret.type), result_slot); + result_mem = mem_for_type(e->c, aux->desc.ret.type); + write_loc(e, final_result, tmp, result_mem, in->loc); + } +} + +static void emit_ret(NativeEmitCtx* e, Inst* in, const CGFuncDesc* fd) { + IRRetAux* aux = (IRRetAux*)in->extra.aux; + NativeLoc value = loc_none(); + NativeLoc* values = NULL; + NativeCallPlanRet* rets = NULL; + u32 nrets = 0; + if (aux && aux->present) { + NativeLoc final = abi_storage_loc(e, &aux->val, in->loc); + NativeFrameSlot slot = + temp_slot(e, aux->val.type, in->loc, NATIVE_FRAME_SLOT_SPILL); + NativeLoc frame = + loc_frame(aux->val.type, class_for_type(e, aux->val.type), slot); + write_loc(e, frame, final, mem_for_type(e->c, aux->val.type), in->loc); + value = frame; + values = &value; + } + e->target->plan_ret(e->target, fd, values, values ? 1u : 0u, &rets, &nrets); + for (u32 i = 0; i < nrets; ++i) + write_loc(e, rets[i].dst, rets[i].src, rets[i].mem, in->loc); + e->target->ret(e->target); +} + +static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in, + const CGFuncDesc* fd) { + NativeLoc dst, a, b, src, tmp; + NativeAddr addr, addr2; + Reg dst_reg; + (void)block; + if (e->target->set_loc) e->target->set_loc(e->target, in->loc); + switch ((IROp)in->op) { + case IR_NOP: + case IR_CONST_I: + case IR_CONST_BYTES: + case IR_PHI: + case IR_SCOPE_BEGIN: + case IR_SCOPE_ELSE: + case IR_SCOPE_END: + return; + case IR_PARAM_DECL: + emit_param_decl(e, in); + return; + case IR_LOAD_IMM: + dst = loc_from_operand(e, &in->opnds[0], in->loc); + write_loc(e, dst, loc_imm(in->opnds[0].type, in->extra.imm), + mem_for_type(e->c, in->opnds[0].type), in->loc); + return; + case IR_LOAD_CONST: + dst = loc_from_operand(e, &in->opnds[0], in->loc); + if (dst.kind != NATIVE_LOC_REG) + dst = materialize(e, dst, class_for_type(e, in->opnds[0].type), + in->opnds[0].type, REG_NONE, REG_NONE, in->loc); + e->target->load_const(e->target, dst, in->extra.cbytes); + return; + case IR_COPY: + dst = loc_from_operand(e, &in->opnds[0], in->loc); + src = loc_from_operand(e, &in->opnds[1], in->loc); + write_loc(e, dst, src, mem_for_type(e->c, in->opnds[0].type), in->loc); + return; + case IR_LOAD: + dst = loc_from_operand(e, &in->opnds[0], in->loc); + addr = addr_from_operand(e, &in->opnds[1], in->loc); + legalize_addr(e, &addr, in->extra.mem, in->loc); + if (dst.kind == NATIVE_LOC_REG) { + e->target->load(e->target, dst, addr, in->extra.mem); + } else { + if (!scratch_available(e, class_for_type(e, in->opnds[0].type), + addr_base_reg(&addr), addr_index_reg(&addr))) + collapse_addr_to_reg(e, &addr, in->loc); + tmp = scratch_loc(e, in->opnds[0].type, + class_for_type(e, in->opnds[0].type), + addr_base_reg(&addr), addr_index_reg(&addr), in->loc); + e->target->load(e->target, tmp, addr, in->extra.mem); + write_loc(e, dst, tmp, in->extra.mem, in->loc); + } + return; + case IR_STORE: + addr = addr_from_operand(e, &in->opnds[0], in->loc); + legalize_addr(e, &addr, in->extra.mem, in->loc); + src = loc_from_operand(e, &in->opnds[1], in->loc); + if (src.kind == NATIVE_LOC_REG && (src.v.reg == addr_base_reg(&addr) || + src.v.reg == addr_index_reg(&addr))) { + NativeFrameSlot slot = + temp_slot(e, in->opnds[1].type, in->loc, NATIVE_FRAME_SLOT_SPILL); + NativeLoc frame = loc_frame(in->opnds[1].type, + class_for_type(e, in->opnds[1].type), slot); + write_loc(e, frame, src, mem_for_type(e->c, in->opnds[1].type), + in->loc); + collapse_addr_to_reg(e, &addr, in->loc); + src = materialize(e, frame, class_for_type(e, in->opnds[1].type), + in->opnds[1].type, addr_base_reg(&addr), REG_NONE, + in->loc); + } + if (src.kind != NATIVE_LOC_REG) { + if (!scratch_available(e, class_for_type(e, in->opnds[1].type), + addr_base_reg(&addr), addr_index_reg(&addr))) + collapse_addr_to_reg(e, &addr, in->loc); + src = materialize(e, src, class_for_type(e, in->opnds[1].type), + in->opnds[1].type, addr_base_reg(&addr), + addr_index_reg(&addr), in->loc); + } + e->target->store(e->target, addr, src, in->extra.mem); + return; + case IR_ADDR_OF: + dst = loc_from_operand(e, &in->opnds[0], in->loc); + addr = addr_from_operand(e, &in->opnds[1], in->loc); + if (dst.kind != NATIVE_LOC_REG) + dst = materialize(e, dst, class_for_type(e, in->opnds[0].type), + in->opnds[0].type, REG_NONE, REG_NONE, in->loc); + e->target->load_addr(e->target, dst, addr); + return; + case IR_TLS_ADDR_OF: { + IRTlsAux* aux = (IRTlsAux*)in->extra.aux; + dst = loc_from_operand(e, &in->opnds[0], in->loc); + if (dst.kind != NATIVE_LOC_REG) + dst = materialize(e, dst, NATIVE_REG_INT, in->opnds[0].type, REG_NONE, + REG_NONE, in->loc); + e->target->tls_addr_of(e->target, dst, aux->sym, aux->addend); + return; + } + case IR_AGG_COPY: { + IRAggAux* aux = (IRAggAux*)in->extra.aux; + addr = pointer_addr_from_operand(e, &in->opnds[0], in->loc, REG_NONE, + REG_NONE); + addr2 = pointer_addr_from_operand( + e, &in->opnds[1], in->loc, + addr.base_kind == NATIVE_ADDR_BASE_REG ? addr.base.reg : REG_NONE, + REG_NONE); + e->target->copy_bytes(e->target, addr, addr2, aux->access); + return; + } + case IR_AGG_SET: { + IRAggAux* aux = (IRAggAux*)in->extra.aux; + addr = pointer_addr_from_operand(e, &in->opnds[0], in->loc, REG_NONE, + REG_NONE); + src = loc_from_operand(e, &in->opnds[1], in->loc); + if (src.kind != NATIVE_LOC_REG) { + if (!scratch_available(e, NATIVE_REG_INT, addr_base_reg(&addr), + addr_index_reg(&addr))) + collapse_addr_to_reg(e, &addr, in->loc); + src = materialize(e, src, NATIVE_REG_INT, in->opnds[1].type, + addr_base_reg(&addr), addr_index_reg(&addr), in->loc); + } + e->target->set_bytes(e->target, addr, src, aux->access); + return; + } + case IR_BITFIELD_LOAD: { + IRBitFieldAux* aux = (IRBitFieldAux*)in->extra.aux; + dst = loc_from_operand(e, &in->opnds[0], in->loc); + addr = addr_from_operand(e, &in->opnds[1], in->loc); + if (dst.kind != NATIVE_LOC_REG) + dst = materialize(e, dst, class_for_type(e, in->opnds[0].type), + in->opnds[0].type, REG_NONE, REG_NONE, in->loc); + e->target->bitfield_load(e->target, dst, addr, aux->access); + return; + } + case IR_BITFIELD_STORE: { + IRBitFieldAux* aux = (IRBitFieldAux*)in->extra.aux; + addr = addr_from_operand(e, &in->opnds[0], in->loc); + src = loc_from_operand(e, &in->opnds[1], in->loc); + if (src.kind != NATIVE_LOC_REG) + src = materialize(e, src, class_for_type(e, in->opnds[1].type), + in->opnds[1].type, REG_NONE, REG_NONE, in->loc); + e->target->bitfield_store(e->target, addr, src, aux->access); + return; + } + case IR_BINOP: + dst = loc_from_operand(e, &in->opnds[0], in->loc); + dst_reg = dst.kind == NATIVE_LOC_REG ? dst.v.reg : REG_NONE; + a = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc), + class_for_type(e, in->opnds[1].type), in->opnds[1].type, + dst_reg, REG_NONE, in->loc); + b = materialize(e, loc_from_operand(e, &in->opnds[2], in->loc), + class_for_type(e, in->opnds[2].type), in->opnds[2].type, + a.v.reg, dst_reg, in->loc); + if (dst.kind != NATIVE_LOC_REG) + dst = scratch_loc(e, in->opnds[0].type, + class_for_type(e, in->opnds[0].type), a.v.reg, + b.v.reg, in->loc); + e->target->binop(e->target, (BinOp)in->extra.imm, dst, a, b); + if (in->opnds[0].kind != OPK_REG) + write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst, + mem_for_type(e->c, in->opnds[0].type), in->loc); + return; + case IR_UNOP: + dst = loc_from_operand(e, &in->opnds[0], in->loc); + dst_reg = dst.kind == NATIVE_LOC_REG ? dst.v.reg : REG_NONE; + a = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc), + class_for_type(e, in->opnds[1].type), in->opnds[1].type, + dst_reg, REG_NONE, in->loc); + if (dst.kind != NATIVE_LOC_REG) + dst = scratch_loc(e, in->opnds[0].type, + class_for_type(e, in->opnds[0].type), a.v.reg, + REG_NONE, in->loc); + e->target->unop(e->target, (UnOp)in->extra.imm, dst, a); + if (in->opnds[0].kind != OPK_REG) + write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst, + mem_for_type(e->c, in->opnds[0].type), in->loc); + return; + case IR_CMP: + dst = loc_from_operand(e, &in->opnds[0], in->loc); + dst_reg = dst.kind == NATIVE_LOC_REG ? dst.v.reg : REG_NONE; + a = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc), + class_for_type(e, in->opnds[1].type), in->opnds[1].type, + dst_reg, REG_NONE, in->loc); + b = materialize(e, loc_from_operand(e, &in->opnds[2], in->loc), + class_for_type(e, in->opnds[2].type), in->opnds[2].type, + a.v.reg, dst_reg, in->loc); + if (dst.kind != NATIVE_LOC_REG) + dst = scratch_loc(e, in->opnds[0].type, + class_for_type(e, in->opnds[0].type), a.v.reg, + b.v.reg, in->loc); + e->target->cmp(e->target, (CmpOp)in->extra.imm, dst, a, b); + if (in->opnds[0].kind != OPK_REG) + write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst, + mem_for_type(e->c, in->opnds[0].type), in->loc); + return; + case IR_CONVERT: + dst = loc_from_operand(e, &in->opnds[0], in->loc); + dst_reg = dst.kind == NATIVE_LOC_REG ? dst.v.reg : REG_NONE; + src = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc), + class_for_type(e, in->opnds[1].type), in->opnds[1].type, + dst_reg, REG_NONE, in->loc); + if (dst.kind != NATIVE_LOC_REG) + dst = scratch_loc(e, in->opnds[0].type, + class_for_type(e, in->opnds[0].type), src.v.reg, + REG_NONE, in->loc); + e->target->convert(e->target, (ConvKind)in->extra.imm, dst, src); + if (in->opnds[0].kind != OPK_REG) + write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst, + mem_for_type(e->c, in->opnds[0].type), in->loc); + return; + case IR_CALL: + emit_call(e, in); + return; + case IR_BR: + e->target->jump(e->target, + ensure_label(e, e->f->blocks[block].succ[0], in->loc)); + return; + case IR_CMP_BRANCH: { + u32 next = order_index + 1u < e->f->emit_order_n + ? e->f->emit_order[order_index + 1u] + : UINT32_MAX; + a = materialize(e, loc_from_operand(e, &in->opnds[0], in->loc), + class_for_type(e, in->opnds[0].type), in->opnds[0].type, + REG_NONE, REG_NONE, in->loc); + b = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc), + class_for_type(e, in->opnds[1].type), in->opnds[1].type, + a.v.reg, REG_NONE, in->loc); + e->target->cmp_branch( + e->target, (CmpOp)in->extra.imm, a, b, + ensure_label(e, e->f->blocks[block].succ[0], in->loc)); + if (e->f->blocks[block].nsucc > 1u && e->f->blocks[block].succ[1] != next) + e->target->jump(e->target, + ensure_label(e, e->f->blocks[block].succ[1], in->loc)); + return; + } + case IR_SWITCH: { + IRSwitchAux* aux = (IRSwitchAux*)in->extra.aux; + NativeLoc sel = + materialize(e, loc_from_operand(e, &in->opnds[0], in->loc), + class_for_type(e, in->opnds[0].type), in->opnds[0].type, + REG_NONE, REG_NONE, in->loc); + NativeLoc imm = + scratch_loc(e, in->opnds[0].type, (NativeAllocClass)sel.cls, + sel.v.reg, REG_NONE, in->loc); + for (u32 i = 0; aux && i < aux->ncases; ++i) { + e->target->load_imm(e->target, imm, (i64)aux->cases[i].value); + e->target->cmp_branch(e->target, CMP_EQ, sel, imm, + ensure_label(e, aux->cases[i].block, in->loc)); + } + if (aux) + e->target->jump(e->target, + ensure_label(e, aux->default_block, in->loc)); + return; + } + case IR_INDIRECT_BRANCH: { + IRIndirectAux* aux = (IRIndirectAux*)in->extra.aux; + MCLabel* labels = aux && aux->ntargets + ? arena_array(e->f->arena, MCLabel, aux->ntargets) + : NULL; + for (u32 i = 0; aux && i < aux->ntargets; ++i) + labels[i] = ensure_label(e, aux->targets[i], in->loc); + src = materialize(e, loc_from_operand(e, &in->opnds[0], in->loc), + NATIVE_REG_INT, in->opnds[0].type, REG_NONE, REG_NONE, + in->loc); + e->target->indirect_branch(e->target, src, labels, + aux ? aux->ntargets : 0u); + return; + } + case IR_LOAD_LABEL_ADDR: + dst = loc_from_operand(e, &in->opnds[0], in->loc); + if (dst.kind != NATIVE_LOC_REG) + dst = materialize(e, dst, NATIVE_REG_INT, in->opnds[0].type, REG_NONE, + REG_NONE, in->loc); + e->target->load_label_addr(e->target, dst, + ensure_label(e, (u32)in->extra.imm, in->loc)); + return; + case IR_LOCAL_STATIC_DATA_BEGIN: { + CgIrLocalStaticBeginAux* aux = (CgIrLocalStaticBeginAux*)in->extra.aux; + emit_local_static_begin(e, aux ? &aux->desc : NULL, in->loc); + return; + } + case IR_LOCAL_STATIC_DATA_WRITE: { + CgIrLocalStaticWriteAux* aux = (CgIrLocalStaticWriteAux*)in->extra.aux; + if (!aux) emit_panic(e, in->loc, "missing local static data write"); + emit_local_static_write(e, aux->has_data ? aux->data : NULL, aux->len, + in->loc); + return; + } + case IR_LOCAL_STATIC_DATA_LABEL_ADDR: { + CgIrLocalStaticLabelAux* aux = (CgIrLocalStaticLabelAux*)in->extra.aux; + if (!aux) emit_panic(e, in->loc, "missing local static label data"); + (void)aux->address_space; + emit_local_static_label_addr(e, + ensure_label(e, (u32)aux->target, in->loc), + aux->addend, aux->width, in->loc); + return; + } + case IR_LOCAL_STATIC_DATA_END: + emit_local_static_end(e, in->loc); + return; + case IR_RET: + emit_ret(e, in, fd); + return; + case IR_ALLOCA: + dst = loc_from_operand(e, &in->opnds[0], in->loc); + src = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc), + NATIVE_REG_INT, in->opnds[1].type, REG_NONE, REG_NONE, + in->loc); + if (dst.kind != NATIVE_LOC_REG) + dst = scratch_loc(e, in->opnds[0].type, NATIVE_REG_INT, src.v.reg, + REG_NONE, in->loc); + e->target->alloca_(e->target, dst, src, (u32)in->extra.imm); + return; + case IR_ATOMIC_LOAD: { + IRAtomicAux* aux = (IRAtomicAux*)in->extra.aux; + dst = loc_from_operand(e, &in->opnds[0], in->loc); + addr = pointer_addr_from_operand(e, &in->opnds[1], in->loc, REG_NONE, + REG_NONE); + if (dst.kind != NATIVE_LOC_REG) + dst = scratch_loc(e, in->opnds[0].type, + class_for_type(e, in->opnds[0].type), REG_NONE, + REG_NONE, in->loc); + e->target->atomic_load(e->target, dst, addr, aux->mem, aux->mo); + if (in->opnds[0].kind != OPK_REG) + write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst, aux->mem, + in->loc); + return; + } + case IR_ATOMIC_STORE: { + IRAtomicAux* aux = (IRAtomicAux*)in->extra.aux; + addr = pointer_addr_from_operand(e, &in->opnds[0], in->loc, REG_NONE, + REG_NONE); + src = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc), + class_for_type(e, in->opnds[1].type), in->opnds[1].type, + REG_NONE, REG_NONE, in->loc); + e->target->atomic_store(e->target, addr, src, aux->mem, aux->mo); + return; + } + case IR_ATOMIC_RMW: { + IRAtomicAux* aux = (IRAtomicAux*)in->extra.aux; + dst = loc_from_operand(e, &in->opnds[0], in->loc); + addr = pointer_addr_from_operand(e, &in->opnds[1], in->loc, REG_NONE, + REG_NONE); + src = materialize(e, loc_from_operand(e, &in->opnds[2], in->loc), + class_for_type(e, in->opnds[2].type), in->opnds[2].type, + REG_NONE, REG_NONE, in->loc); + if (dst.kind != NATIVE_LOC_REG) + dst = scratch_loc(e, in->opnds[0].type, + class_for_type(e, in->opnds[0].type), src.v.reg, + REG_NONE, in->loc); + e->target->atomic_rmw(e->target, (AtomicOp)aux->op, dst, addr, src, + aux->mem, aux->mo); + if (in->opnds[0].kind != OPK_REG) + write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst, aux->mem, + in->loc); + return; + } + case IR_ATOMIC_CAS: { + IRCasAux* aux = (IRCasAux*)in->extra.aux; + NativeLoc ok; + NativeLoc expected; + NativeLoc desired; + dst = loc_from_operand(e, &in->opnds[0], in->loc); + ok = loc_from_operand(e, &in->opnds[1], in->loc); + addr = pointer_addr_from_operand(e, &in->opnds[2], in->loc, REG_NONE, + REG_NONE); + expected = materialize(e, loc_from_operand(e, &in->opnds[3], in->loc), + class_for_type(e, in->opnds[3].type), + in->opnds[3].type, REG_NONE, REG_NONE, in->loc); + desired = + materialize(e, loc_from_operand(e, &in->opnds[4], in->loc), + class_for_type(e, in->opnds[4].type), in->opnds[4].type, + expected.v.reg, REG_NONE, in->loc); + if (dst.kind != NATIVE_LOC_REG) + dst = scratch_loc(e, in->opnds[0].type, + class_for_type(e, in->opnds[0].type), expected.v.reg, + desired.v.reg, in->loc); + if (ok.kind != NATIVE_LOC_REG) + ok = scratch_loc(e, in->opnds[1].type, + class_for_type(e, in->opnds[1].type), dst.v.reg, + expected.v.reg, in->loc); + e->target->atomic_cas(e->target, dst, ok, addr, expected, desired, + aux->mem, aux->success, aux->failure); + if (in->opnds[0].kind != OPK_REG) + write_loc(e, loc_from_operand(e, &in->opnds[0], in->loc), dst, aux->mem, + in->loc); + if (in->opnds[1].kind != OPK_REG) + write_loc(e, loc_from_operand(e, &in->opnds[1], in->loc), ok, + mem_for_type(e->c, in->opnds[1].type), in->loc); + return; + } + case IR_VA_START: + case IR_VA_ARG: + case IR_VA_END: + case IR_VA_COPY: + case IR_BREAK_TO: + case IR_CONTINUE_TO: + case IR_ASM_BLOCK: + emit_panic(e, in->loc, "operation is not wired to NativeTarget yet"); + case IR_FENCE: + e->target->fence(e->target, (MemOrder)in->extra.imm); + return; + case IR_INTRINSIC: { + IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; + NativeLoc* dsts = aux && aux->ndst + ? arena_array(e->f->arena, NativeLoc, aux->ndst) + : NULL; + NativeLoc* args = aux && aux->narg + ? arena_array(e->f->arena, NativeLoc, aux->narg) + : NULL; + for (u32 i = 0; aux && i < aux->ndst; ++i) + dsts[i] = loc_from_operand(e, &aux->dsts[i], in->loc); + for (u32 i = 0; aux && i < aux->narg; ++i) { + if (aux->args[i].kind == OPK_IMM) { + args[i] = loc_from_operand(e, &aux->args[i], in->loc); + } else { + args[i] = materialize(e, loc_from_operand(e, &aux->args[i], in->loc), + class_for_type(e, aux->args[i].type), + aux->args[i].type, REG_NONE, REG_NONE, in->loc); + } + } + e->target->intrinsic(e->target, aux->kind, dsts, aux->ndst, args, + aux->narg); + return; + } + default: + emit_panic(e, in->loc, "unknown IR op"); + } +} + +static int native_emit_terminates(const Inst* in) { + if (!in) return 0; + switch ((IROp)in->op) { + case IR_BR: + case IR_CONDBR: + case IR_CMP_BRANCH: + case IR_SWITCH: + case IR_INDIRECT_BRANCH: + case IR_RET: + case IR_BREAK_TO: + case IR_CONTINUE_TO: + return 1; + case IR_INTRINSIC: { + IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; + return aux && (aux->kind == INTRIN_LONGJMP || aux->kind == INTRIN_TRAP || + aux->kind == INTRIN_UNREACHABLE); + } + default: + return 0; + } +} + +static void emit_block(NativeEmitCtx* e, u32 block, u32 order_index, + const CGFuncDesc* fd) { + if (block >= e->f->nblocks) return; + if (!e->label_placed[block]) { + e->label_placed[block] = 1u; + if (block != e->f->entry) + e->target->label_place(e->target, + ensure_label(e, block, (SrcLoc){0, 0, 0})); + } + Block* bl = &e->f->blocks[block]; + for (u32 i = 0; i < bl->ninsts; ++i) + emit_inst(e, block, order_index, &bl->insts[i], fd); + if (bl->nsucc == 1u && + (bl->ninsts == 0 || + !native_emit_terminates(&bl->insts[bl->ninsts - 1u]))) { + u32 next = order_index + 1u < e->f->emit_order_n + ? e->f->emit_order[order_index + 1u] + : UINT32_MAX; + if (bl->succ[0] != next) + e->target->jump(e->target, + ensure_label(e, bl->succ[0], (SrcLoc){0, 0, 0})); + } +} + +static void map_frame_slots(NativeEmitCtx* e) { + e->slot_map = + arena_zarray(e->f->arena, NativeFrameSlot, e->f->nframe_slots + 1u); + for (u32 i = 0; i < e->f->nframe_slots; ++i) { + IRFrameSlot* s = &e->f->frame_slots[i]; + NativeFrameSlotDesc d; + memset(&d, 0, sizeof d); + d.type = s->type; + d.name = s->name; + d.loc = s->loc; + d.size = s->size; + d.align = s->align; + d.kind = s->kind; + d.flags = s->flags; + e->slot_map[s->id] = e->target->frame_slot(e->target, &d); + } +} + +void opt_emit_native(Compiler* c, Func* f, NativeTarget* target) { + NativeEmitCtx e; + Func view; + CGFuncDesc fd; + NativeFramePatchState state; + if (!f || !target) return; + memset(&e, 0, sizeof e); + if (f->mir) { + view = *f; + view.blocks = f->mir->blocks; + view.nblocks = f->mir->nblocks; + view.entry = f->mir->entry; + view.emit_order = f->mir->emit_order; + view.emit_order_n = f->mir->emit_order_n; + view.emit_order_cap = f->mir->emit_order_cap; + view.opt_rewritten = 1; + view.mir = NULL; + e.f = &view; + } else { + e.f = f; + } + e.c = c; + e.target = target; + metrics_scope_begin(c, "opt.native_emit.setup"); + e.labels = arena_array(e.f->arena, MCLabel, e.f->nblocks ? e.f->nblocks : 1u); + e.label_placed = + arena_zarray(e.f->arena, u8, e.f->nblocks ? e.f->nblocks : 1u); + for (u32 i = 0; i < e.f->nblocks; ++i) e.labels[i] = MC_LABEL_NONE; + fd = semantic_func_desc(&e); + metrics_scope_end(c, "opt.native_emit.setup"); + + metrics_scope_begin(c, "opt.native_emit.func_begin"); + target->func_begin(target, &fd); + map_frame_slots(&e); + bind_params(&e); + metrics_scope_end(c, "opt.native_emit.func_begin"); + + metrics_scope_begin(c, "opt.native_emit.body"); + for (u32 i = 0; i < e.f->emit_order_n; ++i) + emit_block(&e, e.f->emit_order[i], i, &fd); + metrics_scope_end(c, "opt.native_emit.body"); + + memset(&state, 0, sizeof state); + state.max_outgoing = e.max_outgoing; + if (target->note_frame_state) target->note_frame_state(target, &state); + if (target->patch_apply) target->patch_apply(target); + metrics_scope_begin(c, "opt.native_emit.func_end"); + target->func_end(target); + metrics_scope_end(c, "opt.native_emit.func_end"); +} diff --git a/src/opt/pass_o2.c b/src/opt/pass_o2.c @@ -619,741 +619,6 @@ void opt_addr_xform(Func* f) { opt_rebuild_def_use(f); } -/* PReg-namespace variant of opt_addr_xform for the O1 pipeline (no SSA, no - * Val-keyed def-use chains). Scans the whole function once per candidate - * IR_ADDR_OF def to classify uses of its PReg result. - * - * Use classifications (see addr_xform_pregs_classify_use): - * - * OPF_ESCAPE The use is something other than a non-observable - * IR_LOAD/IR_STORE base operand. The IR_ADDR_OF cannot - * be folded; the local's address truly escapes. - * OPF_FOLD_LOCAL Zero-EA use: `OPK_INDIRECT(base=p, ofs=0, index=NONE)` - * in load/store base position. Foldable to OPK_LOCAL. - * OPF_FOLD_EA EA-shaped use: same load/store base position, but with - * nonzero `ofs` or `index != REG_NONE`. The EA must stay - * on the load/store (the operand layout for OPK_LOCAL - * cannot carry the EA today), so the operand is left - * alone and the IR_ADDR_OF def must stay alive to feed - * the OPK_INDIRECT base. The use is still recognized as - * "non-escape" for downstream analysis (e.g. scalar - * promotion's non-escape check). - * - * After classification: if any use is OPF_ESCAPE, no rewrite happens. If - * every use is OPF_FOLD_LOCAL, fold all uses to OPK_LOCAL and NOP the - * IR_ADDR_OF. If a mix of OPF_FOLD_LOCAL and OPF_FOLD_EA, fold the - * zero-EA uses but keep the IR_ADDR_OF alive for the EA-shaped uses. */ - -typedef enum AddrXformUseClass { - OPF_ESCAPE = 0, - OPF_FOLD_LOCAL = 1, - OPF_FOLD_EA = 2, -} AddrXformUseClass; - -static int addr_xform_pregs_main_op_position_ok(Inst* in, u32 op_idx) { - if ((IROp)in->op != IR_LOAD && (IROp)in->op != IR_STORE) return 0; - if (opt_mem_observable(&in->extra.mem)) return 0; - if ((IROp)in->op == IR_LOAD && op_idx != 1u) return 0; - if ((IROp)in->op == IR_STORE && op_idx != 0u) return 0; - return 1; -} - -static AddrXformUseClass addr_xform_pregs_classify_use(Inst* in, Operand* op, - u32 op_idx) { - if (op->kind != OPK_INDIRECT) return OPF_ESCAPE; - if (!addr_xform_pregs_main_op_position_ok(in, op_idx)) return OPF_ESCAPE; - if (op->v.ind.ofs == 0 && op->v.ind.index == (Reg)REG_NONE) - return OPF_FOLD_LOCAL; - return OPF_FOLD_EA; -} - -static int addr_xform_pregs_op_uses(const Operand* op, PReg p) { - if (!op) return 0; - if (op->kind == OPK_REG && (PReg)op->v.reg == p) return 1; - if (op->kind == OPK_INDIRECT) { - if ((PReg)op->v.ind.base == p) return 1; - if (op->v.ind.index != (Reg)REG_NONE && (PReg)op->v.ind.index == p) - return 1; - } - return 0; -} - -static int addr_xform_pregs_abivalue_uses(const CGABIValue* v, PReg p) { - if (!v) return 0; - if (addr_xform_pregs_op_uses(&v->storage, p)) return 1; - for (u32 i = 0; i < v->nparts; ++i) - if (addr_xform_pregs_op_uses((const Operand*)&v->parts[i].op, p)) return 1; - return 0; -} - -static int addr_xform_pregs_aux_uses(Inst* in, PReg p) { - switch ((IROp)in->op) { - case IR_CALL: { - IRCallAux* aux = (IRCallAux*)in->extra.aux; - if (!aux) return 0; - if (aux->use_plan_replay) { - if (addr_xform_pregs_op_uses(&aux->plan.callee, p)) return 1; - for (u32 i = 0; i < aux->plan.nargs; ++i) - if (addr_xform_pregs_op_uses(&aux->plan.args[i].src, p)) return 1; - for (u32 i = 0; i < aux->plan.nrets; ++i) - if (addr_xform_pregs_op_uses(&aux->plan.rets[i].dst, p)) return 1; - } else { - if (addr_xform_pregs_op_uses(&aux->desc.callee, p)) return 1; - for (u32 i = 0; i < aux->desc.nargs; ++i) - if (addr_xform_pregs_abivalue_uses( - (const CGABIValue*)&aux->desc.args[i], p)) - return 1; - if (addr_xform_pregs_abivalue_uses(&aux->desc.ret, p)) return 1; - } - return 0; - } - case IR_RET: { - IRRetAux* aux = (IRRetAux*)in->extra.aux; - if (!aux || !aux->present) return 0; - return addr_xform_pregs_abivalue_uses(&aux->val, p); - } - case IR_SCOPE_BEGIN: { - IRScopeAux* aux = (IRScopeAux*)in->extra.aux; - if (!aux) return 0; - return addr_xform_pregs_op_uses(&aux->desc.cond, p); - } - case IR_ASM_BLOCK: { - IRAsmAux* aux = (IRAsmAux*)in->extra.aux; - if (!aux) return 0; - for (u32 i = 0; i < aux->nin; ++i) - if (addr_xform_pregs_op_uses(&aux->in_ops[i], p)) return 1; - for (u32 i = 0; i < aux->nout; ++i) - if (addr_xform_pregs_op_uses(&aux->out_ops[i], p)) return 1; - return 0; - } - case IR_INTRINSIC: { - IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; - if (!aux) return 0; - for (u32 i = 0; i < aux->narg; ++i) - if (addr_xform_pregs_op_uses(&aux->args[i], p)) return 1; - for (u32 i = 0; i < aux->ndst; ++i) - if (addr_xform_pregs_op_uses(&aux->dsts[i], p)) return 1; - return 0; - } - default: - return 0; - } -} - -/* Returns nonzero if every use of `p` is foldable (OPF_FOLD_LOCAL or - * OPF_FOLD_EA) and at least one use exists. *out_has_ea is set to 1 if any - * use was OPF_FOLD_EA; in that case the rewrite must keep the IR_ADDR_OF - * alive (the EA-shaped use still names p as the OPK_INDIRECT base). */ -static int addr_xform_pregs_classify(Func* f, PReg p, Inst* def_inst, - int* out_has_ea) { - int has_foldable_use = 0; - int has_ea = 0; - for (u32 b = 0; b < f->nblocks; ++b) { - Block* bl = &f->blocks[b]; - for (u32 i = 0; i < bl->ninsts; ++i) { - Inst* in = &bl->insts[i]; - if (in == def_inst) continue; - for (u32 o = 0; o < in->nopnds; ++o) { - Operand* op = &in->opnds[o]; - if (!addr_xform_pregs_op_uses(op, p)) continue; - AddrXformUseClass uc = addr_xform_pregs_classify_use(in, op, o); - if (uc == OPF_ESCAPE) return 0; - has_foldable_use = 1; - if (uc == OPF_FOLD_EA) has_ea = 1; - } - if (addr_xform_pregs_aux_uses(in, p)) return 0; - } - } - if (out_has_ea) *out_has_ea = has_ea; - return has_foldable_use; -} - -void opt_addr_xform_pregs(Func* f) { - if (!f || f->opt_reg_ssa || f->opt_rewritten) return; - int changed = 0; - for (u32 b = 0; b < f->nblocks; ++b) { - Block* bl = &f->blocks[b]; - for (u32 i = 0; i < bl->ninsts; ++i) { - Inst* in = &bl->insts[i]; - if ((IROp)in->op != IR_ADDR_OF) continue; - if (in->nopnds < 2) continue; - if (in->opnds[0].kind != OPK_REG) continue; - if (in->opnds[1].kind != OPK_LOCAL) continue; - PReg p = (PReg)in->opnds[0].v.reg; - if (!opt_reg_valid(f, p)) continue; - int has_ea = 0; - if (!addr_xform_pregs_classify(f, p, in, &has_ea)) continue; - Operand local = in->opnds[1]; - /* Fold every zero-EA use of p to OPK_LOCAL. EA-shaped uses are left - * as OPK_INDIRECT(base=p, ofs, index, log2_scale) so the EA stays on - * the load/store; the IR_ADDR_OF def must survive to feed them. */ - for (u32 bb = 0; bb < f->nblocks; ++bb) { - Block* rb = &f->blocks[bb]; - for (u32 ii = 0; ii < rb->ninsts; ++ii) { - Inst* use = &rb->insts[ii]; - if (use == in) continue; - for (u32 o = 0; o < use->nopnds; ++o) { - Operand* op = &use->opnds[o]; - if (op->kind != OPK_INDIRECT) continue; - if ((PReg)op->v.ind.base != p) continue; - if (op->v.ind.ofs != 0 || op->v.ind.index != (Reg)REG_NONE) - continue; /* EA-shaped; leave alone */ - Operand folded = local; - folded.type = - use->extra.mem.type ? use->extra.mem.type : local.type; - *op = folded; - } - } - } - if (!has_ea) addr_inst_remove(in); - changed = 1; - } - } - /* After folding, walk all frame slots and clear FSF_ADDR_TAKEN on any - * slot whose surviving IR_ADDR_OF defs (if any) have all been retired. - * The frontend-set ADDR_TAKEN flag is conservative; if we proved the - * address no longer escapes, downstream passes (opt_promote_scalar_locals) - * can take advantage of the actual non-escape state. */ - if (changed) { - u8* still_taken = - arena_zarray(f->arena, u8, f->nframe_slots ? f->nframe_slots : 1u); - for (u32 b = 0; b < f->nblocks; ++b) { - Block* bl = &f->blocks[b]; - for (u32 i = 0; i < bl->ninsts; ++i) { - Inst* in = &bl->insts[i]; - if ((IROp)in->op != IR_ADDR_OF) continue; - if (in->nopnds < 2 || in->opnds[1].kind != OPK_LOCAL) continue; - FrameSlot slot = in->opnds[1].v.frame_slot; - if (slot && slot <= f->nframe_slots) still_taken[slot - 1u] = 1; - } - } - for (u32 s = 0; s < f->nframe_slots; ++s) { - if (!still_taken[s]) f->frame_slots[s].flags &= (u16)~FSF_ADDR_TAKEN; - } - } - if (changed) - opt_analysis_invalidate( - f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP); -} - -/* Scalar local promotion for the O1 pipeline. Runs after - * `opt_addr_xform_pregs` has folded zero-EA `OPK_INDIRECT(p)` uses to - * `OPK_LOCAL(slot)` and retired non-escaping `IR_ADDR_OF` defs. For each - * frame slot that is now only referenced as the base of matching-type, - * non-observable `IR_LOAD`/`IR_STORE`, the slot is replaced by a fresh - * mutable PReg: each store becomes `IR_COPY P_slot, src` (or `IR_LOAD_IMM` - * for an immediate source), each load becomes `IR_COPY dst, P_slot`. The - * slot becomes unreferenced and the backend drops it from the frame. - * - * A mutable PReg in `-O1` IR has the same data-flow semantics as a named - * memory cell that does not escape (multiple defs, multiple uses, value at - * a use comes from whichever def reaches it via CFG edges). No phis are - * required because the IR model has no phis; PReg flow becomes hard-reg - * flow after regalloc, and regalloc already handles it. - * - * Conditions for promotion (per slot): - * - * 1. Slot kind is FS_LOCAL (real locals, not spills, sret, alloca). - * 2. Slot has no FSF_ADDR_TAKEN, FSF_VOLATILE flag (after - * `opt_addr_xform_pregs` has cleared the conservative ADDR_TAKEN - * flag for slots whose IR_ADDR_OF defs were all retired). - * 3. Slot's declared type is scalar (int, float, bool, ptr, enum). - * 4. Every appearance of `OPK_LOCAL(slot)` in any instruction operand is - * either: - * - `IR_LOAD.opnds[1]` with matching `access.type == slot.type`, - * no observable mem flags, dst is OPK_REG; - * - `IR_STORE.opnds[0]` with matching `access.type == slot.type`, - * no observable mem flags, src is OPK_REG or OPK_IMM. - * 5. Slot does not appear in any aux operand position (calls, asm, etc.) - * or as an OPK_LOCAL anywhere else (e.g., a surviving IR_ADDR_OF). - * - * Param-slot case: FS_PARAM slots are excluded. The backend prologue is - * responsible for moving the ABI-incoming hard reg into the slot, and that - * move is not visible in the IR (there is no `IR_STORE OPK_LOCAL(slot)` to - * rewrite). At O1 the wrapper already places scalar params in REG storage - * when the frontend does not force a memory home, so the param's value - * arrives in a PReg without needing this pass. If a future scheme records - * the entry-move as a synthetic IR_STORE OPK_LOCAL(slot), this pass would - * promote it the same way it promotes any other store-to-slot. */ - -static int promote_local_type_is_scalar(Func* f, CfreeCgTypeId ty) { - if (!ty) return 0; - CfreeCgTypeKind kind = cfree_cg_type_kind((CfreeCompiler*)f->c, ty); - switch (kind) { - case CFREE_CG_TYPE_BOOL: - case CFREE_CG_TYPE_INT: - case CFREE_CG_TYPE_FLOAT: - case CFREE_CG_TYPE_PTR: - case CFREE_CG_TYPE_ENUM: - return 1; - default: - return 0; - } -} - -static int promote_op_uses_slot(const Operand* op, FrameSlot slot) { - return op && op->kind == OPK_LOCAL && op->v.frame_slot == slot; -} - -static int promote_abivalue_uses_slot(const CGABIValue* v, FrameSlot slot) { - if (!v) return 0; - if (promote_op_uses_slot(&v->storage, slot)) return 1; - for (u32 i = 0; i < v->nparts; ++i) - if (promote_op_uses_slot((const Operand*)&v->parts[i].op, slot)) return 1; - return 0; -} - -static int promote_aux_uses_slot(const Inst* in, FrameSlot slot) { - switch ((IROp)in->op) { - case IR_CALL: { - IRCallAux* aux = (IRCallAux*)in->extra.aux; - if (!aux) return 0; - if (aux->use_plan_replay) { - if (promote_op_uses_slot(&aux->plan.callee, slot)) return 1; - for (u32 i = 0; i < aux->plan.nargs; ++i) - if (promote_op_uses_slot(&aux->plan.args[i].src, slot)) return 1; - for (u32 i = 0; i < aux->plan.nrets; ++i) - if (promote_op_uses_slot(&aux->plan.rets[i].dst, slot)) return 1; - } else { - if (promote_op_uses_slot(&aux->desc.callee, slot)) return 1; - for (u32 i = 0; i < aux->desc.nargs; ++i) - if (promote_abivalue_uses_slot((const CGABIValue*)&aux->desc.args[i], - slot)) - return 1; - if (promote_abivalue_uses_slot(&aux->desc.ret, slot)) return 1; - } - return 0; - } - case IR_RET: { - IRRetAux* aux = (IRRetAux*)in->extra.aux; - if (!aux || !aux->present) return 0; - return promote_abivalue_uses_slot(&aux->val, slot); - } - case IR_SCOPE_BEGIN: { - IRScopeAux* aux = (IRScopeAux*)in->extra.aux; - if (!aux) return 0; - return promote_op_uses_slot(&aux->desc.cond, slot); - } - case IR_ASM_BLOCK: { - IRAsmAux* aux = (IRAsmAux*)in->extra.aux; - if (!aux) return 0; - for (u32 i = 0; i < aux->nin; ++i) - if (promote_op_uses_slot(&aux->in_ops[i], slot)) return 1; - for (u32 i = 0; i < aux->nout; ++i) - if (promote_op_uses_slot(&aux->out_ops[i], slot)) return 1; - return 0; - } - case IR_INTRINSIC: { - IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; - if (!aux) return 0; - for (u32 i = 0; i < aux->narg; ++i) - if (promote_op_uses_slot(&aux->args[i], slot)) return 1; - for (u32 i = 0; i < aux->ndst; ++i) - if (promote_op_uses_slot(&aux->dsts[i], slot)) return 1; - return 0; - } - default: - return 0; - } -} - -/* Per-inst check. Returns: - * 1 = "instruction touches slot in a promotable position" (load/store base). - * 0 = "instruction does not touch slot at all". - * -1 = "instruction touches slot in a non-promotable way" (e.g., wrong - * operand position, type mismatch, observable flags, aux use). */ -static int promote_inst_classify(const Inst* in, FrameSlot slot, - CfreeCgTypeId slot_ty) { - int touched = 0; - /* IR_LOAD: opnds[0]=dst REG, opnds[1]=addr (allowed: OPK_LOCAL slot). */ - if ((IROp)in->op == IR_LOAD) { - if (in->nopnds >= 2 && promote_op_uses_slot(&in->opnds[1], slot)) { - if (opt_mem_observable(&in->extra.mem)) return -1; - if (in->opnds[0].kind != OPK_REG) return -1; - CfreeCgTypeId at = in->extra.mem.type; - if (at && at != slot_ty) return -1; - touched = 1; - } - /* opnds[0] is the dst REG — never OPK_LOCAL by construction. */ - if (in->nopnds >= 1 && promote_op_uses_slot(&in->opnds[0], slot)) return -1; - } else if ((IROp)in->op == IR_STORE) { - if (in->nopnds >= 1 && promote_op_uses_slot(&in->opnds[0], slot)) { - if (opt_mem_observable(&in->extra.mem)) return -1; - if (in->nopnds < 2) return -1; - Operand* src = &in->opnds[1]; - if (src->kind != OPK_REG && src->kind != OPK_IMM) return -1; - CfreeCgTypeId at = in->extra.mem.type; - if (at && at != slot_ty) return -1; - touched = 1; - } - /* opnds[1] is the src value — should never be OPK_LOCAL for a scalar. */ - if (in->nopnds >= 2 && promote_op_uses_slot(&in->opnds[1], slot)) return -1; - } else { - /* Any other instruction with an OPK_LOCAL(slot) operand blocks promotion. - */ - for (u32 o = 0; o < in->nopnds; ++o) - if (promote_op_uses_slot(&in->opnds[o], slot)) return -1; - } - if (promote_aux_uses_slot(in, slot)) return -1; - return touched; -} - -/* Rewrite an `IR_STORE OPK_LOCAL(slot), src` into a PReg def. If src is - * OPK_IMM, emit IR_LOAD_IMM into preg; otherwise emit IR_COPY. */ -static void promote_rewrite_store(Func* f, Inst* in, PReg preg, - CfreeCgTypeId ty, u8 cls) { - Operand src = in->opnds[1]; - Operand* opnds = arena_array(f->arena, Operand, 2); - memset(&opnds[0], 0, sizeof opnds[0]); - opnds[0].kind = OPK_REG; - opnds[0].type = ty; - opnds[0].cls = cls; - opnds[0].v.reg = (Reg)preg; - in->type = ty; - in->def = (Val)preg; - if (src.kind == OPK_IMM) { - in->op = IR_LOAD_IMM; - in->nopnds = 1; - in->opnds = opnds; - in->extra.imm = src.v.imm; - } else { - opnds[1] = src; - opnds[1].type = ty; - opnds[1].cls = cls; - in->op = IR_COPY; - in->nopnds = 2; - in->opnds = opnds; - memset(&in->extra, 0, sizeof in->extra); - } -} - -/* Rewrite an `IR_LOAD dst, OPK_LOCAL(slot)` into `IR_COPY dst, preg`. */ -static void promote_rewrite_load(Func* f, Inst* in, PReg preg, CfreeCgTypeId ty, - u8 cls) { - Operand dst = in->opnds[0]; - Operand* opnds = arena_array(f->arena, Operand, 2); - opnds[0] = dst; - opnds[0].type = ty; - opnds[0].cls = cls; - memset(&opnds[1], 0, sizeof opnds[1]); - opnds[1].kind = OPK_REG; - opnds[1].type = ty; - opnds[1].cls = cls; - opnds[1].v.reg = (Reg)preg; - in->op = IR_COPY; - in->type = ty; - in->nopnds = 2; - in->opnds = opnds; - memset(&in->extra, 0, sizeof in->extra); -} - -void opt_promote_scalar_locals(Func* f) { - if (!f || f->opt_reg_ssa || f->opt_rewritten) return; - if (!f->nframe_slots) return; - int changed = 0; - for (u32 sidx = 0; sidx < f->nframe_slots; ++sidx) { - IRFrameSlot* slot = &f->frame_slots[sidx]; - FrameSlot id = slot->id; - /* FS_PARAM slots are owned by the backend prologue (which copies the - * ABI-incoming hard reg into the slot before any user IR runs); there - * is no IR-level store to rewrite. At O1, the wrapper already places - * scalar params in REG storage when the frontend does not force a - * memory home, so the FS_PARAM promotion path is normally a no-op. - * Only promote FS_LOCAL slots. */ - if (slot->kind != FS_LOCAL) continue; - if (slot->flags & (FSF_ADDR_TAKEN | FSF_VOLATILE)) continue; - if (!promote_local_type_is_scalar(f, slot->type)) continue; - int touched_count = 0; - int rejected = 0; - for (u32 b = 0; b < f->nblocks && !rejected; ++b) { - Block* bl = &f->blocks[b]; - for (u32 i = 0; i < bl->ninsts; ++i) { - Inst* in = &bl->insts[i]; - int r = promote_inst_classify(in, id, slot->type); - if (r < 0) { - rejected = 1; - break; - } - touched_count += r; - } - } - if (rejected || !touched_count) continue; - u8 cls = (cfree_cg_type_kind((CfreeCompiler*)f->c, slot->type) == - CFREE_CG_TYPE_FLOAT) - ? RC_FP - : RC_INT; - PReg preg = ir_alloc_preg(f, slot->type, cls); - for (u32 b = 0; b < f->nblocks; ++b) { - Block* bl = &f->blocks[b]; - for (u32 i = 0; i < bl->ninsts; ++i) { - Inst* in = &bl->insts[i]; - if ((IROp)in->op == IR_LOAD && in->nopnds >= 2 && - promote_op_uses_slot(&in->opnds[1], id)) { - promote_rewrite_load(f, in, preg, slot->type, cls); - } else if ((IROp)in->op == IR_STORE && in->nopnds >= 2 && - promote_op_uses_slot(&in->opnds[0], id)) { - promote_rewrite_store(f, in, preg, slot->type, cls); - } - } - } - /* The frame slot is now unreferenced. Leave the slot table entry in - * place (compaction would require remapping every other slot id); - * the backend's frame layout pass simply omits unreferenced slots. */ - changed = 1; - } - if (changed) - opt_analysis_invalidate( - f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP); -} - -/* CSE-style hoist of `IR_ADDR_OF(OPK_GLOBAL{sym, addend})` defs that appear - * more than once in the same function. The address is a link-time constant - * (TLS and IFUNC live on separate IROps), so all occurrences compute the - * same value; consolidating to a single entry-block def shrinks each loop - * body by the per-iter `adrp`/`add` pair the backend would otherwise re-emit. - * - * Implementation: - * - Walk all insts, group ADDR_OF defs by (sym, addend). - * - For each key with >= 2 defs: allocate a fresh PReg, materialize one - * IR_ADDR_OF in block 0 (after any IR_PARAM_DECL prologue), build a - * preg-remap from each original def-PReg to the new PReg, and NOP each - * original def. - * - One IR walk applies the remap to every operand `v.reg` / - * `v.ind.base`. - * - * Runs after opt_addr_xform_pregs so local addr-of has already been folded - * out; the remaining IR_ADDR_OF defs are global. */ - -typedef struct AddrCseEntry { - ObjSymId sym; - i64 addend; - PReg canonical; /* freshly allocated PReg, def in block 0 */ - CfreeCgTypeId addr_type; /* operand[0].type from the first def */ - u8 cls; /* operand[0].cls from the first def */ - u32 count; /* number of original ADDR_OF defs seen */ -} AddrCseEntry; - -static u32 addr_cse_find_or_add(AddrCseEntry** entries, u32* n, u32* cap, - Arena* arena, ObjSymId sym, i64 addend) { - for (u32 i = 0; i < *n; ++i) { - if ((*entries)[i].sym == sym && (*entries)[i].addend == addend) return i; - } - if (*n == *cap) { - u32 ncap = *cap ? *cap * 2u : 16u; - AddrCseEntry* nv = arena_array(arena, AddrCseEntry, ncap); - if (*entries) memcpy(nv, *entries, sizeof(AddrCseEntry) * (*n)); - *entries = nv; - *cap = ncap; - } - u32 idx = (*n)++; - AddrCseEntry* e = &(*entries)[idx]; - memset(e, 0, sizeof *e); - e->sym = sym; - e->addend = addend; - e->canonical = PREG_NONE; - e->count = 0; - return idx; -} - -static void addr_cse_apply_to_operand(Operand* op, const PReg* remap) { - /* remap is zero-initialized; 0 means "no remap" (preg 0 is reserved as - * unused). PREG_NONE = 0xffffffff and would be a valid remap target but - * we never produce that. */ - if (!op) return; - if (op->kind == OPK_REG) { - PReg p = (PReg)op->v.reg; - if (p != PREG_NONE && p != 0 && remap[p] != 0) op->v.reg = remap[p]; - } else if (op->kind == OPK_INDIRECT) { - PReg p = (PReg)op->v.ind.base; - if (p != PREG_NONE && p != 0 && remap[p] != 0) op->v.ind.base = remap[p]; - if (op->v.ind.index != (Reg)REG_NONE) { - PReg pi = (PReg)op->v.ind.index; - if (pi != PREG_NONE && pi != 0 && remap[pi] != 0) - op->v.ind.index = remap[pi]; - } - } -} - -static void addr_cse_apply_to_inst(Inst* in, const PReg* remap) { - for (u32 o = 0; o < in->nopnds; ++o) - addr_cse_apply_to_operand(&in->opnds[o], remap); - /* IR_CALL aux carries operands too; rewrite both replay variants. */ - if ((IROp)in->op == IR_CALL) { - IRCallAux* aux = (IRCallAux*)in->extra.aux; - if (!aux) return; - if (aux->use_plan_replay) { - addr_cse_apply_to_operand(&aux->plan.callee, remap); - for (u32 i = 0; i < aux->plan.nargs; ++i) - addr_cse_apply_to_operand(&aux->plan.args[i].src, remap); - for (u32 i = 0; i < aux->plan.nrets; ++i) - addr_cse_apply_to_operand(&aux->plan.rets[i].dst, remap); - } else { - addr_cse_apply_to_operand(&aux->desc.callee, remap); - for (u32 i = 0; i < aux->desc.nargs; ++i) { - CGABIValue* v = (CGABIValue*)&aux->desc.args[i]; - addr_cse_apply_to_operand(&v->storage, remap); - for (u32 k = 0; k < v->nparts; ++k) - addr_cse_apply_to_operand((Operand*)&v->parts[k].op, remap); - } - addr_cse_apply_to_operand(&aux->desc.ret.storage, remap); - for (u32 k = 0; k < aux->desc.ret.nparts; ++k) - addr_cse_apply_to_operand((Operand*)&aux->desc.ret.parts[k].op, remap); - } - } else if ((IROp)in->op == IR_RET) { - IRRetAux* aux = (IRRetAux*)in->extra.aux; - if (aux && aux->present) { - addr_cse_apply_to_operand(&aux->val.storage, remap); - for (u32 k = 0; k < aux->val.nparts; ++k) - addr_cse_apply_to_operand((Operand*)&aux->val.parts[k].op, remap); - } - } else if ((IROp)in->op == IR_ASM_BLOCK) { - IRAsmAux* aux = (IRAsmAux*)in->extra.aux; - if (!aux) return; - for (u32 i = 0; i < aux->nin; ++i) - addr_cse_apply_to_operand(&aux->in_ops[i], remap); - for (u32 i = 0; i < aux->nout; ++i) - addr_cse_apply_to_operand(&aux->out_ops[i], remap); - } else if ((IROp)in->op == IR_INTRINSIC) { - IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; - if (!aux) return; - for (u32 i = 0; i < aux->narg; ++i) - addr_cse_apply_to_operand(&aux->args[i], remap); - for (u32 i = 0; i < aux->ndst; ++i) - addr_cse_apply_to_operand(&aux->dsts[i], remap); - } -} - -static Inst* block_insert_at(Func* f, Block* bl, u32 at, u32 k) { - if (at > bl->ninsts) at = bl->ninsts; - if (bl->ninsts + k > bl->cap) { - u32 ncap = bl->cap ? bl->cap : 8u; - while (ncap < bl->ninsts + k) ncap *= 2u; - Inst* nb = arena_zarray(f->arena, Inst, ncap); - if (bl->insts && at) memcpy(nb, bl->insts, sizeof(Inst) * at); - if (bl->insts && bl->ninsts > at) - memcpy(nb + at + k, bl->insts + at, sizeof(Inst) * (bl->ninsts - at)); - bl->insts = nb; - bl->cap = ncap; - } else { - if (bl->ninsts > at) - memmove(bl->insts + at + k, bl->insts + at, - sizeof(Inst) * (bl->ninsts - at)); - } - for (u32 i = 0; i < k; ++i) memset(&bl->insts[at + i], 0, sizeof(Inst)); - bl->ninsts += k; - return &bl->insts[at]; -} - -void opt_addr_of_global_cse(Func* f) { - if (!f || f->opt_reg_ssa || f->opt_rewritten) return; - if (f->nblocks == 0) return; - - /* Pass 1: index ADDR_OF(global) defs by (sym, addend). */ - AddrCseEntry* entries = NULL; - u32 n_entries = 0; - u32 cap_entries = 0; - for (u32 b = 0; b < f->nblocks; ++b) { - Block* bl = &f->blocks[b]; - for (u32 i = 0; i < bl->ninsts; ++i) { - Inst* in = &bl->insts[i]; - if ((IROp)in->op != IR_ADDR_OF) continue; - if (in->nopnds < 2) continue; - if (in->opnds[0].kind != OPK_REG) continue; - if (in->opnds[1].kind != OPK_GLOBAL) continue; - u32 idx = addr_cse_find_or_add(&entries, &n_entries, &cap_entries, - f->arena, in->opnds[1].v.global.sym, - in->opnds[1].v.global.addend); - AddrCseEntry* e = &entries[idx]; - if (e->count == 0) { - e->addr_type = in->opnds[0].type; - e->cls = in->opnds[0].cls; - } - ++e->count; - } - } - if (!n_entries) return; - - /* Pass 2: for each duplicate key, allocate a canonical PReg. */ - u32 dup_count = 0; - for (u32 i = 0; i < n_entries; ++i) { - if (entries[i].count >= 2) { - entries[i].canonical = - ir_alloc_preg(f, entries[i].addr_type, entries[i].cls); - ++dup_count; - } - } - if (!dup_count) return; - - /* Pass 3: walk again, build per-old-PReg remap and NOP duplicate defs. */ - PReg* remap = arena_zarray(f->arena, PReg, opt_reg_count(f)); - for (u32 b = 0; b < f->nblocks; ++b) { - Block* bl = &f->blocks[b]; - for (u32 i = 0; i < bl->ninsts; ++i) { - Inst* in = &bl->insts[i]; - if ((IROp)in->op != IR_ADDR_OF) continue; - if (in->nopnds < 2) continue; - if (in->opnds[0].kind != OPK_REG) continue; - if (in->opnds[1].kind != OPK_GLOBAL) continue; - u32 idx = addr_cse_find_or_add(&entries, &n_entries, &cap_entries, - f->arena, in->opnds[1].v.global.sym, - in->opnds[1].v.global.addend); - if (entries[idx].canonical == PREG_NONE) continue; /* singleton */ - PReg old = (PReg)in->opnds[0].v.reg; - if (opt_reg_valid(f, old)) remap[old] = entries[idx].canonical; - /* NOP the original def. */ - in->op = IR_NOP; - in->def = VAL_NONE; - in->ndefs = 0; - in->defs = NULL; - in->nopnds = 0; - in->opnds = NULL; - } - } - - /* Pass 4: hoist a single ADDR_OF for each duplicated key to the entry - * block, inserted after any leading IR_PARAM_DECL instructions. */ - if (f->entry >= f->nblocks) return; - Block* entry = &f->blocks[f->entry]; - u32 insert_at = 0; - while (insert_at < entry->ninsts && - (IROp)entry->insts[insert_at].op == IR_PARAM_DECL) - ++insert_at; - Inst* slot = block_insert_at(f, entry, insert_at, dup_count); - u32 w = 0; - for (u32 i = 0; i < n_entries; ++i) { - if (entries[i].canonical == PREG_NONE) continue; - Inst* in = &slot[w++]; - in->op = (u16)IR_ADDR_OF; - in->def = (Val)entries[i].canonical; - in->type = entries[i].addr_type; - in->nopnds = 2; - in->opnds = arena_array(f->arena, Operand, 2); - memset(&in->opnds[0], 0, sizeof(Operand)); - in->opnds[0].kind = OPK_REG; - in->opnds[0].cls = entries[i].cls; - in->opnds[0].type = entries[i].addr_type; - in->opnds[0].v.reg = entries[i].canonical; - memset(&in->opnds[1], 0, sizeof(Operand)); - in->opnds[1].kind = OPK_GLOBAL; - in->opnds[1].cls = entries[i].cls; - in->opnds[1].type = entries[i].addr_type; - in->opnds[1].v.global.sym = entries[i].sym; - in->opnds[1].v.global.addend = entries[i].addend; - ir_assign_inst_id(f, in); - } - - /* Pass 5: apply remap to all operand uses in the function. */ - for (u32 b = 0; b < f->nblocks; ++b) { - Block* bl = &f->blocks[b]; - for (u32 i = 0; i < bl->ninsts; ++i) { - addr_cse_apply_to_inst(&bl->insts[i], remap); - } - } - - opt_analysis_invalidate( - f, OPT_ANALYSIS_DEF_USE | OPT_ANALYSIS_DOM | OPT_ANALYSIS_LOOP); -} - static u64 gvn_width_mask(u32 width) { if (width >= 64u) return ~0ull; return (1ull << width) - 1ull; diff --git a/test/opt/cg_ir_lower_test.c b/test/opt/cg_ir_lower_test.c @@ -0,0 +1,199 @@ +#include <cfree/core.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "cg/ir.h" +#include "opt/opt.h" + +#undef Operand +#undef CGFuncDesc +#undef CGParamDesc +#undef CGCallDesc +#undef CGLocalStorage + +static void* h_alloc(CfreeHeap* h, size_t n, size_t a) { + (void)h; + (void)a; + return n ? malloc(n) : NULL; +} + +static void* h_realloc(CfreeHeap* h, void* p, size_t o, size_t n, size_t a) { + (void)h; + (void)o; + (void)a; + return realloc(p, n); +} + +static void h_free(CfreeHeap* h, void* p, size_t n) { + (void)h; + (void)n; + free(p); +} + +static CfreeHeap g_heap = {h_alloc, h_realloc, h_free, NULL}; +static int g_fails; +static int g_checks; + +static void diag_emit(CfreeDiagSink* s, CfreeDiagKind k, CfreeSrcLoc loc, + const char* fmt, va_list ap) { + static const char* names[] = {"note", "warning", "error", "fatal"}; + (void)s; + (void)loc; + fprintf(stderr, "%s: ", names[k]); + vfprintf(stderr, fmt, ap); + fputc('\n', stderr); +} + +static CfreeDiagSink g_diag = {diag_emit, NULL, 0, 0}; + +#define EXPECT(cond, ...) \ + do { \ + ++g_checks; \ + if (!(cond)) { \ + ++g_fails; \ + fprintf(stderr, "FAIL %s:%d: ", __FILE__, __LINE__); \ + fprintf(stderr, __VA_ARGS__); \ + fputc('\n', stderr); \ + } \ + } while (0) + +typedef struct TestCtx { + CfreeContext ctx; + Compiler* c; + CfreeCgTypeId i32; +} TestCtx; + +static void tc_init(TestCtx* tc) { + CfreeTarget target; + CfreeCgBuiltinTypes b; + memset(tc, 0, sizeof *tc); + tc->ctx.heap = &g_heap; + tc->ctx.diag = &g_diag; + tc->ctx.now = -1; + memset(&target, 0, sizeof target); + target.arch = CFREE_ARCH_ARM_64; + target.os = CFREE_OS_MACOS; + target.obj = CFREE_OBJ_MACHO; + target.ptr_size = 8; + target.ptr_align = 8; + if (cfree_compiler_new(target, &tc->ctx, (CfreeCompiler**)&tc->c) != + CFREE_OK || + !tc->c) { + fprintf(stderr, "fatal: compiler allocation failed\n"); + abort(); + } + b = cfree_cg_builtin_types(tc->c); + tc->i32 = b.id[CFREE_CG_BUILTIN_I32]; +} + +static void tc_fini(TestCtx* tc) { + cfree_compiler_free(tc->c); + tc->c = NULL; +} + +static Operand local_op(CGLocal local, CfreeCgTypeId type) { + Operand o; + memset(&o, 0, sizeof o); + o.kind = OPK_LOCAL; + o.type = type; + o.v.local = local; + return o; +} + +static Operand imm_op(i64 value, CfreeCgTypeId type) { + Operand o; + memset(&o, 0, sizeof o); + o.kind = OPK_IMM; + o.type = type; + o.v.imm = value; + return o; +} + +static CGLocal add_local(CgIrFunc* f, CfreeCgTypeId type, const char* name) { + CGLocalDesc d; + (void)name; + memset(&d, 0, sizeof d); + d.type = type; + d.size = 4; + d.align = 4; + return cg_ir_func_add_local(f, &d, 0, 0); +} + +static CgIrInst* emit_ops(CgIrFunc* f, CgIrOp op, const Operand* ops, u32 n) { + CgIrInst* in = cg_ir_emit(f, op, (SrcLoc){0, 0, 0}); + in->opnds = cg_ir_dup_operands(f->arena, ops, n); + in->nopnds = n; + return in; +} + +static void converter_builds_cfg_and_pregs(void) { + TestCtx tc; + tc_init(&tc); + + CGFuncDesc fd; + memset(&fd, 0, sizeof fd); + CfreeCgTypeId result_types[1]; + result_types[0] = tc.i32; + fd.fn_type = tc.i32; + fd.result_types = result_types; + fd.nresults = 1; + CgIrFunc* cg = cg_ir_func_new(tc.c, &fd); + CGLocal a = add_local(cg, tc.i32, "a"); + CGLocal b = add_local(cg, tc.i32, "b"); + Label done = cg_ir_func_add_label(cg); + + Operand one[] = {local_op(a, tc.i32)}; + CgIrInst* li = emit_ops(cg, CG_IR_LOAD_IMM, one, 1); + li->extra.imm = 1; + + Operand cmp[] = {local_op(a, tc.i32), imm_op(0, tc.i32)}; + CgIrInst* br = emit_ops(cg, CG_IR_CMP_BRANCH, cmp, 2); + CgIrCmpBranchAux* br_aux = arena_znew(cg->arena, CgIrCmpBranchAux); + br_aux->op = CMP_NE; + br_aux->target = done; + br->extra.aux = br_aux; + + Operand two[] = {local_op(b, tc.i32)}; + CgIrInst* li2 = emit_ops(cg, CG_IR_LOAD_IMM, two, 1); + li2->extra.imm = 2; + + CgIrInst* label = cg_ir_emit(cg, CG_IR_LABEL, (SrcLoc){0, 0, 0}); + label->extra.imm = (i64)done; + cg_ir_func_note_label_place(cg, done, (SrcLoc){0, 0, 0}); + + CgIrInst* li3 = emit_ops(cg, CG_IR_LOAD_IMM, two, 1); + li3->extra.imm = 3; + + CgIrRetAux* ret_aux = arena_znew(cg->arena, CgIrRetAux); + CGLocal retv = b; + ret_aux->values = cg_ir_dup_locals(cg->arena, &retv, 1); + ret_aux->nvalues = 1; + CgIrInst* ret = cg_ir_emit(cg, CG_IR_RET, (SrcLoc){0, 0, 0}); + ret->extra.aux = ret_aux; + + Func* f = opt_func_from_cg_ir(tc.c, cg); + EXPECT(f != NULL, "converter returned NULL"); + EXPECT(f->nlocals == 2, "expected 2 locals, got %u", f->nlocals); + EXPECT(f->npregs == 3, "expected two PRegs plus sentinel, got %u", f->npregs); + EXPECT(f->nblocks >= 3, "expected at least 3 blocks, got %u", f->nblocks); + EXPECT(f->blocks[f->entry].nsucc == 2, "entry should branch two ways"); + EXPECT(f->blocks[f->entry].ninsts == 2, "entry should contain load+branch"); + EXPECT(f->blocks[f->entry].insts[0].op == IR_LOAD_IMM, + "first inst should be IR_LOAD_IMM"); + EXPECT(f->blocks[f->entry].insts[0].opnds[0].kind == OPK_REG, + "local value should lower to PReg operand"); + + tc_fini(&tc); +} + +int main(void) { + converter_builds_cfg_and_pregs(); + if (g_fails) { + fprintf(stderr, "cg-ir-lower: %d/%d failed\n", g_fails, g_checks); + return 1; + } + printf("cg-ir-lower: %d checks, 0 failures\n", g_checks); + return 0; +}