kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 53115ba9e2fd1150957b4cbc870ded5295b6682d
parent 3ebd1a10457514d71a637b181416abd8e998a936
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed, 27 May 2026 06:19:58 -0700

opt: route inline asm through optimizer; delete direct-replay path

Add an IR_ASM_BLOCK case to pass_native_emit that binds the optimizer's
pre-allocated operand registers to the asm template via a new aa64
NativeTarget asm_block hook. The hook reuses the direct path's clobber-mask
and callee-save/restore helpers (refactored off NativeDirectTarget onto
AANativeTarget) but does not self-allocate: inputs are already live in their
registers and outputs are consumed through the normal use/def data flow, so
it only binds registers and materializes memory-operand bases into scratch.

With asm landed, every function now compiles through the optimizer, so the
direct-replay fallback (opt_func_needs_direct_replay, opt_replay_cg_ir_direct,
the OptReplay machinery and replay_* helpers) and the CFREE_NO_DIRECT_REPLAY
env gate are removed.

Toy R-path green at O1 (and default O0+O1); full toy suite 1333 pass / 0 fail.

Diffstat:
Msrc/arch/aa64/native.c | 184+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
Msrc/opt/opt.c | 415-------------------------------------------------------------------------------
Msrc/opt/pass_native_emit.c | 24+++++++++++++++++++++++-
3 files changed, 185 insertions(+), 438 deletions(-)

diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c @@ -2573,6 +2573,11 @@ static void aa_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr, static void aa_va_end_native(NativeTarget* t, NativeLoc ap_ptr); static void aa_va_copy_native(NativeTarget* t, NativeLoc dst_ap_ptr, NativeLoc src_ap_ptr); +static void aa_asm_block_native(NativeTarget* t, const char* tmpl, + const AsmConstraint* outs, u32 nout, + NativeLoc* out_locs, const AsmConstraint* ins, + u32 nin, const NativeLoc* in_locs, + const Sym* clobbers, u32 nclob); NativeTarget* aa64_native_target_new(Compiler* c, ObjBuilder* obj, MCEmitter* mc) { @@ -2630,6 +2635,7 @@ NativeTarget* aa64_native_target_new(Compiler* c, ObjBuilder* obj, t->va_end_ = aa_va_end_native; t->va_copy_ = aa_va_copy_native; t->intrinsic = aa_intrinsic; + t->asm_block = aa_asm_block_native; t->file_scope_asm = aa_file_scope_asm; t->trap = aa_trap; t->set_loc = aa_set_loc; @@ -3065,8 +3071,13 @@ AA_UNUSED_FN static int aa_asm_match_index(const char* s) { return n; } +_Noreturn static void aa_asm_panic_at(Compiler* c, SrcLoc loc, + const char* msg) { + compiler_panic(c, loc, "aarch64 inline asm: %s", msg); +} + _Noreturn static void aa_asm_panic(NativeDirectTarget* d, const char* msg) { - compiler_panic(d->base.c, d->loc, "aarch64 inline asm: %s", msg); + aa_asm_panic_at(d->base.c, d->loc, msg); } AA_UNUSED_FN static void aa_asm_bound_reg(Operand* out, CfreeCgTypeId type, @@ -3088,19 +3099,19 @@ AA_UNUSED_FN static void aa_asm_bound_mem(Operand* out, CfreeCgTypeId type, out->v.ind.index = CG_LOCAL_NONE; } -static int aa_asm_parse_reg_clobber(NativeDirectTarget* d, Sym name, +static int aa_asm_parse_reg_clobber(Compiler* c, SrcLoc loc, Sym name, NativeAllocClass* cls_out, Reg* reg_out) { - Slice s = pool_slice(d->base.c->global, name); + Slice s = pool_slice(c->global, name); char buf[16]; uint32_t dwarf; if (!s.s || !s.len) return 0; if (s.len == 2 && s.s[0] == 'c' && s.s[1] == 'c') return 0; if (s.len == 6 && memcmp(s.s, "memory", 6) == 0) return 0; - if (s.len >= sizeof buf) aa_asm_panic(d, "clobber name is too long"); + if (s.len >= sizeof buf) aa_asm_panic_at(c, loc, "clobber name is too long"); memcpy(buf, s.s, s.len); buf[s.len] = '\0'; if (aa64_register_index(buf, &dwarf) != 0) - aa_asm_panic(d, "unknown clobber register"); + aa_asm_panic_at(c, loc, "unknown clobber register"); if (dwarf <= 30u) { *cls_out = NATIVE_REG_INT; *reg_out = (Reg)dwarf; @@ -3111,11 +3122,11 @@ static int aa_asm_parse_reg_clobber(NativeDirectTarget* d, Sym name, *reg_out = (Reg)(dwarf - 64u); return 1; } - aa_asm_panic(d, "unsupported clobber register"); + aa_asm_panic_at(c, loc, "unsupported clobber register"); return 0; } -AA_UNUSED_FN static void aa_asm_clobber_masks(NativeDirectTarget* d, +AA_UNUSED_FN static void aa_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers, u32 nclob, u32* int_mask, u32* fp_mask) { *int_mask = 0; @@ -3123,7 +3134,7 @@ AA_UNUSED_FN static void aa_asm_clobber_masks(NativeDirectTarget* d, for (u32 i = 0; i < nclob; ++i) { NativeAllocClass cls; Reg reg; - if (!aa_asm_parse_reg_clobber(d, clobbers[i], &cls, &reg)) continue; + if (!aa_asm_parse_reg_clobber(c, loc, clobbers[i], &cls, &reg)) continue; if (cls == NATIVE_REG_INT) *int_mask |= 1u << reg; else if (cls == NATIVE_REG_FP) @@ -3225,7 +3236,7 @@ typedef struct AAAsmSavedClobber { CfreeCgTypeId type; } AAAsmSavedClobber; -static void aa_asm_save_one(NativeDirectTarget* d, AAAsmSavedClobber* s) { +static void aa_asm_save_one(AANativeTarget* a, AAAsmSavedClobber* s) { NativeFrameSlotDesc desc; NativeAddr addr; NativeLoc reg; @@ -3234,17 +3245,16 @@ static void aa_asm_save_one(NativeDirectTarget* d, AAAsmSavedClobber* s) { desc.size = 8; desc.align = 8; desc.kind = NATIVE_FRAME_SLOT_SAVE; - s->slot = d->native->frame_slot(d->native, &desc); + s->slot = a->base.frame_slot(&a->base, &desc); memset(&addr, 0, sizeof addr); addr.base_kind = NATIVE_ADDR_BASE_FRAME; addr.base.frame = s->slot; addr.base_type = s->type; reg = aa_reg_loc(s->type, s->cls, s->reg); - aa_emit_mem(aa_of(d->native), 0, reg, addr, - aa_mem_for_type(d->native, s->type, 8)); + aa_emit_mem(a, 0, reg, addr, aa_mem_for_type(&a->base, s->type, 8)); } -AA_UNUSED_FN static void aa_asm_restore_one(NativeDirectTarget* d, +AA_UNUSED_FN static void aa_asm_restore_one(AANativeTarget* a, const AAAsmSavedClobber* s) { NativeAddr addr; NativeLoc reg = aa_reg_loc(s->type, s->cls, s->reg); @@ -3252,14 +3262,13 @@ AA_UNUSED_FN static void aa_asm_restore_one(NativeDirectTarget* d, addr.base_kind = NATIVE_ADDR_BASE_FRAME; addr.base.frame = s->slot; addr.base_type = s->type; - aa_emit_mem(aa_of(d->native), 1, reg, addr, - aa_mem_for_type(d->native, s->type, 8)); + aa_emit_mem(a, 1, reg, addr, aa_mem_for_type(&a->base, s->type, 8)); } AA_UNUSED_FN static AAAsmSavedClobber* aa_asm_save_callee_clobbers( - NativeDirectTarget* d, u32 int_mask, u32 fp_mask, u32* nsaved_out) { + AANativeTarget* a, u32 int_mask, u32 fp_mask, u32* nsaved_out) { AAAsmSavedClobber* saved = - arena_zarray(d->base.c->tu, AAAsmSavedClobber, 20u); + arena_zarray(a->base.c->tu, AAAsmSavedClobber, 20u); u32 n = 0; CfreeCgTypeId i64 = builtin_id(CFREE_CG_BUILTIN_I64); CfreeCgTypeId f64 = builtin_id(CFREE_CG_BUILTIN_F64); @@ -3268,14 +3277,14 @@ AA_UNUSED_FN static AAAsmSavedClobber* aa_asm_save_callee_clobbers( saved[n].cls = NATIVE_REG_INT; saved[n].reg = r; saved[n].type = i64; - aa_asm_save_one(d, &saved[n++]); + aa_asm_save_one(a, &saved[n++]); } for (Reg r = 8u; r <= 15u; ++r) { if ((fp_mask & (1u << r)) == 0) continue; saved[n].cls = NATIVE_REG_FP; saved[n].reg = r; saved[n].type = f64; - aa_asm_save_one(d, &saved[n++]); + aa_asm_save_one(a, &saved[n++]); } *nsaved_out = n; return saved; @@ -3294,7 +3303,7 @@ static void aa_direct_asm_block(NativeDirectTarget* d, const char* tmpl, u32 nsaved; AA64Asm* a; - aa_asm_clobber_masks(d, clobbers, nclob, &clob_int, &clob_fp); + aa_asm_clobber_masks(d->base.c, d->loc, clobbers, nclob, &clob_int, &clob_fp); used_int = clob_int | (1u << AA_TMP0) | (1u << AA_TMP1) | (1u << 18u) | (1u << AA_FP) | (1u << AA_LR) | (1u << AA_SP); used_fp = clob_fp | (1u << 20u) | (1u << 21u); @@ -3364,7 +3373,8 @@ static void aa_direct_asm_block(NativeDirectTarget* d, const char* tmpl, } } - saved = aa_asm_save_callee_clobbers(d, clob_int, clob_fp, &nsaved); + saved = aa_asm_save_callee_clobbers(aa_of(d->native), clob_int, clob_fp, + &nsaved); a = aa64_asm_open(d->base.c); aa64_inline_bind(a, outs, nout, bound_outs, ins, nin, bound_ins, clobbers, nclob); @@ -3380,7 +3390,137 @@ static void aa_direct_asm_block(NativeDirectTarget* d, const char* tmpl, src = aa_reg_loc(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local); aa_direct_store_reg_to_operand(d, out_ops[i], src); } - for (u32 i = nsaved; i > 0; --i) aa_asm_restore_one(d, &saved[i - 1u]); + for (u32 i = nsaved; i > 0; --i) + aa_asm_restore_one(aa_of(d->native), &saved[i - 1u]); +} + +/* ---- NativeTarget (optimizer) asm hook ---- + * + * The optimizer has already allocated every operand register and arranged the + * surrounding data flow (inputs are live in their registers on entry, outputs + * are consumed from their registers on exit; the asm's clobber_mask kept the + * allocator from holding live values in clobbered registers). So unlike the + * direct path this hook does NOT self-allocate registers and does NOT load + * inputs / store outputs -- it only binds the pre-allocated registers to the + * template, materializing memory-operand base addresses into the reserved + * scratch registers and saving/restoring callee-saved registers the asm + * clobbers (the only ABI obligation the allocator cannot discharge itself). */ + +static NativeAddr aa_asm_loc_to_addr(AANativeTarget* a, SrcLoc loc, + NativeLoc src) { + NativeAddr addr; + memset(&addr, 0, sizeof addr); + addr.base_type = src.type; + switch ((NativeLocKind)src.kind) { + case NATIVE_LOC_FRAME: + addr.base_kind = NATIVE_ADDR_BASE_FRAME; + addr.base.frame = src.v.frame; + return addr; + case NATIVE_LOC_ADDR: + return src.v.addr; + case NATIVE_LOC_GLOBAL: + addr.base_kind = NATIVE_ADDR_BASE_GLOBAL; + addr.base.global.sym = src.v.global.sym; + addr.base.global.addend = src.v.global.addend; + return addr; + case NATIVE_LOC_REG: + addr.base_kind = NATIVE_ADDR_BASE_REG; + addr.cls = NATIVE_REG_INT; + addr.base.reg = src.v.reg; + return addr; + default: + aa_asm_panic_at(a->base.c, loc, "unsupported memory asm operand"); + } +} + +/* Resolve a memory-constraint operand to a single base register with zero + * offset, folding any frame/global/offset into a scratch register. At most the + * two reserved scratch registers are used across one asm block. */ +static Reg aa_asm_native_mem_base(AANativeTarget* a, SrcLoc loc, NativeLoc src, + u32* ntmp) { + NativeAddr addr = aa_asm_loc_to_addr(a, loc, src); + u32 base; + i32 off; + Reg dst; + if (addr.index_kind != NATIVE_ADDR_INDEX_NONE) + aa_asm_panic_at(a->base.c, loc, "indexed memory asm operand unsupported"); + aa_addr_base(a, addr, &base, &off); + if (off == 0) return (Reg)base; + if (*ntmp >= 2u) + aa_asm_panic_at(a->base.c, loc, "too many memory asm operands"); + dst = (*ntmp == 0u) ? AA_TMP0 : AA_TMP1; + (*ntmp)++; + aa_emit_add_imm(a, dst, base, off); + return dst; +} + +static void aa_asm_bind_native(AANativeTarget* a, SrcLoc loc, Operand* out, + const char* constraint, CfreeCgTypeId type, + NativeLoc src, u32* ntmp) { + const char* body = aa_asm_constraint_body(constraint); + if (body[0] == 'r' || body[0] == 'w') { + NativeAllocClass cls = (body[0] == 'w') ? NATIVE_REG_FP : NATIVE_REG_INT; + if (src.kind != NATIVE_LOC_REG) + aa_asm_panic_at(a->base.c, loc, "register asm operand not in a register"); + aa_asm_bound_reg(out, type, cls, (Reg)src.v.reg); + } else if (body[0] == 'i') { + if (src.kind != NATIVE_LOC_IMM) + aa_asm_panic_at(a->base.c, loc, "immediate asm operand is not immediate"); + memset(out, 0, sizeof *out); + out->kind = OPK_IMM; + out->type = type; + out->v.imm = src.v.imm; + } else if (body[0] == 'm') { + aa_asm_bound_mem(out, type, aa_asm_native_mem_base(a, loc, src, ntmp)); + } else { + aa_asm_panic_at(a->base.c, loc, "unsupported asm constraint"); + } +} + +static void aa_asm_block_native(NativeTarget* t, const char* tmpl, + const AsmConstraint* outs, u32 nout, + NativeLoc* out_locs, const AsmConstraint* ins, + u32 nin, const NativeLoc* in_locs, + const Sym* clobbers, u32 nclob) { + AANativeTarget* a = aa_of(t); + Compiler* c = t->c; + SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0}; + Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL; + Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL; + u32 clob_int, clob_fp, ntmp = 0; + AAAsmSavedClobber* saved; + u32 nsaved; + AA64Asm* asmh; + + aa_asm_clobber_masks(c, loc, clobbers, nclob, &clob_int, &clob_fp); + + for (u32 i = 0; i < nout; ++i) { + CfreeCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type; + aa_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, out_locs[i], + &ntmp); + } + for (u32 i = 0; i < nin; ++i) { + const char* body = aa_asm_constraint_body(ins[i].str); + int matched = aa_asm_match_index(body); + CfreeCgTypeId type; + if (matched >= 0) { + if ((u32)matched >= nout) + aa_asm_panic_at(c, loc, "matching constraint out of range"); + bound_ins[i] = bound_outs[matched]; + continue; + } + type = ins[i].type ? ins[i].type : in_locs[i].type; + aa_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, in_locs[i], + &ntmp); + } + + saved = aa_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved); + asmh = aa64_asm_open(c); + aa64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers, + nclob); + aa64_asm_run_template(asmh, t->mc, tmpl); + aa64_asm_close(asmh); + for (u32 i = nsaved; i > 0; --i) aa_asm_restore_one(a, &saved[i - 1u]); } static const NativeOps aa_direct_ops = { diff --git a/src/opt/opt.c b/src/opt/opt.c @@ -25,417 +25,6 @@ typedef struct OptImpl { Writer* dump_writer; } OptImpl; -typedef struct OptReplay { - OptImpl* o; - CGLocal* local_map; - u32 nlocals; - Label* label_map; - u32 nlabels; - CGScope* scope_map; - u32 nscopes; -} OptReplay; - -static int opt_type_large_or_aggregate(Compiler* c, CfreeCgTypeId ty) { - if (!ty) return 0; - return cg_type_is_aggregate(c, ty) || abi_cg_sizeof(c->abi, ty) > 8u; -} - -static int opt_func_needs_direct_replay(OptImpl* o, const CgIrFunc* f) { - extern char* getenv(const char*); - if (getenv("CFREE_NO_DIRECT_REPLAY")) return 0; - for (u32 i = 0; i < f->desc.nresults; ++i) - if (opt_type_large_or_aggregate(o->c, f->desc.result_types[i])) return 1; - for (u32 i = 0; i < f->desc.nparams; ++i) - if (opt_type_large_or_aggregate(o->c, f->desc.params[i].type)) return 1; - for (u32 i = 0; i < f->ninsts; ++i) { - const CgIrInst* in = &f->insts[i]; - switch ((CgIrOp)in->op) { - case CG_IR_ASM_BLOCK: - case CG_IR_ALLOCA: - case CG_IR_INTRINSIC: - case CG_IR_VA_START: - case CG_IR_VA_ARG: - case CG_IR_VA_END: - case CG_IR_VA_COPY: - return 1; - case CG_IR_CALL: { - const CgIrCallAux* aux = (const CgIrCallAux*)in->extra.aux; - if (!aux) break; - for (u32 a = 0; a < aux->desc.nargs; ++a) { - CGLocal local = aux->desc.args[a]; - if (local && local <= f->nlocals && - opt_type_large_or_aggregate(o->c, - f->locals[local - 1u].desc.type)) - return 1; - } - for (u32 r = 0; r < aux->desc.nresults; ++r) { - CGLocal local = aux->desc.results[r]; - if (local && local <= f->nlocals && - opt_type_large_or_aggregate(o->c, - f->locals[local - 1u].desc.type)) - return 1; - } - break; - } - default: - break; - } - } - return 0; -} - -static Label replay_label(OptReplay* r, Label label, SrcLoc loc) { - if (label == LABEL_NONE) return LABEL_NONE; - if (label > r->nlabels || !r->label_map[label]) - compiler_panic(r->o->c, loc, "opt direct replay: bad label"); - return r->label_map[label]; -} - -static CGLocal replay_local(OptReplay* r, CGLocal local, SrcLoc loc) { - if (local == CG_LOCAL_NONE) return CG_LOCAL_NONE; - if (local > r->nlocals || !r->local_map[local]) - compiler_panic(r->o->c, loc, "opt direct replay: bad local"); - return r->local_map[local]; -} - -static CGScope replay_scope(OptReplay* r, CGScope scope, SrcLoc loc) { - if (scope == CG_SCOPE_NONE) return CG_SCOPE_NONE; - if (scope > r->nscopes || !r->scope_map[scope]) - compiler_panic(r->o->c, loc, "opt direct replay: bad scope"); - return r->scope_map[scope]; -} - -static Operand replay_operand(OptReplay* r, Operand in, SrcLoc loc) { - if (in.kind == OPK_LOCAL) { - in.v.local = replay_local(r, in.v.local, loc); - } else if (in.kind == OPK_INDIRECT) { - in.v.ind.base = replay_local(r, in.v.ind.base, loc); - in.v.ind.index = replay_local(r, in.v.ind.index, loc); - } - return in; -} - -static void replay_operands(OptReplay* r, Operand* dst, const Operand* src, - u32 n, SrcLoc loc) { - for (u32 i = 0; i < n; ++i) dst[i] = replay_operand(r, src[i], loc); -} - -static CGCallDesc replay_call_desc(OptReplay* r, const CGCallDesc* src, - SrcLoc loc) { - CGCallDesc out = *src; - out.callee = replay_operand(r, src->callee, loc); - if (src->nargs) { - CGLocal* args = arena_array(r->o->c->tu, CGLocal, src->nargs); - for (u32 i = 0; i < src->nargs; ++i) - args[i] = replay_local(r, src->args[i], loc); - out.args = args; - } - if (src->nresults) { - CGLocal* results = arena_array(r->o->c->tu, CGLocal, src->nresults); - for (u32 i = 0; i < src->nresults; ++i) - results[i] = replay_local(r, src->results[i], loc); - out.results = results; - } - return out; -} - -static void replay_switch(OptReplay* r, const CgIrInst* in) { - const CgIrSwitchAux* src = (const CgIrSwitchAux*)in->extra.aux; - CGSwitchDesc d; - memset(&d, 0, sizeof d); - d.selector = replay_operand(r, in->opnds[0], in->loc); - d.selector_type = src->selector_type; - d.default_label = replay_label(r, src->default_label, in->loc); - d.ncases = src->ncases; - d.hint = src->hint; - d.opt_level = src->opt_level; - if (src->ncases) { - CGSwitchCase* cases = arena_array(r->o->c->tu, CGSwitchCase, src->ncases); - for (u32 i = 0; i < src->ncases; ++i) { - cases[i] = src->cases[i]; - cases[i].label = replay_label(r, src->cases[i].label, in->loc); - } - d.cases = cases; - } - r->o->target->switch_(r->o->target, &d); -} - -static void replay_inst(OptReplay* r, const CgIrInst* in) { - CgTarget* t = r->o->target; - Operand ops[5]; - if (t->set_loc) t->set_loc(t, in->loc); - switch ((CgIrOp)in->op) { - case CG_IR_NOP: - return; - case CG_IR_LABEL: - t->label_place(t, replay_label(r, (Label)in->extra.imm, in->loc)); - return; - case CG_IR_LOAD_IMM: - ops[0] = replay_operand(r, in->opnds[0], in->loc); - t->load_imm(t, ops[0], in->extra.imm); - return; - case CG_IR_LOAD_CONST: - ops[0] = replay_operand(r, in->opnds[0], in->loc); - t->load_const(t, ops[0], in->extra.cbytes); - return; - case CG_IR_COPY: - replay_operands(r, ops, in->opnds, 2, in->loc); - t->copy(t, ops[0], ops[1]); - return; - case CG_IR_LOAD: - replay_operands(r, ops, in->opnds, 2, in->loc); - t->load(t, ops[0], ops[1], in->extra.mem); - return; - case CG_IR_STORE: - replay_operands(r, ops, in->opnds, 2, in->loc); - t->store(t, ops[0], ops[1], in->extra.mem); - return; - case CG_IR_ADDR_OF: - replay_operands(r, ops, in->opnds, 2, in->loc); - t->addr_of(t, ops[0], ops[1]); - return; - case CG_IR_TLS_ADDR_OF: { - const CgIrTlsAux* aux = (const CgIrTlsAux*)in->extra.aux; - ops[0] = replay_operand(r, in->opnds[0], in->loc); - t->tls_addr_of(t, ops[0], aux->sym, aux->addend); - return; - } - case CG_IR_AGG_COPY: { - const CgIrAggAux* aux = (const CgIrAggAux*)in->extra.aux; - replay_operands(r, ops, in->opnds, 2, in->loc); - t->copy_bytes(t, ops[0], ops[1], aux->access); - return; - } - case CG_IR_AGG_SET: { - const CgIrAggAux* aux = (const CgIrAggAux*)in->extra.aux; - replay_operands(r, ops, in->opnds, 2, in->loc); - t->set_bytes(t, ops[0], ops[1], aux->access); - return; - } - case CG_IR_BITFIELD_LOAD: { - const CgIrBitFieldAux* aux = (const CgIrBitFieldAux*)in->extra.aux; - replay_operands(r, ops, in->opnds, 2, in->loc); - t->bitfield_load(t, ops[0], ops[1], aux->access); - return; - } - case CG_IR_BITFIELD_STORE: { - const CgIrBitFieldAux* aux = (const CgIrBitFieldAux*)in->extra.aux; - replay_operands(r, ops, in->opnds, 2, in->loc); - t->bitfield_store(t, ops[0], ops[1], aux->access); - return; - } - case CG_IR_BINOP: - replay_operands(r, ops, in->opnds, 3, in->loc); - t->binop(t, (BinOp)in->extra.imm, ops[0], ops[1], ops[2]); - return; - case CG_IR_UNOP: - replay_operands(r, ops, in->opnds, 2, in->loc); - t->unop(t, (UnOp)in->extra.imm, ops[0], ops[1]); - return; - case CG_IR_CMP: - replay_operands(r, ops, in->opnds, 3, in->loc); - t->cmp(t, (CmpOp)in->extra.imm, ops[0], ops[1], ops[2]); - return; - case CG_IR_CONVERT: - replay_operands(r, ops, in->opnds, 2, in->loc); - t->convert(t, (ConvKind)in->extra.imm, ops[0], ops[1]); - return; - case CG_IR_CALL: { - const CgIrCallAux* aux = (const CgIrCallAux*)in->extra.aux; - CGCallDesc d = replay_call_desc(r, &aux->desc, in->loc); - t->call(t, &d); - return; - } - case CG_IR_RET: { - const CgIrRetAux* aux = (const CgIrRetAux*)in->extra.aux; - CGLocal* values = NULL; - if (aux && aux->nvalues) { - values = arena_array(r->o->c->tu, CGLocal, aux->nvalues); - for (u32 i = 0; i < aux->nvalues; ++i) - values[i] = replay_local(r, aux->values[i], in->loc); - } - t->ret(t, values, aux ? aux->nvalues : 0u); - return; - } - case CG_IR_BR: - t->jump(t, replay_label(r, (Label)in->extra.imm, in->loc)); - return; - case CG_IR_CMP_BRANCH: { - const CgIrCmpBranchAux* aux = (const CgIrCmpBranchAux*)in->extra.aux; - replay_operands(r, ops, in->opnds, 2, in->loc); - t->cmp_branch(t, aux->op, ops[0], ops[1], - replay_label(r, aux->target, in->loc)); - return; - } - case CG_IR_SWITCH: - replay_switch(r, in); - return; - case CG_IR_INDIRECT_BRANCH: { - const CgIrIndirectAux* aux = (const CgIrIndirectAux*)in->extra.aux; - Label* targets = - arena_array(r->o->c->tu, Label, aux->ntargets ? aux->ntargets : 1u); - for (u32 i = 0; i < aux->ntargets; ++i) - targets[i] = replay_label(r, aux->targets[i], in->loc); - ops[0] = replay_operand(r, in->opnds[0], in->loc); - t->indirect_branch(t, ops[0], targets, aux->ntargets); - return; - } - case CG_IR_LOAD_LABEL_ADDR: - ops[0] = replay_operand(r, in->opnds[0], in->loc); - t->load_label_addr(t, ops[0], - replay_label(r, (Label)in->extra.imm, in->loc)); - return; - case CG_IR_SCOPE_BEGIN: { - const CgIrScopeAux* aux = (const CgIrScopeAux*)in->extra.aux; - CGScopeDesc d = aux->desc; - d.break_label = replay_label(r, d.break_label, in->loc); - d.continue_label = replay_label(r, d.continue_label, in->loc); - d.cond = replay_operand(r, d.cond, in->loc); - r->scope_map[aux->scope] = t->scope_begin(t, &d); - return; - } - case CG_IR_SCOPE_ELSE: - t->scope_else(t, replay_scope(r, (CGScope)in->extra.imm, in->loc)); - return; - case CG_IR_SCOPE_END: - t->scope_end(t, replay_scope(r, (CGScope)in->extra.imm, in->loc)); - return; - case CG_IR_BREAK_TO: - t->break_to(t, replay_scope(r, (CGScope)in->extra.imm, in->loc)); - return; - case CG_IR_CONTINUE_TO: - t->continue_to(t, replay_scope(r, (CGScope)in->extra.imm, in->loc)); - return; - case CG_IR_ALLOCA: - replay_operands(r, ops, in->opnds, 2, in->loc); - t->alloca_(t, ops[0], ops[1], (u32)in->extra.imm); - return; - case CG_IR_VA_START: - ops[0] = replay_operand(r, in->opnds[0], in->loc); - t->va_start_(t, ops[0]); - return; - case CG_IR_VA_ARG: - replay_operands(r, ops, in->opnds, 2, in->loc); - t->va_arg_(t, ops[0], ops[1], (CfreeCgTypeId)in->extra.imm); - return; - case CG_IR_VA_END: - ops[0] = replay_operand(r, in->opnds[0], in->loc); - t->va_end_(t, ops[0]); - return; - case CG_IR_VA_COPY: - replay_operands(r, ops, in->opnds, 2, in->loc); - t->va_copy_(t, ops[0], ops[1]); - return; - case CG_IR_ATOMIC_LOAD: { - const CgIrAtomicAux* aux = (const CgIrAtomicAux*)in->extra.aux; - replay_operands(r, ops, in->opnds, 2, in->loc); - t->atomic_load(t, ops[0], ops[1], aux->mem, aux->order); - return; - } - case CG_IR_ATOMIC_STORE: { - const CgIrAtomicAux* aux = (const CgIrAtomicAux*)in->extra.aux; - replay_operands(r, ops, in->opnds, 2, in->loc); - t->atomic_store(t, ops[0], ops[1], aux->mem, aux->order); - return; - } - case CG_IR_ATOMIC_RMW: { - const CgIrAtomicAux* aux = (const CgIrAtomicAux*)in->extra.aux; - replay_operands(r, ops, in->opnds, 3, in->loc); - t->atomic_rmw(t, aux->op, ops[0], ops[1], ops[2], aux->mem, aux->order); - return; - } - case CG_IR_ATOMIC_CAS: { - const CgIrAtomicAux* aux = (const CgIrAtomicAux*)in->extra.aux; - replay_operands(r, ops, in->opnds, 5, in->loc); - t->atomic_cas(t, ops[0], ops[1], ops[2], ops[3], ops[4], aux->mem, - aux->order, aux->failure); - return; - } - case CG_IR_FENCE: - t->fence(t, (MemOrder)in->extra.imm); - return; - case CG_IR_INTRINSIC: { - const CgIrIntrinsicAux* aux = (const CgIrIntrinsicAux*)in->extra.aux; - Operand* dsts = - arena_array(r->o->c->tu, Operand, aux->ndst ? aux->ndst : 1u); - Operand* args = - arena_array(r->o->c->tu, Operand, aux->narg ? aux->narg : 1u); - replay_operands(r, dsts, aux->dsts, aux->ndst, in->loc); - replay_operands(r, args, aux->args, aux->narg, in->loc); - t->intrinsic(t, aux->kind, dsts, aux->ndst, args, aux->narg); - return; - } - case CG_IR_ASM_BLOCK: { - const CgIrAsmAux* aux = (const CgIrAsmAux*)in->extra.aux; - Operand* out_ops = - arena_array(r->o->c->tu, Operand, aux->nout ? aux->nout : 1u); - Operand* in_ops = - arena_array(r->o->c->tu, Operand, aux->nin ? aux->nin : 1u); - replay_operands(r, out_ops, aux->out_ops, aux->nout, in->loc); - replay_operands(r, in_ops, aux->in_ops, aux->nin, in->loc); - t->asm_block(t, aux->tmpl, aux->outs, aux->nout, out_ops, aux->ins, - aux->nin, in_ops, aux->clobbers, aux->nclob); - return; - } - case CG_IR_LOCAL_STATIC_DATA_BEGIN: { - const CgIrLocalStaticBeginAux* aux = - (const CgIrLocalStaticBeginAux*)in->extra.aux; - if (!t->local_static_data_begin || - !t->local_static_data_begin(t, &aux->desc)) - compiler_panic(r->o->c, in->loc, - "opt direct replay: local static data unsupported"); - return; - } - case CG_IR_LOCAL_STATIC_DATA_WRITE: { - const CgIrLocalStaticWriteAux* aux = - (const CgIrLocalStaticWriteAux*)in->extra.aux; - t->local_static_data_write(t, aux->has_data ? aux->data : NULL, aux->len); - return; - } - case CG_IR_LOCAL_STATIC_DATA_LABEL_ADDR: { - const CgIrLocalStaticLabelAux* aux = - (const CgIrLocalStaticLabelAux*)in->extra.aux; - t->local_static_data_label_addr(t, replay_label(r, aux->target, in->loc), - aux->addend, aux->width, - aux->address_space); - return; - } - case CG_IR_LOCAL_STATIC_DATA_END: - t->local_static_data_end(t); - return; - } -} - -static void opt_replay_cg_ir_direct(OptImpl* o, const CgIrFunc* f) { - OptReplay r; - memset(&r, 0, sizeof r); - r.o = o; - r.nlocals = f->nlocals; - r.local_map = - arena_zarray(o->c->tu, CGLocal, f->nlocals ? f->nlocals + 1u : 1u); - for (u32 i = 0; i < f->nlabels; ++i) - if (f->labels[i].id > r.nlabels) r.nlabels = f->labels[i].id; - r.label_map = arena_zarray(o->c->tu, Label, r.nlabels ? r.nlabels + 1u : 1u); - r.nscopes = f->nscopes; - r.scope_map = - arena_zarray(o->c->tu, CGScope, f->nscopes ? f->nscopes + 1u : 1u); - - o->target->func_begin(o->target, &f->desc); - for (u32 i = 0; i < f->nlabels; ++i) - r.label_map[f->labels[i].id] = o->target->label_new(o->target); - for (u32 i = 0; i < f->nparams; ++i) { - const CgIrParam* p = &f->params[i]; - r.local_map[p->local] = o->target->param(o->target, &p->desc); - } - for (u32 i = 0; i < f->nlocals; ++i) { - const CgIrLocal* l = &f->locals[i]; - if (!r.local_map[l->id]) - r.local_map[l->id] = o->target->local(o->target, &l->desc); - } - for (u32 i = 0; i < f->ninsts; ++i) replay_inst(&r, &f->insts[i]); - o->target->func_end(o->target); -} - static void opt_dbg_dump(OptImpl* o, Func* f, const char* tag) { extern char* getenv(const char*); const char* s = getenv("CFREE_DUMP"); @@ -609,10 +198,6 @@ static void opt_on_func(void* user, CgIrFunc* cg_func) { OptImpl* o = (OptImpl*)user; Func* f; opt_dbg_dump_cg(o, cg_func); - if (opt_func_needs_direct_replay(o, cg_func)) { - opt_replay_cg_ir_direct(o, cg_func); - return; - } metrics_scope_begin(o->c, "opt.o1.cg_ir_lower"); f = opt_func_from_cg_ir(o->c, cg_func); metrics_scope_end(o->c, "opt.o1.cg_ir_lower"); diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c @@ -1171,9 +1171,31 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in, mem_for_type(e->c, ty), in->loc); return; } + case IR_ASM_BLOCK: { + IRAsmAux* aux = (IRAsmAux*)in->extra.aux; + NativeLoc* out_locs = + aux && aux->nout ? arena_array(e->f->arena, NativeLoc, aux->nout) + : NULL; + NativeLoc* in_locs = aux && aux->nin + ? arena_array(e->f->arena, NativeLoc, aux->nin) + : NULL; + /* The optimizer has already allocated registers for the asm operands and + * placed the input values / consumes the output values through the normal + * use/def data flow. We only convert each operand to its NativeLoc; the + * NativeTarget hook binds the pre-allocated registers to the template and + * saves/restores any callee-saved registers the asm clobbers. */ + for (u32 i = 0; aux && i < aux->nout; ++i) + out_locs[i] = loc_from_operand(e, &aux->out_ops[i], in->loc); + for (u32 i = 0; aux && i < aux->nin; ++i) + in_locs[i] = loc_from_operand(e, &aux->in_ops[i], in->loc); + e->target->asm_block(e->target, aux ? aux->tmpl : "", + aux ? aux->outs : NULL, aux ? aux->nout : 0, out_locs, + aux ? aux->ins : NULL, aux ? aux->nin : 0, in_locs, + aux ? aux->clobbers : NULL, aux ? aux->nclob : 0); + return; + } case IR_BREAK_TO: case IR_CONTINUE_TO: - case IR_ASM_BLOCK: emit_panic(e, in->loc, "operation is not wired to NativeTarget yet"); case IR_FENCE: e->target->fence(e->target, (MemOrder)in->extra.imm);