commit 53115ba9e2fd1150957b4cbc870ded5295b6682d
parent 3ebd1a10457514d71a637b181416abd8e998a936
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Wed, 27 May 2026 06:19:58 -0700
opt: route inline asm through optimizer; delete direct-replay path
Add an IR_ASM_BLOCK case to pass_native_emit that binds the optimizer's
pre-allocated operand registers to the asm template via a new aa64
NativeTarget asm_block hook. The hook reuses the direct path's clobber-mask
and callee-save/restore helpers (refactored off NativeDirectTarget onto
AANativeTarget) but does not self-allocate: inputs are already live in their
registers and outputs are consumed through the normal use/def data flow, so
it only binds registers and materializes memory-operand bases into scratch.
With asm landed, every function now compiles through the optimizer, so the
direct-replay fallback (opt_func_needs_direct_replay, opt_replay_cg_ir_direct,
the OptReplay machinery and replay_* helpers) and the CFREE_NO_DIRECT_REPLAY
env gate are removed.
Toy R-path green at O1 (and default O0+O1); full toy suite 1333 pass / 0 fail.
Diffstat:
3 files changed, 185 insertions(+), 438 deletions(-)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -2573,6 +2573,11 @@ static void aa_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr,
static void aa_va_end_native(NativeTarget* t, NativeLoc ap_ptr);
static void aa_va_copy_native(NativeTarget* t, NativeLoc dst_ap_ptr,
NativeLoc src_ap_ptr);
+static void aa_asm_block_native(NativeTarget* t, const char* tmpl,
+ const AsmConstraint* outs, u32 nout,
+ NativeLoc* out_locs, const AsmConstraint* ins,
+ u32 nin, const NativeLoc* in_locs,
+ const Sym* clobbers, u32 nclob);
NativeTarget* aa64_native_target_new(Compiler* c, ObjBuilder* obj,
MCEmitter* mc) {
@@ -2630,6 +2635,7 @@ NativeTarget* aa64_native_target_new(Compiler* c, ObjBuilder* obj,
t->va_end_ = aa_va_end_native;
t->va_copy_ = aa_va_copy_native;
t->intrinsic = aa_intrinsic;
+ t->asm_block = aa_asm_block_native;
t->file_scope_asm = aa_file_scope_asm;
t->trap = aa_trap;
t->set_loc = aa_set_loc;
@@ -3065,8 +3071,13 @@ AA_UNUSED_FN static int aa_asm_match_index(const char* s) {
return n;
}
+_Noreturn static void aa_asm_panic_at(Compiler* c, SrcLoc loc,
+ const char* msg) {
+ compiler_panic(c, loc, "aarch64 inline asm: %s", msg);
+}
+
_Noreturn static void aa_asm_panic(NativeDirectTarget* d, const char* msg) {
- compiler_panic(d->base.c, d->loc, "aarch64 inline asm: %s", msg);
+ aa_asm_panic_at(d->base.c, d->loc, msg);
}
AA_UNUSED_FN static void aa_asm_bound_reg(Operand* out, CfreeCgTypeId type,
@@ -3088,19 +3099,19 @@ AA_UNUSED_FN static void aa_asm_bound_mem(Operand* out, CfreeCgTypeId type,
out->v.ind.index = CG_LOCAL_NONE;
}
-static int aa_asm_parse_reg_clobber(NativeDirectTarget* d, Sym name,
+static int aa_asm_parse_reg_clobber(Compiler* c, SrcLoc loc, Sym name,
NativeAllocClass* cls_out, Reg* reg_out) {
- Slice s = pool_slice(d->base.c->global, name);
+ Slice s = pool_slice(c->global, name);
char buf[16];
uint32_t dwarf;
if (!s.s || !s.len) return 0;
if (s.len == 2 && s.s[0] == 'c' && s.s[1] == 'c') return 0;
if (s.len == 6 && memcmp(s.s, "memory", 6) == 0) return 0;
- if (s.len >= sizeof buf) aa_asm_panic(d, "clobber name is too long");
+ if (s.len >= sizeof buf) aa_asm_panic_at(c, loc, "clobber name is too long");
memcpy(buf, s.s, s.len);
buf[s.len] = '\0';
if (aa64_register_index(buf, &dwarf) != 0)
- aa_asm_panic(d, "unknown clobber register");
+ aa_asm_panic_at(c, loc, "unknown clobber register");
if (dwarf <= 30u) {
*cls_out = NATIVE_REG_INT;
*reg_out = (Reg)dwarf;
@@ -3111,11 +3122,11 @@ static int aa_asm_parse_reg_clobber(NativeDirectTarget* d, Sym name,
*reg_out = (Reg)(dwarf - 64u);
return 1;
}
- aa_asm_panic(d, "unsupported clobber register");
+ aa_asm_panic_at(c, loc, "unsupported clobber register");
return 0;
}
-AA_UNUSED_FN static void aa_asm_clobber_masks(NativeDirectTarget* d,
+AA_UNUSED_FN static void aa_asm_clobber_masks(Compiler* c, SrcLoc loc,
const Sym* clobbers, u32 nclob,
u32* int_mask, u32* fp_mask) {
*int_mask = 0;
@@ -3123,7 +3134,7 @@ AA_UNUSED_FN static void aa_asm_clobber_masks(NativeDirectTarget* d,
for (u32 i = 0; i < nclob; ++i) {
NativeAllocClass cls;
Reg reg;
- if (!aa_asm_parse_reg_clobber(d, clobbers[i], &cls, ®)) continue;
+ if (!aa_asm_parse_reg_clobber(c, loc, clobbers[i], &cls, ®)) continue;
if (cls == NATIVE_REG_INT)
*int_mask |= 1u << reg;
else if (cls == NATIVE_REG_FP)
@@ -3225,7 +3236,7 @@ typedef struct AAAsmSavedClobber {
CfreeCgTypeId type;
} AAAsmSavedClobber;
-static void aa_asm_save_one(NativeDirectTarget* d, AAAsmSavedClobber* s) {
+static void aa_asm_save_one(AANativeTarget* a, AAAsmSavedClobber* s) {
NativeFrameSlotDesc desc;
NativeAddr addr;
NativeLoc reg;
@@ -3234,17 +3245,16 @@ static void aa_asm_save_one(NativeDirectTarget* d, AAAsmSavedClobber* s) {
desc.size = 8;
desc.align = 8;
desc.kind = NATIVE_FRAME_SLOT_SAVE;
- s->slot = d->native->frame_slot(d->native, &desc);
+ s->slot = a->base.frame_slot(&a->base, &desc);
memset(&addr, 0, sizeof addr);
addr.base_kind = NATIVE_ADDR_BASE_FRAME;
addr.base.frame = s->slot;
addr.base_type = s->type;
reg = aa_reg_loc(s->type, s->cls, s->reg);
- aa_emit_mem(aa_of(d->native), 0, reg, addr,
- aa_mem_for_type(d->native, s->type, 8));
+ aa_emit_mem(a, 0, reg, addr, aa_mem_for_type(&a->base, s->type, 8));
}
-AA_UNUSED_FN static void aa_asm_restore_one(NativeDirectTarget* d,
+AA_UNUSED_FN static void aa_asm_restore_one(AANativeTarget* a,
const AAAsmSavedClobber* s) {
NativeAddr addr;
NativeLoc reg = aa_reg_loc(s->type, s->cls, s->reg);
@@ -3252,14 +3262,13 @@ AA_UNUSED_FN static void aa_asm_restore_one(NativeDirectTarget* d,
addr.base_kind = NATIVE_ADDR_BASE_FRAME;
addr.base.frame = s->slot;
addr.base_type = s->type;
- aa_emit_mem(aa_of(d->native), 1, reg, addr,
- aa_mem_for_type(d->native, s->type, 8));
+ aa_emit_mem(a, 1, reg, addr, aa_mem_for_type(&a->base, s->type, 8));
}
AA_UNUSED_FN static AAAsmSavedClobber* aa_asm_save_callee_clobbers(
- NativeDirectTarget* d, u32 int_mask, u32 fp_mask, u32* nsaved_out) {
+ AANativeTarget* a, u32 int_mask, u32 fp_mask, u32* nsaved_out) {
AAAsmSavedClobber* saved =
- arena_zarray(d->base.c->tu, AAAsmSavedClobber, 20u);
+ arena_zarray(a->base.c->tu, AAAsmSavedClobber, 20u);
u32 n = 0;
CfreeCgTypeId i64 = builtin_id(CFREE_CG_BUILTIN_I64);
CfreeCgTypeId f64 = builtin_id(CFREE_CG_BUILTIN_F64);
@@ -3268,14 +3277,14 @@ AA_UNUSED_FN static AAAsmSavedClobber* aa_asm_save_callee_clobbers(
saved[n].cls = NATIVE_REG_INT;
saved[n].reg = r;
saved[n].type = i64;
- aa_asm_save_one(d, &saved[n++]);
+ aa_asm_save_one(a, &saved[n++]);
}
for (Reg r = 8u; r <= 15u; ++r) {
if ((fp_mask & (1u << r)) == 0) continue;
saved[n].cls = NATIVE_REG_FP;
saved[n].reg = r;
saved[n].type = f64;
- aa_asm_save_one(d, &saved[n++]);
+ aa_asm_save_one(a, &saved[n++]);
}
*nsaved_out = n;
return saved;
@@ -3294,7 +3303,7 @@ static void aa_direct_asm_block(NativeDirectTarget* d, const char* tmpl,
u32 nsaved;
AA64Asm* a;
- aa_asm_clobber_masks(d, clobbers, nclob, &clob_int, &clob_fp);
+ aa_asm_clobber_masks(d->base.c, d->loc, clobbers, nclob, &clob_int, &clob_fp);
used_int = clob_int | (1u << AA_TMP0) | (1u << AA_TMP1) | (1u << 18u) |
(1u << AA_FP) | (1u << AA_LR) | (1u << AA_SP);
used_fp = clob_fp | (1u << 20u) | (1u << 21u);
@@ -3364,7 +3373,8 @@ static void aa_direct_asm_block(NativeDirectTarget* d, const char* tmpl,
}
}
- saved = aa_asm_save_callee_clobbers(d, clob_int, clob_fp, &nsaved);
+ saved = aa_asm_save_callee_clobbers(aa_of(d->native), clob_int, clob_fp,
+ &nsaved);
a = aa64_asm_open(d->base.c);
aa64_inline_bind(a, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
nclob);
@@ -3380,7 +3390,137 @@ static void aa_direct_asm_block(NativeDirectTarget* d, const char* tmpl,
src = aa_reg_loc(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local);
aa_direct_store_reg_to_operand(d, out_ops[i], src);
}
- for (u32 i = nsaved; i > 0; --i) aa_asm_restore_one(d, &saved[i - 1u]);
+ for (u32 i = nsaved; i > 0; --i)
+ aa_asm_restore_one(aa_of(d->native), &saved[i - 1u]);
+}
+
+/* ---- NativeTarget (optimizer) asm hook ----
+ *
+ * The optimizer has already allocated every operand register and arranged the
+ * surrounding data flow (inputs are live in their registers on entry, outputs
+ * are consumed from their registers on exit; the asm's clobber_mask kept the
+ * allocator from holding live values in clobbered registers). So unlike the
+ * direct path this hook does NOT self-allocate registers and does NOT load
+ * inputs / store outputs -- it only binds the pre-allocated registers to the
+ * template, materializing memory-operand base addresses into the reserved
+ * scratch registers and saving/restoring callee-saved registers the asm
+ * clobbers (the only ABI obligation the allocator cannot discharge itself). */
+
+static NativeAddr aa_asm_loc_to_addr(AANativeTarget* a, SrcLoc loc,
+ NativeLoc src) {
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ addr.base_type = src.type;
+ switch ((NativeLocKind)src.kind) {
+ case NATIVE_LOC_FRAME:
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = src.v.frame;
+ return addr;
+ case NATIVE_LOC_ADDR:
+ return src.v.addr;
+ case NATIVE_LOC_GLOBAL:
+ addr.base_kind = NATIVE_ADDR_BASE_GLOBAL;
+ addr.base.global.sym = src.v.global.sym;
+ addr.base.global.addend = src.v.global.addend;
+ return addr;
+ case NATIVE_LOC_REG:
+ addr.base_kind = NATIVE_ADDR_BASE_REG;
+ addr.cls = NATIVE_REG_INT;
+ addr.base.reg = src.v.reg;
+ return addr;
+ default:
+ aa_asm_panic_at(a->base.c, loc, "unsupported memory asm operand");
+ }
+}
+
+/* Resolve a memory-constraint operand to a single base register with zero
+ * offset, folding any frame/global/offset into a scratch register. At most the
+ * two reserved scratch registers are used across one asm block. */
+static Reg aa_asm_native_mem_base(AANativeTarget* a, SrcLoc loc, NativeLoc src,
+ u32* ntmp) {
+ NativeAddr addr = aa_asm_loc_to_addr(a, loc, src);
+ u32 base;
+ i32 off;
+ Reg dst;
+ if (addr.index_kind != NATIVE_ADDR_INDEX_NONE)
+ aa_asm_panic_at(a->base.c, loc, "indexed memory asm operand unsupported");
+ aa_addr_base(a, addr, &base, &off);
+ if (off == 0) return (Reg)base;
+ if (*ntmp >= 2u)
+ aa_asm_panic_at(a->base.c, loc, "too many memory asm operands");
+ dst = (*ntmp == 0u) ? AA_TMP0 : AA_TMP1;
+ (*ntmp)++;
+ aa_emit_add_imm(a, dst, base, off);
+ return dst;
+}
+
+static void aa_asm_bind_native(AANativeTarget* a, SrcLoc loc, Operand* out,
+ const char* constraint, CfreeCgTypeId type,
+ NativeLoc src, u32* ntmp) {
+ const char* body = aa_asm_constraint_body(constraint);
+ if (body[0] == 'r' || body[0] == 'w') {
+ NativeAllocClass cls = (body[0] == 'w') ? NATIVE_REG_FP : NATIVE_REG_INT;
+ if (src.kind != NATIVE_LOC_REG)
+ aa_asm_panic_at(a->base.c, loc, "register asm operand not in a register");
+ aa_asm_bound_reg(out, type, cls, (Reg)src.v.reg);
+ } else if (body[0] == 'i') {
+ if (src.kind != NATIVE_LOC_IMM)
+ aa_asm_panic_at(a->base.c, loc, "immediate asm operand is not immediate");
+ memset(out, 0, sizeof *out);
+ out->kind = OPK_IMM;
+ out->type = type;
+ out->v.imm = src.v.imm;
+ } else if (body[0] == 'm') {
+ aa_asm_bound_mem(out, type, aa_asm_native_mem_base(a, loc, src, ntmp));
+ } else {
+ aa_asm_panic_at(a->base.c, loc, "unsupported asm constraint");
+ }
+}
+
+static void aa_asm_block_native(NativeTarget* t, const char* tmpl,
+ const AsmConstraint* outs, u32 nout,
+ NativeLoc* out_locs, const AsmConstraint* ins,
+ u32 nin, const NativeLoc* in_locs,
+ const Sym* clobbers, u32 nclob) {
+ AANativeTarget* a = aa_of(t);
+ Compiler* c = t->c;
+ SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
+ Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
+ Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
+ u32 clob_int, clob_fp, ntmp = 0;
+ AAAsmSavedClobber* saved;
+ u32 nsaved;
+ AA64Asm* asmh;
+
+ aa_asm_clobber_masks(c, loc, clobbers, nclob, &clob_int, &clob_fp);
+
+ for (u32 i = 0; i < nout; ++i) {
+ CfreeCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type;
+ aa_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, out_locs[i],
+ &ntmp);
+ }
+ for (u32 i = 0; i < nin; ++i) {
+ const char* body = aa_asm_constraint_body(ins[i].str);
+ int matched = aa_asm_match_index(body);
+ CfreeCgTypeId type;
+ if (matched >= 0) {
+ if ((u32)matched >= nout)
+ aa_asm_panic_at(c, loc, "matching constraint out of range");
+ bound_ins[i] = bound_outs[matched];
+ continue;
+ }
+ type = ins[i].type ? ins[i].type : in_locs[i].type;
+ aa_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, in_locs[i],
+ &ntmp);
+ }
+
+ saved = aa_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved);
+ asmh = aa64_asm_open(c);
+ aa64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
+ nclob);
+ aa64_asm_run_template(asmh, t->mc, tmpl);
+ aa64_asm_close(asmh);
+ for (u32 i = nsaved; i > 0; --i) aa_asm_restore_one(a, &saved[i - 1u]);
}
static const NativeOps aa_direct_ops = {
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -25,417 +25,6 @@ typedef struct OptImpl {
Writer* dump_writer;
} OptImpl;
-typedef struct OptReplay {
- OptImpl* o;
- CGLocal* local_map;
- u32 nlocals;
- Label* label_map;
- u32 nlabels;
- CGScope* scope_map;
- u32 nscopes;
-} OptReplay;
-
-static int opt_type_large_or_aggregate(Compiler* c, CfreeCgTypeId ty) {
- if (!ty) return 0;
- return cg_type_is_aggregate(c, ty) || abi_cg_sizeof(c->abi, ty) > 8u;
-}
-
-static int opt_func_needs_direct_replay(OptImpl* o, const CgIrFunc* f) {
- extern char* getenv(const char*);
- if (getenv("CFREE_NO_DIRECT_REPLAY")) return 0;
- for (u32 i = 0; i < f->desc.nresults; ++i)
- if (opt_type_large_or_aggregate(o->c, f->desc.result_types[i])) return 1;
- for (u32 i = 0; i < f->desc.nparams; ++i)
- if (opt_type_large_or_aggregate(o->c, f->desc.params[i].type)) return 1;
- for (u32 i = 0; i < f->ninsts; ++i) {
- const CgIrInst* in = &f->insts[i];
- switch ((CgIrOp)in->op) {
- case CG_IR_ASM_BLOCK:
- case CG_IR_ALLOCA:
- case CG_IR_INTRINSIC:
- case CG_IR_VA_START:
- case CG_IR_VA_ARG:
- case CG_IR_VA_END:
- case CG_IR_VA_COPY:
- return 1;
- case CG_IR_CALL: {
- const CgIrCallAux* aux = (const CgIrCallAux*)in->extra.aux;
- if (!aux) break;
- for (u32 a = 0; a < aux->desc.nargs; ++a) {
- CGLocal local = aux->desc.args[a];
- if (local && local <= f->nlocals &&
- opt_type_large_or_aggregate(o->c,
- f->locals[local - 1u].desc.type))
- return 1;
- }
- for (u32 r = 0; r < aux->desc.nresults; ++r) {
- CGLocal local = aux->desc.results[r];
- if (local && local <= f->nlocals &&
- opt_type_large_or_aggregate(o->c,
- f->locals[local - 1u].desc.type))
- return 1;
- }
- break;
- }
- default:
- break;
- }
- }
- return 0;
-}
-
-static Label replay_label(OptReplay* r, Label label, SrcLoc loc) {
- if (label == LABEL_NONE) return LABEL_NONE;
- if (label > r->nlabels || !r->label_map[label])
- compiler_panic(r->o->c, loc, "opt direct replay: bad label");
- return r->label_map[label];
-}
-
-static CGLocal replay_local(OptReplay* r, CGLocal local, SrcLoc loc) {
- if (local == CG_LOCAL_NONE) return CG_LOCAL_NONE;
- if (local > r->nlocals || !r->local_map[local])
- compiler_panic(r->o->c, loc, "opt direct replay: bad local");
- return r->local_map[local];
-}
-
-static CGScope replay_scope(OptReplay* r, CGScope scope, SrcLoc loc) {
- if (scope == CG_SCOPE_NONE) return CG_SCOPE_NONE;
- if (scope > r->nscopes || !r->scope_map[scope])
- compiler_panic(r->o->c, loc, "opt direct replay: bad scope");
- return r->scope_map[scope];
-}
-
-static Operand replay_operand(OptReplay* r, Operand in, SrcLoc loc) {
- if (in.kind == OPK_LOCAL) {
- in.v.local = replay_local(r, in.v.local, loc);
- } else if (in.kind == OPK_INDIRECT) {
- in.v.ind.base = replay_local(r, in.v.ind.base, loc);
- in.v.ind.index = replay_local(r, in.v.ind.index, loc);
- }
- return in;
-}
-
-static void replay_operands(OptReplay* r, Operand* dst, const Operand* src,
- u32 n, SrcLoc loc) {
- for (u32 i = 0; i < n; ++i) dst[i] = replay_operand(r, src[i], loc);
-}
-
-static CGCallDesc replay_call_desc(OptReplay* r, const CGCallDesc* src,
- SrcLoc loc) {
- CGCallDesc out = *src;
- out.callee = replay_operand(r, src->callee, loc);
- if (src->nargs) {
- CGLocal* args = arena_array(r->o->c->tu, CGLocal, src->nargs);
- for (u32 i = 0; i < src->nargs; ++i)
- args[i] = replay_local(r, src->args[i], loc);
- out.args = args;
- }
- if (src->nresults) {
- CGLocal* results = arena_array(r->o->c->tu, CGLocal, src->nresults);
- for (u32 i = 0; i < src->nresults; ++i)
- results[i] = replay_local(r, src->results[i], loc);
- out.results = results;
- }
- return out;
-}
-
-static void replay_switch(OptReplay* r, const CgIrInst* in) {
- const CgIrSwitchAux* src = (const CgIrSwitchAux*)in->extra.aux;
- CGSwitchDesc d;
- memset(&d, 0, sizeof d);
- d.selector = replay_operand(r, in->opnds[0], in->loc);
- d.selector_type = src->selector_type;
- d.default_label = replay_label(r, src->default_label, in->loc);
- d.ncases = src->ncases;
- d.hint = src->hint;
- d.opt_level = src->opt_level;
- if (src->ncases) {
- CGSwitchCase* cases = arena_array(r->o->c->tu, CGSwitchCase, src->ncases);
- for (u32 i = 0; i < src->ncases; ++i) {
- cases[i] = src->cases[i];
- cases[i].label = replay_label(r, src->cases[i].label, in->loc);
- }
- d.cases = cases;
- }
- r->o->target->switch_(r->o->target, &d);
-}
-
-static void replay_inst(OptReplay* r, const CgIrInst* in) {
- CgTarget* t = r->o->target;
- Operand ops[5];
- if (t->set_loc) t->set_loc(t, in->loc);
- switch ((CgIrOp)in->op) {
- case CG_IR_NOP:
- return;
- case CG_IR_LABEL:
- t->label_place(t, replay_label(r, (Label)in->extra.imm, in->loc));
- return;
- case CG_IR_LOAD_IMM:
- ops[0] = replay_operand(r, in->opnds[0], in->loc);
- t->load_imm(t, ops[0], in->extra.imm);
- return;
- case CG_IR_LOAD_CONST:
- ops[0] = replay_operand(r, in->opnds[0], in->loc);
- t->load_const(t, ops[0], in->extra.cbytes);
- return;
- case CG_IR_COPY:
- replay_operands(r, ops, in->opnds, 2, in->loc);
- t->copy(t, ops[0], ops[1]);
- return;
- case CG_IR_LOAD:
- replay_operands(r, ops, in->opnds, 2, in->loc);
- t->load(t, ops[0], ops[1], in->extra.mem);
- return;
- case CG_IR_STORE:
- replay_operands(r, ops, in->opnds, 2, in->loc);
- t->store(t, ops[0], ops[1], in->extra.mem);
- return;
- case CG_IR_ADDR_OF:
- replay_operands(r, ops, in->opnds, 2, in->loc);
- t->addr_of(t, ops[0], ops[1]);
- return;
- case CG_IR_TLS_ADDR_OF: {
- const CgIrTlsAux* aux = (const CgIrTlsAux*)in->extra.aux;
- ops[0] = replay_operand(r, in->opnds[0], in->loc);
- t->tls_addr_of(t, ops[0], aux->sym, aux->addend);
- return;
- }
- case CG_IR_AGG_COPY: {
- const CgIrAggAux* aux = (const CgIrAggAux*)in->extra.aux;
- replay_operands(r, ops, in->opnds, 2, in->loc);
- t->copy_bytes(t, ops[0], ops[1], aux->access);
- return;
- }
- case CG_IR_AGG_SET: {
- const CgIrAggAux* aux = (const CgIrAggAux*)in->extra.aux;
- replay_operands(r, ops, in->opnds, 2, in->loc);
- t->set_bytes(t, ops[0], ops[1], aux->access);
- return;
- }
- case CG_IR_BITFIELD_LOAD: {
- const CgIrBitFieldAux* aux = (const CgIrBitFieldAux*)in->extra.aux;
- replay_operands(r, ops, in->opnds, 2, in->loc);
- t->bitfield_load(t, ops[0], ops[1], aux->access);
- return;
- }
- case CG_IR_BITFIELD_STORE: {
- const CgIrBitFieldAux* aux = (const CgIrBitFieldAux*)in->extra.aux;
- replay_operands(r, ops, in->opnds, 2, in->loc);
- t->bitfield_store(t, ops[0], ops[1], aux->access);
- return;
- }
- case CG_IR_BINOP:
- replay_operands(r, ops, in->opnds, 3, in->loc);
- t->binop(t, (BinOp)in->extra.imm, ops[0], ops[1], ops[2]);
- return;
- case CG_IR_UNOP:
- replay_operands(r, ops, in->opnds, 2, in->loc);
- t->unop(t, (UnOp)in->extra.imm, ops[0], ops[1]);
- return;
- case CG_IR_CMP:
- replay_operands(r, ops, in->opnds, 3, in->loc);
- t->cmp(t, (CmpOp)in->extra.imm, ops[0], ops[1], ops[2]);
- return;
- case CG_IR_CONVERT:
- replay_operands(r, ops, in->opnds, 2, in->loc);
- t->convert(t, (ConvKind)in->extra.imm, ops[0], ops[1]);
- return;
- case CG_IR_CALL: {
- const CgIrCallAux* aux = (const CgIrCallAux*)in->extra.aux;
- CGCallDesc d = replay_call_desc(r, &aux->desc, in->loc);
- t->call(t, &d);
- return;
- }
- case CG_IR_RET: {
- const CgIrRetAux* aux = (const CgIrRetAux*)in->extra.aux;
- CGLocal* values = NULL;
- if (aux && aux->nvalues) {
- values = arena_array(r->o->c->tu, CGLocal, aux->nvalues);
- for (u32 i = 0; i < aux->nvalues; ++i)
- values[i] = replay_local(r, aux->values[i], in->loc);
- }
- t->ret(t, values, aux ? aux->nvalues : 0u);
- return;
- }
- case CG_IR_BR:
- t->jump(t, replay_label(r, (Label)in->extra.imm, in->loc));
- return;
- case CG_IR_CMP_BRANCH: {
- const CgIrCmpBranchAux* aux = (const CgIrCmpBranchAux*)in->extra.aux;
- replay_operands(r, ops, in->opnds, 2, in->loc);
- t->cmp_branch(t, aux->op, ops[0], ops[1],
- replay_label(r, aux->target, in->loc));
- return;
- }
- case CG_IR_SWITCH:
- replay_switch(r, in);
- return;
- case CG_IR_INDIRECT_BRANCH: {
- const CgIrIndirectAux* aux = (const CgIrIndirectAux*)in->extra.aux;
- Label* targets =
- arena_array(r->o->c->tu, Label, aux->ntargets ? aux->ntargets : 1u);
- for (u32 i = 0; i < aux->ntargets; ++i)
- targets[i] = replay_label(r, aux->targets[i], in->loc);
- ops[0] = replay_operand(r, in->opnds[0], in->loc);
- t->indirect_branch(t, ops[0], targets, aux->ntargets);
- return;
- }
- case CG_IR_LOAD_LABEL_ADDR:
- ops[0] = replay_operand(r, in->opnds[0], in->loc);
- t->load_label_addr(t, ops[0],
- replay_label(r, (Label)in->extra.imm, in->loc));
- return;
- case CG_IR_SCOPE_BEGIN: {
- const CgIrScopeAux* aux = (const CgIrScopeAux*)in->extra.aux;
- CGScopeDesc d = aux->desc;
- d.break_label = replay_label(r, d.break_label, in->loc);
- d.continue_label = replay_label(r, d.continue_label, in->loc);
- d.cond = replay_operand(r, d.cond, in->loc);
- r->scope_map[aux->scope] = t->scope_begin(t, &d);
- return;
- }
- case CG_IR_SCOPE_ELSE:
- t->scope_else(t, replay_scope(r, (CGScope)in->extra.imm, in->loc));
- return;
- case CG_IR_SCOPE_END:
- t->scope_end(t, replay_scope(r, (CGScope)in->extra.imm, in->loc));
- return;
- case CG_IR_BREAK_TO:
- t->break_to(t, replay_scope(r, (CGScope)in->extra.imm, in->loc));
- return;
- case CG_IR_CONTINUE_TO:
- t->continue_to(t, replay_scope(r, (CGScope)in->extra.imm, in->loc));
- return;
- case CG_IR_ALLOCA:
- replay_operands(r, ops, in->opnds, 2, in->loc);
- t->alloca_(t, ops[0], ops[1], (u32)in->extra.imm);
- return;
- case CG_IR_VA_START:
- ops[0] = replay_operand(r, in->opnds[0], in->loc);
- t->va_start_(t, ops[0]);
- return;
- case CG_IR_VA_ARG:
- replay_operands(r, ops, in->opnds, 2, in->loc);
- t->va_arg_(t, ops[0], ops[1], (CfreeCgTypeId)in->extra.imm);
- return;
- case CG_IR_VA_END:
- ops[0] = replay_operand(r, in->opnds[0], in->loc);
- t->va_end_(t, ops[0]);
- return;
- case CG_IR_VA_COPY:
- replay_operands(r, ops, in->opnds, 2, in->loc);
- t->va_copy_(t, ops[0], ops[1]);
- return;
- case CG_IR_ATOMIC_LOAD: {
- const CgIrAtomicAux* aux = (const CgIrAtomicAux*)in->extra.aux;
- replay_operands(r, ops, in->opnds, 2, in->loc);
- t->atomic_load(t, ops[0], ops[1], aux->mem, aux->order);
- return;
- }
- case CG_IR_ATOMIC_STORE: {
- const CgIrAtomicAux* aux = (const CgIrAtomicAux*)in->extra.aux;
- replay_operands(r, ops, in->opnds, 2, in->loc);
- t->atomic_store(t, ops[0], ops[1], aux->mem, aux->order);
- return;
- }
- case CG_IR_ATOMIC_RMW: {
- const CgIrAtomicAux* aux = (const CgIrAtomicAux*)in->extra.aux;
- replay_operands(r, ops, in->opnds, 3, in->loc);
- t->atomic_rmw(t, aux->op, ops[0], ops[1], ops[2], aux->mem, aux->order);
- return;
- }
- case CG_IR_ATOMIC_CAS: {
- const CgIrAtomicAux* aux = (const CgIrAtomicAux*)in->extra.aux;
- replay_operands(r, ops, in->opnds, 5, in->loc);
- t->atomic_cas(t, ops[0], ops[1], ops[2], ops[3], ops[4], aux->mem,
- aux->order, aux->failure);
- return;
- }
- case CG_IR_FENCE:
- t->fence(t, (MemOrder)in->extra.imm);
- return;
- case CG_IR_INTRINSIC: {
- const CgIrIntrinsicAux* aux = (const CgIrIntrinsicAux*)in->extra.aux;
- Operand* dsts =
- arena_array(r->o->c->tu, Operand, aux->ndst ? aux->ndst : 1u);
- Operand* args =
- arena_array(r->o->c->tu, Operand, aux->narg ? aux->narg : 1u);
- replay_operands(r, dsts, aux->dsts, aux->ndst, in->loc);
- replay_operands(r, args, aux->args, aux->narg, in->loc);
- t->intrinsic(t, aux->kind, dsts, aux->ndst, args, aux->narg);
- return;
- }
- case CG_IR_ASM_BLOCK: {
- const CgIrAsmAux* aux = (const CgIrAsmAux*)in->extra.aux;
- Operand* out_ops =
- arena_array(r->o->c->tu, Operand, aux->nout ? aux->nout : 1u);
- Operand* in_ops =
- arena_array(r->o->c->tu, Operand, aux->nin ? aux->nin : 1u);
- replay_operands(r, out_ops, aux->out_ops, aux->nout, in->loc);
- replay_operands(r, in_ops, aux->in_ops, aux->nin, in->loc);
- t->asm_block(t, aux->tmpl, aux->outs, aux->nout, out_ops, aux->ins,
- aux->nin, in_ops, aux->clobbers, aux->nclob);
- return;
- }
- case CG_IR_LOCAL_STATIC_DATA_BEGIN: {
- const CgIrLocalStaticBeginAux* aux =
- (const CgIrLocalStaticBeginAux*)in->extra.aux;
- if (!t->local_static_data_begin ||
- !t->local_static_data_begin(t, &aux->desc))
- compiler_panic(r->o->c, in->loc,
- "opt direct replay: local static data unsupported");
- return;
- }
- case CG_IR_LOCAL_STATIC_DATA_WRITE: {
- const CgIrLocalStaticWriteAux* aux =
- (const CgIrLocalStaticWriteAux*)in->extra.aux;
- t->local_static_data_write(t, aux->has_data ? aux->data : NULL, aux->len);
- return;
- }
- case CG_IR_LOCAL_STATIC_DATA_LABEL_ADDR: {
- const CgIrLocalStaticLabelAux* aux =
- (const CgIrLocalStaticLabelAux*)in->extra.aux;
- t->local_static_data_label_addr(t, replay_label(r, aux->target, in->loc),
- aux->addend, aux->width,
- aux->address_space);
- return;
- }
- case CG_IR_LOCAL_STATIC_DATA_END:
- t->local_static_data_end(t);
- return;
- }
-}
-
-static void opt_replay_cg_ir_direct(OptImpl* o, const CgIrFunc* f) {
- OptReplay r;
- memset(&r, 0, sizeof r);
- r.o = o;
- r.nlocals = f->nlocals;
- r.local_map =
- arena_zarray(o->c->tu, CGLocal, f->nlocals ? f->nlocals + 1u : 1u);
- for (u32 i = 0; i < f->nlabels; ++i)
- if (f->labels[i].id > r.nlabels) r.nlabels = f->labels[i].id;
- r.label_map = arena_zarray(o->c->tu, Label, r.nlabels ? r.nlabels + 1u : 1u);
- r.nscopes = f->nscopes;
- r.scope_map =
- arena_zarray(o->c->tu, CGScope, f->nscopes ? f->nscopes + 1u : 1u);
-
- o->target->func_begin(o->target, &f->desc);
- for (u32 i = 0; i < f->nlabels; ++i)
- r.label_map[f->labels[i].id] = o->target->label_new(o->target);
- for (u32 i = 0; i < f->nparams; ++i) {
- const CgIrParam* p = &f->params[i];
- r.local_map[p->local] = o->target->param(o->target, &p->desc);
- }
- for (u32 i = 0; i < f->nlocals; ++i) {
- const CgIrLocal* l = &f->locals[i];
- if (!r.local_map[l->id])
- r.local_map[l->id] = o->target->local(o->target, &l->desc);
- }
- for (u32 i = 0; i < f->ninsts; ++i) replay_inst(&r, &f->insts[i]);
- o->target->func_end(o->target);
-}
-
static void opt_dbg_dump(OptImpl* o, Func* f, const char* tag) {
extern char* getenv(const char*);
const char* s = getenv("CFREE_DUMP");
@@ -609,10 +198,6 @@ static void opt_on_func(void* user, CgIrFunc* cg_func) {
OptImpl* o = (OptImpl*)user;
Func* f;
opt_dbg_dump_cg(o, cg_func);
- if (opt_func_needs_direct_replay(o, cg_func)) {
- opt_replay_cg_ir_direct(o, cg_func);
- return;
- }
metrics_scope_begin(o->c, "opt.o1.cg_ir_lower");
f = opt_func_from_cg_ir(o->c, cg_func);
metrics_scope_end(o->c, "opt.o1.cg_ir_lower");
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -1171,9 +1171,31 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in,
mem_for_type(e->c, ty), in->loc);
return;
}
+ case IR_ASM_BLOCK: {
+ IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
+ NativeLoc* out_locs =
+ aux && aux->nout ? arena_array(e->f->arena, NativeLoc, aux->nout)
+ : NULL;
+ NativeLoc* in_locs = aux && aux->nin
+ ? arena_array(e->f->arena, NativeLoc, aux->nin)
+ : NULL;
+ /* The optimizer has already allocated registers for the asm operands and
+ * placed the input values / consumes the output values through the normal
+ * use/def data flow. We only convert each operand to its NativeLoc; the
+ * NativeTarget hook binds the pre-allocated registers to the template and
+ * saves/restores any callee-saved registers the asm clobbers. */
+ for (u32 i = 0; aux && i < aux->nout; ++i)
+ out_locs[i] = loc_from_operand(e, &aux->out_ops[i], in->loc);
+ for (u32 i = 0; aux && i < aux->nin; ++i)
+ in_locs[i] = loc_from_operand(e, &aux->in_ops[i], in->loc);
+ e->target->asm_block(e->target, aux ? aux->tmpl : "",
+ aux ? aux->outs : NULL, aux ? aux->nout : 0, out_locs,
+ aux ? aux->ins : NULL, aux ? aux->nin : 0, in_locs,
+ aux ? aux->clobbers : NULL, aux ? aux->nclob : 0);
+ return;
+ }
case IR_BREAK_TO:
case IR_CONTINUE_TO:
- case IR_ASM_BLOCK:
emit_panic(e, in->loc, "operation is not wired to NativeTarget yet");
case IR_FENCE:
e->target->fence(e->target, (MemOrder)in->extra.imm);