commit ca23ef4abbc29a06c4b0da8fda4fa1d37d4acbe8
parent 54d233934315c9eb5df71573041f9e4c6488f000
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Mon, 1 Jun 2026 14:43:29 -0700
arch: fix -O1 inline-asm callee-saved clobber frame panic; add FP constraints + clobber_abi
A leaf (or non-leaf) with inline asm clobbering a callee-saved register
panicked on the optimizer path with "frame slot requested after prologue".
The optimizer's plan_frame builds the function's callee-saved set by scanning
MIR operands, but asm clobbers are not operands -- so the clobbered register
was never saved by the prologue, and the per-block fallback (*_asm_save_one)
tried to allocate a spill slot after the frame was already final. rv64/x64/aa64
all hit it.
Fix: forward the asm clobber names (and the clobber-ABI sets, below) through
NativeKnownFrameDesc; each backend resolves them with its own clobber parser and
folds the callee-saved subset into the prologue-saved set using its ABI
predicate (rv64 excludes s0; x64 keeps the reserved-but-callee-saved rbx/r12;
aa64 uses x19..x28 / v8..v15). The optimizer-path asm hook then drops the
per-block spill entirely. machinize's resolve_name is unset on every backend, so
aux->clobber_mask is unreliable; the name-based path sidesteps that.
Also:
- FP register asm constraints: the arch-neutral CG layer rejected 'f'/'x'/'w'
(riscv/x86/aarch64 FP register classes) even though every backend's asm hook
handles them, so =f/=x/=w were unreachable from C. Accept them via
api_asm_is_reg_constraint; the operand's type routes it to the FP class.
- clobber_abi was parsed but dropped (a no-op): thread clobber_abi_sets through
the asm_block vtable -> IRAsmAux -> plan_frame and the O0 direct path, and add
CFREE_CG_ASM_CLOBBER_ABI_CALLEE_SAVED. Each backend expands the set against its
own register file. This is the arch-neutral way to spill the whole callee-saved
file across an asm block, and the basis for a portable toy regression.
- toy: parse clobber_abi(.callee_saved); add case 142 exercising the fold at -O1
across all lanes/arches.
Diffstat:
19 files changed, 318 insertions(+), 53 deletions(-)
diff --git a/include/cfree/cg.h b/include/cfree/cg.h
@@ -978,6 +978,12 @@ typedef enum CfreeCgAsmFlag {
typedef enum CfreeCgAsmClobberAbiSet {
CFREE_CG_ASM_CLOBBER_ABI_NONE = 0,
CFREE_CG_ASM_CLOBBER_ABI_CALLER_SAVED = 1u << 0,
+ /* Every callee-saved register of the target ABI. The compiler preserves them
+ * across the asm block (prologue/epilogue save on the optimizer path, a
+ * per-block spill on the single-pass path) just as it would for named
+ * callee-saved clobbers — an arch-neutral way to say "this asm trashes the
+ * callee-saved register file". */
+ CFREE_CG_ASM_CLOBBER_ABI_CALLEE_SAVED = 1u << 1,
} CfreeCgAsmClobberAbiSet;
typedef struct CfreeCgAsmOperand {
diff --git a/lang/toy/asm.c b/lang/toy/asm.c
@@ -324,6 +324,8 @@ static int toy_parse_asm_clobber_abi(ToyParser* p, uint32_t* clobber_abi_sets) {
if (!toy_parse_attr_dot_name(p, &name)) return 0;
if (toy_sym_is(p, name, "caller_saved"))
*clobber_abi_sets |= CFREE_CG_ASM_CLOBBER_ABI_CALLER_SAVED;
+ else if (toy_sym_is(p, name, "callee_saved"))
+ *clobber_abi_sets |= CFREE_CG_ASM_CLOBBER_ABI_CALLEE_SAVED;
else {
toy_error(p, p->cur.loc, "unknown asm clobber ABI");
return 0;
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -1526,6 +1526,47 @@ static int aa_frame_slot_debug_loc(NativeTarget* t, NativeFrameSlot slot,
return 1;
}
+static void aa_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
+ u32 nclob, u32* int_mask, u32* fp_mask);
+/* Defined after aa_classes (below); forward-declared so the frame helper can use
+ * it. Expands CfreeCgAsmClobberAbiSet bits into per-class register masks. */
+static void aa_abi_clobber_masks(u32 abi_sets, u32* int_mask, u32* fp_mask);
+
+/* Build the callee-saved set the prologue must preserve: the allocator-assigned
+ * callee-saved registers (frame->callee_saved_used) plus any an inline-asm block
+ * clobbers. The latter are opaque to the optimizer's operand scan, so it
+ * forwards the raw clobber names (frame->asm_clobbers) and the arch-neutral
+ * clobber-ABI sets (frame->asm_clobber_abi_sets); we resolve both into masks and
+ * keep only the callee-saved ones, per AAPCS64: x19..x28 and the low 64 bits of
+ * v8..v15 (x29/x30 are the frame pointer and link register, handled by the
+ * prologue head, not as ordinary callee-saves). This is the same register
+ * selection the per-block spill used, hoisted into the prologue. */
+static u32 aa_known_callee_saves(NativeTarget* t,
+ const NativeKnownFrameDesc* frame, u32* out,
+ u32 cap) {
+ u32 ncls = frame->ncallee_classes;
+ u32 clob_int = 0, clob_fp = 0, abi_int, abi_fp;
+ if (ncls > cap) ncls = cap;
+ for (u32 c = 0; c < ncls; ++c)
+ out[c] = frame->callee_saved_used ? frame->callee_saved_used[c] : 0u;
+ if (frame->asm_clobbers && frame->nasm_clobbers) {
+ AANativeTarget* a = aa_of(t);
+ SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
+ aa_asm_clobber_masks(t->c, loc, frame->asm_clobbers, frame->nasm_clobbers,
+ &clob_int, &clob_fp);
+ }
+ aa_abi_clobber_masks(frame->asm_clobber_abi_sets, &abi_int, &abi_fp);
+ clob_int |= abi_int;
+ clob_fp |= abi_fp;
+ for (Reg r = 0; r < 32u; ++r) {
+ if (NATIVE_REG_INT < ncls && (clob_int & (1u << r)) && r >= 19u && r <= 28u)
+ out[NATIVE_REG_INT] |= 1u << r;
+ if (NATIVE_REG_FP < ncls && (clob_fp & (1u << r)) && r >= 8u && r <= 15u)
+ out[NATIVE_REG_FP] |= 1u << r;
+ }
+ return ncls;
+}
+
/* Optimizer entry point: the full frame is supplied up front, so the prologue,
* entry saves, slim-form eligibility, allocas, and tail epilogues are all final
* the moment they are emitted — no back-patching (aa_func_end skips the patch
@@ -1542,10 +1583,10 @@ static void aa_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
aa_func_begin_common(t, fd);
a->frame.known_frame = 1;
if (frame) {
+ u32 cs[NATIVE_CALL_PLAN_CLASSES];
+ u32 ncs = aa_known_callee_saves(t, frame, cs, NATIVE_CALL_PLAN_CLASSES);
a->frame.has_alloca = frame->has_alloca;
- if (frame->callee_saved_used && frame->ncallee_classes)
- aa_reserve_callee_saves(t, frame->callee_saved_used,
- frame->ncallee_classes);
+ if (ncs) aa_reserve_callee_saves(t, cs, ncs);
for (u32 i = 0; i < frame->nslots; ++i) {
NativeFrameSlot slot = aa_frame_slot(t, &frame->slots[i]);
if (out_slots) out_slots[i] = slot;
@@ -3479,6 +3520,22 @@ static const NativeRegInfo aa_reg_info = {
.nclasses = sizeof aa_classes / sizeof aa_classes[0],
};
+/* Expand the arch-neutral clobber-ABI sets (CfreeCgAsmClobberAbiSet bits) into
+ * this target's per-class caller/callee-saved register masks. Forward-declared
+ * earlier for aa_known_callee_saves; defined here where aa_classes is in scope. */
+static void aa_abi_clobber_masks(u32 abi_sets, u32* int_mask, u32* fp_mask) {
+ *int_mask = 0;
+ *fp_mask = 0;
+ if (abi_sets & CFREE_CG_ASM_CLOBBER_ABI_CALLER_SAVED) {
+ *int_mask |= aa_classes[NATIVE_REG_INT].caller_saved_mask;
+ *fp_mask |= aa_classes[NATIVE_REG_FP].caller_saved_mask;
+ }
+ if (abi_sets & CFREE_CG_ASM_CLOBBER_ABI_CALLEE_SAVED) {
+ *int_mask |= aa_classes[NATIVE_REG_INT].callee_saved_mask;
+ *fp_mask |= aa_classes[NATIVE_REG_FP].callee_saved_mask;
+ }
+}
+
static void aa_va_start_native(NativeTarget* t, NativeLoc ap_ptr);
static void aa_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr,
CfreeCgTypeId type);
@@ -4081,9 +4138,8 @@ static int aa_asm_parse_reg_clobber(Compiler* c, SrcLoc loc, Sym name,
return 0;
}
-AA_UNUSED_FN static void aa_asm_clobber_masks(Compiler* c, SrcLoc loc,
- const Sym* clobbers, u32 nclob,
- u32* int_mask, u32* fp_mask) {
+static void aa_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
+ u32 nclob, u32* int_mask, u32* fp_mask) {
*int_mask = 0;
*fp_mask = 0;
for (u32 i = 0; i < nclob; ++i) {
@@ -4249,16 +4305,20 @@ static void aa_direct_asm_block(NativeDirectTarget* d, const char* tmpl,
const AsmConstraint* outs, u32 nout,
Operand* out_ops, const AsmConstraint* ins,
u32 nin, const Operand* in_ops,
- const Sym* clobbers, u32 nclob) {
+ const Sym* clobbers, u32 nclob,
+ u32 clobber_abi_sets) {
Operand* bound_outs =
nout ? arena_zarray(d->base.c->tu, Operand, nout) : NULL;
Operand* bound_ins = nin ? arena_zarray(d->base.c->tu, Operand, nin) : NULL;
- u32 clob_int, clob_fp, used_int, used_fp;
+ u32 clob_int, clob_fp, abi_int, abi_fp, used_int, used_fp;
AAAsmSavedClobber* saved;
u32 nsaved;
AA64Asm* a;
aa_asm_clobber_masks(d->base.c, d->loc, clobbers, nclob, &clob_int, &clob_fp);
+ aa_abi_clobber_masks(clobber_abi_sets, &abi_int, &abi_fp);
+ clob_int |= abi_int;
+ clob_fp |= abi_fp;
used_int = clob_int | (1u << AA_TMP0) | (1u << AA_TMP1) | (1u << 18u) |
(1u << AA_FP) | (1u << AA_LR) | (1u << AA_SP);
used_fp = clob_fp | (1u << 20u) | (1u << 21u);
@@ -4442,13 +4502,9 @@ static void aa_asm_block_native(NativeTarget* t, const char* tmpl,
SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
- u32 clob_int, clob_fp, ntmp = 0;
- AAAsmSavedClobber* saved;
- u32 nsaved;
+ u32 ntmp = 0;
AA64Asm* asmh;
- aa_asm_clobber_masks(c, loc, clobbers, nclob, &clob_int, &clob_fp);
-
for (u32 i = 0; i < nout; ++i) {
CfreeCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type;
aa_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, out_locs[i],
@@ -4490,13 +4546,14 @@ static void aa_asm_block_native(NativeTarget* t, const char* tmpl,
}
}
- saved = aa_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved);
+ /* No per-block callee-saved spill here: plan_frame forwarded the asm clobber
+ * masks and aa_known_callee_saves folded the callee-saved ones into the
+ * function's saved set, so the prologue/epilogue already preserve them. */
asmh = aa64_asm_open(c);
aa64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
nclob);
aa64_asm_run_template(asmh, t->mc, tmpl);
aa64_asm_close(asmh);
- for (u32 i = nsaved; i > 0; --i) aa_asm_restore_one(a, &saved[i - 1u]);
}
static const NativeOps aa_direct_ops = {
diff --git a/src/arch/check_target.c b/src/arch/check_target.c
@@ -282,7 +282,7 @@ static void check_asm_block(CgTarget* t, const char* tmpl,
const AsmConstraint* outs, u32 nout,
Operand* out_ops, const AsmConstraint* ins, u32 nin,
const Operand* in_ops, const Sym* clobbers,
- u32 nclob) {
+ u32 nclob, u32 clobber_abi_sets) {
(void)t;
(void)tmpl;
(void)outs;
@@ -293,6 +293,7 @@ static void check_asm_block(CgTarget* t, const char* tmpl,
(void)in_ops;
(void)clobbers;
(void)nclob;
+ (void)clobber_abi_sets;
}
static void check_atomic_load(CgTarget* t, Operand dst, Operand addr,
diff --git a/src/arch/native_target.h b/src/arch/native_target.h
@@ -55,6 +55,21 @@ typedef struct NativeKnownFrameDesc {
* full frame is fixed before the prologue is emitted. NULL / 0 means none. */
const u32* callee_saved_used;
u32 ncallee_classes;
+ /* Union of the clobber register names of every inline-asm block in the body.
+ * Inline-asm clobbers are invisible to the operand scan that builds
+ * callee_saved_used, so the optimizer forwards the raw names here and the
+ * backend resolves them with its own clobber parser, folding the callee-saved
+ * ones into its save set (applying its ABI predicate, which excludes the frame
+ * pointer and keeps any reserved-but-callee-saved scratch such as x64 rbx).
+ * The prologue/epilogue then preserve them, so the asm hook needs no per-block
+ * spill — which on the known-frame path would request a frame slot after the
+ * frame is already final. NULL / 0 when the body contains no inline asm. */
+ const Sym* asm_clobbers;
+ u32 nasm_clobbers;
+ /* Union of CfreeCgAsmClobberAbiSet bits over the body's inline-asm blocks: an
+ * arch-neutral "clobbers the whole caller/callee-saved set" the backend
+ * expands against its own register file, alongside the named asm_clobbers. */
+ u32 asm_clobber_abi_sets;
/* Whether the function body contains a dynamic alloca. The backend needs this
* up front (before the body) to decide prologue/epilogue form, since with a
* known frame the slim-epilogue eligibility is settled at func_begin. */
diff --git a/src/arch/rv64/native.c b/src/arch/rv64/native.c
@@ -1521,6 +1521,63 @@ static void rv_reserve_callee_saves(NativeTarget* t, const u32* used,
native_frame_set_callee_saves(&rv_of(t)->frame, used, nclasses, NULL, 0, 0);
}
+static int rv_reg_is_callee_int(Reg r);
+static int rv_reg_is_callee_fp(Reg r);
+static void rv_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
+ u32 nclob, u32* int_mask, u32* fp_mask);
+
+/* Expand the arch-neutral clobber-ABI sets (CfreeCgAsmClobberAbiSet bits) into
+ * this target's per-class caller/callee-saved register masks. */
+static void rv_abi_clobber_masks(u32 abi_sets, u32* int_mask, u32* fp_mask) {
+ *int_mask = 0;
+ *fp_mask = 0;
+ if (abi_sets & CFREE_CG_ASM_CLOBBER_ABI_CALLER_SAVED) {
+ *int_mask |= rv_classes[NATIVE_REG_INT].caller_saved_mask;
+ *fp_mask |= rv_classes[NATIVE_REG_FP].caller_saved_mask;
+ }
+ if (abi_sets & CFREE_CG_ASM_CLOBBER_ABI_CALLEE_SAVED) {
+ *int_mask |= rv_classes[NATIVE_REG_INT].callee_saved_mask;
+ *fp_mask |= rv_classes[NATIVE_REG_FP].callee_saved_mask;
+ }
+}
+
+/* Build the callee-saved set the prologue must preserve: the allocator-assigned
+ * callee-saved registers (frame->callee_saved_used) plus any an inline-asm block
+ * clobbers. The latter are opaque to the optimizer's operand scan, so it
+ * forwards the raw clobber names (frame->asm_clobbers) and the arch-neutral
+ * clobber-ABI sets (frame->asm_clobber_abi_sets); we resolve both into masks and
+ * keep only the callee-saved ones — rv_reg_is_callee_int excludes s0 (the frame
+ * pointer, preserved by the prologue head, not as an ordinary callee-save). This
+ * is the same register selection the per-block spill used, hoisted into the
+ * prologue. Writes up to `cap` per-class masks into `out` and returns the class
+ * count to reserve. */
+static u32 rv_known_callee_saves(NativeTarget* t,
+ const NativeKnownFrameDesc* frame, u32* out,
+ u32 cap) {
+ u32 ncls = frame->ncallee_classes;
+ u32 clob_int = 0, clob_fp = 0, abi_int, abi_fp;
+ if (ncls > cap) ncls = cap;
+ for (u32 c = 0; c < ncls; ++c)
+ out[c] = frame->callee_saved_used ? frame->callee_saved_used[c] : 0u;
+ if (frame->asm_clobbers && frame->nasm_clobbers) {
+ RvNativeTarget* a = rv_of(t);
+ SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
+ rv_asm_clobber_masks(t->c, loc, frame->asm_clobbers, frame->nasm_clobbers,
+ &clob_int, &clob_fp);
+ }
+ rv_abi_clobber_masks(frame->asm_clobber_abi_sets, &abi_int, &abi_fp);
+ clob_int |= abi_int;
+ clob_fp |= abi_fp;
+ for (Reg r = 0; r < 32u; ++r) {
+ if (NATIVE_REG_INT < ncls && (clob_int & (1u << r)) &&
+ rv_reg_is_callee_int(r))
+ out[NATIVE_REG_INT] |= 1u << r;
+ if (NATIVE_REG_FP < ncls && (clob_fp & (1u << r)) && rv_reg_is_callee_fp(r))
+ out[NATIVE_REG_FP] |= 1u << r;
+ }
+ return ncls;
+}
+
static u32 rv_signature_stack_bytes(NativeTarget* t, CfreeCgTypeId fn_type,
int* variadic, u32* nparams);
@@ -1542,10 +1599,10 @@ static void rv_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
rv_func_begin_common(t, fd);
a->frame.known_frame = 1;
if (frame) {
+ u32 cs[NATIVE_CALL_PLAN_CLASSES];
+ u32 ncs = rv_known_callee_saves(t, frame, cs, NATIVE_CALL_PLAN_CLASSES);
a->frame.has_alloca = frame->has_alloca;
- if (frame->callee_saved_used && frame->ncallee_classes)
- rv_reserve_callee_saves(t, frame->callee_saved_used,
- frame->ncallee_classes);
+ if (ncs) rv_reserve_callee_saves(t, cs, ncs);
for (i = 0; i < frame->nslots; ++i) {
NativeFrameSlot slot = rv_frame_slot(t, &frame->slots[i]);
if (out_slots) out_slots[i] = slot;
@@ -3200,13 +3257,9 @@ static void rv_asm_block_native(NativeTarget* t, const char* tmpl,
SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
- u32 clob_int, clob_fp, ntmp = 0;
- RvAsmSavedClobber* saved;
- u32 nsaved, i;
+ u32 ntmp = 0, i;
Rv64Asm* asmh;
- rv_asm_clobber_masks(c, loc, clobbers, nclob, &clob_int, &clob_fp);
-
for (i = 0; i < nout; ++i) {
CfreeCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type;
rv_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, out_locs[i],
@@ -3239,13 +3292,14 @@ static void rv_asm_block_native(NativeTarget* t, const char* tmpl,
rv_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc, &ntmp);
}
- saved = rv_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved);
+ /* No per-block callee-saved spill here: plan_frame forwarded the asm clobber
+ * masks and rv_known_callee_saves folded the callee-saved ones into the
+ * function's saved set, so the prologue/epilogue already preserve them. */
asmh = rv64_asm_open(c);
rv64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
nclob);
rv64_asm_run_template(asmh, t->mc, tmpl);
rv64_asm_close(asmh);
- for (i = nsaved; i > 0; --i) rv_asm_restore_one(a, &saved[i - 1u]);
}
static void rv_file_scope_asm(NativeTarget* t, const char* src, size_t len) {
/* Top-level __asm__("...") — assemble through the generic .s parser, which
@@ -3476,17 +3530,21 @@ static void rv_direct_asm_block(NativeDirectTarget* d, const char* tmpl,
const AsmConstraint* outs, u32 nout,
Operand* out_ops, const AsmConstraint* ins,
u32 nin, const Operand* in_ops,
- const Sym* clobbers, u32 nclob) {
+ const Sym* clobbers, u32 nclob,
+ u32 clobber_abi_sets) {
RvNativeTarget* a = rv_of(d->native);
Compiler* c = d->base.c;
Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
- u32 clob_int, clob_fp, used_int, used_fp;
+ u32 clob_int, clob_fp, abi_int, abi_fp, used_int, used_fp;
RvAsmSavedClobber* saved;
u32 nsaved, i;
Rv64Asm* asmh;
rv_asm_clobber_masks(c, d->loc, clobbers, nclob, &clob_int, &clob_fp);
+ rv_abi_clobber_masks(clobber_abi_sets, &abi_int, &abi_fp);
+ clob_int |= abi_int;
+ clob_fp |= abi_fp;
/* Reserve emit scratch (t0/t1/t2/t3), sp/gp/tp/zero/ra and the frame pointer
* so the operand allocator never hands them out. */
used_int = clob_int | (1u << RV_ZERO) | (1u << RV_RA) | (1u << RV_SP) |
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -1634,6 +1634,63 @@ static void x64_reserve_callee_saves(NativeTarget* t, const u32* used,
native_frame_set_callee_saves(&x64_of(t)->frame, used, nclasses, NULL, 0, 0);
}
+static int x64_reg_is_callee_int(const X64ABIRegs* abi, Reg r);
+static int x64_reg_is_callee_fp(const X64ABIRegs* abi, Reg r);
+static void x64_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
+ u32 nclob, u32* int_mask, u32* fp_mask);
+
+/* Expand the arch-neutral clobber-ABI sets (CfreeCgAsmClobberAbiSet bits) into
+ * this target's per-class caller/callee-saved register masks. */
+static void x64_abi_clobber_masks(u32 abi_sets, u32* int_mask, u32* fp_mask) {
+ *int_mask = 0;
+ *fp_mask = 0;
+ if (abi_sets & CFREE_CG_ASM_CLOBBER_ABI_CALLER_SAVED) {
+ *int_mask |= x64_classes[NATIVE_REG_INT].caller_saved_mask;
+ *fp_mask |= x64_classes[NATIVE_REG_FP].caller_saved_mask;
+ }
+ if (abi_sets & CFREE_CG_ASM_CLOBBER_ABI_CALLEE_SAVED) {
+ *int_mask |= x64_classes[NATIVE_REG_INT].callee_saved_mask;
+ *fp_mask |= x64_classes[NATIVE_REG_FP].callee_saved_mask;
+ }
+}
+
+/* Build the callee-saved set the prologue must preserve: the allocator-assigned
+ * callee-saved registers (frame->callee_saved_used) plus any an inline-asm block
+ * clobbers. The latter are opaque to the optimizer's operand scan, so it
+ * forwards the raw clobber names (frame->asm_clobbers) and the arch-neutral
+ * clobber-ABI sets (frame->asm_clobber_abi_sets); we resolve both into masks and
+ * keep only the callee-saved ones. x64_reg_is_callee_* follow the live ABI: they
+ * exclude rbp (handled by the prologue head) and keep the reserved-but-callee-
+ * saved scratch rbx/r12 (which the caller still expects preserved). This is the
+ * same register selection the per-block spill used, hoisted into the prologue. */
+static u32 x64_known_callee_saves(NativeTarget* t, const X64ABIRegs* abi,
+ const NativeKnownFrameDesc* frame, u32* out,
+ u32 cap) {
+ u32 ncls = frame->ncallee_classes;
+ u32 clob_int = 0, clob_fp = 0, abi_int, abi_fp;
+ if (ncls > cap) ncls = cap;
+ for (u32 c = 0; c < ncls; ++c)
+ out[c] = frame->callee_saved_used ? frame->callee_saved_used[c] : 0u;
+ if (frame->asm_clobbers && frame->nasm_clobbers) {
+ X64NativeTarget* a = x64_of(t);
+ SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
+ x64_asm_clobber_masks(t->c, loc, frame->asm_clobbers, frame->nasm_clobbers,
+ &clob_int, &clob_fp);
+ }
+ x64_abi_clobber_masks(frame->asm_clobber_abi_sets, &abi_int, &abi_fp);
+ clob_int |= abi_int;
+ clob_fp |= abi_fp;
+ for (Reg r = 0; r < 16u; ++r) {
+ if (NATIVE_REG_INT < ncls && (clob_int & (1u << r)) &&
+ x64_reg_is_callee_int(abi, r))
+ out[NATIVE_REG_INT] |= 1u << r;
+ if (NATIVE_REG_FP < ncls && (clob_fp & (1u << r)) &&
+ x64_reg_is_callee_fp(abi, r))
+ out[NATIVE_REG_FP] |= 1u << r;
+ }
+ return ncls;
+}
+
/* Optimizer entry point: the full frame is supplied up front, so the prologue
* is emitted final the moment it is built — no NOP region, no func_end patch
* (x64_func_end skips patching when known_frame). x64_build_prologue emits the
@@ -1650,10 +1707,11 @@ static void x64_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
x64_func_begin_common(t, fd);
a->frame.known_frame = 1;
if (frame) {
+ u32 cs[NATIVE_CALL_PLAN_CLASSES];
+ u32 ncs =
+ x64_known_callee_saves(t, a->abi, frame, cs, NATIVE_CALL_PLAN_CLASSES);
a->frame.has_alloca = frame->has_alloca;
- if (frame->callee_saved_used && frame->ncallee_classes)
- x64_reserve_callee_saves(t, frame->callee_saved_used,
- frame->ncallee_classes);
+ if (ncs) x64_reserve_callee_saves(t, cs, ncs);
for (i = 0; i < frame->nslots; ++i) {
NativeFrameSlot slot = x64_frame_slot(t, &frame->slots[i]);
if (out_slots) out_slots[i] = slot;
@@ -3674,13 +3732,9 @@ static void x64_asm_block_native(NativeTarget* t, const char* tmpl,
SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
- u32 clob_int, clob_fp, ntmp = 0;
- X64AsmSavedClobber* saved;
- u32 nsaved, i;
+ u32 ntmp = 0, i;
X64Asm* asmh;
- x64_asm_clobber_masks(c, loc, clobbers, nclob, &clob_int, &clob_fp);
-
for (i = 0; i < nout; ++i) {
CfreeCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type;
x64_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, out_locs[i],
@@ -3711,13 +3765,14 @@ static void x64_asm_block_native(NativeTarget* t, const char* tmpl,
x64_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc, &ntmp);
}
- saved = x64_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved);
+ /* No per-block callee-saved spill here: plan_frame forwarded the asm clobber
+ * masks and x64_known_callee_saves folded the callee-saved ones into the
+ * function's saved set, so the prologue/epilogue already preserve them. */
asmh = x64_asm_open(c);
x64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
nclob);
x64_asm_run_template(asmh, t->mc, tmpl);
x64_asm_close(asmh);
- for (i = nsaved; i > 0; --i) x64_asm_restore_one(a, &saved[i - 1u]);
}
static void x64_file_scope_asm(NativeTarget* t, const char* src, size_t len) {
@@ -3989,17 +4044,21 @@ static void x64_direct_asm_block(NativeDirectTarget* d, const char* tmpl,
const AsmConstraint* outs, u32 nout,
Operand* out_ops, const AsmConstraint* ins,
u32 nin, const Operand* in_ops,
- const Sym* clobbers, u32 nclob) {
+ const Sym* clobbers, u32 nclob,
+ u32 clobber_abi_sets) {
X64NativeTarget* a = x64_of(d->native);
Compiler* c = d->base.c;
Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
- u32 clob_int, clob_fp, used_int, used_fp;
+ u32 clob_int, clob_fp, abi_int, abi_fp, used_int, used_fp;
X64AsmSavedClobber* saved;
u32 nsaved, i;
X64Asm* asmh;
x64_asm_clobber_masks(c, d->loc, clobbers, nclob, &clob_int, &clob_fp);
+ x64_abi_clobber_masks(clobber_abi_sets, &abi_int, &abi_fp);
+ clob_int |= abi_int;
+ clob_fp |= abi_fp;
/* Reserve emit scratch (rax,r11), driver scratch, sp/bp, and clobbers. */
used_int = clob_int | (1u << X64_RAX) | (1u << X64_R11) | (1u << X64_RSP) |
(1u << X64_RBP) | (1u << X64_RBX) | (1u << X64_R12) |
diff --git a/src/cg/asm.c b/src/cg/asm.c
@@ -29,6 +29,17 @@ int api_asm_is_early_clobber(const char* s) {
return (s[0] == '=' && s[1] == '&') || s[0] == '&';
}
+/* Does this constraint body name a register operand (one that binds to a temp
+ * local, as opposed to 'i' immediate or 'm' memory)? 'r' is the architecture-
+ * neutral general-register class; 'f' (riscv), 'x' (x86 SSE) and 'w' (aarch64
+ * SIMD/FP) are the per-target FP/vector register classes. The temp local's type
+ * selects the actual NativeAllocClass downstream, and the target's asm hook
+ * rejects a letter that does not apply to it, so listing all three here is safe
+ * across backends. */
+int api_asm_is_reg_constraint(char c) {
+ return c == 'r' || c == 'f' || c == 'x' || c == 'w';
+}
+
void api_asm_memory_clobber_sv(CfreeCg* g, ApiSValue* sv, CGLocal local_id) {
(void)g;
(void)sv;
@@ -60,8 +71,8 @@ void cfree_cg_inline_asm(CfreeCg* g, CfreeCgInlineAsm asm_block) {
uint32_t ninputs = asm_block.ninputs;
const CfreeSym* clobbers = asm_block.clobbers;
uint32_t nclobbers = asm_block.nclobbers;
+ uint32_t clobber_abi_sets = asm_block.clobber_abi_sets;
(void)asm_block.flags;
- (void)asm_block.clobber_abi_sets;
if (!g) return;
api_local_const_memory_boundary(g);
T = g->target;
@@ -145,7 +156,10 @@ void cfree_cg_inline_asm(CfreeCg* g, CfreeCgInlineAsm asm_block) {
for (u32 i = 0; i < noutputs; ++i) {
const char* body = api_asm_constraint_body(outs[i].str);
if (api_asm_is_early_clobber(outs[i].str)) continue;
- if (body[0] == 'r') {
+ /* A register constraint binds to a temp local; the local's type selects the
+ * register class (integer vs FP), so the backend hook places an FP-class
+ * output (riscv 'f', x86 'x', aarch64 'w') in an FP register. */
+ if (api_asm_is_reg_constraint(body[0])) {
CfreeCgTypeId oty = outs[i].type ? outs[i].type : fallback_ty;
CGLocal r = api_alloc_temp_local(g, oty);
out_ops[i] = api_op_local(r, oty);
@@ -182,7 +196,7 @@ void cfree_cg_inline_asm(CfreeCg* g, CfreeCgInlineAsm asm_block) {
T->copy(T, bound, src);
}
in_ops[i] = bound;
- } else if (s[0] == 'r') {
+ } else if (api_asm_is_reg_constraint(s[0])) {
in_ops[i] = api_force_local(g, &in_svs[i], ity);
} else if (s[0] == 'i') {
if (!api_sv_op_is(&in_svs[i], OPK_IMM)) {
@@ -216,7 +230,7 @@ void cfree_cg_inline_asm(CfreeCg* g, CfreeCgInlineAsm asm_block) {
CGLocal r;
if (!api_asm_is_early_clobber(outs[i].str)) continue;
body = api_asm_constraint_body(outs[i].str);
- if (body[0] != 'r') {
+ if (!api_asm_is_reg_constraint(body[0])) {
compiler_panic(g->c, g->cur_loc,
"CfreeCg: unsupported early-clobber asm output");
continue;
@@ -250,7 +264,7 @@ void cfree_cg_inline_asm(CfreeCg* g, CfreeCgInlineAsm asm_block) {
}
T->asm_block(T, tmpl_str, outs, noutputs, out_ops, ins, total_inputs, in_ops,
- clobs, nclobbers);
+ clobs, nclobbers, clobber_abi_sets);
for (u32 i = 0; i < total_inputs; ++i) api_release(g, &in_svs[i]);
for (u32 i = 0; i < noutputs; ++i) {
diff --git a/src/cg/cgtarget.h b/src/cg/cgtarget.h
@@ -673,7 +673,7 @@ struct CgTarget {
void (*asm_block)(CgTarget*, const char* tmpl, const AsmConstraint* outs,
u32 nout, Operand* out_ops, const AsmConstraint* ins,
u32 nin, const Operand* in_ops, const Sym* clobbers,
- u32 nclob);
+ u32 nclob, u32 clobber_abi_sets);
/* Optional: handle a top-level `__asm__("...")` block (file scope, not
* inside a function). Backends that leave this NULL fall back to the
diff --git a/src/cg/internal.h b/src/cg/internal.h
@@ -216,6 +216,7 @@ const char* api_sym_cstr(CfreeCg* g, CfreeSym sym);
int api_asm_parse_match_index(const char* s);
const char* api_asm_constraint_body(const char* s);
int api_asm_is_early_clobber(const char* s);
+int api_asm_is_reg_constraint(char c);
void api_asm_memory_clobber_sv(CfreeCg* g, ApiSValue* sv, CGLocal local);
void cfree_cg_inline_asm(CfreeCg* g, CfreeCgInlineAsm asm_block);
void cfree_cg_file_scope_asm(CfreeCg* g, CfreeSlice asm_source);
diff --git a/src/cg/ir.h b/src/cg/ir.h
@@ -169,6 +169,7 @@ typedef struct CgIrAsmAux {
u32 nout;
u32 nin;
u32 nclob;
+ u32 clobber_abi_sets; /* CfreeCgAsmClobberAbiSet bits */
} CgIrAsmAux;
typedef struct CgIrIntrinsicAux {
diff --git a/src/cg/ir_recorder.c b/src/cg/ir_recorder.c
@@ -538,8 +538,8 @@ static void rec_intrinsic(CgTarget* t, IntrinKind kind, Operand* dsts, u32 ndst,
static void rec_asm_block(CgTarget* t, const char* tmpl,
const AsmConstraint* outs, u32 nout, Operand* out_ops,
const AsmConstraint* ins, u32 nin,
- const Operand* in_ops, const Sym* clobbers,
- u32 nclob) {
+ const Operand* in_ops, const Sym* clobbers, u32 nclob,
+ u32 clobber_abi_sets) {
CgIrRecorder* r = rec_of(t);
CgIrFunc* f = require_func(r);
CgIrInst* in = emit(r, CG_IR_ASM_BLOCK);
@@ -558,6 +558,7 @@ static void rec_asm_block(CgTarget* t, const char* tmpl,
aux->nout = nout;
aux->nin = nin;
aux->nclob = nclob;
+ aux->clobber_abi_sets = clobber_abi_sets;
in->extra.aux = aux;
}
diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c
@@ -1707,15 +1707,15 @@ static void nd_intrinsic(CgTarget* t, IntrinKind kind, Operand* dsts, u32 ndst,
static void nd_asm_block(CgTarget* t, const char* tmpl,
const AsmConstraint* outs, u32 nout, Operand* out_ops,
const AsmConstraint* ins, u32 nin,
- const Operand* in_ops, const Sym* clobbers,
- u32 nclob) {
+ const Operand* in_ops, const Sym* clobbers, u32 nclob,
+ u32 clobber_abi_sets) {
NativeDirectTarget* d = nd_of(t);
nd_flush_all(d);
nd_barrier(d,
NATIVE_DIRECT_BARRIER_INLINE_ASM | NATIVE_DIRECT_BARRIER_MEMORY);
if (d->ops && d->ops->asm_block) {
d->ops->asm_block(d, tmpl, outs, nout, out_ops, ins, nin, in_ops, clobbers,
- nclob);
+ nclob, clobber_abi_sets);
return;
}
nd_panic(d, "target does not emit inline asm");
diff --git a/src/cg/native_direct_target.h b/src/cg/native_direct_target.h
@@ -88,7 +88,7 @@ struct NativeOps {
void (*asm_block)(NativeDirectTarget*, const char* tmpl,
const AsmConstraint* outs, u32 nout, Operand* out_ops,
const AsmConstraint* ins, u32 nin, const Operand* in_ops,
- const Sym* clobbers, u32 nclob);
+ const Sym* clobbers, u32 nclob, u32 clobber_abi_sets);
void (*barrier)(NativeDirectTarget*, u32 flags);
};
diff --git a/src/opt/cg_ir_lower.c b/src/opt/cg_ir_lower.c
@@ -767,6 +767,7 @@ static void lower_asm(CgIrLower* l, Inst* out, const CgIrInst* in) {
aux->nout = src->nout;
aux->nin = src->nin;
aux->nclob = src->nclob;
+ aux->clobber_abi_sets = src->clobber_abi_sets;
aux->out_ops =
src->nout ? arena_array(l->f->arena, OptOperand, src->nout) : NULL;
aux->in_ops =
diff --git a/src/opt/ir.h b/src/opt/ir.h
@@ -440,6 +440,9 @@ typedef struct IRAsmAux {
out_ops; /* nout slots; the wrapped target may fill in REG location */
Operand* in_ops; /* nin slots; recorded by w_asm_block, xlat'd at replay */
u32 nout, nin, nclob;
+ /* CfreeCgAsmClobberAbiSet bits: an arch-neutral "clobbers the whole caller/
+ * callee-saved set" the backend expands against its own register file. */
+ u32 clobber_abi_sets;
/* Filled by opt_machinize from backend register-name resolution. */
u32 clobber_mask[OPT_REG_CLASSES];
i32* out_fixed_regs; /* nout, -1 when unconstrained */
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -1356,6 +1356,9 @@ static void plan_frame(NativeEmitCtx* e, const CGFuncDesc* fd) {
u8 needs_scratch_spill = 0;
u8 has_call = 0;
u8 has_asm = 0;
+ u32 nasm_clob = 0;
+ u32 asm_clobber_abi_sets = 0;
+ Sym* asm_clobbers = NULL;
memset(&frame, 0, sizeof frame);
nclasses = t->reserve_callee_saves
? compute_callee_saved_used(e, used, EMIT_MAX_REG_CLASSES)
@@ -1377,10 +1380,37 @@ static void plan_frame(NativeEmitCtx* e, const CGFuncDesc* fd) {
if (aux && aux->desc.nargs > max_args) max_args = aux->desc.nargs;
} else if ((IROp)in->op == IR_ASM_BLOCK) {
/* Inline asm may clobber the return-address register or the red zone
- * opaquely; disqualifies the frame-eliding tiers (see has_asm). */
+ * opaquely; disqualifies the frame-eliding tiers (see has_asm). Its
+ * callee-saved register clobbers are equally opaque to the operand scan
+ * below; count them now so the backend can fold them into the saved
+ * set (collected into a single Sym list in a second pass below). */
+ IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
has_asm = 1;
+ if (aux) {
+ nasm_clob += aux->nclob;
+ asm_clobber_abi_sets |= aux->clobber_abi_sets;
+ }
+ }
+ }
+ }
+ /* Gather the union of every asm block's clobber names. The backend resolves
+ * them with its own clobber parser (machinize's resolve_name is unset on every
+ * backend, so aux->clobber_mask is unreliable here). */
+ if (nasm_clob) {
+ u32 n = 0;
+ asm_clobbers = arena_array(e->f->arena, Sym, nasm_clob);
+ for (u32 b = 0; b < e->f->nblocks; ++b) {
+ Block* bl = &e->f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ IRAsmAux* aux;
+ if ((IROp)in->op != IR_ASM_BLOCK) continue;
+ aux = (IRAsmAux*)in->extra.aux;
+ for (u32 k = 0; aux && k < aux->nclob; ++k)
+ asm_clobbers[n++] = aux->clobbers[k];
}
}
+ nasm_clob = n;
}
if (t->call_stack_bytes) {
NativeLoc* args =
@@ -1436,6 +1466,9 @@ static void plan_frame(NativeEmitCtx* e, const CGFuncDesc* fd) {
frame.needs_scratch_spill = needs_scratch_spill;
frame.is_leaf = !has_call;
frame.has_asm = has_asm;
+ frame.asm_clobbers = asm_clobbers;
+ frame.nasm_clobbers = nasm_clob;
+ frame.asm_clobber_abi_sets = asm_clobber_abi_sets;
t->func_begin_known_frame(t, fd, &frame, out_slots);
for (u32 i = 0; i < e->f->nframe_slots; ++i)
e->slot_map[e->f->frame_slots[i].id] = out_slots[i];
diff --git a/test/toy/cases/142_typed_asm_clobber_abi_callee.expected b/test/toy/cases/142_typed_asm_clobber_abi_callee.expected
@@ -0,0 +1 @@
+42
diff --git a/test/toy/cases/142_typed_asm_clobber_abi_callee.toy b/test/toy/cases/142_typed_asm_clobber_abi_callee.toy
@@ -0,0 +1,12 @@
+fn __user_main(): i64 {
+ @asm<void>(
+ "",
+ outputs(),
+ inputs(),
+ clobber_abi(.callee_saved),
+ flags(.volatile)
+ );
+ return 42;
+}
+
+fn main(): i32 { return __user_main() as i32; }