kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit ca23ef4abbc29a06c4b0da8fda4fa1d37d4acbe8
parent 54d233934315c9eb5df71573041f9e4c6488f000
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Mon,  1 Jun 2026 14:43:29 -0700

arch: fix -O1 inline-asm callee-saved clobber frame panic; add FP constraints + clobber_abi

A leaf (or non-leaf) with inline asm clobbering a callee-saved register
panicked on the optimizer path with "frame slot requested after prologue".
The optimizer's plan_frame builds the function's callee-saved set by scanning
MIR operands, but asm clobbers are not operands -- so the clobbered register
was never saved by the prologue, and the per-block fallback (*_asm_save_one)
tried to allocate a spill slot after the frame was already final. rv64/x64/aa64
all hit it.

Fix: forward the asm clobber names (and the clobber-ABI sets, below) through
NativeKnownFrameDesc; each backend resolves them with its own clobber parser and
folds the callee-saved subset into the prologue-saved set using its ABI
predicate (rv64 excludes s0; x64 keeps the reserved-but-callee-saved rbx/r12;
aa64 uses x19..x28 / v8..v15). The optimizer-path asm hook then drops the
per-block spill entirely. machinize's resolve_name is unset on every backend, so
aux->clobber_mask is unreliable; the name-based path sidesteps that.

Also:
- FP register asm constraints: the arch-neutral CG layer rejected 'f'/'x'/'w'
  (riscv/x86/aarch64 FP register classes) even though every backend's asm hook
  handles them, so =f/=x/=w were unreachable from C. Accept them via
  api_asm_is_reg_constraint; the operand's type routes it to the FP class.
- clobber_abi was parsed but dropped (a no-op): thread clobber_abi_sets through
  the asm_block vtable -> IRAsmAux -> plan_frame and the O0 direct path, and add
  CFREE_CG_ASM_CLOBBER_ABI_CALLEE_SAVED. Each backend expands the set against its
  own register file. This is the arch-neutral way to spill the whole callee-saved
  file across an asm block, and the basis for a portable toy regression.
- toy: parse clobber_abi(.callee_saved); add case 142 exercising the fold at -O1
  across all lanes/arches.

Diffstat:
Minclude/cfree/cg.h | 6++++++
Mlang/toy/asm.c | 2++
Msrc/arch/aa64/native.c | 87+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
Msrc/arch/check_target.c | 3++-
Msrc/arch/native_target.h | 15+++++++++++++++
Msrc/arch/rv64/native.c | 82+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
Msrc/arch/x64/native.c | 83+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
Msrc/cg/asm.c | 24+++++++++++++++++++-----
Msrc/cg/cgtarget.h | 2+-
Msrc/cg/internal.h | 1+
Msrc/cg/ir.h | 1+
Msrc/cg/ir_recorder.c | 5+++--
Msrc/cg/native_direct_target.c | 6+++---
Msrc/cg/native_direct_target.h | 2+-
Msrc/opt/cg_ir_lower.c | 1+
Msrc/opt/ir.h | 3+++
Msrc/opt/pass_native_emit.c | 35++++++++++++++++++++++++++++++++++-
Atest/toy/cases/142_typed_asm_clobber_abi_callee.expected | 1+
Atest/toy/cases/142_typed_asm_clobber_abi_callee.toy | 12++++++++++++
19 files changed, 318 insertions(+), 53 deletions(-)

diff --git a/include/cfree/cg.h b/include/cfree/cg.h @@ -978,6 +978,12 @@ typedef enum CfreeCgAsmFlag { typedef enum CfreeCgAsmClobberAbiSet { CFREE_CG_ASM_CLOBBER_ABI_NONE = 0, CFREE_CG_ASM_CLOBBER_ABI_CALLER_SAVED = 1u << 0, + /* Every callee-saved register of the target ABI. The compiler preserves them + * across the asm block (prologue/epilogue save on the optimizer path, a + * per-block spill on the single-pass path) just as it would for named + * callee-saved clobbers — an arch-neutral way to say "this asm trashes the + * callee-saved register file". */ + CFREE_CG_ASM_CLOBBER_ABI_CALLEE_SAVED = 1u << 1, } CfreeCgAsmClobberAbiSet; typedef struct CfreeCgAsmOperand { diff --git a/lang/toy/asm.c b/lang/toy/asm.c @@ -324,6 +324,8 @@ static int toy_parse_asm_clobber_abi(ToyParser* p, uint32_t* clobber_abi_sets) { if (!toy_parse_attr_dot_name(p, &name)) return 0; if (toy_sym_is(p, name, "caller_saved")) *clobber_abi_sets |= CFREE_CG_ASM_CLOBBER_ABI_CALLER_SAVED; + else if (toy_sym_is(p, name, "callee_saved")) + *clobber_abi_sets |= CFREE_CG_ASM_CLOBBER_ABI_CALLEE_SAVED; else { toy_error(p, p->cur.loc, "unknown asm clobber ABI"); return 0; diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c @@ -1526,6 +1526,47 @@ static int aa_frame_slot_debug_loc(NativeTarget* t, NativeFrameSlot slot, return 1; } +static void aa_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers, + u32 nclob, u32* int_mask, u32* fp_mask); +/* Defined after aa_classes (below); forward-declared so the frame helper can use + * it. Expands CfreeCgAsmClobberAbiSet bits into per-class register masks. */ +static void aa_abi_clobber_masks(u32 abi_sets, u32* int_mask, u32* fp_mask); + +/* Build the callee-saved set the prologue must preserve: the allocator-assigned + * callee-saved registers (frame->callee_saved_used) plus any an inline-asm block + * clobbers. The latter are opaque to the optimizer's operand scan, so it + * forwards the raw clobber names (frame->asm_clobbers) and the arch-neutral + * clobber-ABI sets (frame->asm_clobber_abi_sets); we resolve both into masks and + * keep only the callee-saved ones, per AAPCS64: x19..x28 and the low 64 bits of + * v8..v15 (x29/x30 are the frame pointer and link register, handled by the + * prologue head, not as ordinary callee-saves). This is the same register + * selection the per-block spill used, hoisted into the prologue. */ +static u32 aa_known_callee_saves(NativeTarget* t, + const NativeKnownFrameDesc* frame, u32* out, + u32 cap) { + u32 ncls = frame->ncallee_classes; + u32 clob_int = 0, clob_fp = 0, abi_int, abi_fp; + if (ncls > cap) ncls = cap; + for (u32 c = 0; c < ncls; ++c) + out[c] = frame->callee_saved_used ? frame->callee_saved_used[c] : 0u; + if (frame->asm_clobbers && frame->nasm_clobbers) { + AANativeTarget* a = aa_of(t); + SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0}; + aa_asm_clobber_masks(t->c, loc, frame->asm_clobbers, frame->nasm_clobbers, + &clob_int, &clob_fp); + } + aa_abi_clobber_masks(frame->asm_clobber_abi_sets, &abi_int, &abi_fp); + clob_int |= abi_int; + clob_fp |= abi_fp; + for (Reg r = 0; r < 32u; ++r) { + if (NATIVE_REG_INT < ncls && (clob_int & (1u << r)) && r >= 19u && r <= 28u) + out[NATIVE_REG_INT] |= 1u << r; + if (NATIVE_REG_FP < ncls && (clob_fp & (1u << r)) && r >= 8u && r <= 15u) + out[NATIVE_REG_FP] |= 1u << r; + } + return ncls; +} + /* Optimizer entry point: the full frame is supplied up front, so the prologue, * entry saves, slim-form eligibility, allocas, and tail epilogues are all final * the moment they are emitted — no back-patching (aa_func_end skips the patch @@ -1542,10 +1583,10 @@ static void aa_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd, aa_func_begin_common(t, fd); a->frame.known_frame = 1; if (frame) { + u32 cs[NATIVE_CALL_PLAN_CLASSES]; + u32 ncs = aa_known_callee_saves(t, frame, cs, NATIVE_CALL_PLAN_CLASSES); a->frame.has_alloca = frame->has_alloca; - if (frame->callee_saved_used && frame->ncallee_classes) - aa_reserve_callee_saves(t, frame->callee_saved_used, - frame->ncallee_classes); + if (ncs) aa_reserve_callee_saves(t, cs, ncs); for (u32 i = 0; i < frame->nslots; ++i) { NativeFrameSlot slot = aa_frame_slot(t, &frame->slots[i]); if (out_slots) out_slots[i] = slot; @@ -3479,6 +3520,22 @@ static const NativeRegInfo aa_reg_info = { .nclasses = sizeof aa_classes / sizeof aa_classes[0], }; +/* Expand the arch-neutral clobber-ABI sets (CfreeCgAsmClobberAbiSet bits) into + * this target's per-class caller/callee-saved register masks. Forward-declared + * earlier for aa_known_callee_saves; defined here where aa_classes is in scope. */ +static void aa_abi_clobber_masks(u32 abi_sets, u32* int_mask, u32* fp_mask) { + *int_mask = 0; + *fp_mask = 0; + if (abi_sets & CFREE_CG_ASM_CLOBBER_ABI_CALLER_SAVED) { + *int_mask |= aa_classes[NATIVE_REG_INT].caller_saved_mask; + *fp_mask |= aa_classes[NATIVE_REG_FP].caller_saved_mask; + } + if (abi_sets & CFREE_CG_ASM_CLOBBER_ABI_CALLEE_SAVED) { + *int_mask |= aa_classes[NATIVE_REG_INT].callee_saved_mask; + *fp_mask |= aa_classes[NATIVE_REG_FP].callee_saved_mask; + } +} + static void aa_va_start_native(NativeTarget* t, NativeLoc ap_ptr); static void aa_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr, CfreeCgTypeId type); @@ -4081,9 +4138,8 @@ static int aa_asm_parse_reg_clobber(Compiler* c, SrcLoc loc, Sym name, return 0; } -AA_UNUSED_FN static void aa_asm_clobber_masks(Compiler* c, SrcLoc loc, - const Sym* clobbers, u32 nclob, - u32* int_mask, u32* fp_mask) { +static void aa_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers, + u32 nclob, u32* int_mask, u32* fp_mask) { *int_mask = 0; *fp_mask = 0; for (u32 i = 0; i < nclob; ++i) { @@ -4249,16 +4305,20 @@ static void aa_direct_asm_block(NativeDirectTarget* d, const char* tmpl, const AsmConstraint* outs, u32 nout, Operand* out_ops, const AsmConstraint* ins, u32 nin, const Operand* in_ops, - const Sym* clobbers, u32 nclob) { + const Sym* clobbers, u32 nclob, + u32 clobber_abi_sets) { Operand* bound_outs = nout ? arena_zarray(d->base.c->tu, Operand, nout) : NULL; Operand* bound_ins = nin ? arena_zarray(d->base.c->tu, Operand, nin) : NULL; - u32 clob_int, clob_fp, used_int, used_fp; + u32 clob_int, clob_fp, abi_int, abi_fp, used_int, used_fp; AAAsmSavedClobber* saved; u32 nsaved; AA64Asm* a; aa_asm_clobber_masks(d->base.c, d->loc, clobbers, nclob, &clob_int, &clob_fp); + aa_abi_clobber_masks(clobber_abi_sets, &abi_int, &abi_fp); + clob_int |= abi_int; + clob_fp |= abi_fp; used_int = clob_int | (1u << AA_TMP0) | (1u << AA_TMP1) | (1u << 18u) | (1u << AA_FP) | (1u << AA_LR) | (1u << AA_SP); used_fp = clob_fp | (1u << 20u) | (1u << 21u); @@ -4442,13 +4502,9 @@ static void aa_asm_block_native(NativeTarget* t, const char* tmpl, SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0}; Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL; Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL; - u32 clob_int, clob_fp, ntmp = 0; - AAAsmSavedClobber* saved; - u32 nsaved; + u32 ntmp = 0; AA64Asm* asmh; - aa_asm_clobber_masks(c, loc, clobbers, nclob, &clob_int, &clob_fp); - for (u32 i = 0; i < nout; ++i) { CfreeCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type; aa_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, out_locs[i], @@ -4490,13 +4546,14 @@ static void aa_asm_block_native(NativeTarget* t, const char* tmpl, } } - saved = aa_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved); + /* No per-block callee-saved spill here: plan_frame forwarded the asm clobber + * masks and aa_known_callee_saves folded the callee-saved ones into the + * function's saved set, so the prologue/epilogue already preserve them. */ asmh = aa64_asm_open(c); aa64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers, nclob); aa64_asm_run_template(asmh, t->mc, tmpl); aa64_asm_close(asmh); - for (u32 i = nsaved; i > 0; --i) aa_asm_restore_one(a, &saved[i - 1u]); } static const NativeOps aa_direct_ops = { diff --git a/src/arch/check_target.c b/src/arch/check_target.c @@ -282,7 +282,7 @@ static void check_asm_block(CgTarget* t, const char* tmpl, const AsmConstraint* outs, u32 nout, Operand* out_ops, const AsmConstraint* ins, u32 nin, const Operand* in_ops, const Sym* clobbers, - u32 nclob) { + u32 nclob, u32 clobber_abi_sets) { (void)t; (void)tmpl; (void)outs; @@ -293,6 +293,7 @@ static void check_asm_block(CgTarget* t, const char* tmpl, (void)in_ops; (void)clobbers; (void)nclob; + (void)clobber_abi_sets; } static void check_atomic_load(CgTarget* t, Operand dst, Operand addr, diff --git a/src/arch/native_target.h b/src/arch/native_target.h @@ -55,6 +55,21 @@ typedef struct NativeKnownFrameDesc { * full frame is fixed before the prologue is emitted. NULL / 0 means none. */ const u32* callee_saved_used; u32 ncallee_classes; + /* Union of the clobber register names of every inline-asm block in the body. + * Inline-asm clobbers are invisible to the operand scan that builds + * callee_saved_used, so the optimizer forwards the raw names here and the + * backend resolves them with its own clobber parser, folding the callee-saved + * ones into its save set (applying its ABI predicate, which excludes the frame + * pointer and keeps any reserved-but-callee-saved scratch such as x64 rbx). + * The prologue/epilogue then preserve them, so the asm hook needs no per-block + * spill — which on the known-frame path would request a frame slot after the + * frame is already final. NULL / 0 when the body contains no inline asm. */ + const Sym* asm_clobbers; + u32 nasm_clobbers; + /* Union of CfreeCgAsmClobberAbiSet bits over the body's inline-asm blocks: an + * arch-neutral "clobbers the whole caller/callee-saved set" the backend + * expands against its own register file, alongside the named asm_clobbers. */ + u32 asm_clobber_abi_sets; /* Whether the function body contains a dynamic alloca. The backend needs this * up front (before the body) to decide prologue/epilogue form, since with a * known frame the slim-epilogue eligibility is settled at func_begin. */ diff --git a/src/arch/rv64/native.c b/src/arch/rv64/native.c @@ -1521,6 +1521,63 @@ static void rv_reserve_callee_saves(NativeTarget* t, const u32* used, native_frame_set_callee_saves(&rv_of(t)->frame, used, nclasses, NULL, 0, 0); } +static int rv_reg_is_callee_int(Reg r); +static int rv_reg_is_callee_fp(Reg r); +static void rv_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers, + u32 nclob, u32* int_mask, u32* fp_mask); + +/* Expand the arch-neutral clobber-ABI sets (CfreeCgAsmClobberAbiSet bits) into + * this target's per-class caller/callee-saved register masks. */ +static void rv_abi_clobber_masks(u32 abi_sets, u32* int_mask, u32* fp_mask) { + *int_mask = 0; + *fp_mask = 0; + if (abi_sets & CFREE_CG_ASM_CLOBBER_ABI_CALLER_SAVED) { + *int_mask |= rv_classes[NATIVE_REG_INT].caller_saved_mask; + *fp_mask |= rv_classes[NATIVE_REG_FP].caller_saved_mask; + } + if (abi_sets & CFREE_CG_ASM_CLOBBER_ABI_CALLEE_SAVED) { + *int_mask |= rv_classes[NATIVE_REG_INT].callee_saved_mask; + *fp_mask |= rv_classes[NATIVE_REG_FP].callee_saved_mask; + } +} + +/* Build the callee-saved set the prologue must preserve: the allocator-assigned + * callee-saved registers (frame->callee_saved_used) plus any an inline-asm block + * clobbers. The latter are opaque to the optimizer's operand scan, so it + * forwards the raw clobber names (frame->asm_clobbers) and the arch-neutral + * clobber-ABI sets (frame->asm_clobber_abi_sets); we resolve both into masks and + * keep only the callee-saved ones — rv_reg_is_callee_int excludes s0 (the frame + * pointer, preserved by the prologue head, not as an ordinary callee-save). This + * is the same register selection the per-block spill used, hoisted into the + * prologue. Writes up to `cap` per-class masks into `out` and returns the class + * count to reserve. */ +static u32 rv_known_callee_saves(NativeTarget* t, + const NativeKnownFrameDesc* frame, u32* out, + u32 cap) { + u32 ncls = frame->ncallee_classes; + u32 clob_int = 0, clob_fp = 0, abi_int, abi_fp; + if (ncls > cap) ncls = cap; + for (u32 c = 0; c < ncls; ++c) + out[c] = frame->callee_saved_used ? frame->callee_saved_used[c] : 0u; + if (frame->asm_clobbers && frame->nasm_clobbers) { + RvNativeTarget* a = rv_of(t); + SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0}; + rv_asm_clobber_masks(t->c, loc, frame->asm_clobbers, frame->nasm_clobbers, + &clob_int, &clob_fp); + } + rv_abi_clobber_masks(frame->asm_clobber_abi_sets, &abi_int, &abi_fp); + clob_int |= abi_int; + clob_fp |= abi_fp; + for (Reg r = 0; r < 32u; ++r) { + if (NATIVE_REG_INT < ncls && (clob_int & (1u << r)) && + rv_reg_is_callee_int(r)) + out[NATIVE_REG_INT] |= 1u << r; + if (NATIVE_REG_FP < ncls && (clob_fp & (1u << r)) && rv_reg_is_callee_fp(r)) + out[NATIVE_REG_FP] |= 1u << r; + } + return ncls; +} + static u32 rv_signature_stack_bytes(NativeTarget* t, CfreeCgTypeId fn_type, int* variadic, u32* nparams); @@ -1542,10 +1599,10 @@ static void rv_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd, rv_func_begin_common(t, fd); a->frame.known_frame = 1; if (frame) { + u32 cs[NATIVE_CALL_PLAN_CLASSES]; + u32 ncs = rv_known_callee_saves(t, frame, cs, NATIVE_CALL_PLAN_CLASSES); a->frame.has_alloca = frame->has_alloca; - if (frame->callee_saved_used && frame->ncallee_classes) - rv_reserve_callee_saves(t, frame->callee_saved_used, - frame->ncallee_classes); + if (ncs) rv_reserve_callee_saves(t, cs, ncs); for (i = 0; i < frame->nslots; ++i) { NativeFrameSlot slot = rv_frame_slot(t, &frame->slots[i]); if (out_slots) out_slots[i] = slot; @@ -3200,13 +3257,9 @@ static void rv_asm_block_native(NativeTarget* t, const char* tmpl, SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0}; Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL; Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL; - u32 clob_int, clob_fp, ntmp = 0; - RvAsmSavedClobber* saved; - u32 nsaved, i; + u32 ntmp = 0, i; Rv64Asm* asmh; - rv_asm_clobber_masks(c, loc, clobbers, nclob, &clob_int, &clob_fp); - for (i = 0; i < nout; ++i) { CfreeCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type; rv_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, out_locs[i], @@ -3239,13 +3292,14 @@ static void rv_asm_block_native(NativeTarget* t, const char* tmpl, rv_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc, &ntmp); } - saved = rv_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved); + /* No per-block callee-saved spill here: plan_frame forwarded the asm clobber + * masks and rv_known_callee_saves folded the callee-saved ones into the + * function's saved set, so the prologue/epilogue already preserve them. */ asmh = rv64_asm_open(c); rv64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers, nclob); rv64_asm_run_template(asmh, t->mc, tmpl); rv64_asm_close(asmh); - for (i = nsaved; i > 0; --i) rv_asm_restore_one(a, &saved[i - 1u]); } static void rv_file_scope_asm(NativeTarget* t, const char* src, size_t len) { /* Top-level __asm__("...") — assemble through the generic .s parser, which @@ -3476,17 +3530,21 @@ static void rv_direct_asm_block(NativeDirectTarget* d, const char* tmpl, const AsmConstraint* outs, u32 nout, Operand* out_ops, const AsmConstraint* ins, u32 nin, const Operand* in_ops, - const Sym* clobbers, u32 nclob) { + const Sym* clobbers, u32 nclob, + u32 clobber_abi_sets) { RvNativeTarget* a = rv_of(d->native); Compiler* c = d->base.c; Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL; Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL; - u32 clob_int, clob_fp, used_int, used_fp; + u32 clob_int, clob_fp, abi_int, abi_fp, used_int, used_fp; RvAsmSavedClobber* saved; u32 nsaved, i; Rv64Asm* asmh; rv_asm_clobber_masks(c, d->loc, clobbers, nclob, &clob_int, &clob_fp); + rv_abi_clobber_masks(clobber_abi_sets, &abi_int, &abi_fp); + clob_int |= abi_int; + clob_fp |= abi_fp; /* Reserve emit scratch (t0/t1/t2/t3), sp/gp/tp/zero/ra and the frame pointer * so the operand allocator never hands them out. */ used_int = clob_int | (1u << RV_ZERO) | (1u << RV_RA) | (1u << RV_SP) | diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c @@ -1634,6 +1634,63 @@ static void x64_reserve_callee_saves(NativeTarget* t, const u32* used, native_frame_set_callee_saves(&x64_of(t)->frame, used, nclasses, NULL, 0, 0); } +static int x64_reg_is_callee_int(const X64ABIRegs* abi, Reg r); +static int x64_reg_is_callee_fp(const X64ABIRegs* abi, Reg r); +static void x64_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers, + u32 nclob, u32* int_mask, u32* fp_mask); + +/* Expand the arch-neutral clobber-ABI sets (CfreeCgAsmClobberAbiSet bits) into + * this target's per-class caller/callee-saved register masks. */ +static void x64_abi_clobber_masks(u32 abi_sets, u32* int_mask, u32* fp_mask) { + *int_mask = 0; + *fp_mask = 0; + if (abi_sets & CFREE_CG_ASM_CLOBBER_ABI_CALLER_SAVED) { + *int_mask |= x64_classes[NATIVE_REG_INT].caller_saved_mask; + *fp_mask |= x64_classes[NATIVE_REG_FP].caller_saved_mask; + } + if (abi_sets & CFREE_CG_ASM_CLOBBER_ABI_CALLEE_SAVED) { + *int_mask |= x64_classes[NATIVE_REG_INT].callee_saved_mask; + *fp_mask |= x64_classes[NATIVE_REG_FP].callee_saved_mask; + } +} + +/* Build the callee-saved set the prologue must preserve: the allocator-assigned + * callee-saved registers (frame->callee_saved_used) plus any an inline-asm block + * clobbers. The latter are opaque to the optimizer's operand scan, so it + * forwards the raw clobber names (frame->asm_clobbers) and the arch-neutral + * clobber-ABI sets (frame->asm_clobber_abi_sets); we resolve both into masks and + * keep only the callee-saved ones. x64_reg_is_callee_* follow the live ABI: they + * exclude rbp (handled by the prologue head) and keep the reserved-but-callee- + * saved scratch rbx/r12 (which the caller still expects preserved). This is the + * same register selection the per-block spill used, hoisted into the prologue. */ +static u32 x64_known_callee_saves(NativeTarget* t, const X64ABIRegs* abi, + const NativeKnownFrameDesc* frame, u32* out, + u32 cap) { + u32 ncls = frame->ncallee_classes; + u32 clob_int = 0, clob_fp = 0, abi_int, abi_fp; + if (ncls > cap) ncls = cap; + for (u32 c = 0; c < ncls; ++c) + out[c] = frame->callee_saved_used ? frame->callee_saved_used[c] : 0u; + if (frame->asm_clobbers && frame->nasm_clobbers) { + X64NativeTarget* a = x64_of(t); + SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0}; + x64_asm_clobber_masks(t->c, loc, frame->asm_clobbers, frame->nasm_clobbers, + &clob_int, &clob_fp); + } + x64_abi_clobber_masks(frame->asm_clobber_abi_sets, &abi_int, &abi_fp); + clob_int |= abi_int; + clob_fp |= abi_fp; + for (Reg r = 0; r < 16u; ++r) { + if (NATIVE_REG_INT < ncls && (clob_int & (1u << r)) && + x64_reg_is_callee_int(abi, r)) + out[NATIVE_REG_INT] |= 1u << r; + if (NATIVE_REG_FP < ncls && (clob_fp & (1u << r)) && + x64_reg_is_callee_fp(abi, r)) + out[NATIVE_REG_FP] |= 1u << r; + } + return ncls; +} + /* Optimizer entry point: the full frame is supplied up front, so the prologue * is emitted final the moment it is built — no NOP region, no func_end patch * (x64_func_end skips patching when known_frame). x64_build_prologue emits the @@ -1650,10 +1707,11 @@ static void x64_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd, x64_func_begin_common(t, fd); a->frame.known_frame = 1; if (frame) { + u32 cs[NATIVE_CALL_PLAN_CLASSES]; + u32 ncs = + x64_known_callee_saves(t, a->abi, frame, cs, NATIVE_CALL_PLAN_CLASSES); a->frame.has_alloca = frame->has_alloca; - if (frame->callee_saved_used && frame->ncallee_classes) - x64_reserve_callee_saves(t, frame->callee_saved_used, - frame->ncallee_classes); + if (ncs) x64_reserve_callee_saves(t, cs, ncs); for (i = 0; i < frame->nslots; ++i) { NativeFrameSlot slot = x64_frame_slot(t, &frame->slots[i]); if (out_slots) out_slots[i] = slot; @@ -3674,13 +3732,9 @@ static void x64_asm_block_native(NativeTarget* t, const char* tmpl, SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0}; Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL; Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL; - u32 clob_int, clob_fp, ntmp = 0; - X64AsmSavedClobber* saved; - u32 nsaved, i; + u32 ntmp = 0, i; X64Asm* asmh; - x64_asm_clobber_masks(c, loc, clobbers, nclob, &clob_int, &clob_fp); - for (i = 0; i < nout; ++i) { CfreeCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type; x64_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, out_locs[i], @@ -3711,13 +3765,14 @@ static void x64_asm_block_native(NativeTarget* t, const char* tmpl, x64_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc, &ntmp); } - saved = x64_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved); + /* No per-block callee-saved spill here: plan_frame forwarded the asm clobber + * masks and x64_known_callee_saves folded the callee-saved ones into the + * function's saved set, so the prologue/epilogue already preserve them. */ asmh = x64_asm_open(c); x64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers, nclob); x64_asm_run_template(asmh, t->mc, tmpl); x64_asm_close(asmh); - for (i = nsaved; i > 0; --i) x64_asm_restore_one(a, &saved[i - 1u]); } static void x64_file_scope_asm(NativeTarget* t, const char* src, size_t len) { @@ -3989,17 +4044,21 @@ static void x64_direct_asm_block(NativeDirectTarget* d, const char* tmpl, const AsmConstraint* outs, u32 nout, Operand* out_ops, const AsmConstraint* ins, u32 nin, const Operand* in_ops, - const Sym* clobbers, u32 nclob) { + const Sym* clobbers, u32 nclob, + u32 clobber_abi_sets) { X64NativeTarget* a = x64_of(d->native); Compiler* c = d->base.c; Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL; Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL; - u32 clob_int, clob_fp, used_int, used_fp; + u32 clob_int, clob_fp, abi_int, abi_fp, used_int, used_fp; X64AsmSavedClobber* saved; u32 nsaved, i; X64Asm* asmh; x64_asm_clobber_masks(c, d->loc, clobbers, nclob, &clob_int, &clob_fp); + x64_abi_clobber_masks(clobber_abi_sets, &abi_int, &abi_fp); + clob_int |= abi_int; + clob_fp |= abi_fp; /* Reserve emit scratch (rax,r11), driver scratch, sp/bp, and clobbers. */ used_int = clob_int | (1u << X64_RAX) | (1u << X64_R11) | (1u << X64_RSP) | (1u << X64_RBP) | (1u << X64_RBX) | (1u << X64_R12) | diff --git a/src/cg/asm.c b/src/cg/asm.c @@ -29,6 +29,17 @@ int api_asm_is_early_clobber(const char* s) { return (s[0] == '=' && s[1] == '&') || s[0] == '&'; } +/* Does this constraint body name a register operand (one that binds to a temp + * local, as opposed to 'i' immediate or 'm' memory)? 'r' is the architecture- + * neutral general-register class; 'f' (riscv), 'x' (x86 SSE) and 'w' (aarch64 + * SIMD/FP) are the per-target FP/vector register classes. The temp local's type + * selects the actual NativeAllocClass downstream, and the target's asm hook + * rejects a letter that does not apply to it, so listing all three here is safe + * across backends. */ +int api_asm_is_reg_constraint(char c) { + return c == 'r' || c == 'f' || c == 'x' || c == 'w'; +} + void api_asm_memory_clobber_sv(CfreeCg* g, ApiSValue* sv, CGLocal local_id) { (void)g; (void)sv; @@ -60,8 +71,8 @@ void cfree_cg_inline_asm(CfreeCg* g, CfreeCgInlineAsm asm_block) { uint32_t ninputs = asm_block.ninputs; const CfreeSym* clobbers = asm_block.clobbers; uint32_t nclobbers = asm_block.nclobbers; + uint32_t clobber_abi_sets = asm_block.clobber_abi_sets; (void)asm_block.flags; - (void)asm_block.clobber_abi_sets; if (!g) return; api_local_const_memory_boundary(g); T = g->target; @@ -145,7 +156,10 @@ void cfree_cg_inline_asm(CfreeCg* g, CfreeCgInlineAsm asm_block) { for (u32 i = 0; i < noutputs; ++i) { const char* body = api_asm_constraint_body(outs[i].str); if (api_asm_is_early_clobber(outs[i].str)) continue; - if (body[0] == 'r') { + /* A register constraint binds to a temp local; the local's type selects the + * register class (integer vs FP), so the backend hook places an FP-class + * output (riscv 'f', x86 'x', aarch64 'w') in an FP register. */ + if (api_asm_is_reg_constraint(body[0])) { CfreeCgTypeId oty = outs[i].type ? outs[i].type : fallback_ty; CGLocal r = api_alloc_temp_local(g, oty); out_ops[i] = api_op_local(r, oty); @@ -182,7 +196,7 @@ void cfree_cg_inline_asm(CfreeCg* g, CfreeCgInlineAsm asm_block) { T->copy(T, bound, src); } in_ops[i] = bound; - } else if (s[0] == 'r') { + } else if (api_asm_is_reg_constraint(s[0])) { in_ops[i] = api_force_local(g, &in_svs[i], ity); } else if (s[0] == 'i') { if (!api_sv_op_is(&in_svs[i], OPK_IMM)) { @@ -216,7 +230,7 @@ void cfree_cg_inline_asm(CfreeCg* g, CfreeCgInlineAsm asm_block) { CGLocal r; if (!api_asm_is_early_clobber(outs[i].str)) continue; body = api_asm_constraint_body(outs[i].str); - if (body[0] != 'r') { + if (!api_asm_is_reg_constraint(body[0])) { compiler_panic(g->c, g->cur_loc, "CfreeCg: unsupported early-clobber asm output"); continue; @@ -250,7 +264,7 @@ void cfree_cg_inline_asm(CfreeCg* g, CfreeCgInlineAsm asm_block) { } T->asm_block(T, tmpl_str, outs, noutputs, out_ops, ins, total_inputs, in_ops, - clobs, nclobbers); + clobs, nclobbers, clobber_abi_sets); for (u32 i = 0; i < total_inputs; ++i) api_release(g, &in_svs[i]); for (u32 i = 0; i < noutputs; ++i) { diff --git a/src/cg/cgtarget.h b/src/cg/cgtarget.h @@ -673,7 +673,7 @@ struct CgTarget { void (*asm_block)(CgTarget*, const char* tmpl, const AsmConstraint* outs, u32 nout, Operand* out_ops, const AsmConstraint* ins, u32 nin, const Operand* in_ops, const Sym* clobbers, - u32 nclob); + u32 nclob, u32 clobber_abi_sets); /* Optional: handle a top-level `__asm__("...")` block (file scope, not * inside a function). Backends that leave this NULL fall back to the diff --git a/src/cg/internal.h b/src/cg/internal.h @@ -216,6 +216,7 @@ const char* api_sym_cstr(CfreeCg* g, CfreeSym sym); int api_asm_parse_match_index(const char* s); const char* api_asm_constraint_body(const char* s); int api_asm_is_early_clobber(const char* s); +int api_asm_is_reg_constraint(char c); void api_asm_memory_clobber_sv(CfreeCg* g, ApiSValue* sv, CGLocal local); void cfree_cg_inline_asm(CfreeCg* g, CfreeCgInlineAsm asm_block); void cfree_cg_file_scope_asm(CfreeCg* g, CfreeSlice asm_source); diff --git a/src/cg/ir.h b/src/cg/ir.h @@ -169,6 +169,7 @@ typedef struct CgIrAsmAux { u32 nout; u32 nin; u32 nclob; + u32 clobber_abi_sets; /* CfreeCgAsmClobberAbiSet bits */ } CgIrAsmAux; typedef struct CgIrIntrinsicAux { diff --git a/src/cg/ir_recorder.c b/src/cg/ir_recorder.c @@ -538,8 +538,8 @@ static void rec_intrinsic(CgTarget* t, IntrinKind kind, Operand* dsts, u32 ndst, static void rec_asm_block(CgTarget* t, const char* tmpl, const AsmConstraint* outs, u32 nout, Operand* out_ops, const AsmConstraint* ins, u32 nin, - const Operand* in_ops, const Sym* clobbers, - u32 nclob) { + const Operand* in_ops, const Sym* clobbers, u32 nclob, + u32 clobber_abi_sets) { CgIrRecorder* r = rec_of(t); CgIrFunc* f = require_func(r); CgIrInst* in = emit(r, CG_IR_ASM_BLOCK); @@ -558,6 +558,7 @@ static void rec_asm_block(CgTarget* t, const char* tmpl, aux->nout = nout; aux->nin = nin; aux->nclob = nclob; + aux->clobber_abi_sets = clobber_abi_sets; in->extra.aux = aux; } diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c @@ -1707,15 +1707,15 @@ static void nd_intrinsic(CgTarget* t, IntrinKind kind, Operand* dsts, u32 ndst, static void nd_asm_block(CgTarget* t, const char* tmpl, const AsmConstraint* outs, u32 nout, Operand* out_ops, const AsmConstraint* ins, u32 nin, - const Operand* in_ops, const Sym* clobbers, - u32 nclob) { + const Operand* in_ops, const Sym* clobbers, u32 nclob, + u32 clobber_abi_sets) { NativeDirectTarget* d = nd_of(t); nd_flush_all(d); nd_barrier(d, NATIVE_DIRECT_BARRIER_INLINE_ASM | NATIVE_DIRECT_BARRIER_MEMORY); if (d->ops && d->ops->asm_block) { d->ops->asm_block(d, tmpl, outs, nout, out_ops, ins, nin, in_ops, clobbers, - nclob); + nclob, clobber_abi_sets); return; } nd_panic(d, "target does not emit inline asm"); diff --git a/src/cg/native_direct_target.h b/src/cg/native_direct_target.h @@ -88,7 +88,7 @@ struct NativeOps { void (*asm_block)(NativeDirectTarget*, const char* tmpl, const AsmConstraint* outs, u32 nout, Operand* out_ops, const AsmConstraint* ins, u32 nin, const Operand* in_ops, - const Sym* clobbers, u32 nclob); + const Sym* clobbers, u32 nclob, u32 clobber_abi_sets); void (*barrier)(NativeDirectTarget*, u32 flags); }; diff --git a/src/opt/cg_ir_lower.c b/src/opt/cg_ir_lower.c @@ -767,6 +767,7 @@ static void lower_asm(CgIrLower* l, Inst* out, const CgIrInst* in) { aux->nout = src->nout; aux->nin = src->nin; aux->nclob = src->nclob; + aux->clobber_abi_sets = src->clobber_abi_sets; aux->out_ops = src->nout ? arena_array(l->f->arena, OptOperand, src->nout) : NULL; aux->in_ops = diff --git a/src/opt/ir.h b/src/opt/ir.h @@ -440,6 +440,9 @@ typedef struct IRAsmAux { out_ops; /* nout slots; the wrapped target may fill in REG location */ Operand* in_ops; /* nin slots; recorded by w_asm_block, xlat'd at replay */ u32 nout, nin, nclob; + /* CfreeCgAsmClobberAbiSet bits: an arch-neutral "clobbers the whole caller/ + * callee-saved set" the backend expands against its own register file. */ + u32 clobber_abi_sets; /* Filled by opt_machinize from backend register-name resolution. */ u32 clobber_mask[OPT_REG_CLASSES]; i32* out_fixed_regs; /* nout, -1 when unconstrained */ diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c @@ -1356,6 +1356,9 @@ static void plan_frame(NativeEmitCtx* e, const CGFuncDesc* fd) { u8 needs_scratch_spill = 0; u8 has_call = 0; u8 has_asm = 0; + u32 nasm_clob = 0; + u32 asm_clobber_abi_sets = 0; + Sym* asm_clobbers = NULL; memset(&frame, 0, sizeof frame); nclasses = t->reserve_callee_saves ? compute_callee_saved_used(e, used, EMIT_MAX_REG_CLASSES) @@ -1377,10 +1380,37 @@ static void plan_frame(NativeEmitCtx* e, const CGFuncDesc* fd) { if (aux && aux->desc.nargs > max_args) max_args = aux->desc.nargs; } else if ((IROp)in->op == IR_ASM_BLOCK) { /* Inline asm may clobber the return-address register or the red zone - * opaquely; disqualifies the frame-eliding tiers (see has_asm). */ + * opaquely; disqualifies the frame-eliding tiers (see has_asm). Its + * callee-saved register clobbers are equally opaque to the operand scan + * below; count them now so the backend can fold them into the saved + * set (collected into a single Sym list in a second pass below). */ + IRAsmAux* aux = (IRAsmAux*)in->extra.aux; has_asm = 1; + if (aux) { + nasm_clob += aux->nclob; + asm_clobber_abi_sets |= aux->clobber_abi_sets; + } + } + } + } + /* Gather the union of every asm block's clobber names. The backend resolves + * them with its own clobber parser (machinize's resolve_name is unset on every + * backend, so aux->clobber_mask is unreliable here). */ + if (nasm_clob) { + u32 n = 0; + asm_clobbers = arena_array(e->f->arena, Sym, nasm_clob); + for (u32 b = 0; b < e->f->nblocks; ++b) { + Block* bl = &e->f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + IRAsmAux* aux; + if ((IROp)in->op != IR_ASM_BLOCK) continue; + aux = (IRAsmAux*)in->extra.aux; + for (u32 k = 0; aux && k < aux->nclob; ++k) + asm_clobbers[n++] = aux->clobbers[k]; } } + nasm_clob = n; } if (t->call_stack_bytes) { NativeLoc* args = @@ -1436,6 +1466,9 @@ static void plan_frame(NativeEmitCtx* e, const CGFuncDesc* fd) { frame.needs_scratch_spill = needs_scratch_spill; frame.is_leaf = !has_call; frame.has_asm = has_asm; + frame.asm_clobbers = asm_clobbers; + frame.nasm_clobbers = nasm_clob; + frame.asm_clobber_abi_sets = asm_clobber_abi_sets; t->func_begin_known_frame(t, fd, &frame, out_slots); for (u32 i = 0; i < e->f->nframe_slots; ++i) e->slot_map[e->f->frame_slots[i].id] = out_slots[i]; diff --git a/test/toy/cases/142_typed_asm_clobber_abi_callee.expected b/test/toy/cases/142_typed_asm_clobber_abi_callee.expected @@ -0,0 +1 @@ +42 diff --git a/test/toy/cases/142_typed_asm_clobber_abi_callee.toy b/test/toy/cases/142_typed_asm_clobber_abi_callee.toy @@ -0,0 +1,12 @@ +fn __user_main(): i64 { + @asm<void>( + "", + outputs(), + inputs(), + clobber_abi(.callee_saved), + flags(.volatile) + ); + return 42; +} + +fn main(): i32 { return __user_main() as i32; }