arch: fix -O1 inline-asm callee-saved clobber frame panic; add FP constraints + clobber_abi - kit

commit ca23ef4abbc29a06c4b0da8fda4fa1d37d4acbe8
parent 54d233934315c9eb5df71573041f9e4c6488f000
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Mon,  1 Jun 2026 14:43:29 -0700

arch: fix -O1 inline-asm callee-saved clobber frame panic; add FP constraints + clobber_abi

A leaf (or non-leaf) with inline asm clobbering a callee-saved register
panicked on the optimizer path with "frame slot requested after prologue".
The optimizer's plan_frame builds the function's callee-saved set by scanning
MIR operands, but asm clobbers are not operands -- so the clobbered register
was never saved by the prologue, and the per-block fallback (*_asm_save_one)
tried to allocate a spill slot after the frame was already final. rv64/x64/aa64
all hit it.

Fix: forward the asm clobber names (and the clobber-ABI sets, below) through
NativeKnownFrameDesc; each backend resolves them with its own clobber parser and
folds the callee-saved subset into the prologue-saved set using its ABI
predicate (rv64 excludes s0; x64 keeps the reserved-but-callee-saved rbx/r12;
aa64 uses x19..x28 / v8..v15). The optimizer-path asm hook then drops the
per-block spill entirely. machinize's resolve_name is unset on every backend, so
aux->clobber_mask is unreliable; the name-based path sidesteps that.

Also:
- FP register asm constraints: the arch-neutral CG layer rejected 'f'/'x'/'w'
  (riscv/x86/aarch64 FP register classes) even though every backend's asm hook
  handles them, so =f/=x/=w were unreachable from C. Accept them via
  api_asm_is_reg_constraint; the operand's type routes it to the FP class.
- clobber_abi was parsed but dropped (a no-op): thread clobber_abi_sets through
  the asm_block vtable -> IRAsmAux -> plan_frame and the O0 direct path, and add
  CFREE_CG_ASM_CLOBBER_ABI_CALLEE_SAVED. Each backend expands the set against its
  own register file. This is the arch-neutral way to spill the whole callee-saved
  file across an asm block, and the basis for a portable toy regression.
- toy: parse clobber_abi(.callee_saved); add case 142 exercising the fold at -O1
  across all lanes/arches.

Diffstat:
M include/cfree/cg.h  | 6 ++++++
M lang/toy/asm.c  | 2 ++
M src/arch/aa64/native.c  | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
M src/arch/check_target.c  | 3 ++-
M src/arch/native_target.h  | 15 +++++++++++++++
M src/arch/rv64/native.c  | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
M src/arch/x64/native.c  | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
M src/cg/asm.c  | 24 +++++++++++++++++++-----
M src/cg/cgtarget.h  | 2 +-
M src/cg/internal.h  | 1 +
M src/cg/ir.h  | 1 +
M src/cg/ir_recorder.c  | 5 +++--
M src/cg/native_direct_target.c  | 6 +++---
M src/cg/native_direct_target.h  | 2 +-
M src/opt/cg_ir_lower.c  | 1 +
M src/opt/ir.h  | 3 +++
M src/opt/pass_native_emit.c  | 35 ++++++++++++++++++++++++++++++++++-
A test/toy/cases/142_typed_asm_clobber_abi_callee.expected  | 1 +
A test/toy/cases/142_typed_asm_clobber_abi_callee.toy  | 12 ++++++++++++

19 files changed, 318 insertions(+), 53 deletions(-)
diff --git a/include/cfree/cg.h b/include/cfree/cg.h
@@ -978,6 +978,12 @@ typedef enum CfreeCgAsmFlag {
 typedef enum CfreeCgAsmClobberAbiSet {
   CFREE_CG_ASM_CLOBBER_ABI_NONE = 0,
   CFREE_CG_ASM_CLOBBER_ABI_CALLER_SAVED = 1u << 0,
+  /* Every callee-saved register of the target ABI. The compiler preserves them
+   * across the asm block (prologue/epilogue save on the optimizer path, a
+   * per-block spill on the single-pass path) just as it would for named
+   * callee-saved clobbers — an arch-neutral way to say "this asm trashes the
+   * callee-saved register file". */
+  CFREE_CG_ASM_CLOBBER_ABI_CALLEE_SAVED = 1u << 1,
 } CfreeCgAsmClobberAbiSet;
 
 typedef struct CfreeCgAsmOperand {
diff --git a/lang/toy/asm.c b/lang/toy/asm.c
@@ -324,6 +324,8 @@ static int toy_parse_asm_clobber_abi(ToyParser* p, uint32_t* clobber_abi_sets) {
     if (!toy_parse_attr_dot_name(p, &name)) return 0;
     if (toy_sym_is(p, name, "caller_saved"))
       *clobber_abi_sets |= CFREE_CG_ASM_CLOBBER_ABI_CALLER_SAVED;
+    else if (toy_sym_is(p, name, "callee_saved"))
+      *clobber_abi_sets |= CFREE_CG_ASM_CLOBBER_ABI_CALLEE_SAVED;
     else {
       toy_error(p, p->cur.loc, "unknown asm clobber ABI");
       return 0;
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -1526,6 +1526,47 @@ static int aa_frame_slot_debug_loc(NativeTarget* t, NativeFrameSlot slot,
   return 1;
 }
 
+static void aa_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
+                                 u32 nclob, u32* int_mask, u32* fp_mask);
+/* Defined after aa_classes (below); forward-declared so the frame helper can use
+ * it. Expands CfreeCgAsmClobberAbiSet bits into per-class register masks. */
+static void aa_abi_clobber_masks(u32 abi_sets, u32* int_mask, u32* fp_mask);
+
+/* Build the callee-saved set the prologue must preserve: the allocator-assigned
+ * callee-saved registers (frame->callee_saved_used) plus any an inline-asm block
+ * clobbers. The latter are opaque to the optimizer's operand scan, so it
+ * forwards the raw clobber names (frame->asm_clobbers) and the arch-neutral
+ * clobber-ABI sets (frame->asm_clobber_abi_sets); we resolve both into masks and
+ * keep only the callee-saved ones, per AAPCS64: x19..x28 and the low 64 bits of
+ * v8..v15 (x29/x30 are the frame pointer and link register, handled by the
+ * prologue head, not as ordinary callee-saves). This is the same register
+ * selection the per-block spill used, hoisted into the prologue. */
+static u32 aa_known_callee_saves(NativeTarget* t,
+                                 const NativeKnownFrameDesc* frame, u32* out,
+                                 u32 cap) {
+  u32 ncls = frame->ncallee_classes;
+  u32 clob_int = 0, clob_fp = 0, abi_int, abi_fp;
+  if (ncls > cap) ncls = cap;
+  for (u32 c = 0; c < ncls; ++c)
+    out[c] = frame->callee_saved_used ? frame->callee_saved_used[c] : 0u;
+  if (frame->asm_clobbers && frame->nasm_clobbers) {
+    AANativeTarget* a = aa_of(t);
+    SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
+    aa_asm_clobber_masks(t->c, loc, frame->asm_clobbers, frame->nasm_clobbers,
+                         &clob_int, &clob_fp);
+  }
+  aa_abi_clobber_masks(frame->asm_clobber_abi_sets, &abi_int, &abi_fp);
+  clob_int |= abi_int;
+  clob_fp |= abi_fp;
+  for (Reg r = 0; r < 32u; ++r) {
+    if (NATIVE_REG_INT < ncls && (clob_int & (1u << r)) && r >= 19u && r <= 28u)
+      out[NATIVE_REG_INT] |= 1u << r;
+    if (NATIVE_REG_FP < ncls && (clob_fp & (1u << r)) && r >= 8u && r <= 15u)
+      out[NATIVE_REG_FP] |= 1u << r;
+  }
+  return ncls;
+}
+
 /* Optimizer entry point: the full frame is supplied up front, so the prologue,
  * entry saves, slim-form eligibility, allocas, and tail epilogues are all final
  * the moment they are emitted — no back-patching (aa_func_end skips the patch
@@ -1542,10 +1583,10 @@ static void aa_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
   aa_func_begin_common(t, fd);
   a->frame.known_frame = 1;
   if (frame) {
+    u32 cs[NATIVE_CALL_PLAN_CLASSES];
+    u32 ncs = aa_known_callee_saves(t, frame, cs, NATIVE_CALL_PLAN_CLASSES);
     a->frame.has_alloca = frame->has_alloca;
-    if (frame->callee_saved_used && frame->ncallee_classes)
-      aa_reserve_callee_saves(t, frame->callee_saved_used,
-                              frame->ncallee_classes);
+    if (ncs) aa_reserve_callee_saves(t, cs, ncs);
     for (u32 i = 0; i < frame->nslots; ++i) {
       NativeFrameSlot slot = aa_frame_slot(t, &frame->slots[i]);
       if (out_slots) out_slots[i] = slot;
@@ -3479,6 +3520,22 @@ static const NativeRegInfo aa_reg_info = {
     .nclasses = sizeof aa_classes / sizeof aa_classes[0],
 };
 
+/* Expand the arch-neutral clobber-ABI sets (CfreeCgAsmClobberAbiSet bits) into
+ * this target's per-class caller/callee-saved register masks. Forward-declared
+ * earlier for aa_known_callee_saves; defined here where aa_classes is in scope. */
+static void aa_abi_clobber_masks(u32 abi_sets, u32* int_mask, u32* fp_mask) {
+  *int_mask = 0;
+  *fp_mask = 0;
+  if (abi_sets & CFREE_CG_ASM_CLOBBER_ABI_CALLER_SAVED) {
+    *int_mask |= aa_classes[NATIVE_REG_INT].caller_saved_mask;
+    *fp_mask |= aa_classes[NATIVE_REG_FP].caller_saved_mask;
+  }
+  if (abi_sets & CFREE_CG_ASM_CLOBBER_ABI_CALLEE_SAVED) {
+    *int_mask |= aa_classes[NATIVE_REG_INT].callee_saved_mask;
+    *fp_mask |= aa_classes[NATIVE_REG_FP].callee_saved_mask;
+  }
+}
+
 static void aa_va_start_native(NativeTarget* t, NativeLoc ap_ptr);
 static void aa_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr,
                              CfreeCgTypeId type);
@@ -4081,9 +4138,8 @@ static int aa_asm_parse_reg_clobber(Compiler* c, SrcLoc loc, Sym name,
   return 0;
 }
 
-AA_UNUSED_FN static void aa_asm_clobber_masks(Compiler* c, SrcLoc loc,
-                                              const Sym* clobbers, u32 nclob,
-                                              u32* int_mask, u32* fp_mask) {
+static void aa_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
+                                 u32 nclob, u32* int_mask, u32* fp_mask) {
   *int_mask = 0;
   *fp_mask = 0;
   for (u32 i = 0; i < nclob; ++i) {
@@ -4249,16 +4305,20 @@ static void aa_direct_asm_block(NativeDirectTarget* d, const char* tmpl,
                                 const AsmConstraint* outs, u32 nout,
                                 Operand* out_ops, const AsmConstraint* ins,
                                 u32 nin, const Operand* in_ops,
-                                const Sym* clobbers, u32 nclob) {
+                                const Sym* clobbers, u32 nclob,
+                                u32 clobber_abi_sets) {
   Operand* bound_outs =
       nout ? arena_zarray(d->base.c->tu, Operand, nout) : NULL;
   Operand* bound_ins = nin ? arena_zarray(d->base.c->tu, Operand, nin) : NULL;
-  u32 clob_int, clob_fp, used_int, used_fp;
+  u32 clob_int, clob_fp, abi_int, abi_fp, used_int, used_fp;
   AAAsmSavedClobber* saved;
   u32 nsaved;
   AA64Asm* a;
 
   aa_asm_clobber_masks(d->base.c, d->loc, clobbers, nclob, &clob_int, &clob_fp);
+  aa_abi_clobber_masks(clobber_abi_sets, &abi_int, &abi_fp);
+  clob_int |= abi_int;
+  clob_fp |= abi_fp;
   used_int = clob_int | (1u << AA_TMP0) | (1u << AA_TMP1) | (1u << 18u) |
              (1u << AA_FP) | (1u << AA_LR) | (1u << AA_SP);
   used_fp = clob_fp | (1u << 20u) | (1u << 21u);
@@ -4442,13 +4502,9 @@ static void aa_asm_block_native(NativeTarget* t, const char* tmpl,
   SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
   Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
   Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
-  u32 clob_int, clob_fp, ntmp = 0;
-  AAAsmSavedClobber* saved;
-  u32 nsaved;
+  u32 ntmp = 0;
   AA64Asm* asmh;
 
-  aa_asm_clobber_masks(c, loc, clobbers, nclob, &clob_int, &clob_fp);
-
   for (u32 i = 0; i < nout; ++i) {
     CfreeCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type;
     aa_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, out_locs[i],
@@ -4490,13 +4546,14 @@ static void aa_asm_block_native(NativeTarget* t, const char* tmpl,
     }
   }
 
-  saved = aa_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved);
+  /* No per-block callee-saved spill here: plan_frame forwarded the asm clobber
+   * masks and aa_known_callee_saves folded the callee-saved ones into the
+   * function's saved set, so the prologue/epilogue already preserve them. */
   asmh = aa64_asm_open(c);
   aa64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
                    nclob);
   aa64_asm_run_template(asmh, t->mc, tmpl);
   aa64_asm_close(asmh);
-  for (u32 i = nsaved; i > 0; --i) aa_asm_restore_one(a, &saved[i - 1u]);
 }
 
 static const NativeOps aa_direct_ops = {
diff --git a/src/arch/check_target.c b/src/arch/check_target.c
@@ -282,7 +282,7 @@ static void check_asm_block(CgTarget* t, const char* tmpl,
                             const AsmConstraint* outs, u32 nout,
                             Operand* out_ops, const AsmConstraint* ins, u32 nin,
                             const Operand* in_ops, const Sym* clobbers,
-                            u32 nclob) {
+                            u32 nclob, u32 clobber_abi_sets) {
   (void)t;
   (void)tmpl;
   (void)outs;
@@ -293,6 +293,7 @@ static void check_asm_block(CgTarget* t, const char* tmpl,
   (void)in_ops;
   (void)clobbers;
   (void)nclob;
+  (void)clobber_abi_sets;
 }
 
 static void check_atomic_load(CgTarget* t, Operand dst, Operand addr,
diff --git a/src/arch/native_target.h b/src/arch/native_target.h
@@ -55,6 +55,21 @@ typedef struct NativeKnownFrameDesc {
    * full frame is fixed before the prologue is emitted. NULL / 0 means none. */
   const u32* callee_saved_used;
   u32 ncallee_classes;
+  /* Union of the clobber register names of every inline-asm block in the body.
+   * Inline-asm clobbers are invisible to the operand scan that builds
+   * callee_saved_used, so the optimizer forwards the raw names here and the
+   * backend resolves them with its own clobber parser, folding the callee-saved
+   * ones into its save set (applying its ABI predicate, which excludes the frame
+   * pointer and keeps any reserved-but-callee-saved scratch such as x64 rbx).
+   * The prologue/epilogue then preserve them, so the asm hook needs no per-block
+   * spill — which on the known-frame path would request a frame slot after the
+   * frame is already final. NULL / 0 when the body contains no inline asm. */
+  const Sym* asm_clobbers;
+  u32 nasm_clobbers;
+  /* Union of CfreeCgAsmClobberAbiSet bits over the body's inline-asm blocks: an
+   * arch-neutral "clobbers the whole caller/callee-saved set" the backend
+   * expands against its own register file, alongside the named asm_clobbers. */
+  u32 asm_clobber_abi_sets;
   /* Whether the function body contains a dynamic alloca. The backend needs this
    * up front (before the body) to decide prologue/epilogue form, since with a
    * known frame the slim-epilogue eligibility is settled at func_begin. */
diff --git a/src/arch/rv64/native.c b/src/arch/rv64/native.c
@@ -1521,6 +1521,63 @@ static void rv_reserve_callee_saves(NativeTarget* t, const u32* used,
   native_frame_set_callee_saves(&rv_of(t)->frame, used, nclasses, NULL, 0, 0);
 }
 
+static int rv_reg_is_callee_int(Reg r);
+static int rv_reg_is_callee_fp(Reg r);
+static void rv_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
+                                 u32 nclob, u32* int_mask, u32* fp_mask);
+
+/* Expand the arch-neutral clobber-ABI sets (CfreeCgAsmClobberAbiSet bits) into
+ * this target's per-class caller/callee-saved register masks. */
+static void rv_abi_clobber_masks(u32 abi_sets, u32* int_mask, u32* fp_mask) {
+  *int_mask = 0;
+  *fp_mask = 0;
+  if (abi_sets & CFREE_CG_ASM_CLOBBER_ABI_CALLER_SAVED) {
+    *int_mask |= rv_classes[NATIVE_REG_INT].caller_saved_mask;
+    *fp_mask |= rv_classes[NATIVE_REG_FP].caller_saved_mask;
+  }
+  if (abi_sets & CFREE_CG_ASM_CLOBBER_ABI_CALLEE_SAVED) {
+    *int_mask |= rv_classes[NATIVE_REG_INT].callee_saved_mask;
+    *fp_mask |= rv_classes[NATIVE_REG_FP].callee_saved_mask;
+  }
+}
+
+/* Build the callee-saved set the prologue must preserve: the allocator-assigned
+ * callee-saved registers (frame->callee_saved_used) plus any an inline-asm block
+ * clobbers. The latter are opaque to the optimizer's operand scan, so it
+ * forwards the raw clobber names (frame->asm_clobbers) and the arch-neutral
+ * clobber-ABI sets (frame->asm_clobber_abi_sets); we resolve both into masks and
+ * keep only the callee-saved ones — rv_reg_is_callee_int excludes s0 (the frame
+ * pointer, preserved by the prologue head, not as an ordinary callee-save). This
+ * is the same register selection the per-block spill used, hoisted into the
+ * prologue. Writes up to `cap` per-class masks into `out` and returns the class
+ * count to reserve. */
+static u32 rv_known_callee_saves(NativeTarget* t,
+                                 const NativeKnownFrameDesc* frame, u32* out,
+                                 u32 cap) {
+  u32 ncls = frame->ncallee_classes;
+  u32 clob_int = 0, clob_fp = 0, abi_int, abi_fp;
+  if (ncls > cap) ncls = cap;
+  for (u32 c = 0; c < ncls; ++c)
+    out[c] = frame->callee_saved_used ? frame->callee_saved_used[c] : 0u;
+  if (frame->asm_clobbers && frame->nasm_clobbers) {
+    RvNativeTarget* a = rv_of(t);
+    SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
+    rv_asm_clobber_masks(t->c, loc, frame->asm_clobbers, frame->nasm_clobbers,
+                         &clob_int, &clob_fp);
+  }
+  rv_abi_clobber_masks(frame->asm_clobber_abi_sets, &abi_int, &abi_fp);
+  clob_int |= abi_int;
+  clob_fp |= abi_fp;
+  for (Reg r = 0; r < 32u; ++r) {
+    if (NATIVE_REG_INT < ncls && (clob_int & (1u << r)) &&
+        rv_reg_is_callee_int(r))
+      out[NATIVE_REG_INT] |= 1u << r;
+    if (NATIVE_REG_FP < ncls && (clob_fp & (1u << r)) && rv_reg_is_callee_fp(r))
+      out[NATIVE_REG_FP] |= 1u << r;
+  }
+  return ncls;
+}
+
 static u32 rv_signature_stack_bytes(NativeTarget* t, CfreeCgTypeId fn_type,
                                     int* variadic, u32* nparams);
 
@@ -1542,10 +1599,10 @@ static void rv_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
   rv_func_begin_common(t, fd);
   a->frame.known_frame = 1;
   if (frame) {
+    u32 cs[NATIVE_CALL_PLAN_CLASSES];
+    u32 ncs = rv_known_callee_saves(t, frame, cs, NATIVE_CALL_PLAN_CLASSES);
     a->frame.has_alloca = frame->has_alloca;
-    if (frame->callee_saved_used && frame->ncallee_classes)
-      rv_reserve_callee_saves(t, frame->callee_saved_used,
-                              frame->ncallee_classes);
+    if (ncs) rv_reserve_callee_saves(t, cs, ncs);
     for (i = 0; i < frame->nslots; ++i) {
       NativeFrameSlot slot = rv_frame_slot(t, &frame->slots[i]);
       if (out_slots) out_slots[i] = slot;
@@ -3200,13 +3257,9 @@ static void rv_asm_block_native(NativeTarget* t, const char* tmpl,
   SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
   Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
   Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
-  u32 clob_int, clob_fp, ntmp = 0;
-  RvAsmSavedClobber* saved;
-  u32 nsaved, i;
+  u32 ntmp = 0, i;
   Rv64Asm* asmh;
 
-  rv_asm_clobber_masks(c, loc, clobbers, nclob, &clob_int, &clob_fp);
-
   for (i = 0; i < nout; ++i) {
     CfreeCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type;
     rv_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, out_locs[i],
@@ -3239,13 +3292,14 @@ static void rv_asm_block_native(NativeTarget* t, const char* tmpl,
     rv_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc, &ntmp);
   }
 
-  saved = rv_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved);
+  /* No per-block callee-saved spill here: plan_frame forwarded the asm clobber
+   * masks and rv_known_callee_saves folded the callee-saved ones into the
+   * function's saved set, so the prologue/epilogue already preserve them. */
   asmh = rv64_asm_open(c);
   rv64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
                    nclob);
   rv64_asm_run_template(asmh, t->mc, tmpl);
   rv64_asm_close(asmh);
-  for (i = nsaved; i > 0; --i) rv_asm_restore_one(a, &saved[i - 1u]);
 }
 static void rv_file_scope_asm(NativeTarget* t, const char* src, size_t len) {
   /* Top-level __asm__("...") — assemble through the generic .s parser, which
@@ -3476,17 +3530,21 @@ static void rv_direct_asm_block(NativeDirectTarget* d, const char* tmpl,
                                 const AsmConstraint* outs, u32 nout,
                                 Operand* out_ops, const AsmConstraint* ins,
                                 u32 nin, const Operand* in_ops,
-                                const Sym* clobbers, u32 nclob) {
+                                const Sym* clobbers, u32 nclob,
+                                u32 clobber_abi_sets) {
   RvNativeTarget* a = rv_of(d->native);
   Compiler* c = d->base.c;
   Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
   Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
-  u32 clob_int, clob_fp, used_int, used_fp;
+  u32 clob_int, clob_fp, abi_int, abi_fp, used_int, used_fp;
   RvAsmSavedClobber* saved;
   u32 nsaved, i;
   Rv64Asm* asmh;
 
   rv_asm_clobber_masks(c, d->loc, clobbers, nclob, &clob_int, &clob_fp);
+  rv_abi_clobber_masks(clobber_abi_sets, &abi_int, &abi_fp);
+  clob_int |= abi_int;
+  clob_fp |= abi_fp;
   /* Reserve emit scratch (t0/t1/t2/t3), sp/gp/tp/zero/ra and the frame pointer
    * so the operand allocator never hands them out. */
   used_int = clob_int | (1u << RV_ZERO) | (1u << RV_RA) | (1u << RV_SP) |
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -1634,6 +1634,63 @@ static void x64_reserve_callee_saves(NativeTarget* t, const u32* used,
   native_frame_set_callee_saves(&x64_of(t)->frame, used, nclasses, NULL, 0, 0);
 }
 
+static int x64_reg_is_callee_int(const X64ABIRegs* abi, Reg r);
+static int x64_reg_is_callee_fp(const X64ABIRegs* abi, Reg r);
+static void x64_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
+                                  u32 nclob, u32* int_mask, u32* fp_mask);
+
+/* Expand the arch-neutral clobber-ABI sets (CfreeCgAsmClobberAbiSet bits) into
+ * this target's per-class caller/callee-saved register masks. */
+static void x64_abi_clobber_masks(u32 abi_sets, u32* int_mask, u32* fp_mask) {
+  *int_mask = 0;
+  *fp_mask = 0;
+  if (abi_sets & CFREE_CG_ASM_CLOBBER_ABI_CALLER_SAVED) {
+    *int_mask |= x64_classes[NATIVE_REG_INT].caller_saved_mask;
+    *fp_mask |= x64_classes[NATIVE_REG_FP].caller_saved_mask;
+  }
+  if (abi_sets & CFREE_CG_ASM_CLOBBER_ABI_CALLEE_SAVED) {
+    *int_mask |= x64_classes[NATIVE_REG_INT].callee_saved_mask;
+    *fp_mask |= x64_classes[NATIVE_REG_FP].callee_saved_mask;
+  }
+}
+
+/* Build the callee-saved set the prologue must preserve: the allocator-assigned
+ * callee-saved registers (frame->callee_saved_used) plus any an inline-asm block
+ * clobbers. The latter are opaque to the optimizer's operand scan, so it
+ * forwards the raw clobber names (frame->asm_clobbers) and the arch-neutral
+ * clobber-ABI sets (frame->asm_clobber_abi_sets); we resolve both into masks and
+ * keep only the callee-saved ones. x64_reg_is_callee_* follow the live ABI: they
+ * exclude rbp (handled by the prologue head) and keep the reserved-but-callee-
+ * saved scratch rbx/r12 (which the caller still expects preserved). This is the
+ * same register selection the per-block spill used, hoisted into the prologue. */
+static u32 x64_known_callee_saves(NativeTarget* t, const X64ABIRegs* abi,
+                                  const NativeKnownFrameDesc* frame, u32* out,
+                                  u32 cap) {
+  u32 ncls = frame->ncallee_classes;
+  u32 clob_int = 0, clob_fp = 0, abi_int, abi_fp;
+  if (ncls > cap) ncls = cap;
+  for (u32 c = 0; c < ncls; ++c)
+    out[c] = frame->callee_saved_used ? frame->callee_saved_used[c] : 0u;
+  if (frame->asm_clobbers && frame->nasm_clobbers) {
+    X64NativeTarget* a = x64_of(t);
+    SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
+    x64_asm_clobber_masks(t->c, loc, frame->asm_clobbers, frame->nasm_clobbers,
+                          &clob_int, &clob_fp);
+  }
+  x64_abi_clobber_masks(frame->asm_clobber_abi_sets, &abi_int, &abi_fp);
+  clob_int |= abi_int;
+  clob_fp |= abi_fp;
+  for (Reg r = 0; r < 16u; ++r) {
+    if (NATIVE_REG_INT < ncls && (clob_int & (1u << r)) &&
+        x64_reg_is_callee_int(abi, r))
+      out[NATIVE_REG_INT] |= 1u << r;
+    if (NATIVE_REG_FP < ncls && (clob_fp & (1u << r)) &&
+        x64_reg_is_callee_fp(abi, r))
+      out[NATIVE_REG_FP] |= 1u << r;
+  }
+  return ncls;
+}
+
 /* Optimizer entry point: the full frame is supplied up front, so the prologue
  * is emitted final the moment it is built — no NOP region, no func_end patch
  * (x64_func_end skips patching when known_frame). x64_build_prologue emits the
@@ -1650,10 +1707,11 @@ static void x64_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
   x64_func_begin_common(t, fd);
   a->frame.known_frame = 1;
   if (frame) {
+    u32 cs[NATIVE_CALL_PLAN_CLASSES];
+    u32 ncs =
+        x64_known_callee_saves(t, a->abi, frame, cs, NATIVE_CALL_PLAN_CLASSES);
     a->frame.has_alloca = frame->has_alloca;
-    if (frame->callee_saved_used && frame->ncallee_classes)
-      x64_reserve_callee_saves(t, frame->callee_saved_used,
-                               frame->ncallee_classes);
+    if (ncs) x64_reserve_callee_saves(t, cs, ncs);
     for (i = 0; i < frame->nslots; ++i) {
       NativeFrameSlot slot = x64_frame_slot(t, &frame->slots[i]);
       if (out_slots) out_slots[i] = slot;
@@ -3674,13 +3732,9 @@ static void x64_asm_block_native(NativeTarget* t, const char* tmpl,
   SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
   Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
   Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
-  u32 clob_int, clob_fp, ntmp = 0;
-  X64AsmSavedClobber* saved;
-  u32 nsaved, i;
+  u32 ntmp = 0, i;
   X64Asm* asmh;
 
-  x64_asm_clobber_masks(c, loc, clobbers, nclob, &clob_int, &clob_fp);
-
   for (i = 0; i < nout; ++i) {
     CfreeCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type;
     x64_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, out_locs[i],
@@ -3711,13 +3765,14 @@ static void x64_asm_block_native(NativeTarget* t, const char* tmpl,
     x64_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc, &ntmp);
   }
 
-  saved = x64_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved);
+  /* No per-block callee-saved spill here: plan_frame forwarded the asm clobber
+   * masks and x64_known_callee_saves folded the callee-saved ones into the
+   * function's saved set, so the prologue/epilogue already preserve them. */
   asmh = x64_asm_open(c);
   x64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
                   nclob);
   x64_asm_run_template(asmh, t->mc, tmpl);
   x64_asm_close(asmh);
-  for (i = nsaved; i > 0; --i) x64_asm_restore_one(a, &saved[i - 1u]);
 }
 
 static void x64_file_scope_asm(NativeTarget* t, const char* src, size_t len) {
@@ -3989,17 +4044,21 @@ static void x64_direct_asm_block(NativeDirectTarget* d, const char* tmpl,
                                  const AsmConstraint* outs, u32 nout,
                                  Operand* out_ops, const AsmConstraint* ins,
                                  u32 nin, const Operand* in_ops,
-                                 const Sym* clobbers, u32 nclob) {
+                                 const Sym* clobbers, u32 nclob,
+                                 u32 clobber_abi_sets) {
   X64NativeTarget* a = x64_of(d->native);
   Compiler* c = d->base.c;
   Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
   Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
-  u32 clob_int, clob_fp, used_int, used_fp;
+  u32 clob_int, clob_fp, abi_int, abi_fp, used_int, used_fp;
   X64AsmSavedClobber* saved;
   u32 nsaved, i;
   X64Asm* asmh;
 
   x64_asm_clobber_masks(c, d->loc, clobbers, nclob, &clob_int, &clob_fp);
+  x64_abi_clobber_masks(clobber_abi_sets, &abi_int, &abi_fp);
+  clob_int |= abi_int;
+  clob_fp |= abi_fp;
   /* Reserve emit scratch (rax,r11), driver scratch, sp/bp, and clobbers. */
   used_int = clob_int | (1u << X64_RAX) | (1u << X64_R11) | (1u << X64_RSP) |
              (1u << X64_RBP) | (1u << X64_RBX) | (1u << X64_R12) |
diff --git a/src/cg/asm.c b/src/cg/asm.c
@@ -29,6 +29,17 @@ int api_asm_is_early_clobber(const char* s) {
   return (s[0] == '=' && s[1] == '&') || s[0] == '&';
 }
 
+/* Does this constraint body name a register operand (one that binds to a temp
+ * local, as opposed to 'i' immediate or 'm' memory)? 'r' is the architecture-
+ * neutral general-register class; 'f' (riscv), 'x' (x86 SSE) and 'w' (aarch64
+ * SIMD/FP) are the per-target FP/vector register classes. The temp local's type
+ * selects the actual NativeAllocClass downstream, and the target's asm hook
+ * rejects a letter that does not apply to it, so listing all three here is safe
+ * across backends. */
+int api_asm_is_reg_constraint(char c) {
+  return c == 'r' || c == 'f' || c == 'x' || c == 'w';
+}
+
 void api_asm_memory_clobber_sv(CfreeCg* g, ApiSValue* sv, CGLocal local_id) {
   (void)g;
   (void)sv;
@@ -60,8 +71,8 @@ void cfree_cg_inline_asm(CfreeCg* g, CfreeCgInlineAsm asm_block) {
   uint32_t ninputs = asm_block.ninputs;
   const CfreeSym* clobbers = asm_block.clobbers;
   uint32_t nclobbers = asm_block.nclobbers;
+  uint32_t clobber_abi_sets = asm_block.clobber_abi_sets;
   (void)asm_block.flags;
-  (void)asm_block.clobber_abi_sets;
   if (!g) return;
   api_local_const_memory_boundary(g);
   T = g->target;
@@ -145,7 +156,10 @@ void cfree_cg_inline_asm(CfreeCg* g, CfreeCgInlineAsm asm_block) {
   for (u32 i = 0; i < noutputs; ++i) {
     const char* body = api_asm_constraint_body(outs[i].str);
     if (api_asm_is_early_clobber(outs[i].str)) continue;
-    if (body[0] == 'r') {
+    /* A register constraint binds to a temp local; the local's type selects the
+     * register class (integer vs FP), so the backend hook places an FP-class
+     * output (riscv 'f', x86 'x', aarch64 'w') in an FP register. */
+    if (api_asm_is_reg_constraint(body[0])) {
       CfreeCgTypeId oty = outs[i].type ? outs[i].type : fallback_ty;
       CGLocal r = api_alloc_temp_local(g, oty);
       out_ops[i] = api_op_local(r, oty);
@@ -182,7 +196,7 @@ void cfree_cg_inline_asm(CfreeCg* g, CfreeCgInlineAsm asm_block) {
         T->copy(T, bound, src);
       }
       in_ops[i] = bound;
-    } else if (s[0] == 'r') {
+    } else if (api_asm_is_reg_constraint(s[0])) {
       in_ops[i] = api_force_local(g, &in_svs[i], ity);
     } else if (s[0] == 'i') {
       if (!api_sv_op_is(&in_svs[i], OPK_IMM)) {
@@ -216,7 +230,7 @@ void cfree_cg_inline_asm(CfreeCg* g, CfreeCgInlineAsm asm_block) {
     CGLocal r;
     if (!api_asm_is_early_clobber(outs[i].str)) continue;
     body = api_asm_constraint_body(outs[i].str);
-    if (body[0] != 'r') {
+    if (!api_asm_is_reg_constraint(body[0])) {
       compiler_panic(g->c, g->cur_loc,
                      "CfreeCg: unsupported early-clobber asm output");
       continue;
@@ -250,7 +264,7 @@ void cfree_cg_inline_asm(CfreeCg* g, CfreeCgInlineAsm asm_block) {
   }
 
   T->asm_block(T, tmpl_str, outs, noutputs, out_ops, ins, total_inputs, in_ops,
-               clobs, nclobbers);
+               clobs, nclobbers, clobber_abi_sets);
 
   for (u32 i = 0; i < total_inputs; ++i) api_release(g, &in_svs[i]);
   for (u32 i = 0; i < noutputs; ++i) {
diff --git a/src/cg/cgtarget.h b/src/cg/cgtarget.h
@@ -673,7 +673,7 @@ struct CgTarget {
   void (*asm_block)(CgTarget*, const char* tmpl, const AsmConstraint* outs,
                     u32 nout, Operand* out_ops, const AsmConstraint* ins,
                     u32 nin, const Operand* in_ops, const Sym* clobbers,
-                    u32 nclob);
+                    u32 nclob, u32 clobber_abi_sets);
 
   /* Optional: handle a top-level `__asm__("...")` block (file scope, not
    * inside a function). Backends that leave this NULL fall back to the
diff --git a/src/cg/internal.h b/src/cg/internal.h
@@ -216,6 +216,7 @@ const char* api_sym_cstr(CfreeCg* g, CfreeSym sym);
 int api_asm_parse_match_index(const char* s);
 const char* api_asm_constraint_body(const char* s);
 int api_asm_is_early_clobber(const char* s);
+int api_asm_is_reg_constraint(char c);
 void api_asm_memory_clobber_sv(CfreeCg* g, ApiSValue* sv, CGLocal local);
 void cfree_cg_inline_asm(CfreeCg* g, CfreeCgInlineAsm asm_block);
 void cfree_cg_file_scope_asm(CfreeCg* g, CfreeSlice asm_source);
diff --git a/src/cg/ir.h b/src/cg/ir.h
@@ -169,6 +169,7 @@ typedef struct CgIrAsmAux {
   u32 nout;
   u32 nin;
   u32 nclob;
+  u32 clobber_abi_sets; /* CfreeCgAsmClobberAbiSet bits */
 } CgIrAsmAux;
 
 typedef struct CgIrIntrinsicAux {
diff --git a/src/cg/ir_recorder.c b/src/cg/ir_recorder.c
@@ -538,8 +538,8 @@ static void rec_intrinsic(CgTarget* t, IntrinKind kind, Operand* dsts, u32 ndst,
 static void rec_asm_block(CgTarget* t, const char* tmpl,
                           const AsmConstraint* outs, u32 nout, Operand* out_ops,
                           const AsmConstraint* ins, u32 nin,
-                          const Operand* in_ops, const Sym* clobbers,
-                          u32 nclob) {
+                          const Operand* in_ops, const Sym* clobbers, u32 nclob,
+                          u32 clobber_abi_sets) {
   CgIrRecorder* r = rec_of(t);
   CgIrFunc* f = require_func(r);
   CgIrInst* in = emit(r, CG_IR_ASM_BLOCK);
@@ -558,6 +558,7 @@ static void rec_asm_block(CgTarget* t, const char* tmpl,
   aux->nout = nout;
   aux->nin = nin;
   aux->nclob = nclob;
+  aux->clobber_abi_sets = clobber_abi_sets;
   in->extra.aux = aux;
 }
 
diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c
@@ -1707,15 +1707,15 @@ static void nd_intrinsic(CgTarget* t, IntrinKind kind, Operand* dsts, u32 ndst,
 static void nd_asm_block(CgTarget* t, const char* tmpl,
                          const AsmConstraint* outs, u32 nout, Operand* out_ops,
                          const AsmConstraint* ins, u32 nin,
-                         const Operand* in_ops, const Sym* clobbers,
-                         u32 nclob) {
+                         const Operand* in_ops, const Sym* clobbers, u32 nclob,
+                         u32 clobber_abi_sets) {
   NativeDirectTarget* d = nd_of(t);
   nd_flush_all(d);
   nd_barrier(d,
              NATIVE_DIRECT_BARRIER_INLINE_ASM | NATIVE_DIRECT_BARRIER_MEMORY);
   if (d->ops && d->ops->asm_block) {
     d->ops->asm_block(d, tmpl, outs, nout, out_ops, ins, nin, in_ops, clobbers,
-                      nclob);
+                      nclob, clobber_abi_sets);
     return;
   }
   nd_panic(d, "target does not emit inline asm");
diff --git a/src/cg/native_direct_target.h b/src/cg/native_direct_target.h
@@ -88,7 +88,7 @@ struct NativeOps {
   void (*asm_block)(NativeDirectTarget*, const char* tmpl,
                     const AsmConstraint* outs, u32 nout, Operand* out_ops,
                     const AsmConstraint* ins, u32 nin, const Operand* in_ops,
-                    const Sym* clobbers, u32 nclob);
+                    const Sym* clobbers, u32 nclob, u32 clobber_abi_sets);
 
   void (*barrier)(NativeDirectTarget*, u32 flags);
 };
diff --git a/src/opt/cg_ir_lower.c b/src/opt/cg_ir_lower.c
@@ -767,6 +767,7 @@ static void lower_asm(CgIrLower* l, Inst* out, const CgIrInst* in) {
     aux->nout = src->nout;
     aux->nin = src->nin;
     aux->nclob = src->nclob;
+    aux->clobber_abi_sets = src->clobber_abi_sets;
     aux->out_ops =
         src->nout ? arena_array(l->f->arena, OptOperand, src->nout) : NULL;
     aux->in_ops =
diff --git a/src/opt/ir.h b/src/opt/ir.h
@@ -440,6 +440,9 @@ typedef struct IRAsmAux {
       out_ops;     /* nout slots; the wrapped target may fill in REG location */
   Operand* in_ops; /* nin slots; recorded by w_asm_block, xlat'd at replay */
   u32 nout, nin, nclob;
+  /* CfreeCgAsmClobberAbiSet bits: an arch-neutral "clobbers the whole caller/
+   * callee-saved set" the backend expands against its own register file. */
+  u32 clobber_abi_sets;
   /* Filled by opt_machinize from backend register-name resolution. */
   u32 clobber_mask[OPT_REG_CLASSES];
   i32* out_fixed_regs; /* nout, -1 when unconstrained */
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -1356,6 +1356,9 @@ static void plan_frame(NativeEmitCtx* e, const CGFuncDesc* fd) {
   u8 needs_scratch_spill = 0;
   u8 has_call = 0;
   u8 has_asm = 0;
+  u32 nasm_clob = 0;
+  u32 asm_clobber_abi_sets = 0;
+  Sym* asm_clobbers = NULL;
   memset(&frame, 0, sizeof frame);
   nclasses = t->reserve_callee_saves
                  ? compute_callee_saved_used(e, used, EMIT_MAX_REG_CLASSES)
@@ -1377,10 +1380,37 @@ static void plan_frame(NativeEmitCtx* e, const CGFuncDesc* fd) {
         if (aux && aux->desc.nargs > max_args) max_args = aux->desc.nargs;
       } else if ((IROp)in->op == IR_ASM_BLOCK) {
         /* Inline asm may clobber the return-address register or the red zone
-         * opaquely; disqualifies the frame-eliding tiers (see has_asm). */
+         * opaquely; disqualifies the frame-eliding tiers (see has_asm). Its
+         * callee-saved register clobbers are equally opaque to the operand scan
+         * below; count them now so the backend can fold them into the saved
+         * set (collected into a single Sym list in a second pass below). */
+        IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
         has_asm = 1;
+        if (aux) {
+          nasm_clob += aux->nclob;
+          asm_clobber_abi_sets |= aux->clobber_abi_sets;
+        }
+      }
+    }
+  }
+  /* Gather the union of every asm block's clobber names. The backend resolves
+   * them with its own clobber parser (machinize's resolve_name is unset on every
+   * backend, so aux->clobber_mask is unreliable here). */
+  if (nasm_clob) {
+    u32 n = 0;
+    asm_clobbers = arena_array(e->f->arena, Sym, nasm_clob);
+    for (u32 b = 0; b < e->f->nblocks; ++b) {
+      Block* bl = &e->f->blocks[b];
+      for (u32 i = 0; i < bl->ninsts; ++i) {
+        Inst* in = &bl->insts[i];
+        IRAsmAux* aux;
+        if ((IROp)in->op != IR_ASM_BLOCK) continue;
+        aux = (IRAsmAux*)in->extra.aux;
+        for (u32 k = 0; aux && k < aux->nclob; ++k)
+          asm_clobbers[n++] = aux->clobbers[k];
       }
     }
+    nasm_clob = n;
   }
   if (t->call_stack_bytes) {
     NativeLoc* args =
@@ -1436,6 +1466,9 @@ static void plan_frame(NativeEmitCtx* e, const CGFuncDesc* fd) {
   frame.needs_scratch_spill = needs_scratch_spill;
   frame.is_leaf = !has_call;
   frame.has_asm = has_asm;
+  frame.asm_clobbers = asm_clobbers;
+  frame.nasm_clobbers = nasm_clob;
+  frame.asm_clobber_abi_sets = asm_clobber_abi_sets;
   t->func_begin_known_frame(t, fd, &frame, out_slots);
   for (u32 i = 0; i < e->f->nframe_slots; ++i)
     e->slot_map[e->f->frame_slots[i].id] = out_slots[i];
diff --git a/test/toy/cases/142_typed_asm_clobber_abi_callee.expected b/test/toy/cases/142_typed_asm_clobber_abi_callee.expected
@@ -0,0 +1 @@
+42
diff --git a/test/toy/cases/142_typed_asm_clobber_abi_callee.toy b/test/toy/cases/142_typed_asm_clobber_abi_callee.toy
@@ -0,0 +1,12 @@
+fn __user_main(): i64 {
+  @asm<void>(
+    "",
+    outputs(),
+    inputs(),
+    clobber_abi(.callee_saved),
+    flags(.volatile)
+  );
+  return 42;
+}
+
+fn main(): i32 { return __user_main() as i32; }

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	include/cfree/cg.h	\|	6	++++++
M	lang/toy/asm.c	\|	2	++
M	src/arch/aa64/native.c	\|	87	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
M	src/arch/check_target.c	\|	3	++-
M	src/arch/native_target.h	\|	15	+++++++++++++++
M	src/arch/rv64/native.c	\|	82	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
M	src/arch/x64/native.c	\|	83	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
M	src/cg/asm.c	\|	24	+++++++++++++++++++-----
M	src/cg/cgtarget.h	\|	2	+-
M	src/cg/internal.h	\|	1	+
M	src/cg/ir.h	\|	1	+
M	src/cg/ir_recorder.c	\|	5	+++--
M	src/cg/native_direct_target.c	\|	6	+++---
M	src/cg/native_direct_target.h	\|	2	+-
M	src/opt/cg_ir_lower.c	\|	1	+
M	src/opt/ir.h	\|	3	+++
M	src/opt/pass_native_emit.c	\|	35	++++++++++++++++++++++++++++++++++-
A	test/toy/cases/142_typed_asm_clobber_abi_callee.expected	\|	1	+
A	test/toy/cases/142_typed_asm_clobber_abi_callee.toy	\|	12	++++++++++++