Fix hard-pinned native inline asm staging - kit

commit c8b9d53d52c4d396b6caed50da9e576413595199
parent 96de8d734d0490c3e537ec6950c325e51015df59
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sat,  6 Jun 2026 04:15:28 -0700

Fix hard-pinned native inline asm staging

Diffstat:
M doc/plan/TODO.md  | 47 +++++++++--------------------------------------
M src/arch/aa64/native.c  | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
M src/arch/riscv/native.c  | 34 ++++++++++++++++++++++++++++++----
M src/arch/x64/native.c  | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
M src/cg/native_asm.c  | 22 ++++++++++++++++++++++
M src/cg/native_asm.h  | 12 ++++++++++++
M src/opt/pass_native_emit.c  | 22 ++++++++++++++++------

7 files changed, 248 insertions(+), 65 deletions(-)
diff --git a/doc/plan/TODO.md b/doc/plan/TODO.md
@@ -5,44 +5,6 @@ fixed, remove it instead of checking it off or keeping a closed entry.
 
 Add new deferred fixes below as they are discovered.
 
-## x86-64 inline asm: `-g -O1` + a 4-operand register idiom → `too many memory asm operands` (compiler abort)
-
-A register-pinned inline-asm syscall (4 operands: `rax`/`rdi`/`rsi`/`rdx` via GNU
-local register variables + an `"r"`/`"+r"` constraint, the only syscall idiom kit
-accepts — see the entry above) aborts the compiler **only on x86-64 at `-O1` with
-`-g`**. The bracket is exact: `-g -O0` OK, `-O1` (no `-g`) OK, `-O0` OK, and
-aarch64/riscv64 compile it fine at `-g -O1`; only x64 + O1 + -g fails. The
-message is `fatal: x64 inline asm: too many memory asm operands`.
-
-Minimal repro (`kit cc -target x86_64-linux-gnu -g -O1 -c`):
-
-```c
-static long w(int fd, const char* b, unsigned long n) {
-  register long rax __asm__("rax") = 1, rdi __asm__("rdi") = fd;
-  register long rsi __asm__("rsi") = (long)b, rdx __asm__("rdx") = (long)n;
-  __asm__ volatile("syscall" : "+r"(rax) : "r"(rdi), "r"(rsi), "r"(rdx)
-                              : "rcx", "r11", "memory");
-  return rax;
-}
-```
-
-Root cause: the x64 asm lowering stages a *memory-resident* `"r"` operand into a
-scratch register before the asm, but the scratch pool is only **two** registers
-(`X64_TMP_INT` / `X64_TMP_INT2`), and `src/arch/x64/native.c:4014` panics on the
-third. At `-O1 -g` the four pinned `register long` operands are left stack-
-resident at the asm point (the GNU `register asm` hint binds the operand, it does
-not pin residency across statements; the `-g` location tracking perturbs the
-allocator into spilling), so 3+ need staging and it trips. `-O0` keeps them in
-registers, so `ntmp` stays ≤ 2. Fix: when an `"r"` operand carries a hard-
-register pin, load it straight into that pinned register instead of a shared
-scratch temp (no temp needed at all); failing that, stage through more than two
-scratch regs. **Secondary:** the fatal itself does not exit cleanly — under the
-ASan host build `compiler_panic`'s `longjmp` (`src/core/core.c:179`) SEGVs, so the
-diagnostic becomes a SIGABRT/SEGV instead of a clean `fatal:` exit. Found writing
-the WS4 backtrace round-trip (`test/rt/addr2line_prog.c`), whose x86-64 `write`
-sink is exactly this idiom; surfaced by sweeping that test at `-O1` per
-doc/plan/BACKTRACE.md — left red (`test-rt-backtrace`, `x64/O1` lane).
-
 ## setjmp/longjmp miscompiled at `-O1`: the longjmp'd `setjmp` return value is wrong
 
 A textbook setjmp/longjmp round-trip returns the right answer at `-O0` but the
@@ -84,3 +46,12 @@ program loads at an ASLR base, so captured code addresses don't match link-time
 addresses without computing the load slide. `-no-pie` should clear `o->pie` (and
 ideally `-static` without `-pie` should default to non-PIE). Found while making a
 backtrace demo's addresses line up with `kit addr2line`.
+
+## ASan host build: some fatal diagnostics SEGV during `compiler_panic` recovery
+
+While reproducing the former x64 inline-asm fatal on the ASan host build, the
+diagnostic printed, then `compiler_panicv`'s `longjmp` (`src/core/core.c:187`)
+faulted in `_longjmp`, turning a user-facing `fatal:` into SIGABRT/SEGV. The
+inline-asm trigger is fixed, but the panic recovery path still needs a focused
+expected-fatal regression and an audit of panic save/restore lifetime under the
+sanitized hosted driver.
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -4926,6 +4926,36 @@ static Reg aa_asm_native_mem_base(AANativeTarget* a, SrcLoc loc, NativeLoc src,
   return dst;
 }
 
+static void aa_asm_load_loc_to_reg(AANativeTarget* a, SrcLoc loc, NativeLoc src,
+                                   NativeLoc dst) {
+  NativeTarget* t = &a->base;
+  NativeAllocClass cls = (NativeAllocClass)dst.cls;
+  if (src.kind == NATIVE_LOC_REG) {
+    if (src.v.reg != dst.v.reg || src.cls != dst.cls) t->move(t, dst, src);
+    return;
+  }
+  if (src.kind == NATIVE_LOC_IMM) {
+    if (cls != NATIVE_REG_INT)
+      aa_asm_panic_at(t->c, loc,
+                      "floating-point immediate asm input is unsupported");
+    t->load_imm(t, dst, src.v.imm);
+    return;
+  }
+  aa_emit_mem(a, 1, dst, aa_asm_loc_to_addr(a, loc, src),
+              aa_mem_for_type(t, dst.type, type_size32(t, dst.type)));
+}
+
+static void aa_asm_store_reg_to_loc(AANativeTarget* a, SrcLoc loc,
+                                    NativeLoc dst, NativeLoc src) {
+  NativeTarget* t = &a->base;
+  if (dst.kind == NATIVE_LOC_REG) {
+    if (dst.v.reg != src.v.reg || dst.cls != src.cls) t->move(t, dst, src);
+    return;
+  }
+  aa_emit_mem(a, 0, src, aa_asm_loc_to_addr(a, loc, dst),
+              aa_mem_for_type(t, src.type, type_size32(t, src.type)));
+}
+
 static void aa_asm_bind_native(AANativeTarget* a, SrcLoc loc, Operand* out,
                                const char* constraint, KitCgTypeId type,
                                NativeLoc src, u32* ntmp) {
@@ -4970,13 +5000,29 @@ static void aa_asm_block_native(NativeTarget* t, const char* tmpl,
   SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
   Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
   Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
+  u8* staged_outs = nout ? arena_zarray(c->tu, u8, nout) : NULL;
   u32 ntmp = 0;
   AA64Asm* asmh;
 
   for (u32 i = 0; i < nout; ++i) {
     KitCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type;
-    aa_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, out_locs[i],
-                       &ntmp);
+    NativeLoc outloc = out_locs[i];
+    NativeAsmPinnedLoc pinned =
+        native_asm_prepare_pinned_loc(t, outs[i].reg, outs[i].str, type, outloc);
+    if (pinned.has_pin) {
+      if (pinned.pin_status != NATIVE_ASM_REG_PIN_OK)
+        aa_asm_panic_at(c, loc,
+                        native_asm_pin_status_message(pinned.pin_status));
+      if (pinned.wrong_reg)
+        aa_asm_panic_at(c, loc, "hard-register asm operand in wrong register");
+      outloc = pinned.loc;
+      if (pinned.needs_stage) {
+        staged_outs[i] = 1u;
+        if (outs[i].dir == KIT_CG_ASM_INOUT)
+          aa_asm_load_loc_to_reg(a, loc, out_locs[i], outloc);
+      }
+    }
+    aa_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, outloc, &ntmp);
   }
   for (u32 i = 0; i < nin; ++i) {
     const char* body = native_asm_constraint_body(ins[i].str);
@@ -4993,16 +5039,29 @@ static void aa_asm_block_native(NativeTarget* t, const char* tmpl,
       const char* in_body = native_asm_constraint_body(ins[i].str);
       NativeAsmConstraintInfo info;
       NativeLoc inloc = in_locs[i];
+      NativeAsmPinnedLoc pinned =
+          native_asm_prepare_pinned_loc(t, ins[i].reg, ins[i].str, type, inloc);
       /* A register-constrained input whose value is an address-taken local
        * arrives in a frame slot: the optimizer cannot keep an address-taken
        * local live in a register across the block, so the "inputs are already
        * in registers" contract does not hold for it. Load it into a reserved
-       * scratch register (as the direct path does) before binding. Only
-       * unrestricted integer constraints can use this scratch; restricted
-       * register sets must already arrive in an allowed hard register. */
-      if (native_asm_constraint_reg_info(t, ins[i].str, &info) &&
-          info.cls == NATIVE_REG_INT && info.allowed_mask == 0 &&
-          inloc.kind != NATIVE_LOC_REG) {
+       * scratch register (as the direct path does) before binding. With no
+       * hard pin, only unrestricted integer constraints can use this scratch;
+       * restricted register sets must already arrive in an allowed hard
+       * register. */
+      if (pinned.has_pin) {
+        if (pinned.pin_status != NATIVE_ASM_REG_PIN_OK)
+          aa_asm_panic_at(c, loc,
+                          native_asm_pin_status_message(pinned.pin_status));
+        if (pinned.wrong_reg)
+          aa_asm_panic_at(c, loc,
+                          "hard-register asm operand in wrong register");
+        inloc = pinned.loc;
+        if (pinned.needs_stage)
+          aa_asm_load_loc_to_reg(a, loc, in_locs[i], inloc);
+      } else if (native_asm_constraint_reg_info(t, ins[i].str, &info) &&
+                 info.cls == NATIVE_REG_INT && info.allowed_mask == 0 &&
+                 inloc.kind != NATIVE_LOC_REG) {
         Reg r;
         if (ntmp >= 2u) aa_asm_panic_at(c, loc, "too many memory asm operands");
         r = (ntmp == 0u) ? AA_TMP0 : AA_TMP1;
@@ -5024,6 +5083,17 @@ static void aa_asm_block_native(NativeTarget* t, const char* tmpl,
                    nclob);
   aa64_asm_run_template(asmh, t->mc, tmpl);
   aa64_asm_close(asmh);
+
+  for (u32 i = 0; i < nout; ++i) {
+    NativeAllocClass cls;
+    NativeLoc src;
+    if (!staged_outs || !staged_outs[i]) continue;
+    if (bound_outs[i].kind != AA64_INLINE_OPK_REG) continue;
+    cls = bound_outs[i].pad[0] == AA64_INLINE_OPCLS_FP ? NATIVE_REG_FP
+                                                       : NATIVE_REG_INT;
+    src = native_loc_reg(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local);
+    aa_asm_store_reg_to_loc(a, loc, out_locs[i], src);
+  }
 }
 
 static const NativeOps aa_direct_ops = {
diff --git a/src/arch/riscv/native.c b/src/arch/riscv/native.c
@@ -3827,8 +3827,22 @@ static void rv_asm_block_native(NativeTarget* t, const char* tmpl,
     KitCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type;
     NativeLoc outloc = out_locs[i];
     NativeAsmConstraintInfo info;
-    if (native_asm_constraint_reg_info(t, outs[i].str, &info) &&
-        info.allowed_mask == 0 && outloc.kind != NATIVE_LOC_REG) {
+    NativeAsmPinnedLoc pinned =
+        native_asm_prepare_pinned_loc(t, outs[i].reg, outs[i].str, type, outloc);
+    if (pinned.has_pin) {
+      if (pinned.pin_status != NATIVE_ASM_REG_PIN_OK)
+        rv_asm_panic_at(c, loc,
+                        native_asm_pin_status_message(pinned.pin_status));
+      if (pinned.wrong_reg)
+        rv_asm_panic_at(c, loc, "hard-register asm operand in wrong register");
+      outloc = pinned.loc;
+      if (pinned.needs_stage) {
+        staged_outs[i] = 1u;
+        if (outs[i].dir == KIT_CG_ASM_INOUT)
+          rv_asm_load_loc_to_reg(a, loc, out_locs[i], outloc);
+      }
+    } else if (native_asm_constraint_reg_info(t, outs[i].str, &info) &&
+               info.allowed_mask == 0 && outloc.kind != NATIVE_LOC_REG) {
       Reg r = rv_asm_stage_reg(a, loc, info.cls, &nstage_int, &nstage_fp);
       outloc = native_loc_reg(type, info.cls, r);
       staged_outs[i] = 1u;
@@ -3852,8 +3866,20 @@ static void rv_asm_block_native(NativeTarget* t, const char* tmpl,
     inloc = in_locs[i];
     {
       NativeAsmConstraintInfo info;
-      if (native_asm_constraint_reg_info(t, ins[i].str, &info) &&
-          info.allowed_mask == 0 && inloc.kind != NATIVE_LOC_REG) {
+      NativeAsmPinnedLoc pinned =
+          native_asm_prepare_pinned_loc(t, ins[i].reg, ins[i].str, type, inloc);
+      if (pinned.has_pin) {
+        if (pinned.pin_status != NATIVE_ASM_REG_PIN_OK)
+          rv_asm_panic_at(c, loc,
+                          native_asm_pin_status_message(pinned.pin_status));
+        if (pinned.wrong_reg)
+          rv_asm_panic_at(c, loc,
+                          "hard-register asm operand in wrong register");
+        inloc = pinned.loc;
+        if (pinned.needs_stage)
+          rv_asm_load_loc_to_reg(a, loc, in_locs[i], inloc);
+      } else if (native_asm_constraint_reg_info(t, ins[i].str, &info) &&
+                 info.allowed_mask == 0 && inloc.kind != NATIVE_LOC_REG) {
         Reg r = rv_asm_stage_reg(a, loc, info.cls, &nstage_int, &nstage_fp);
         inloc = native_loc_reg(type, info.cls, r);
         rv_asm_load_loc_to_reg(a, loc, in_locs[i], inloc);
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -4014,6 +4014,36 @@ static Reg x64_asm_native_mem_base(X64NativeTarget* a, SrcLoc loc,
   return dst;
 }
 
+static void x64_asm_load_loc_to_reg(X64NativeTarget* a, SrcLoc loc,
+                                    NativeLoc src, NativeLoc dst) {
+  NativeTarget* t = &a->base;
+  NativeAllocClass cls = (NativeAllocClass)dst.cls;
+  if (src.kind == NATIVE_LOC_REG) {
+    if (src.v.reg != dst.v.reg || src.cls != dst.cls) t->move(t, dst, src);
+    return;
+  }
+  if (src.kind == NATIVE_LOC_IMM) {
+    if (cls != NATIVE_REG_INT)
+      x64_asm_panic_at(t->c, loc,
+                       "floating-point immediate asm input is unsupported");
+    t->load_imm(t, dst, src.v.imm);
+    return;
+  }
+  x64_emit_mem(a, 1, dst, x64_asm_loc_to_addr(a, loc, src),
+               native_mem_for_type(t, dst.type, native_type_size(t, dst.type)));
+}
+
+static void x64_asm_store_reg_to_loc(X64NativeTarget* a, SrcLoc loc,
+                                     NativeLoc dst, NativeLoc src) {
+  NativeTarget* t = &a->base;
+  if (dst.kind == NATIVE_LOC_REG) {
+    if (dst.v.reg != src.v.reg || dst.cls != src.cls) t->move(t, dst, src);
+    return;
+  }
+  x64_emit_mem(a, 0, src, x64_asm_loc_to_addr(a, loc, dst),
+               native_mem_for_type(t, src.type, native_type_size(t, src.type)));
+}
+
 static void x64_asm_bind_native(X64NativeTarget* a, SrcLoc loc, Operand* out,
                                 const char* constraint, KitCgTypeId type,
                                 NativeLoc src, u32* ntmp) {
@@ -4057,12 +4087,29 @@ static void x64_asm_block_native(NativeTarget* t, const char* tmpl,
   SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
   Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
   Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
+  u8* staged_outs = nout ? arena_zarray(c->tu, u8, nout) : NULL;
   u32 ntmp = 0, i;
   X64Asm* asmh;
 
   for (i = 0; i < nout; ++i) {
     KitCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type;
-    x64_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, out_locs[i],
+    NativeLoc outloc = out_locs[i];
+    NativeAsmPinnedLoc pinned =
+        native_asm_prepare_pinned_loc(t, outs[i].reg, outs[i].str, type, outloc);
+    if (pinned.has_pin) {
+      if (pinned.pin_status != NATIVE_ASM_REG_PIN_OK)
+        x64_asm_panic_at(c, loc,
+                         native_asm_pin_status_message(pinned.pin_status));
+      if (pinned.wrong_reg)
+        x64_asm_panic_at(c, loc, "hard-register asm operand in wrong register");
+      outloc = pinned.loc;
+      if (pinned.needs_stage) {
+        staged_outs[i] = 1u;
+        if (outs[i].dir == KIT_CG_ASM_INOUT)
+          x64_asm_load_loc_to_reg(a, loc, out_locs[i], outloc);
+      }
+    }
+    x64_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, outloc,
                         &ntmp);
   }
   for (i = 0; i < nin; ++i) {
@@ -4078,14 +4125,28 @@ static void x64_asm_block_native(NativeTarget* t, const char* tmpl,
     }
     type = ins[i].type ? ins[i].type : in_locs[i].type;
     inloc = in_locs[i];
-    if ((body[0] == 'r') && inloc.kind != NATIVE_LOC_REG) {
-      Reg r;
-      if (ntmp >= 2u) x64_asm_panic_at(c, loc, "too many memory asm operands");
-      r = (ntmp == 0u) ? (Reg)X64_TMP_INT : (Reg)X64_TMP_INT2;
-      ntmp++;
-      inloc = native_loc_reg(type, NATIVE_REG_INT, r);
-      x64_emit_mem(a, 1, inloc, x64_asm_loc_to_addr(a, loc, in_locs[i]),
-                   native_mem_for_type(t, type, native_type_size(t, type)));
+    {
+      NativeAsmPinnedLoc pinned =
+          native_asm_prepare_pinned_loc(t, ins[i].reg, ins[i].str, type, inloc);
+      if (pinned.has_pin) {
+        if (pinned.pin_status != NATIVE_ASM_REG_PIN_OK)
+          x64_asm_panic_at(c, loc,
+                           native_asm_pin_status_message(pinned.pin_status));
+        if (pinned.wrong_reg)
+          x64_asm_panic_at(c, loc,
+                           "hard-register asm operand in wrong register");
+        inloc = pinned.loc;
+        if (pinned.needs_stage)
+          x64_asm_load_loc_to_reg(a, loc, in_locs[i], inloc);
+      } else if ((body[0] == 'r') && inloc.kind != NATIVE_LOC_REG) {
+        Reg r;
+        if (ntmp >= 2u) x64_asm_panic_at(c, loc, "too many memory asm operands");
+        r = (ntmp == 0u) ? (Reg)X64_TMP_INT : (Reg)X64_TMP_INT2;
+        ntmp++;
+        inloc = native_loc_reg(type, NATIVE_REG_INT, r);
+        x64_emit_mem(a, 1, inloc, x64_asm_loc_to_addr(a, loc, in_locs[i]),
+                     native_mem_for_type(t, type, native_type_size(t, type)));
+      }
     }
     x64_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc, &ntmp);
   }
@@ -4098,6 +4159,17 @@ static void x64_asm_block_native(NativeTarget* t, const char* tmpl,
                   nclob);
   x64_asm_run_template(asmh, t->mc, tmpl);
   x64_asm_close(asmh);
+
+  for (i = 0; i < nout; ++i) {
+    NativeAllocClass cls;
+    NativeLoc src;
+    if (!staged_outs || !staged_outs[i]) continue;
+    if (bound_outs[i].kind != X64_INLINE_OPK_REG) continue;
+    cls = bound_outs[i].pad[0] == X64_INLINE_OPCLS_FP ? NATIVE_REG_FP
+                                                       : NATIVE_REG_INT;
+    src = native_loc_reg(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local);
+    x64_asm_store_reg_to_loc(a, loc, out_locs[i], src);
+  }
 }
 
 /* file_scope_asm + finalize are shared (cg/native_asm.h). */
diff --git a/src/cg/native_asm.c b/src/cg/native_asm.c
@@ -161,6 +161,28 @@ NativeAsmRegPinStatus native_asm_resolve_pin(NativeTarget* t, Sym reg,
   return NATIVE_ASM_REG_PIN_OK;
 }
 
+NativeAsmPinnedLoc native_asm_prepare_pinned_loc(NativeTarget* t, Sym reg,
+                                                 const char* constraint,
+                                                 KitCgTypeId type,
+                                                 NativeLoc loc) {
+  NativeAsmPinnedLoc out;
+  NativeAsmRegPin pin;
+  memset(&out, 0, sizeof out);
+  out.loc = loc;
+  out.pin_status = native_asm_resolve_pin(t, reg, constraint, &pin);
+  if (out.pin_status == NATIVE_ASM_REG_PIN_ABSENT) return out;
+  out.has_pin = 1u;
+  if (out.pin_status != NATIVE_ASM_REG_PIN_OK) return out;
+  if (loc.kind != NATIVE_LOC_REG) {
+    out.loc = native_loc_reg(type, pin.cls, pin.reg);
+    out.needs_stage = 1u;
+    return out;
+  }
+  if ((Reg)loc.v.reg != pin.reg || (NativeAllocClass)loc.cls != pin.cls)
+    out.wrong_reg = 1u;
+  return out;
+}
+
 const char* native_asm_pin_status_message(NativeAsmRegPinStatus st) {
   switch (st) {
     case NATIVE_ASM_REG_PIN_ABSENT:
diff --git a/src/cg/native_asm.h b/src/cg/native_asm.h
@@ -65,6 +65,14 @@ typedef struct NativeAsmConstraintInfo {
   u32 allowed_mask; /* 0 means any valid register in cls. */
 } NativeAsmConstraintInfo;
 
+typedef struct NativeAsmPinnedLoc {
+  NativeLoc loc;
+  NativeAsmRegPinStatus pin_status;
+  u8 has_pin;
+  u8 needs_stage;
+  u8 wrong_reg;
+} NativeAsmPinnedLoc;
+
 int native_asm_constraint_reg_info(NativeTarget* t, const char* constraint,
                                    NativeAsmConstraintInfo* out);
 int native_asm_constraint_is_reg(NativeTarget* t, const char* constraint);
@@ -76,6 +84,10 @@ int native_asm_constraint_is_reg(NativeTarget* t, const char* constraint);
 NativeAsmRegPinStatus native_asm_resolve_pin(NativeTarget* t, Sym reg,
                                              const char* constraint,
                                              NativeAsmRegPin* out);
+NativeAsmPinnedLoc native_asm_prepare_pinned_loc(NativeTarget* t, Sym reg,
+                                                 const char* constraint,
+                                                 KitCgTypeId type,
+                                                 NativeLoc loc);
 const char* native_asm_pin_status_message(NativeAsmRegPinStatus st);
 int native_asm_constraint_reg_class(const char* constraint,
                                     NativeAllocClass* cls_out);
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -1414,13 +1414,18 @@ static void plan_frame(NativeEmitCtx* e, const CGFuncDesc* fd) {
       } else if ((IROp)in->op == IR_ASM_BLOCK) {
         /* Inline asm may clobber the return-address register or the red zone
          * opaquely; disqualifies the frame-eliding tiers (see has_asm). Its
-         * callee-saved register clobbers are equally opaque to the operand scan
-         * below; count them now so the backend can fold them into the saved
-         * set (collected into a single Sym list in a second pass below). */
+         * callee-saved register clobbers and hard-register operand pins are
+         * equally opaque to the operand scan below; count them now so the
+         * backend can fold them into the saved set (collected into a single Sym
+         * list in a second pass below). */
         IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
         has_asm = 1;
         if (aux) {
           nasm_clob += aux->nclob;
+          for (u32 k = 0; k < aux->nout; ++k)
+            if (aux->outs[k].reg) ++nasm_clob;
+          for (u32 k = 0; k < aux->nin; ++k)
+            if (aux->ins[k].reg) ++nasm_clob;
           asm_clobber_abi_sets |= aux->clobber_abi_sets;
         }
       } else if ((IROp)in->op == IR_INTRINSIC) {
@@ -1434,9 +1439,10 @@ static void plan_frame(NativeEmitCtx* e, const CGFuncDesc* fd) {
       }
     }
   }
-  /* Gather the union of every asm block's clobber names. The backend resolves
-   * them with its own clobber parser (machinize's resolve_name is unset on
-   * every backend, so aux->clobber_mask is unreliable here). */
+  /* Gather the union of every asm block's clobber names and hard-register
+   * operand pins. The backend resolves them with its own clobber parser
+   * (machinize's resolve_name is unset on every backend, so aux->clobber_mask is
+   * unreliable here). */
   if (nasm_clob) {
     u32 n = 0;
     asm_clobbers = arena_array(e->f->arena, Sym, nasm_clob);
@@ -1449,6 +1455,10 @@ static void plan_frame(NativeEmitCtx* e, const CGFuncDesc* fd) {
         aux = (IRAsmAux*)in->extra.aux;
         for (u32 k = 0; aux && k < aux->nclob; ++k)
           asm_clobbers[n++] = aux->clobbers[k];
+        for (u32 k = 0; aux && k < aux->nout; ++k)
+          if (aux->outs[k].reg) asm_clobbers[n++] = aux->outs[k].reg;
+        for (u32 k = 0; aux && k < aux->nin; ++k)
+          if (aux->ins[k].reg) asm_clobbers[n++] = aux->ins[k].reg;
       }
     }
     nasm_clob = n;

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	doc/plan/TODO.md	\|	47	+++++++++--------------------------------------
M	src/arch/aa64/native.c	\|	86	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
M	src/arch/riscv/native.c	\|	34	++++++++++++++++++++++++++++++----
M	src/arch/x64/native.c	\|	90	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
M	src/cg/native_asm.c	\|	22	++++++++++++++++++++++
M	src/cg/native_asm.h	\|	12	++++++++++++
M	src/opt/pass_native_emit.c	\|	22	++++++++++++++++------