Fix Windows x64/arm64 variadic ABI at O0 and O1 - kit

commit 61a4a4c6f662dced2a9394cb09ee41c1bca529c2
parent 4c4f1db31be66b1fab039b0a1fa8c4f8ab2bdd6f
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu,  4 Jun 2026 21:51:14 -0700

Fix Windows x64/arm64 variadic ABI at O0 and O1

aarch64-windows variadics were broken in both the direct (O0) and
optimizer (O1) paths, and x64 Win64 va_start miscompiled at O1. The
no-VM smoke (build + objdump) hid all of it; these are runtime bugs.

- aarch64-windows: route variadic FP args through GP registers (the
  classifier already remapped named params; aa_param_abi now does the
  same for trailing `...` args) and home x0..x7 into a GP "home area"
  at the top of the frame, contiguous with the incoming stack args, so
  the plain-pointer va_list walks register- then stack-passed varargs
  as one block. Driven by ABIVaListInfo.gp_reg_count; forces the fat
  top-record frame for Windows variadic functions only (non-variadic
  and non-Windows codegen is untouched).
- x64 Win64: declare RAX clobbered by va_start via the machine-op
  clobber mechanism (new NATIVE_MOP_VA_START), so the allocator stops
  keeping a live value (e.g. a return-coalesced loop accumulator) in
  RAX across the op.

Verified on the Win11 ARM64 VM (aarch64 native + x64 emulated) at O0
and O1: all three COFF Windows smokes pass, plus new coverage for
loop/overflow varargs, named-FP varargs, va_copy, and RSI/RDI +
XMM6-15 preservation. No regressions across cg/opt/isa/parse/asm/pp/
debug/dwarf/smoke/libc, Apple-ARM64, or SysV varargs.

Also fold in the in-flight frontend cleanups owned by this change
(extern-inline suppressed-parse dummy labels; drop the bogus
local-const memory boundary on file-scope asm) and add their missing
test coverage.

Diffstat:
M lang/c/parse/cg_adapter.c  | 14 +++++++++-----
M lang/c/parse/parse_stmt.c  | 54 ++++++++++++++++++++++++++++++++++++++++++++++++------
M mk/test.mk  | 6 ++++++
M src/abi/abi_aapcs64_windows.c  | 6 +++++-
M src/arch/aa64/native.c  | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
M src/arch/native_target.h  | 36 ++++++++++++++++++++++++++++++++----
M src/arch/x64/native.c  | 22 +++++++++++++++++++---
M src/cg/asm.c  | 1 -
M src/cg/native_asm.c  | 9 ++++-----
M src/cg/native_asm.h  | 4 +---
M src/cg/native_direct_target.c  | 10 ++++++----
M src/opt/pass_machinize.c  | 14 ++++++++++----
M src/opt/pass_native_emit.c  | 4 +++-
A test/coff/windows-o1-abi-smoke.sh  | 434 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M test/parse/cases/asm_02_file_scope.c  | 11 ++++++++++-
A test/parse/cases/gnu_inline_control_flow.c  | 36 ++++++++++++++++++++++++++++++++++++
A test/parse/cases/gnu_inline_control_flow.expected  | 1 +

17 files changed, 732 insertions(+), 58 deletions(-)
diff --git a/lang/c/parse/cg_adapter.c b/lang/c/parse/cg_adapter.c
@@ -607,11 +607,15 @@ void pcg_computed_goto(Parser* p, const CGLabel* targets, u32 ntargets) {
 
 /* ---- Control flow ----
  *
- * Labels are pure bookkeeping handles, so pcg_label_new always mints one (it
- * has no codegen side effect to suppress). The placement / jump / branch ops
- * gate on emit; the conditional branches also pop the tested value off the
- * type stack to mirror the CG-side consume. */
-CGLabel pcg_label_new(Parser* p) { return kit_cg_label_new(p->cg); }
+ * Label placement / jump / branch ops gate on emit; the conditional branches
+ * also pop the tested value off the type stack to mirror the CG-side consume.
+ * Suppressed parses, such as C99 `extern inline` bodies, do not open a CG
+ * function, so they use a nonzero dummy label only for semantic bookkeeping
+ * around break/continue/case validation. */
+CGLabel pcg_label_new(Parser* p) {
+  if (!pcg_emit_enabled(p)) return (CGLabel)1;
+  return kit_cg_label_new(p->cg);
+}
 
 void pcg_label_place(Parser* p, CGLabel l) {
   if (pcg_emit_enabled(p)) kit_cg_label_place(p->cg, l);
diff --git a/lang/c/parse/parse_stmt.c b/lang/c/parse/parse_stmt.c
@@ -81,11 +81,21 @@ static void parse_while_stmt(Parser* p) {
    * /parse_continue/etc. keep using their existing raw `pcg_jump` calls —
    * the C target recognizes the labels as the innermost scope's
    * boundaries and emits the structured keywords on its own. */
-  KitCgScope scope = kit_cg_scope_begin(p->cg, KIT_CG_TYPE_NONE);
-  CGLabel L_top = kit_cg_scope_continue_label(p->cg, scope);
-  CGLabel L_end = kit_cg_scope_break_label(p->cg, scope);
   CGLabel saved_break = p->cur_break;
   CGLabel saved_continue = p->cur_continue;
+  KitCgScope scope;
+  CGLabel L_top;
+  CGLabel L_end;
+  int emit = pcg_emit_enabled(p);
+  if (emit) {
+    scope = kit_cg_scope_begin(p->cg, KIT_CG_TYPE_NONE);
+    L_top = kit_cg_scope_continue_label(p->cg, scope);
+    L_end = kit_cg_scope_break_label(p->cg, scope);
+  } else {
+    scope = 0;
+    L_top = (CGLabel)1;
+    L_end = (CGLabel)1;
+  }
   expect_punct(p, '(', "'('");
   parse_expr(p);
   to_rvalue(p);
@@ -93,6 +103,15 @@ static void parse_while_stmt(Parser* p) {
     perr(p, "while condition requires scalar type");
   }
   expect_punct(p, ')', "')'");
+  if (!emit) {
+    pcg_drop(p);
+    p->cur_break = L_end;
+    p->cur_continue = L_top;
+    parse_stmt(p);
+    p->cur_break = saved_break;
+    p->cur_continue = saved_continue;
+    return;
+  }
   pcg_branch_false(p, L_end);
   p->cur_break = L_end;
   p->cur_continue = L_top;
@@ -378,18 +397,29 @@ static void parse_switch_stmt(Parser* p) {
    * chain (unchanged behaviour) and which the C target overrides to
    * emit a real `switch (sel) { case V: goto L_V; …; default: goto
    * L_def; }`. */
-  KitCgScope scope = kit_cg_scope_begin(p->cg, KIT_CG_TYPE_NONE);
-  CGLabel L_dispatch = pcg_label_new(p);
-  CGLabel L_end = kit_cg_scope_break_label(p->cg, scope);
   CGLabel saved_break = p->cur_break;
   SwitchCtx ctx;
   SwitchCtx* saved_switch = p->cur_switch;
+  KitCgScope scope;
+  CGLabel L_dispatch;
+  CGLabel L_end;
+  int emit = pcg_emit_enabled(p);
   FrameSlotDesc fsd;
   const Type* vty;
   CaseEntry* it;
   CaseEntry* prev;
   CaseEntry* head;
 
+  if (emit) {
+    scope = kit_cg_scope_begin(p->cg, KIT_CG_TYPE_NONE);
+    L_dispatch = pcg_label_new(p);
+    L_end = kit_cg_scope_break_label(p->cg, scope);
+  } else {
+    scope = 0;
+    L_dispatch = (CGLabel)1;
+    L_end = (CGLabel)1;
+  }
+
   expect_punct(p, '(', "'('");
   parse_expr(p);
   to_rvalue(p);
@@ -413,6 +443,18 @@ static void parse_switch_stmt(Parser* p) {
   }
   expect_punct(p, ')', "')' after switch expression");
 
+  if (!emit) {
+    pcg_drop(p);
+    memset(&ctx, 0, sizeof ctx);
+    ctx.parent = saved_switch;
+    p->cur_switch = &ctx;
+    p->cur_break = L_end;
+    parse_stmt(p);
+    p->cur_break = saved_break;
+    p->cur_switch = saved_switch;
+    return;
+  }
+
   memset(&ctx, 0, sizeof ctx);
   memset(&fsd, 0, sizeof fsd);
   fsd.type = vty;
diff --git a/mk/test.mk b/mk/test.mk
@@ -59,6 +59,7 @@ TEST_TARGETS = \
     test-cg-api \
     test-coff \
     test-coff-mingw-import \
+    test-coff-windows-o1-abi \
     test-coff-windows-ucrt \
     test-debug \
     test-dbg \
@@ -685,6 +686,10 @@ test-coff-mingw-import: lib $(COFF_IMPORT_MINGW_BIN)
 test-coff-windows-ucrt: bin rt-x86_64-pc-windows rt-aarch64-windows windows-ucrt-sysroots
 	KIT_SYSROOT=$(abspath build/llvm-mingw/20260602/ucrt) bash test/coff/windows-ucrt-hosted-smoke.sh
 	KIT_SYSROOT=$(abspath build/llvm-mingw/20260602/ucrt) bash test/coff/windows-system-dlls-smoke.sh
+	KIT_SYSROOT=$(abspath build/llvm-mingw/20260602/ucrt) bash test/coff/windows-o1-abi-smoke.sh
+
+test-coff-windows-o1-abi: bin rt-x86_64-pc-windows rt-aarch64-windows windows-ucrt-sysroots
+	KIT_SYSROOT=$(abspath build/llvm-mingw/20260602/ucrt) bash test/coff/windows-o1-abi-smoke.sh
 
 # Opt-in: run the COFF/PE hosted smokes against a real Windows 11 ARM64 VM, so
 # their per-program run lanes execute for real instead of self-skipping. On
@@ -707,6 +712,7 @@ test-coff-windows-vm: bin rt-x86_64-pc-windows rt-aarch64-windows windows-ucrt-s
 	bash scripts/windows_vm.sh wait-ssh 600
 	$(_WIN_VM_ENV) bash test/coff/windows-ucrt-hosted-smoke.sh
 	$(_WIN_VM_ENV) bash test/coff/windows-system-dlls-smoke.sh
+	$(_WIN_VM_ENV) bash test/coff/windows-o1-abi-smoke.sh
 
 # The parse/asm/macho harnesses select a cross-target via KIT_TEST_ARCH
 # (default aa64); the link rt dependency is resolved through the shared
diff --git a/src/abi/abi_aapcs64_windows.c b/src/abi/abi_aapcs64_windows.c
@@ -64,5 +64,9 @@ const ABIVtable aapcs64_windows_vtable = {
     .compute_func_info = aapcs64_windows_compute_func_info,
     .va_list_info = {8, 8, ABI_SC_PTR, 0, 0, 0},
     .va_list_layout = {.type = {8, 8, ABI_SC_PTR, 0, 0, 0},
-                       .kind = ABI_VA_LIST_POINTER},
+                       .kind = ABI_VA_LIST_POINTER,
+                       .gp_reg_count = 8,
+                       .fp_reg_count = 0,
+                       .gp_slot_size = 8,
+                       .fp_slot_size = 0},
 };
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -130,14 +130,22 @@ typedef struct AAFrameLayout {
   u32 slot_bytes; /* sum of aa_frame_slot reservations (callee-saves + locals
                    * + spills + sret/variadic) */
   u32 out_stack;  /* max outgoing-arg bytes across all calls in this function */
-  u32 frame_size; /* align16(AA_FRAME_SAVE_SIZE + slot_bytes + out_stack) */
+  u32 top_home;   /* Windows-variadic GP register home area, reserved between
+                   * the saved pair and the incoming stack args so the
+                   * plain-pointer va_list walks register then stack varargs as
+                   * one contiguous block (0 on every other ABI). */
+  u32 frame_size; /* align16(AA_FRAME_SAVE_SIZE + top_home + slot_bytes +
+                   * out_stack) */
 } AAFrameLayout;
 
-static inline AAFrameLayout aa_build_layout(u32 slot_bytes, u32 out_stack) {
+static inline AAFrameLayout aa_build_layout(u32 slot_bytes, u32 out_stack,
+                                            u32 top_home) {
   AAFrameLayout L;
   L.slot_bytes = slot_bytes;
   L.out_stack = out_stack;
-  L.frame_size = align_up_u32(AA_FRAME_SAVE_SIZE + slot_bytes + out_stack, 16u);
+  L.top_home = top_home;
+  L.frame_size =
+      align_up_u32(AA_FRAME_SAVE_SIZE + top_home + slot_bytes + out_stack, 16u);
   return L;
 }
 
@@ -152,7 +160,7 @@ static inline i32 aa_fp_off_saved_lr(void) { return 8; }
 /* SP-relative byte offsets. */
 static inline i32 aa_sp_off_out_arg(u32 byte_off) { return (i32)byte_off; }
 static inline u32 aa_sp_off_saved_pair(const AAFrameLayout* L) {
-  return L->frame_size - AA_FRAME_SAVE_SIZE;
+  return L->frame_size - AA_FRAME_SAVE_SIZE - L->top_home;
 }
 
 /* Frame slots and callee-save records are owned by the shared NativeFrame
@@ -195,6 +203,11 @@ typedef struct AANativeTarget {
    * and unread on the single-pass path, which never sets fp_at_bottom. */
   u32 frame_size_final;
   u32 incoming_stack_size;
+  /* Windows-variadic GP register home area size (gp_reg_count * gp_slot_size,
+   * 64 today; 0 on every other ABI). When nonzero the function takes the fat
+   * top-record layout and homes x0..x7 into [fp + AA_FRAME_SAVE_SIZE ..] so the
+   * plain-pointer va_list can walk register then stack varargs contiguously. */
+  u32 top_home_bytes;
   u32 next_param_int;
   u32 next_param_fp;
   u32 next_param_stack;
@@ -254,7 +267,10 @@ static AANativeTarget* aa_of(NativeTarget* t) { return (AANativeTarget*)t; }
  *     record since frame_size = align16(16+cum_off) >= 16+cum_off).
  *     CFA = fp+frame_size. */
 static inline i32 aa_fp_off_in_arg(const AANativeTarget* a, u32 byte_off) {
-  u32 base = a->fp_at_bottom ? a->frame_size_final : AA_FRAME_SAVE_SIZE;
+  /* top-record incoming args sit above the saved pair and the (usually empty)
+   * Windows-variadic GP home area; bottom-record never carries a home area. */
+  u32 base = a->fp_at_bottom ? a->frame_size_final
+                             : AA_FRAME_SAVE_SIZE + a->top_home_bytes;
   return (i32)(base + byte_off);
 }
 static inline i32 aa_fp_off_slot(const AANativeTarget* a, u32 slot_off) {
@@ -272,7 +288,17 @@ static inline i32 aa_fp_off_tail_out_arg(const AANativeTarget* a,
  * fp+frame_size bottom-record). Named so the CFI emit site stays layout-blind.
  */
 static inline i32 aa_cfa_off(const AANativeTarget* a) {
-  return a->fp_at_bottom ? (i32)a->frame_size_final : (i32)AA_FRAME_SAVE_SIZE;
+  return a->fp_at_bottom
+             ? (i32)a->frame_size_final
+             : (i32)(AA_FRAME_SAVE_SIZE + a->top_home_bytes);
+}
+
+/* fp-relative offset of GP home slot `i` (Windows variadic only). The home area
+ * sits just above the saved pair and just below the incoming stack args, so
+ * slot gp_reg_count coincides with incoming-arg byte 0 (top-record only — a
+ * function with a home area never takes a slim/bottom layout). */
+static inline i32 aa_fp_off_home_slot(u32 i) {
+  return (i32)(AA_FRAME_SAVE_SIZE + i * 8u);
 }
 
 static void aa_panic(AANativeTarget* a, const char* msg) {
@@ -1023,13 +1049,34 @@ static void aa_reserve_variadic_reg_saves(AANativeTarget* a) {
   a->va_vr_slot = a->base.frame_slot(&a->base, &sd);
 }
 
-/* Emit the stores into the variadic register-save area. Slots must already be
- * reserved (aa_reserve_variadic_reg_saves). */
+/* Emit the stores into the variadic register-save area. For AAPCS64 these land
+ * in the reserved gr/vr frame slots (aa_reserve_variadic_reg_saves); for the
+ * Windows GP home area they land in [fp + AA_FRAME_SAVE_SIZE ..], the
+ * top-of-frame block contiguous with the incoming stack args. */
 static void aa_emit_variadic_reg_save_stores(AANativeTarget* a) {
   NativeAddr addr;
   MemAccess mem;
   KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64);
   ABIVaListInfo vai = abi_va_list_layout(a->base.c->abi);
+  if (vai.kind == ABI_VA_LIST_POINTER && a->top_home_bytes) {
+    /* Windows: home x0..x{gp_reg_count-1} so the plain-pointer va_list walks
+     * register then stack varargs as one block. The named leading registers are
+     * homed too (harmless): va_start skips past them. */
+    memset(&mem, 0, sizeof mem);
+    mem.type = i64;
+    mem.size = 8;
+    mem.align = 8;
+    memset(&addr, 0, sizeof addr);
+    addr.base_kind = NATIVE_ADDR_BASE_REG;
+    addr.base.reg = AA_FP;
+    addr.base_type = i64;
+    for (u32 r = 0; r < vai.gp_reg_count && r < 8u; ++r) {
+      NativeLoc src = native_loc_reg(i64, NATIVE_REG_INT, r);
+      addr.offset = aa_fp_off_home_slot(r);
+      aa_emit_mem(a, 0, src, addr, mem);
+    }
+    return;
+  }
   if (vai.kind != ABI_VA_LIST_AAPCS64) return;
   memset(&mem, 0, sizeof mem);
   mem.type = i64;
@@ -1080,6 +1127,18 @@ static void aa_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) {
   a->slim_small_frame = 0;
   a->fp_at_bottom = 0;
   a->frame_size_final = 0;
+  /* Windows variadic functions reserve a GP register home area at the top of
+   * the frame (just below the incoming stack args). The plain-pointer va_list
+   * then walks register-passed then stack-passed varargs as one block. Other
+   * ABIs leave gp_reg_count 0 here: Apple ARM64 routes all varargs to the
+   * stack, AAPCS64 uses a struct va_list with separate reg-save pointers. */
+  {
+    const ABIFuncInfo* fi = abi_cg_func_info(t->c->abi, fd->fn_type);
+    ABIVaListInfo vai = abi_va_list_layout(t->c->abi);
+    a->top_home_bytes = (fi && fi->variadic && vai.kind == ABI_VA_LIST_POINTER)
+                            ? vai.gp_reg_count * vai.gp_slot_size
+                            : 0u;
+  }
   mc->set_section(mc, fd->text_section_id);
   mc->emit_align(mc, 4, 0);
   a->func_start = mc->pos(mc);
@@ -1288,11 +1347,13 @@ static void aa_words_restore_frame(AANativeTarget* a, u32* words, u32 cap,
     return;
   }
   if (*n + 3u > cap) aa_panic(a, "instruction patch too small");
-  /* AAPCS64: fp is the saved-pair address. Reload pair from [fp], then
-   * restore sp to fp + AA_FRAME_SAVE_SIZE (= caller's original sp = CFA). */
+  /* AAPCS64: fp is the saved-pair address. Reload pair from [fp], then restore
+   * sp to fp + CFA-offset (= caller's original sp = CFA). The CFA offset is
+   * AA_FRAME_SAVE_SIZE normally, plus the Windows-variadic GP home area when
+   * present. */
   words[(*n)++] = aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0);
   words[(*n)++] = aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, 0);
-  words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP0, AA_FRAME_SAVE_SIZE, 0);
+  words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP0, (u32)aa_cfa_off(a), 0);
 }
 
 /* Emit callee-save store (save=1) or restore (save=0) words into `words`,
@@ -1488,7 +1549,8 @@ static void aa_apply_patches(AANativeTarget* a, const AAFrameLayout* L) {
 static void aa_func_end(NativeTarget* t) {
   AANativeTarget* a = aa_of(t);
   MCEmitter* mc = t->mc;
-  AAFrameLayout L = aa_build_layout(a->frame.cum_off, a->frame.max_outgoing);
+  AAFrameLayout L =
+      aa_build_layout(a->frame.cum_off, a->frame.max_outgoing, a->top_home_bytes);
   /* known_frame (optimizer): prologue, allocas, and tail epilogues were emitted
    * final and slim eligibility was settled in aa_func_begin_known_frame — there
    * is nothing to patch. Single-pass (NDT): a worst-case prologue region was
@@ -1646,7 +1708,8 @@ static void aa_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
    * known, so the prologue immediates and slim-form choice are settled here.
    * frame_size_final must be set before aa_build_prologue_words / entry saves,
    * since the bottom-record offset helpers read it. */
-  L = aa_build_layout(a->frame.cum_off, a->frame.max_outgoing);
+  L = aa_build_layout(a->frame.cum_off, a->frame.max_outgoing,
+                      a->top_home_bytes);
   a->frame_size_final = L.frame_size;
   /* Slim Tier A: no callee-saves, no alloca, no body slots, no outgoing stack
    * args — the whole frame is the 16-byte record. fp_at_bottom: a small frame
@@ -1654,14 +1717,18 @@ static void aa_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
    * the bottom (fp = sp) so sp adjustment folds into the pre/post-indexed
    * stp/ldp (frame_size <= 504 keeps the post-index ldp imm in range).
    * Otherwise slim_small_frame keeps the top-record layout but skips the
-   * x17/x10 scratch (out_stack>0 small frames land here). (See aa_func_end for
-   * the single-pass path, which never takes any slim form.) */
+   * x17/x10 scratch (out_stack>0 small frames land here). A Windows-variadic
+   * home area forces the fat top-record layout: it lives above the saved pair,
+   * which neither the slim forms (saved pair at the very top) nor the
+   * bottom-record (saved pair at the very bottom) leave room for. (See
+   * aa_func_end for the single-pass path, which never takes any slim form.) */
   a->slim_prologue = a->frame.ncallee_saves == 0 && !a->frame.has_alloca &&
-                     L.slot_bytes == 0 && L.out_stack == 0;
+                     L.slot_bytes == 0 && L.out_stack == 0 && !a->top_home_bytes;
   a->fp_at_bottom = !a->slim_prologue && !a->frame.has_alloca &&
-                    L.out_stack == 0 && L.frame_size <= 504u;
+                    L.out_stack == 0 && L.frame_size <= 504u &&
+                    !a->top_home_bytes;
   a->slim_small_frame = !a->slim_prologue && !a->fp_at_bottom &&
-                        !a->frame.has_alloca &&
+                        !a->frame.has_alloca && !a->top_home_bytes &&
                         aa_sp_off_saved_pair(&L) <= 504u;
   n = aa_build_prologue_words(a, &L, words, AA_PROLOGUE_WORDS);
   for (u32 i = 0; i < n; ++i) aa_emit32(t->mc, words[i]);
@@ -2516,8 +2583,15 @@ static const ABIArgInfo* aa_param_abi(NativeTarget* t, const ABIFuncInfo* abi,
   scratch->flags = ABI_AF_NONE;
   scratch->nparts = 1;
   scratch->parts = arena_zarray(t->c->tu, ABIArgPart, 1);
+  /* Windows ARM64 routes variadic floating-point arguments through the integer
+   * registers/stack (the classifier's remap_fp_parts_to_int does the same for
+   * the *named* params of a variadic function); the value's bit pattern moves
+   * via fmov x,d. Every other ABI keeps the `...` FP args in v registers. */
   ((ABIArgPart*)scratch->parts)[0].cls =
-      cg_type_is_float(t->c, desc->args[i].type) ? ABI_CLASS_FP : ABI_CLASS_INT;
+      (cg_type_is_float(t->c, desc->args[i].type) &&
+       t->c->target.os != KIT_OS_WINDOWS)
+          ? ABI_CLASS_FP
+          : ABI_CLASS_INT;
   ((ABIArgPart*)scratch->parts)[0].loc = ABI_LOC_REG;
   ((ABIArgPart*)scratch->parts)[0].size = type_size32(t, desc->args[i].type);
   ((ABIArgPart*)scratch->parts)[0].align = type_align32(t, desc->args[i].type);
@@ -2872,7 +2946,8 @@ static void aa_emit_tail_site(NativeTarget* t, NativeLoc callee) {
     /* Frame is final: emit the tail epilogue (callee restores + frame restore +
      * branch) directly, exactly the words aa_apply_patches would patch in but
      * without the reserved NOP padding. */
-    AAFrameLayout L = aa_build_layout(a->frame.cum_off, a->frame.max_outgoing);
+    AAFrameLayout L =
+      aa_build_layout(a->frame.cum_off, a->frame.max_outgoing, a->top_home_bytes);
     u32 words[AA_TAIL_WORDS];
     u32 n = 0;
     aa_words_callee_restores(a, words, AA_TAIL_WORDS, &n);
@@ -4074,6 +4149,19 @@ static void aa_va_start_core(AANativeTarget* a, NativeAddr ap) {
   NativeLoc ptr =
       native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, AA_TMP0);
   if (vai.kind == ABI_VA_LIST_POINTER) {
+    if (a->top_home_bytes) {
+      /* Windows: `va_list = &<first vararg>` inside the contiguous
+       * [GP home area | incoming stack args] block. Named args consume the
+       * leading slots; next_param_int (FP params remapped to GP included) plus
+       * next_param_stack locate the first unnamed slot. Home slot
+       * gp_reg_count coincides with incoming-arg byte 0, so a single formula
+       * spans both regions. */
+      i32 off =
+          aa_fp_off_home_slot(a->next_param_int) + (i32)a->next_param_stack;
+      aa_emit_add_imm(a, AA_TMP0, AA_FP, off);
+      aa_emit_mem(a, 0, ptr, ap, aa_mem_for_type(t, ptr.type, 8));
+      return;
+    }
     /* `va_list = &<first vararg>`. Variadic stack args follow the fixed
      * incoming params in the same caller window. Apple ARM64 compact fixed
      * stack args may leave this cursor at +4, while the first variadic slot
diff --git a/src/arch/native_target.h b/src/arch/native_target.h
@@ -272,6 +272,7 @@ typedef struct NativeFramePatchState {
  * the descriptor keeps the backend from depending on the optimizer IR. */
 typedef enum NativeMachineOpKind {
   NATIVE_MOP_BINOP,
+  NATIVE_MOP_VA_START,
   NATIVE_MOP_VA_ARG,
   NATIVE_MOP_ATOMIC_CAS,
   NATIVE_MOP_ATOMIC_RMW,
@@ -370,12 +371,13 @@ struct NativeTarget {
    * up front. */
   void (*reserve_callee_saves)(NativeTarget*, const u32* used_by_class,
                                u32 nclasses);
-  /* Optional live-ABI callee-saved register mask for a class. Static
+  /* Optional live-ABI caller/callee-saved register masks for a class. Static
    * NativeAllocClassInfo masks describe the target register file, but some
    * targets vary preservation rules by OS ABI (x64 SysV vs Win64 XMM regs).
-   * Direct emission uses this to decide which borrowed scratch/cache registers
-   * must be reported to reserve_callee_saves(). NULL falls back to
-   * NativeAllocClassInfo.callee_saved_mask. */
+   * The optimizer and direct emission use these to keep allocation, call
+   * clobbers, and prologue save sets aligned with the selected ABI. NULL falls
+   * back to NativeAllocClassInfo.{caller,callee}_saved_mask. */
+  u32 (*caller_saved_mask)(NativeTarget*, NativeAllocClass);
   u32 (*callee_saved_mask)(NativeTarget*, NativeAllocClass);
   /* Optional. When set, the optimizer emit path calls this once — after
    * func_begin, reserve_callee_saves, and frame-slot mapping, but before the
@@ -519,6 +521,32 @@ struct NativeTarget {
   void (*destroy)(NativeTarget*);
 };
 
+static inline const NativeAllocClassInfo*
+native_target_class_info(const NativeTarget* t, NativeAllocClass cls) {
+  if (!t || !t->regs) return NULL;
+  for (u32 i = 0; i < t->regs->nclasses; ++i) {
+    const NativeAllocClassInfo* ci = &t->regs->classes[i];
+    if ((NativeAllocClass)ci->cls == cls) return ci;
+  }
+  return NULL;
+}
+
+static inline u32 native_target_caller_saved_mask(NativeTarget* t,
+                                                  NativeAllocClass cls) {
+  const NativeAllocClassInfo* ci;
+  if (t && t->caller_saved_mask) return t->caller_saved_mask(t, cls);
+  ci = native_target_class_info(t, cls);
+  return ci ? ci->caller_saved_mask : 0u;
+}
+
+static inline u32 native_target_callee_saved_mask(NativeTarget* t,
+                                                  NativeAllocClass cls) {
+  const NativeAllocClassInfo* ci;
+  if (t && t->callee_saved_mask) return t->callee_saved_mask(t, cls);
+  ci = native_target_class_info(t, cls);
+  return ci ? ci->callee_saved_mask : 0u;
+}
+
 static inline NativeLoc native_loc_none(void) {
   NativeLoc loc;
   memset(&loc, 0, sizeof loc);
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -1755,21 +1755,29 @@ static int x64_reg_is_callee_fp(const X64ABIRegs* abi, Reg r);
 static u32 x64_live_callee_saved_mask(NativeTarget* t,
                                       NativeAllocClass cls) {
   X64NativeTarget* a = x64_of(t);
+  const X64ABIRegs* abi = a->abi ? a->abi : x64_abi_for_os(t->c->target.os);
   u32 mask = 0;
   for (Reg r = 0; r < 16u; ++r) {
-    if (cls == NATIVE_REG_INT && x64_reg_is_callee_int(a->abi, r))
+    if (cls == NATIVE_REG_INT && x64_reg_is_callee_int(abi, r))
       mask |= 1u << r;
-    if (cls == NATIVE_REG_FP && x64_reg_is_callee_fp(a->abi, r))
+    if (cls == NATIVE_REG_FP && x64_reg_is_callee_fp(abi, r))
       mask |= 1u << r;
   }
   return mask;
 }
 
+static u32 x64_live_caller_saved_mask(NativeTarget* t,
+                                      NativeAllocClass cls) {
+  const NativeAllocClassInfo* ci = native_target_class_info(t, cls);
+  if (!ci) return 0;
+  return ci->caller_saved_mask & ~x64_live_callee_saved_mask(t, cls);
+}
+
 static void x64_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
                                   u32 nclob, u32* int_mask, u32* fp_mask);
 
 /* abi_clobber_masks is shared as native_asm_abi_clobber_masks
- * (cg/native_asm.h); it reads the masks from t->regs->classes. */
+ * (cg/native_asm.h); it reads the target's live ABI masks. */
 
 /* Build the callee-saved set the prologue must preserve: the allocator-assigned
  * callee-saved registers (frame->callee_saved_used) plus any an inline-asm
@@ -4049,6 +4057,13 @@ static int x64_machine_op_clobbers(NativeTarget* t, const NativeMachineOp* op,
       mask[NATIVE_REG_INT] =
           (1u << X64_RAX) | (1u << X64_RCX) | (1u << X64_RDX);
       return 1;
+    case NATIVE_MOP_VA_START:
+      /* x64_va_start_core materializes the va_list field values through RAX
+       * (the ap pointer itself lands in the reserved r11 scratch). RAX is the
+       * return register, so the allocator may otherwise keep a live value there
+       * across the op. */
+      mask[NATIVE_REG_INT] = (1u << X64_RAX);
+      return 1;
     case NATIVE_MOP_VA_ARG:
       if (!op->result_is_fp) return 0;
       mask[NATIVE_REG_INT] = (1u << X64_RAX);
@@ -4092,6 +4107,7 @@ NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj,
   /* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved
    * set; x64_func_begin_known_frame derives the records from the masks. */
   t->reserve_callee_saves = x64_reserve_callee_saves;
+  t->caller_saved_mask = x64_live_caller_saved_mask;
   t->callee_saved_mask = x64_live_callee_saved_mask;
   t->signature_stack_bytes = x64_signature_stack_bytes;
   t->call_stack_bytes = x64_call_stack_bytes;
diff --git a/src/cg/asm.c b/src/cg/asm.c
@@ -324,7 +324,6 @@ void kit_cg_file_scope_asm(KitCg* g, KitSlice asm_source) {
   if (!g || !asm_source.s) return;
   if (g->check_only) return;
   if (g->target && g->target->file_scope_asm) {
-    api_local_const_memory_boundary(g);
     g->target->file_scope_asm(g->target, asm_source.s, asm_source.len);
     return;
   }
diff --git a/src/cg/native_asm.c b/src/cg/native_asm.c
@@ -37,16 +37,15 @@ int native_asm_match_index(const char* s) {
 
 void native_asm_abi_clobber_masks(NativeTarget* t, u32 abi_sets, u32* int_mask,
                                   u32* fp_mask) {
-  const NativeAllocClassInfo* classes = t->regs->classes;
   *int_mask = 0;
   *fp_mask = 0;
   if (abi_sets & KIT_CG_ASM_CLOBBER_ABI_CALLER_SAVED) {
-    *int_mask |= classes[NATIVE_REG_INT].caller_saved_mask;
-    *fp_mask |= classes[NATIVE_REG_FP].caller_saved_mask;
+    *int_mask |= native_target_caller_saved_mask(t, NATIVE_REG_INT);
+    *fp_mask |= native_target_caller_saved_mask(t, NATIVE_REG_FP);
   }
   if (abi_sets & KIT_CG_ASM_CLOBBER_ABI_CALLEE_SAVED) {
-    *int_mask |= classes[NATIVE_REG_INT].callee_saved_mask;
-    *fp_mask |= classes[NATIVE_REG_FP].callee_saved_mask;
+    *int_mask |= native_target_callee_saved_mask(t, NATIVE_REG_INT);
+    *fp_mask |= native_target_callee_saved_mask(t, NATIVE_REG_FP);
   }
 }
 
diff --git a/src/cg/native_asm.h b/src/cg/native_asm.h
@@ -40,9 +40,7 @@ int native_asm_constraint_early(const char* s);
 int native_asm_match_index(const char* s);
 
 /* Expand the arch-neutral clobber-ABI sets (KitCgAsmClobberAbiSet bits) into
- * this target's per-class caller/callee-saved register masks, read straight
- * from the target's register file (t->regs->classes). Byte-identical across the
- * backends apart from which register table they consulted, so it lives here. */
+ * this target's live per-class caller/callee-saved register masks. */
 void native_asm_abi_clobber_masks(NativeTarget* t, u32 abi_sets, u32* int_mask,
                                   u32* fp_mask);
 
diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c
@@ -109,9 +109,11 @@ static Reg nd_cache_reg_for(NativeDirectTarget* d, CGLocal local,
 static Reg nd_pick_cache_victim(NativeDirectTarget* d, NativeAllocClass cls);
 
 static u32 nd_callee_saved_mask(NativeDirectTarget* d, NativeAllocClass cls) {
-  if (d->native && d->native->callee_saved_mask)
-    return d->native->callee_saved_mask(d->native, cls);
-  return nd_class_info(d, cls)->callee_saved_mask;
+  return native_target_callee_saved_mask(d->native, cls);
+}
+
+static u32 nd_caller_saved_mask(NativeDirectTarget* d, NativeAllocClass cls) {
+  return native_target_caller_saved_mask(d->native, cls);
 }
 
 static void nd_note_reg_used(NativeDirectTarget* d, NativeAllocClass cls,
@@ -544,7 +546,7 @@ static Reg nd_pick_cache_victim(NativeDirectTarget* d, NativeAllocClass cls) {
  * path. */
 static Reg nd_cache_alloc(NativeDirectTarget* d, NativeAllocClass cls) {
   const NativeAllocClassInfo* ci = nd_class_info(d, cls);
-  u32 caller = ci->caller_saved_mask;
+  u32 caller = nd_caller_saved_mask(d, cls);
   Reg victim;
   for (u32 i = 0; i < ci->nallocable; ++i) {
     Reg r = ci->allocable[i];
diff --git a/src/opt/pass_machinize.c b/src/opt/pass_machinize.c
@@ -88,11 +88,14 @@ static void machinize_prepare_insts(Func* f, NativeTarget* target) {
   }
 }
 
-static void collect_class(Func* f, const NativeAllocClassInfo* ci) {
+static void collect_class(Func* f, NativeTarget* target,
+                          const NativeAllocClassInfo* ci) {
   u32 cls = ci->cls;
   if (cls >= OPT_REG_CLASSES) return;
-  f->opt_caller_saved[cls] = ci->caller_saved_mask;
-  f->opt_callee_saved[cls] = ci->callee_saved_mask;
+  f->opt_caller_saved[cls] =
+      native_target_caller_saved_mask(target, (NativeAllocClass)cls);
+  f->opt_callee_saved[cls] =
+      native_target_callee_saved_mask(target, (NativeAllocClass)cls);
   f->opt_reserved_regs[cls] = ci->reserved_mask;
   f->opt_arg_regs[cls] = ci->arg_mask;
   f->opt_ret_regs[cls] = ci->ret_mask;
@@ -116,7 +119,7 @@ static void collect_class(Func* f, const NativeAllocClassInfo* ci) {
 static void machinize_collect_regs(Func* f, NativeTarget* target) {
   if (!target || !target->regs) return;
   for (u32 i = 0; i < target->regs->nclasses; ++i)
-    collect_class(f, &target->regs->classes[i]);
+    collect_class(f, target, &target->regs->classes[i]);
 }
 
 static void machinize_check_overlap(Func* f) {
@@ -157,6 +160,9 @@ static void machinize_inst_clobbers(Func* f, NativeTarget* target) {
           mop.second_is_reg =
               (u8)(in->nopnds > 2u && in->opnds[2].kind == OPK_REG);
           break;
+        case IR_VA_START:
+          mop.kind = NATIVE_MOP_VA_START;
+          break;
         case IR_VA_ARG:
           mop.kind = NATIVE_MOP_VA_ARG;
           mop.result_is_fp = (u8)(in->nopnds > 0u &&
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -1360,7 +1360,9 @@ static u32 compute_callee_saved_used(NativeEmitCtx* e, u32* used, u32 cap) {
   nclasses = ri->nclasses < cap ? ri->nclasses : cap;
   for (u32 i = 0; i < ri->nclasses; ++i) {
     const NativeAllocClassInfo* ci = &ri->classes[i];
-    if (ci->cls < cap) used[ci->cls] &= ci->callee_saved_mask;
+    if (ci->cls < cap)
+      used[ci->cls] &=
+          native_target_callee_saved_mask(t, (NativeAllocClass)ci->cls);
   }
   return nclasses;
 }
diff --git a/test/coff/windows-o1-abi-smoke.sh b/test/coff/windows-o1-abi-smoke.sh
@@ -0,0 +1,434 @@
+#!/usr/bin/env bash
+# test/coff/windows-o1-abi-smoke.sh - focused -O1 Windows ABI coverage for PE.
+#
+# Builds one optimized C program for x86_64-windows and aarch64-windows. The
+# program deliberately exercises ABI shapes that are easy to regress in the
+# optimizer: nonvolatile GPR/FP preservation across calls, mixed int/FP arg
+# assignment, stack arguments, varargs, aggregate return/by-value passing,
+# indirect callbacks, and a UCRT qsort callback.
+set -u
+
+ROOT=${KIT_TEST_ROOT:-$(cd "$(dirname "$0")/../.." && pwd)}
+KIT=${KIT:-"$ROOT/build/kit"}
+SDK=${KIT_SYSROOT:-}
+
+KIT_KIT_DIR="$ROOT/test/lib"
+. "$ROOT/test/lib/kit_sh_kit.sh"
+kit_report_init
+
+LABEL_SUITE=windows-o1-abi-smoke
+
+find_sdk() {
+  local arch=$1
+  local d
+  for d in \
+    "$ROOT"/build/llvm-mingw/*/ucrt/"$arch"-w64-mingw32 \
+    /tmp/llvm-mingw*/llvm-mingw-*-ucrt-*/"$arch"-w64-mingw32 \
+    /tmp/llvm-mingw*/"$arch"-w64-mingw32 \
+    /private/tmp/llvm-mingw*/llvm-mingw-*-ucrt-*/"$arch"-w64-mingw32 \
+    /private/tmp/llvm-mingw*/"$arch"-w64-mingw32; do
+    if [ -d "$d/lib" ] && [ -r "$d/include/windows.h" ]; then
+      printf '%s\n' "$d"
+      return 0
+    fi
+  done
+  return 1
+}
+
+sdk_for_arch() {
+  local arch=$1
+  local base
+  if [ -n "$SDK" ]; then
+    if [ "$(basename "$SDK")" = "$arch-w64-mingw32" ]; then
+      printf '%s\n' "$SDK"
+      return 0
+    fi
+    base=$(dirname "$SDK")
+    if [ -d "$base/$arch-w64-mingw32/lib" ] &&
+       [ -r "$base/$arch-w64-mingw32/include/windows.h" ]; then
+      printf '%s\n' "$base/$arch-w64-mingw32"
+      return 0
+    fi
+  fi
+  find_sdk "$arch"
+}
+
+if [ ! -x "$KIT" ]; then
+  kit_fail "$LABEL_SUITE/kit-present" "kit binary not found: $KIT"
+  kit_summary "$LABEL_SUITE"
+  kit_exit
+fi
+
+TMP=${TMPDIR:-/tmp}
+work=$(mktemp -d "$TMP/kit-windows-o1-abi-smoke.XXXXXX")
+trap 'rm -rf "$work"' EXIT
+
+ABI_C=$work/o1-abi.c
+
+cat >"$ABI_C" <<'SRC'
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+typedef struct Pair {
+  uint64_t a;
+  uint64_t b;
+} Pair;
+
+typedef struct Big {
+  uint64_t a;
+  uint64_t b;
+  uint64_t c;
+  uint64_t d;
+  uint64_t e;
+  double f;
+} Big;
+
+static volatile double g_fp[16] = {
+    1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+    9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+};
+
+static volatile uint64_t g_int[16] = {
+    11u,  23u,  37u,  41u,  53u,  67u,  71u,  83u,
+    97u,  101u, 109u, 113u, 127u, 131u, 137u, 149u,
+};
+
+int run_fp_probe(void);
+int run_int_probe(void);
+
+__attribute__((noinline)) uint64_t opaque_u64(uint64_t x) {
+  return (x * 33u) ^ 0x123456789abcdef0ull;
+}
+
+__attribute__((noinline)) void touch_fp_pressure(double seed) {
+  double a0 = g_fp[0] + seed;
+  double a1 = g_fp[1] + seed;
+  double a2 = g_fp[2] + seed;
+  double a3 = g_fp[3] + seed;
+  double a4 = g_fp[4] + seed;
+  double a5 = g_fp[5] + seed;
+  double a6 = g_fp[6] + seed;
+  double a7 = g_fp[7] + seed;
+  double a8 = g_fp[8] + seed;
+  double a9 = g_fp[9] + seed;
+  double a10 = g_fp[10] + seed;
+  double a11 = g_fp[11] + seed;
+  uint64_t k = opaque_u64((uint64_t)seed);
+  double sum = a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9 + a10 + a11;
+  sum += (double)(k & 255u);
+  g_fp[15] = sum;
+}
+
+__attribute__((noinline)) void touch_int_pressure(uint64_t seed) {
+  uint64_t a0 = g_int[0] + seed;
+  uint64_t a1 = g_int[1] + seed;
+  uint64_t a2 = g_int[2] + seed;
+  uint64_t a3 = g_int[3] + seed;
+  uint64_t a4 = g_int[4] + seed;
+  uint64_t a5 = g_int[5] + seed;
+  uint64_t a6 = g_int[6] + seed;
+  uint64_t a7 = g_int[7] + seed;
+  uint64_t a8 = g_int[8] + seed;
+  uint64_t a9 = g_int[9] + seed;
+  uint64_t k = opaque_u64(seed + 7u);
+  g_int[15] = a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9 + k;
+}
+
+#if defined(__x86_64__)
+__asm__(
+    ".text\n"
+    ".globl run_fp_probe\n"
+    "run_fp_probe:\n"
+    "pushq %rbp\n"
+    "movq %rsp, %rbp\n"
+    "subq $0x30, %rsp\n"
+    "movaps %xmm6, -0x10(%rbp)\n"
+    "movq $0x405edd2f1a9fbe77, %rax\n"
+    "movq %rax, %xmm6\n"
+    "movq $0x401c000000000000, %rax\n"
+    "movq %rax, %xmm0\n"
+    "callq touch_fp_pressure\n"
+    "movq %xmm6, %rax\n"
+    "movq $0x405edd2f1a9fbe77, %rcx\n"
+    "cmpq %rcx, %rax\n"
+    "sete %al\n"
+    "movzbl %al, %eax\n"
+    "movaps -0x10(%rbp), %xmm6\n"
+    "leave\n"
+    "retq\n"
+    ".globl run_int_probe\n"
+    "run_int_probe:\n"
+    "pushq %rbp\n"
+    "movq %rsp, %rbp\n"
+    "subq $0x30, %rsp\n"
+    "movq %r13, -0x8(%rbp)\n"
+    "movq $0xfedcba9876543210, %r13\n"
+    "movq $5, %rcx\n"
+    "callq touch_int_pressure\n"
+    "movq %r13, %rax\n"
+    "movq $0xfedcba9876543210, %rcx\n"
+    "cmpq %rcx, %rax\n"
+    "sete %al\n"
+    "movzbl %al, %eax\n"
+    "movq -0x8(%rbp), %r13\n"
+    "leave\n"
+    "retq\n");
+#elif defined(__aarch64__)
+__asm__(
+    ".text\n"
+    ".globl run_fp_probe\n"
+    "run_fp_probe:\n"
+    "stp x29, x30, [sp, #-32]!\n"
+    "mov x29, sp\n"
+    "str d8, [sp, #16]\n"
+    "mov x9, #0xbe77\n"
+    "movk x9, #0x1a9f, lsl #16\n"
+    "movk x9, #0xdd2f, lsl #32\n"
+    "movk x9, #0x405e, lsl #48\n"
+    "fmov d8, x9\n"
+    "mov x9, #0\n"
+    "movk x9, #0x401c, lsl #48\n"
+    "fmov d0, x9\n"
+    "bl touch_fp_pressure\n"
+    "fmov x10, d8\n"
+    "mov x9, #0xbe77\n"
+    "movk x9, #0x1a9f, lsl #16\n"
+    "movk x9, #0xdd2f, lsl #32\n"
+    "movk x9, #0x405e, lsl #48\n"
+    "cmp x10, x9\n"
+    "cset w0, eq\n"
+    "ldr d8, [sp, #16]\n"
+    "ldp x29, x30, [sp], #32\n"
+    "ret\n"
+    ".globl run_int_probe\n"
+    "run_int_probe:\n"
+    "stp x29, x30, [sp, #-32]!\n"
+    "mov x29, sp\n"
+    "str x19, [sp, #16]\n"
+    "mov x19, #0x3210\n"
+    "movk x19, #0x7654, lsl #16\n"
+    "movk x19, #0xba98, lsl #32\n"
+    "movk x19, #0xfedc, lsl #48\n"
+    "mov x0, #5\n"
+    "bl touch_int_pressure\n"
+    "mov x10, x19\n"
+    "mov x9, #0x3210\n"
+    "movk x9, #0x7654, lsl #16\n"
+    "movk x9, #0xba98, lsl #32\n"
+    "movk x9, #0xfedc, lsl #48\n"
+    "cmp x10, x9\n"
+    "cset w0, eq\n"
+    "ldr x19, [sp, #16]\n"
+    "ldp x29, x30, [sp], #32\n"
+    "ret\n");
+#else
+#error unsupported arch
+#endif
+
+__attribute__((noinline)) uint64_t mixed_args(int a, uint64_t b, double c,
+                                              float d, int e, double f,
+                                              uint64_t g, int h, double i,
+                                              uint64_t j) {
+  return (uint64_t)a + b + (uint64_t)(c * 10.0) + (uint64_t)(d * 10.0f) +
+         (uint64_t)e + (uint64_t)(f * 10.0) + g + (uint64_t)h +
+         (uint64_t)(i * 10.0) + j;
+}
+
+__attribute__((noinline)) uint64_t stack_args(uint64_t a1, uint64_t a2,
+                                              uint64_t a3, uint64_t a4,
+                                              uint64_t a5, uint64_t a6,
+                                              uint64_t a7, uint64_t a8,
+                                              uint64_t a9, uint64_t a10,
+                                              uint64_t a11, uint64_t a12) {
+  return a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9 + a10 + a11 + a12;
+}
+
+__attribute__((noinline)) Pair make_pair(uint64_t base) {
+  Pair p;
+  p.a = base + 1u;
+  p.b = base + 2u;
+  return p;
+}
+
+__attribute__((noinline)) Big make_big(uint64_t base) {
+  Big b;
+  b.a = base + 1u;
+  b.b = base + 2u;
+  b.c = base + 3u;
+  b.d = base + 4u;
+  b.e = base + 5u;
+  b.f = (double)(base + 6u) + 0.5;
+  return b;
+}
+
+__attribute__((noinline)) uint64_t take_big(Big b, Pair p, uint64_t tail) {
+  return b.a + b.b * 2u + b.c * 3u + b.d * 4u + b.e * 5u +
+         (uint64_t)(b.f * 10.0) + p.a + p.b + tail;
+}
+
+__attribute__((noinline)) uint64_t sum_varargs(int tag, ...) {
+  va_list ap;
+  uint64_t a;
+  double b;
+  uint64_t c;
+  int d;
+  double e;
+  va_start(ap, tag);
+  a = va_arg(ap, uint64_t);
+  b = va_arg(ap, double);
+  c = va_arg(ap, uint64_t);
+  d = va_arg(ap, int);
+  e = va_arg(ap, double);
+  va_end(ap);
+  return (uint64_t)tag + a + (uint64_t)(b * 10.0) + c + (uint64_t)d +
+         (uint64_t)(e * 10.0);
+}
+
+/* Loop over a long, boundary-crossing variadic list mixing integer and
+ * floating-point arguments. Exercises three easy-to-regress shapes at once:
+ * the loop accumulator stays live across va_start (so no scratch register may
+ * clobber it), the >8 arguments overflow the register slots onto the stack (so
+ * the ARM64 GP home area must be contiguous with the incoming stack args), and
+ * the floating-point arguments route through integer slots on ARM64. */
+__attribute__((noinline)) uint64_t loop_varargs(int n, ...) {
+  va_list ap;
+  uint64_t sum = (uint64_t)n;
+  va_start(ap, n);
+  for (int i = 0; i < n; ++i) {
+    if (i & 1)
+      sum += (uint64_t)(va_arg(ap, double) * 2.0);
+    else
+      sum += va_arg(ap, uint64_t);
+  }
+  va_end(ap);
+  return sum;
+}
+
+typedef uint64_t (*MixCallback)(int, double, uint64_t, double, int, int,
+                                uint64_t);
+
+__attribute__((noinline)) uint64_t callback_impl(int a, double b, uint64_t c,
+                                                 double d, int e, int f,
+                                                 uint64_t g) {
+  return (uint64_t)a + (uint64_t)(b * 10.0) + c * 2u +
+         (uint64_t)(d * 10.0) + (uint64_t)e * 3u + (uint64_t)f * 5u + g;
+}
+
+__attribute__((noinline)) uint64_t call_callback(MixCallback cb) {
+  return cb(3, 2.5, 7u, 1.5, 4, 5, 9u);
+}
+
+static int cmp_u32(const void* a, const void* b) {
+  uint32_t aa = *(const uint32_t*)a;
+  uint32_t bb = *(const uint32_t*)b;
+  return (aa > bb) - (aa < bb);
+}
+
+int main(void) {
+  uint32_t vals[6] = {9u, 1u, 7u, 3u, 5u, 2u};
+  Pair p;
+  Big b;
+  if (!run_fp_probe()) return 10;
+  if (!(g_fp[15] > 0.0)) return 11;
+  if (!run_int_probe()) return 20;
+  if (g_int[15] == 0u) return 21;
+  if (mixed_args(1, 2u, 1.5, 2.5f, 3, 3.5, 4u, 5, 4.5, 6u) != 141u)
+    return 30;
+  if (stack_args(1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u) != 78u)
+    return 31;
+  p = make_pair(100u);
+  if (p.a != 101u || p.b != 102u) return 40;
+  b = make_big(10u);
+  if (take_big(b, p, 5u) != 578u) return 41;
+  if (sum_varargs(3, (uint64_t)10u, 2.5, (uint64_t)20u, 7, 1.5) != 80u)
+    return 50;
+  if (loop_varargs(10, (uint64_t)1, 2.0, (uint64_t)3, 4.0, (uint64_t)5, 6.0,
+                   (uint64_t)7, 8.0, (uint64_t)9, 10.0) != 95u)
+    return 51;
+  if (call_callback(callback_impl) != 103u) return 60;
+  qsort(vals, 6u, sizeof vals[0], cmp_u32);
+  if (vals[0] != 1u || vals[1] != 2u || vals[5] != 9u) return 70;
+  return 0;
+}
+SRC
+
+no_legacy_crt_imports() {
+  local name=$1 dump=$2
+  if grep -Eiq 'DLL Name: (msvcrt|ucrt)\.dll' "$dump"; then
+    grep -Ei 'DLL Name: (msvcrt|ucrt)\.dll' "$dump" > "$work/$name.diag"
+    not_ok "$name" "$work/$name.diag"
+  else
+    ok "$name"
+  fi
+}
+
+run_vm_if_available() {
+  local label=$1 arch=$2 exe=$3
+  case "$arch" in
+    x64)
+      if [ -n "${KIT_WINDOWS_VM_X64:-${KIT_WINDOWS_VM_AMD64:-}}" ]; then
+        if "$ROOT/scripts/windows_vm.sh" run x64 "$exe" \
+            > "$work/$label-vm.out" 2> "$work/$label-vm.err"; then
+          ok "$label-vm"
+        else
+          not_ok "$label-vm" "$work/$label-vm.err"
+        fi
+      else
+        skip_test "$label-vm" "KIT_WINDOWS_VM_X64 not set"
+      fi
+      ;;
+    aarch64)
+      if [ -n "${KIT_WINDOWS_VM_AARCH64:-${KIT_WINDOWS_VM_ARM64:-}}" ]; then
+        if "$ROOT/scripts/windows_vm.sh" run aarch64 "$exe" \
+            > "$work/$label-vm.out" 2> "$work/$label-vm.err"; then
+          ok "$label-vm"
+        else
+          not_ok "$label-vm" "$work/$label-vm.err"
+        fi
+      else
+        skip_test "$label-vm" "KIT_WINDOWS_VM_AARCH64 not set"
+      fi
+      ;;
+  esac
+}
+
+ran=0
+for arch in x86_64 aarch64; do
+  case "$arch" in
+    x86_64) target=x86_64-windows; label=x64 ;;
+    aarch64) target=aarch64-windows; label=aarch64 ;;
+  esac
+
+  if ! ARCH_SDK=$(sdk_for_arch "$arch"); then
+    skip_test "$LABEL_SUITE/$label-sysroot" "no $arch llvm-mingw UCRT sysroot"
+    continue
+  fi
+  if [ ! -r "$ARCH_SDK/include/windows.h" ] ||
+     [ ! -r "$ARCH_SDK/lib/libucrt.a" ]; then
+    skip_test "$LABEL_SUITE/$label-sysroot" "invalid UCRT llvm-mingw sysroot: $ARCH_SDK"
+    continue
+  fi
+
+  ran=1
+  exe=$work/o1-abi-$arch.exe
+  dump=$work/o1-abi-$arch.dump
+  run_ok "$label-o1-abi-build" "$KIT" cc -target "$target" --sysroot "$ARCH_SDK" \
+    -O1 "$ABI_C" -o "$exe"
+  if [ -f "$exe" ]; then
+    run_ok "$label-o1-abi-objdump" "$KIT" objdump -p "$exe"
+    if [ -s "$work/$label-o1-abi-objdump.out" ]; then
+      cp "$work/$label-o1-abi-objdump.out" "$dump"
+      no_legacy_crt_imports "$label-o1-abi-no-legacy-crt" "$dump"
+      contains "$label-o1-abi-qsort-import" "$dump" "Name:     qsort"
+    fi
+    run_vm_if_available "$label-o1-abi" "$label" "$exe"
+  fi
+done
+
+if [ "$ran" -eq 0 ]; then
+  skip_test "$LABEL_SUITE" "set KIT_SYSROOT or install llvm-mingw UCRT under /tmp/llvm-mingw*"
+fi
+
+kit_summary "$LABEL_SUITE"
+kit_exit
diff --git a/test/parse/cases/asm_02_file_scope.c b/test/parse/cases/asm_02_file_scope.c
@@ -14,5 +14,14 @@ asm(".data\n"
     ".text\n");
 
 extern int global_asm_before;
+extern int global_asm_tail;
 
-int test_main(void) { return global_asm_before + global_asm_after; }
+int test_main(void) {
+  return global_asm_before + global_asm_after + global_asm_tail;
+}
+
+asm(".data\n"
+    ".globl global_asm_tail\n"
+    "global_asm_tail:\n"
+    ".word 0\n"
+    ".text\n");
diff --git a/test/parse/cases/gnu_inline_control_flow.c b/test/parse/cases/gnu_inline_control_flow.c
@@ -0,0 +1,36 @@
+/* A C99 `extern inline` definition is parsed and semantically validated but
+ * emits no out-of-line code (codegen suppressed). Control-flow statements in
+ * such a body must do their break/continue/case bookkeeping without opening a
+ * CG function — regression for pcg_label_new and the while/switch suppressed
+ * paths, which mint dummy labels under suppression instead of calling the CG
+ * label ops. The suppressed body is intentionally not referenced (an extern
+ * inline has no external definition to link against); test_main stands alone. */
+extern inline int suppressed(int n) {
+  int s = 0;
+  while (n > 0) {
+    if (n == 3) {
+      n--;
+      continue;
+    }
+    switch (n) {
+      case 1:
+        s += 1;
+        break;
+      case 2:
+        s += 2;
+        break;
+      default:
+        s += n;
+        break;
+    }
+    n--;
+  }
+  for (int i = 0; i < 4; i++) {
+    if (i == 1) continue;
+    if (i == 3) break;
+    s += i;
+  }
+  return s;
+}
+
+int test_main(void) { return 7; }
diff --git a/test/parse/cases/gnu_inline_control_flow.expected b/test/parse/cases/gnu_inline_control_flow.expected
@@ -0,0 +1 @@
+7

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	lang/c/parse/cg_adapter.c	\|	14	+++++++++-----
M	lang/c/parse/parse_stmt.c	\|	54	++++++++++++++++++++++++++++++++++++++++++++++++------
M	mk/test.mk	\|	6	++++++
M	src/abi/abi_aapcs64_windows.c	\|	6	+++++-
M	src/arch/aa64/native.c	\|	128	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
M	src/arch/native_target.h	\|	36	++++++++++++++++++++++++++++++++----
M	src/arch/x64/native.c	\|	22	+++++++++++++++++++---
M	src/cg/asm.c	\|	1	-
M	src/cg/native_asm.c	\|	9	++++-----
M	src/cg/native_asm.h	\|	4	+---
M	src/cg/native_direct_target.c	\|	10	++++++----
M	src/opt/pass_machinize.c	\|	14	++++++++++----
M	src/opt/pass_native_emit.c	\|	4	+++-
A	test/coff/windows-o1-abi-smoke.sh	\|	434	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	test/parse/cases/asm_02_file_scope.c	\|	11	++++++++++-
A	test/parse/cases/gnu_inline_control_flow.c	\|	36	++++++++++++++++++++++++++++++++++++
A	test/parse/cases/gnu_inline_control_flow.expected	\|	1	+