commit 61a4a4c6f662dced2a9394cb09ee41c1bca529c2
parent 4c4f1db31be66b1fab039b0a1fa8c4f8ab2bdd6f
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 4 Jun 2026 21:51:14 -0700
Fix Windows x64/arm64 variadic ABI at O0 and O1
aarch64-windows variadics were broken in both the direct (O0) and
optimizer (O1) paths, and x64 Win64 va_start miscompiled at O1. The
no-VM smoke (build + objdump) hid all of it; these are runtime bugs.
- aarch64-windows: route variadic FP args through GP registers (the
classifier already remapped named params; aa_param_abi now does the
same for trailing `...` args) and home x0..x7 into a GP "home area"
at the top of the frame, contiguous with the incoming stack args, so
the plain-pointer va_list walks register- then stack-passed varargs
as one block. Driven by ABIVaListInfo.gp_reg_count; forces the fat
top-record frame for Windows variadic functions only (non-variadic
and non-Windows codegen is untouched).
- x64 Win64: declare RAX clobbered by va_start via the machine-op
clobber mechanism (new NATIVE_MOP_VA_START), so the allocator stops
keeping a live value (e.g. a return-coalesced loop accumulator) in
RAX across the op.
Verified on the Win11 ARM64 VM (aarch64 native + x64 emulated) at O0
and O1: all three COFF Windows smokes pass, plus new coverage for
loop/overflow varargs, named-FP varargs, va_copy, and RSI/RDI +
XMM6-15 preservation. No regressions across cg/opt/isa/parse/asm/pp/
debug/dwarf/smoke/libc, Apple-ARM64, or SysV varargs.
Also fold in the in-flight frontend cleanups owned by this change
(extern-inline suppressed-parse dummy labels; drop the bogus
local-const memory boundary on file-scope asm) and add their missing
test coverage.
Diffstat:
17 files changed, 732 insertions(+), 58 deletions(-)
diff --git a/lang/c/parse/cg_adapter.c b/lang/c/parse/cg_adapter.c
@@ -607,11 +607,15 @@ void pcg_computed_goto(Parser* p, const CGLabel* targets, u32 ntargets) {
/* ---- Control flow ----
*
- * Labels are pure bookkeeping handles, so pcg_label_new always mints one (it
- * has no codegen side effect to suppress). The placement / jump / branch ops
- * gate on emit; the conditional branches also pop the tested value off the
- * type stack to mirror the CG-side consume. */
-CGLabel pcg_label_new(Parser* p) { return kit_cg_label_new(p->cg); }
+ * Label placement / jump / branch ops gate on emit; the conditional branches
+ * also pop the tested value off the type stack to mirror the CG-side consume.
+ * Suppressed parses, such as C99 `extern inline` bodies, do not open a CG
+ * function, so they use a nonzero dummy label only for semantic bookkeeping
+ * around break/continue/case validation. */
+CGLabel pcg_label_new(Parser* p) {
+ if (!pcg_emit_enabled(p)) return (CGLabel)1;
+ return kit_cg_label_new(p->cg);
+}
void pcg_label_place(Parser* p, CGLabel l) {
if (pcg_emit_enabled(p)) kit_cg_label_place(p->cg, l);
diff --git a/lang/c/parse/parse_stmt.c b/lang/c/parse/parse_stmt.c
@@ -81,11 +81,21 @@ static void parse_while_stmt(Parser* p) {
* /parse_continue/etc. keep using their existing raw `pcg_jump` calls —
* the C target recognizes the labels as the innermost scope's
* boundaries and emits the structured keywords on its own. */
- KitCgScope scope = kit_cg_scope_begin(p->cg, KIT_CG_TYPE_NONE);
- CGLabel L_top = kit_cg_scope_continue_label(p->cg, scope);
- CGLabel L_end = kit_cg_scope_break_label(p->cg, scope);
CGLabel saved_break = p->cur_break;
CGLabel saved_continue = p->cur_continue;
+ KitCgScope scope;
+ CGLabel L_top;
+ CGLabel L_end;
+ int emit = pcg_emit_enabled(p);
+ if (emit) {
+ scope = kit_cg_scope_begin(p->cg, KIT_CG_TYPE_NONE);
+ L_top = kit_cg_scope_continue_label(p->cg, scope);
+ L_end = kit_cg_scope_break_label(p->cg, scope);
+ } else {
+ scope = 0;
+ L_top = (CGLabel)1;
+ L_end = (CGLabel)1;
+ }
expect_punct(p, '(', "'('");
parse_expr(p);
to_rvalue(p);
@@ -93,6 +103,15 @@ static void parse_while_stmt(Parser* p) {
perr(p, "while condition requires scalar type");
}
expect_punct(p, ')', "')'");
+ if (!emit) {
+ pcg_drop(p);
+ p->cur_break = L_end;
+ p->cur_continue = L_top;
+ parse_stmt(p);
+ p->cur_break = saved_break;
+ p->cur_continue = saved_continue;
+ return;
+ }
pcg_branch_false(p, L_end);
p->cur_break = L_end;
p->cur_continue = L_top;
@@ -378,18 +397,29 @@ static void parse_switch_stmt(Parser* p) {
* chain (unchanged behaviour) and which the C target overrides to
* emit a real `switch (sel) { case V: goto L_V; …; default: goto
* L_def; }`. */
- KitCgScope scope = kit_cg_scope_begin(p->cg, KIT_CG_TYPE_NONE);
- CGLabel L_dispatch = pcg_label_new(p);
- CGLabel L_end = kit_cg_scope_break_label(p->cg, scope);
CGLabel saved_break = p->cur_break;
SwitchCtx ctx;
SwitchCtx* saved_switch = p->cur_switch;
+ KitCgScope scope;
+ CGLabel L_dispatch;
+ CGLabel L_end;
+ int emit = pcg_emit_enabled(p);
FrameSlotDesc fsd;
const Type* vty;
CaseEntry* it;
CaseEntry* prev;
CaseEntry* head;
+ if (emit) {
+ scope = kit_cg_scope_begin(p->cg, KIT_CG_TYPE_NONE);
+ L_dispatch = pcg_label_new(p);
+ L_end = kit_cg_scope_break_label(p->cg, scope);
+ } else {
+ scope = 0;
+ L_dispatch = (CGLabel)1;
+ L_end = (CGLabel)1;
+ }
+
expect_punct(p, '(', "'('");
parse_expr(p);
to_rvalue(p);
@@ -413,6 +443,18 @@ static void parse_switch_stmt(Parser* p) {
}
expect_punct(p, ')', "')' after switch expression");
+ if (!emit) {
+ pcg_drop(p);
+ memset(&ctx, 0, sizeof ctx);
+ ctx.parent = saved_switch;
+ p->cur_switch = &ctx;
+ p->cur_break = L_end;
+ parse_stmt(p);
+ p->cur_break = saved_break;
+ p->cur_switch = saved_switch;
+ return;
+ }
+
memset(&ctx, 0, sizeof ctx);
memset(&fsd, 0, sizeof fsd);
fsd.type = vty;
diff --git a/mk/test.mk b/mk/test.mk
@@ -59,6 +59,7 @@ TEST_TARGETS = \
test-cg-api \
test-coff \
test-coff-mingw-import \
+ test-coff-windows-o1-abi \
test-coff-windows-ucrt \
test-debug \
test-dbg \
@@ -685,6 +686,10 @@ test-coff-mingw-import: lib $(COFF_IMPORT_MINGW_BIN)
test-coff-windows-ucrt: bin rt-x86_64-pc-windows rt-aarch64-windows windows-ucrt-sysroots
KIT_SYSROOT=$(abspath build/llvm-mingw/20260602/ucrt) bash test/coff/windows-ucrt-hosted-smoke.sh
KIT_SYSROOT=$(abspath build/llvm-mingw/20260602/ucrt) bash test/coff/windows-system-dlls-smoke.sh
+ KIT_SYSROOT=$(abspath build/llvm-mingw/20260602/ucrt) bash test/coff/windows-o1-abi-smoke.sh
+
+test-coff-windows-o1-abi: bin rt-x86_64-pc-windows rt-aarch64-windows windows-ucrt-sysroots
+ KIT_SYSROOT=$(abspath build/llvm-mingw/20260602/ucrt) bash test/coff/windows-o1-abi-smoke.sh
# Opt-in: run the COFF/PE hosted smokes against a real Windows 11 ARM64 VM, so
# their per-program run lanes execute for real instead of self-skipping. On
@@ -707,6 +712,7 @@ test-coff-windows-vm: bin rt-x86_64-pc-windows rt-aarch64-windows windows-ucrt-s
bash scripts/windows_vm.sh wait-ssh 600
$(_WIN_VM_ENV) bash test/coff/windows-ucrt-hosted-smoke.sh
$(_WIN_VM_ENV) bash test/coff/windows-system-dlls-smoke.sh
+ $(_WIN_VM_ENV) bash test/coff/windows-o1-abi-smoke.sh
# The parse/asm/macho harnesses select a cross-target via KIT_TEST_ARCH
# (default aa64); the link rt dependency is resolved through the shared
diff --git a/src/abi/abi_aapcs64_windows.c b/src/abi/abi_aapcs64_windows.c
@@ -64,5 +64,9 @@ const ABIVtable aapcs64_windows_vtable = {
.compute_func_info = aapcs64_windows_compute_func_info,
.va_list_info = {8, 8, ABI_SC_PTR, 0, 0, 0},
.va_list_layout = {.type = {8, 8, ABI_SC_PTR, 0, 0, 0},
- .kind = ABI_VA_LIST_POINTER},
+ .kind = ABI_VA_LIST_POINTER,
+ .gp_reg_count = 8,
+ .fp_reg_count = 0,
+ .gp_slot_size = 8,
+ .fp_slot_size = 0},
};
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -130,14 +130,22 @@ typedef struct AAFrameLayout {
u32 slot_bytes; /* sum of aa_frame_slot reservations (callee-saves + locals
* + spills + sret/variadic) */
u32 out_stack; /* max outgoing-arg bytes across all calls in this function */
- u32 frame_size; /* align16(AA_FRAME_SAVE_SIZE + slot_bytes + out_stack) */
+ u32 top_home; /* Windows-variadic GP register home area, reserved between
+ * the saved pair and the incoming stack args so the
+ * plain-pointer va_list walks register then stack varargs as
+ * one contiguous block (0 on every other ABI). */
+ u32 frame_size; /* align16(AA_FRAME_SAVE_SIZE + top_home + slot_bytes +
+ * out_stack) */
} AAFrameLayout;
-static inline AAFrameLayout aa_build_layout(u32 slot_bytes, u32 out_stack) {
+static inline AAFrameLayout aa_build_layout(u32 slot_bytes, u32 out_stack,
+ u32 top_home) {
AAFrameLayout L;
L.slot_bytes = slot_bytes;
L.out_stack = out_stack;
- L.frame_size = align_up_u32(AA_FRAME_SAVE_SIZE + slot_bytes + out_stack, 16u);
+ L.top_home = top_home;
+ L.frame_size =
+ align_up_u32(AA_FRAME_SAVE_SIZE + top_home + slot_bytes + out_stack, 16u);
return L;
}
@@ -152,7 +160,7 @@ static inline i32 aa_fp_off_saved_lr(void) { return 8; }
/* SP-relative byte offsets. */
static inline i32 aa_sp_off_out_arg(u32 byte_off) { return (i32)byte_off; }
static inline u32 aa_sp_off_saved_pair(const AAFrameLayout* L) {
- return L->frame_size - AA_FRAME_SAVE_SIZE;
+ return L->frame_size - AA_FRAME_SAVE_SIZE - L->top_home;
}
/* Frame slots and callee-save records are owned by the shared NativeFrame
@@ -195,6 +203,11 @@ typedef struct AANativeTarget {
* and unread on the single-pass path, which never sets fp_at_bottom. */
u32 frame_size_final;
u32 incoming_stack_size;
+ /* Windows-variadic GP register home area size (gp_reg_count * gp_slot_size,
+ * 64 today; 0 on every other ABI). When nonzero the function takes the fat
+ * top-record layout and homes x0..x7 into [fp + AA_FRAME_SAVE_SIZE ..] so the
+ * plain-pointer va_list can walk register then stack varargs contiguously. */
+ u32 top_home_bytes;
u32 next_param_int;
u32 next_param_fp;
u32 next_param_stack;
@@ -254,7 +267,10 @@ static AANativeTarget* aa_of(NativeTarget* t) { return (AANativeTarget*)t; }
* record since frame_size = align16(16+cum_off) >= 16+cum_off).
* CFA = fp+frame_size. */
static inline i32 aa_fp_off_in_arg(const AANativeTarget* a, u32 byte_off) {
- u32 base = a->fp_at_bottom ? a->frame_size_final : AA_FRAME_SAVE_SIZE;
+ /* top-record incoming args sit above the saved pair and the (usually empty)
+ * Windows-variadic GP home area; bottom-record never carries a home area. */
+ u32 base = a->fp_at_bottom ? a->frame_size_final
+ : AA_FRAME_SAVE_SIZE + a->top_home_bytes;
return (i32)(base + byte_off);
}
static inline i32 aa_fp_off_slot(const AANativeTarget* a, u32 slot_off) {
@@ -272,7 +288,17 @@ static inline i32 aa_fp_off_tail_out_arg(const AANativeTarget* a,
* fp+frame_size bottom-record). Named so the CFI emit site stays layout-blind.
*/
static inline i32 aa_cfa_off(const AANativeTarget* a) {
- return a->fp_at_bottom ? (i32)a->frame_size_final : (i32)AA_FRAME_SAVE_SIZE;
+ return a->fp_at_bottom
+ ? (i32)a->frame_size_final
+ : (i32)(AA_FRAME_SAVE_SIZE + a->top_home_bytes);
+}
+
+/* fp-relative offset of GP home slot `i` (Windows variadic only). The home area
+ * sits just above the saved pair and just below the incoming stack args, so
+ * slot gp_reg_count coincides with incoming-arg byte 0 (top-record only — a
+ * function with a home area never takes a slim/bottom layout). */
+static inline i32 aa_fp_off_home_slot(u32 i) {
+ return (i32)(AA_FRAME_SAVE_SIZE + i * 8u);
}
static void aa_panic(AANativeTarget* a, const char* msg) {
@@ -1023,13 +1049,34 @@ static void aa_reserve_variadic_reg_saves(AANativeTarget* a) {
a->va_vr_slot = a->base.frame_slot(&a->base, &sd);
}
-/* Emit the stores into the variadic register-save area. Slots must already be
- * reserved (aa_reserve_variadic_reg_saves). */
+/* Emit the stores into the variadic register-save area. For AAPCS64 these land
+ * in the reserved gr/vr frame slots (aa_reserve_variadic_reg_saves); for the
+ * Windows GP home area they land in [fp + AA_FRAME_SAVE_SIZE ..], the
+ * top-of-frame block contiguous with the incoming stack args. */
static void aa_emit_variadic_reg_save_stores(AANativeTarget* a) {
NativeAddr addr;
MemAccess mem;
KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64);
ABIVaListInfo vai = abi_va_list_layout(a->base.c->abi);
+ if (vai.kind == ABI_VA_LIST_POINTER && a->top_home_bytes) {
+ /* Windows: home x0..x{gp_reg_count-1} so the plain-pointer va_list walks
+ * register then stack varargs as one block. The named leading registers are
+ * homed too (harmless): va_start skips past them. */
+ memset(&mem, 0, sizeof mem);
+ mem.type = i64;
+ mem.size = 8;
+ mem.align = 8;
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_REG;
+ addr.base.reg = AA_FP;
+ addr.base_type = i64;
+ for (u32 r = 0; r < vai.gp_reg_count && r < 8u; ++r) {
+ NativeLoc src = native_loc_reg(i64, NATIVE_REG_INT, r);
+ addr.offset = aa_fp_off_home_slot(r);
+ aa_emit_mem(a, 0, src, addr, mem);
+ }
+ return;
+ }
if (vai.kind != ABI_VA_LIST_AAPCS64) return;
memset(&mem, 0, sizeof mem);
mem.type = i64;
@@ -1080,6 +1127,18 @@ static void aa_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) {
a->slim_small_frame = 0;
a->fp_at_bottom = 0;
a->frame_size_final = 0;
+ /* Windows variadic functions reserve a GP register home area at the top of
+ * the frame (just below the incoming stack args). The plain-pointer va_list
+ * then walks register-passed then stack-passed varargs as one block. Other
+ * ABIs leave gp_reg_count 0 here: Apple ARM64 routes all varargs to the
+ * stack, AAPCS64 uses a struct va_list with separate reg-save pointers. */
+ {
+ const ABIFuncInfo* fi = abi_cg_func_info(t->c->abi, fd->fn_type);
+ ABIVaListInfo vai = abi_va_list_layout(t->c->abi);
+ a->top_home_bytes = (fi && fi->variadic && vai.kind == ABI_VA_LIST_POINTER)
+ ? vai.gp_reg_count * vai.gp_slot_size
+ : 0u;
+ }
mc->set_section(mc, fd->text_section_id);
mc->emit_align(mc, 4, 0);
a->func_start = mc->pos(mc);
@@ -1288,11 +1347,13 @@ static void aa_words_restore_frame(AANativeTarget* a, u32* words, u32 cap,
return;
}
if (*n + 3u > cap) aa_panic(a, "instruction patch too small");
- /* AAPCS64: fp is the saved-pair address. Reload pair from [fp], then
- * restore sp to fp + AA_FRAME_SAVE_SIZE (= caller's original sp = CFA). */
+ /* AAPCS64: fp is the saved-pair address. Reload pair from [fp], then restore
+ * sp to fp + CFA-offset (= caller's original sp = CFA). The CFA offset is
+ * AA_FRAME_SAVE_SIZE normally, plus the Windows-variadic GP home area when
+ * present. */
words[(*n)++] = aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0);
words[(*n)++] = aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, 0);
- words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP0, AA_FRAME_SAVE_SIZE, 0);
+ words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP0, (u32)aa_cfa_off(a), 0);
}
/* Emit callee-save store (save=1) or restore (save=0) words into `words`,
@@ -1488,7 +1549,8 @@ static void aa_apply_patches(AANativeTarget* a, const AAFrameLayout* L) {
static void aa_func_end(NativeTarget* t) {
AANativeTarget* a = aa_of(t);
MCEmitter* mc = t->mc;
- AAFrameLayout L = aa_build_layout(a->frame.cum_off, a->frame.max_outgoing);
+ AAFrameLayout L =
+ aa_build_layout(a->frame.cum_off, a->frame.max_outgoing, a->top_home_bytes);
/* known_frame (optimizer): prologue, allocas, and tail epilogues were emitted
* final and slim eligibility was settled in aa_func_begin_known_frame — there
* is nothing to patch. Single-pass (NDT): a worst-case prologue region was
@@ -1646,7 +1708,8 @@ static void aa_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
* known, so the prologue immediates and slim-form choice are settled here.
* frame_size_final must be set before aa_build_prologue_words / entry saves,
* since the bottom-record offset helpers read it. */
- L = aa_build_layout(a->frame.cum_off, a->frame.max_outgoing);
+ L = aa_build_layout(a->frame.cum_off, a->frame.max_outgoing,
+ a->top_home_bytes);
a->frame_size_final = L.frame_size;
/* Slim Tier A: no callee-saves, no alloca, no body slots, no outgoing stack
* args — the whole frame is the 16-byte record. fp_at_bottom: a small frame
@@ -1654,14 +1717,18 @@ static void aa_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
* the bottom (fp = sp) so sp adjustment folds into the pre/post-indexed
* stp/ldp (frame_size <= 504 keeps the post-index ldp imm in range).
* Otherwise slim_small_frame keeps the top-record layout but skips the
- * x17/x10 scratch (out_stack>0 small frames land here). (See aa_func_end for
- * the single-pass path, which never takes any slim form.) */
+ * x17/x10 scratch (out_stack>0 small frames land here). A Windows-variadic
+ * home area forces the fat top-record layout: it lives above the saved pair,
+ * which neither the slim forms (saved pair at the very top) nor the
+ * bottom-record (saved pair at the very bottom) leave room for. (See
+ * aa_func_end for the single-pass path, which never takes any slim form.) */
a->slim_prologue = a->frame.ncallee_saves == 0 && !a->frame.has_alloca &&
- L.slot_bytes == 0 && L.out_stack == 0;
+ L.slot_bytes == 0 && L.out_stack == 0 && !a->top_home_bytes;
a->fp_at_bottom = !a->slim_prologue && !a->frame.has_alloca &&
- L.out_stack == 0 && L.frame_size <= 504u;
+ L.out_stack == 0 && L.frame_size <= 504u &&
+ !a->top_home_bytes;
a->slim_small_frame = !a->slim_prologue && !a->fp_at_bottom &&
- !a->frame.has_alloca &&
+ !a->frame.has_alloca && !a->top_home_bytes &&
aa_sp_off_saved_pair(&L) <= 504u;
n = aa_build_prologue_words(a, &L, words, AA_PROLOGUE_WORDS);
for (u32 i = 0; i < n; ++i) aa_emit32(t->mc, words[i]);
@@ -2516,8 +2583,15 @@ static const ABIArgInfo* aa_param_abi(NativeTarget* t, const ABIFuncInfo* abi,
scratch->flags = ABI_AF_NONE;
scratch->nparts = 1;
scratch->parts = arena_zarray(t->c->tu, ABIArgPart, 1);
+ /* Windows ARM64 routes variadic floating-point arguments through the integer
+ * registers/stack (the classifier's remap_fp_parts_to_int does the same for
+ * the *named* params of a variadic function); the value's bit pattern moves
+ * via fmov x,d. Every other ABI keeps the `...` FP args in v registers. */
((ABIArgPart*)scratch->parts)[0].cls =
- cg_type_is_float(t->c, desc->args[i].type) ? ABI_CLASS_FP : ABI_CLASS_INT;
+ (cg_type_is_float(t->c, desc->args[i].type) &&
+ t->c->target.os != KIT_OS_WINDOWS)
+ ? ABI_CLASS_FP
+ : ABI_CLASS_INT;
((ABIArgPart*)scratch->parts)[0].loc = ABI_LOC_REG;
((ABIArgPart*)scratch->parts)[0].size = type_size32(t, desc->args[i].type);
((ABIArgPart*)scratch->parts)[0].align = type_align32(t, desc->args[i].type);
@@ -2872,7 +2946,8 @@ static void aa_emit_tail_site(NativeTarget* t, NativeLoc callee) {
/* Frame is final: emit the tail epilogue (callee restores + frame restore +
* branch) directly, exactly the words aa_apply_patches would patch in but
* without the reserved NOP padding. */
- AAFrameLayout L = aa_build_layout(a->frame.cum_off, a->frame.max_outgoing);
+ AAFrameLayout L =
+ aa_build_layout(a->frame.cum_off, a->frame.max_outgoing, a->top_home_bytes);
u32 words[AA_TAIL_WORDS];
u32 n = 0;
aa_words_callee_restores(a, words, AA_TAIL_WORDS, &n);
@@ -4074,6 +4149,19 @@ static void aa_va_start_core(AANativeTarget* a, NativeAddr ap) {
NativeLoc ptr =
native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, AA_TMP0);
if (vai.kind == ABI_VA_LIST_POINTER) {
+ if (a->top_home_bytes) {
+ /* Windows: `va_list = &<first vararg>` inside the contiguous
+ * [GP home area | incoming stack args] block. Named args consume the
+ * leading slots; next_param_int (FP params remapped to GP included) plus
+ * next_param_stack locate the first unnamed slot. Home slot
+ * gp_reg_count coincides with incoming-arg byte 0, so a single formula
+ * spans both regions. */
+ i32 off =
+ aa_fp_off_home_slot(a->next_param_int) + (i32)a->next_param_stack;
+ aa_emit_add_imm(a, AA_TMP0, AA_FP, off);
+ aa_emit_mem(a, 0, ptr, ap, aa_mem_for_type(t, ptr.type, 8));
+ return;
+ }
/* `va_list = &<first vararg>`. Variadic stack args follow the fixed
* incoming params in the same caller window. Apple ARM64 compact fixed
* stack args may leave this cursor at +4, while the first variadic slot
diff --git a/src/arch/native_target.h b/src/arch/native_target.h
@@ -272,6 +272,7 @@ typedef struct NativeFramePatchState {
* the descriptor keeps the backend from depending on the optimizer IR. */
typedef enum NativeMachineOpKind {
NATIVE_MOP_BINOP,
+ NATIVE_MOP_VA_START,
NATIVE_MOP_VA_ARG,
NATIVE_MOP_ATOMIC_CAS,
NATIVE_MOP_ATOMIC_RMW,
@@ -370,12 +371,13 @@ struct NativeTarget {
* up front. */
void (*reserve_callee_saves)(NativeTarget*, const u32* used_by_class,
u32 nclasses);
- /* Optional live-ABI callee-saved register mask for a class. Static
+ /* Optional live-ABI caller/callee-saved register masks for a class. Static
* NativeAllocClassInfo masks describe the target register file, but some
* targets vary preservation rules by OS ABI (x64 SysV vs Win64 XMM regs).
- * Direct emission uses this to decide which borrowed scratch/cache registers
- * must be reported to reserve_callee_saves(). NULL falls back to
- * NativeAllocClassInfo.callee_saved_mask. */
+ * The optimizer and direct emission use these to keep allocation, call
+ * clobbers, and prologue save sets aligned with the selected ABI. NULL falls
+ * back to NativeAllocClassInfo.{caller,callee}_saved_mask. */
+ u32 (*caller_saved_mask)(NativeTarget*, NativeAllocClass);
u32 (*callee_saved_mask)(NativeTarget*, NativeAllocClass);
/* Optional. When set, the optimizer emit path calls this once — after
* func_begin, reserve_callee_saves, and frame-slot mapping, but before the
@@ -519,6 +521,32 @@ struct NativeTarget {
void (*destroy)(NativeTarget*);
};
+static inline const NativeAllocClassInfo*
+native_target_class_info(const NativeTarget* t, NativeAllocClass cls) {
+ if (!t || !t->regs) return NULL;
+ for (u32 i = 0; i < t->regs->nclasses; ++i) {
+ const NativeAllocClassInfo* ci = &t->regs->classes[i];
+ if ((NativeAllocClass)ci->cls == cls) return ci;
+ }
+ return NULL;
+}
+
+static inline u32 native_target_caller_saved_mask(NativeTarget* t,
+ NativeAllocClass cls) {
+ const NativeAllocClassInfo* ci;
+ if (t && t->caller_saved_mask) return t->caller_saved_mask(t, cls);
+ ci = native_target_class_info(t, cls);
+ return ci ? ci->caller_saved_mask : 0u;
+}
+
+static inline u32 native_target_callee_saved_mask(NativeTarget* t,
+ NativeAllocClass cls) {
+ const NativeAllocClassInfo* ci;
+ if (t && t->callee_saved_mask) return t->callee_saved_mask(t, cls);
+ ci = native_target_class_info(t, cls);
+ return ci ? ci->callee_saved_mask : 0u;
+}
+
static inline NativeLoc native_loc_none(void) {
NativeLoc loc;
memset(&loc, 0, sizeof loc);
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -1755,21 +1755,29 @@ static int x64_reg_is_callee_fp(const X64ABIRegs* abi, Reg r);
static u32 x64_live_callee_saved_mask(NativeTarget* t,
NativeAllocClass cls) {
X64NativeTarget* a = x64_of(t);
+ const X64ABIRegs* abi = a->abi ? a->abi : x64_abi_for_os(t->c->target.os);
u32 mask = 0;
for (Reg r = 0; r < 16u; ++r) {
- if (cls == NATIVE_REG_INT && x64_reg_is_callee_int(a->abi, r))
+ if (cls == NATIVE_REG_INT && x64_reg_is_callee_int(abi, r))
mask |= 1u << r;
- if (cls == NATIVE_REG_FP && x64_reg_is_callee_fp(a->abi, r))
+ if (cls == NATIVE_REG_FP && x64_reg_is_callee_fp(abi, r))
mask |= 1u << r;
}
return mask;
}
+static u32 x64_live_caller_saved_mask(NativeTarget* t,
+ NativeAllocClass cls) {
+ const NativeAllocClassInfo* ci = native_target_class_info(t, cls);
+ if (!ci) return 0;
+ return ci->caller_saved_mask & ~x64_live_callee_saved_mask(t, cls);
+}
+
static void x64_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
u32 nclob, u32* int_mask, u32* fp_mask);
/* abi_clobber_masks is shared as native_asm_abi_clobber_masks
- * (cg/native_asm.h); it reads the masks from t->regs->classes. */
+ * (cg/native_asm.h); it reads the target's live ABI masks. */
/* Build the callee-saved set the prologue must preserve: the allocator-assigned
* callee-saved registers (frame->callee_saved_used) plus any an inline-asm
@@ -4049,6 +4057,13 @@ static int x64_machine_op_clobbers(NativeTarget* t, const NativeMachineOp* op,
mask[NATIVE_REG_INT] =
(1u << X64_RAX) | (1u << X64_RCX) | (1u << X64_RDX);
return 1;
+ case NATIVE_MOP_VA_START:
+ /* x64_va_start_core materializes the va_list field values through RAX
+ * (the ap pointer itself lands in the reserved r11 scratch). RAX is the
+ * return register, so the allocator may otherwise keep a live value there
+ * across the op. */
+ mask[NATIVE_REG_INT] = (1u << X64_RAX);
+ return 1;
case NATIVE_MOP_VA_ARG:
if (!op->result_is_fp) return 0;
mask[NATIVE_REG_INT] = (1u << X64_RAX);
@@ -4092,6 +4107,7 @@ NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj,
/* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved
* set; x64_func_begin_known_frame derives the records from the masks. */
t->reserve_callee_saves = x64_reserve_callee_saves;
+ t->caller_saved_mask = x64_live_caller_saved_mask;
t->callee_saved_mask = x64_live_callee_saved_mask;
t->signature_stack_bytes = x64_signature_stack_bytes;
t->call_stack_bytes = x64_call_stack_bytes;
diff --git a/src/cg/asm.c b/src/cg/asm.c
@@ -324,7 +324,6 @@ void kit_cg_file_scope_asm(KitCg* g, KitSlice asm_source) {
if (!g || !asm_source.s) return;
if (g->check_only) return;
if (g->target && g->target->file_scope_asm) {
- api_local_const_memory_boundary(g);
g->target->file_scope_asm(g->target, asm_source.s, asm_source.len);
return;
}
diff --git a/src/cg/native_asm.c b/src/cg/native_asm.c
@@ -37,16 +37,15 @@ int native_asm_match_index(const char* s) {
void native_asm_abi_clobber_masks(NativeTarget* t, u32 abi_sets, u32* int_mask,
u32* fp_mask) {
- const NativeAllocClassInfo* classes = t->regs->classes;
*int_mask = 0;
*fp_mask = 0;
if (abi_sets & KIT_CG_ASM_CLOBBER_ABI_CALLER_SAVED) {
- *int_mask |= classes[NATIVE_REG_INT].caller_saved_mask;
- *fp_mask |= classes[NATIVE_REG_FP].caller_saved_mask;
+ *int_mask |= native_target_caller_saved_mask(t, NATIVE_REG_INT);
+ *fp_mask |= native_target_caller_saved_mask(t, NATIVE_REG_FP);
}
if (abi_sets & KIT_CG_ASM_CLOBBER_ABI_CALLEE_SAVED) {
- *int_mask |= classes[NATIVE_REG_INT].callee_saved_mask;
- *fp_mask |= classes[NATIVE_REG_FP].callee_saved_mask;
+ *int_mask |= native_target_callee_saved_mask(t, NATIVE_REG_INT);
+ *fp_mask |= native_target_callee_saved_mask(t, NATIVE_REG_FP);
}
}
diff --git a/src/cg/native_asm.h b/src/cg/native_asm.h
@@ -40,9 +40,7 @@ int native_asm_constraint_early(const char* s);
int native_asm_match_index(const char* s);
/* Expand the arch-neutral clobber-ABI sets (KitCgAsmClobberAbiSet bits) into
- * this target's per-class caller/callee-saved register masks, read straight
- * from the target's register file (t->regs->classes). Byte-identical across the
- * backends apart from which register table they consulted, so it lives here. */
+ * this target's live per-class caller/callee-saved register masks. */
void native_asm_abi_clobber_masks(NativeTarget* t, u32 abi_sets, u32* int_mask,
u32* fp_mask);
diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c
@@ -109,9 +109,11 @@ static Reg nd_cache_reg_for(NativeDirectTarget* d, CGLocal local,
static Reg nd_pick_cache_victim(NativeDirectTarget* d, NativeAllocClass cls);
static u32 nd_callee_saved_mask(NativeDirectTarget* d, NativeAllocClass cls) {
- if (d->native && d->native->callee_saved_mask)
- return d->native->callee_saved_mask(d->native, cls);
- return nd_class_info(d, cls)->callee_saved_mask;
+ return native_target_callee_saved_mask(d->native, cls);
+}
+
+static u32 nd_caller_saved_mask(NativeDirectTarget* d, NativeAllocClass cls) {
+ return native_target_caller_saved_mask(d->native, cls);
}
static void nd_note_reg_used(NativeDirectTarget* d, NativeAllocClass cls,
@@ -544,7 +546,7 @@ static Reg nd_pick_cache_victim(NativeDirectTarget* d, NativeAllocClass cls) {
* path. */
static Reg nd_cache_alloc(NativeDirectTarget* d, NativeAllocClass cls) {
const NativeAllocClassInfo* ci = nd_class_info(d, cls);
- u32 caller = ci->caller_saved_mask;
+ u32 caller = nd_caller_saved_mask(d, cls);
Reg victim;
for (u32 i = 0; i < ci->nallocable; ++i) {
Reg r = ci->allocable[i];
diff --git a/src/opt/pass_machinize.c b/src/opt/pass_machinize.c
@@ -88,11 +88,14 @@ static void machinize_prepare_insts(Func* f, NativeTarget* target) {
}
}
-static void collect_class(Func* f, const NativeAllocClassInfo* ci) {
+static void collect_class(Func* f, NativeTarget* target,
+ const NativeAllocClassInfo* ci) {
u32 cls = ci->cls;
if (cls >= OPT_REG_CLASSES) return;
- f->opt_caller_saved[cls] = ci->caller_saved_mask;
- f->opt_callee_saved[cls] = ci->callee_saved_mask;
+ f->opt_caller_saved[cls] =
+ native_target_caller_saved_mask(target, (NativeAllocClass)cls);
+ f->opt_callee_saved[cls] =
+ native_target_callee_saved_mask(target, (NativeAllocClass)cls);
f->opt_reserved_regs[cls] = ci->reserved_mask;
f->opt_arg_regs[cls] = ci->arg_mask;
f->opt_ret_regs[cls] = ci->ret_mask;
@@ -116,7 +119,7 @@ static void collect_class(Func* f, const NativeAllocClassInfo* ci) {
static void machinize_collect_regs(Func* f, NativeTarget* target) {
if (!target || !target->regs) return;
for (u32 i = 0; i < target->regs->nclasses; ++i)
- collect_class(f, &target->regs->classes[i]);
+ collect_class(f, target, &target->regs->classes[i]);
}
static void machinize_check_overlap(Func* f) {
@@ -157,6 +160,9 @@ static void machinize_inst_clobbers(Func* f, NativeTarget* target) {
mop.second_is_reg =
(u8)(in->nopnds > 2u && in->opnds[2].kind == OPK_REG);
break;
+ case IR_VA_START:
+ mop.kind = NATIVE_MOP_VA_START;
+ break;
case IR_VA_ARG:
mop.kind = NATIVE_MOP_VA_ARG;
mop.result_is_fp = (u8)(in->nopnds > 0u &&
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -1360,7 +1360,9 @@ static u32 compute_callee_saved_used(NativeEmitCtx* e, u32* used, u32 cap) {
nclasses = ri->nclasses < cap ? ri->nclasses : cap;
for (u32 i = 0; i < ri->nclasses; ++i) {
const NativeAllocClassInfo* ci = &ri->classes[i];
- if (ci->cls < cap) used[ci->cls] &= ci->callee_saved_mask;
+ if (ci->cls < cap)
+ used[ci->cls] &=
+ native_target_callee_saved_mask(t, (NativeAllocClass)ci->cls);
}
return nclasses;
}
diff --git a/test/coff/windows-o1-abi-smoke.sh b/test/coff/windows-o1-abi-smoke.sh
@@ -0,0 +1,434 @@
+#!/usr/bin/env bash
+# test/coff/windows-o1-abi-smoke.sh - focused -O1 Windows ABI coverage for PE.
+#
+# Builds one optimized C program for x86_64-windows and aarch64-windows. The
+# program deliberately exercises ABI shapes that are easy to regress in the
+# optimizer: nonvolatile GPR/FP preservation across calls, mixed int/FP arg
+# assignment, stack arguments, varargs, aggregate return/by-value passing,
+# indirect callbacks, and a UCRT qsort callback.
+set -u
+
+ROOT=${KIT_TEST_ROOT:-$(cd "$(dirname "$0")/../.." && pwd)}
+KIT=${KIT:-"$ROOT/build/kit"}
+SDK=${KIT_SYSROOT:-}
+
+KIT_KIT_DIR="$ROOT/test/lib"
+. "$ROOT/test/lib/kit_sh_kit.sh"
+kit_report_init
+
+LABEL_SUITE=windows-o1-abi-smoke
+
+find_sdk() {
+ local arch=$1
+ local d
+ for d in \
+ "$ROOT"/build/llvm-mingw/*/ucrt/"$arch"-w64-mingw32 \
+ /tmp/llvm-mingw*/llvm-mingw-*-ucrt-*/"$arch"-w64-mingw32 \
+ /tmp/llvm-mingw*/"$arch"-w64-mingw32 \
+ /private/tmp/llvm-mingw*/llvm-mingw-*-ucrt-*/"$arch"-w64-mingw32 \
+ /private/tmp/llvm-mingw*/"$arch"-w64-mingw32; do
+ if [ -d "$d/lib" ] && [ -r "$d/include/windows.h" ]; then
+ printf '%s\n' "$d"
+ return 0
+ fi
+ done
+ return 1
+}
+
+sdk_for_arch() {
+ local arch=$1
+ local base
+ if [ -n "$SDK" ]; then
+ if [ "$(basename "$SDK")" = "$arch-w64-mingw32" ]; then
+ printf '%s\n' "$SDK"
+ return 0
+ fi
+ base=$(dirname "$SDK")
+ if [ -d "$base/$arch-w64-mingw32/lib" ] &&
+ [ -r "$base/$arch-w64-mingw32/include/windows.h" ]; then
+ printf '%s\n' "$base/$arch-w64-mingw32"
+ return 0
+ fi
+ fi
+ find_sdk "$arch"
+}
+
+if [ ! -x "$KIT" ]; then
+ kit_fail "$LABEL_SUITE/kit-present" "kit binary not found: $KIT"
+ kit_summary "$LABEL_SUITE"
+ kit_exit
+fi
+
+TMP=${TMPDIR:-/tmp}
+work=$(mktemp -d "$TMP/kit-windows-o1-abi-smoke.XXXXXX")
+trap 'rm -rf "$work"' EXIT
+
+ABI_C=$work/o1-abi.c
+
+cat >"$ABI_C" <<'SRC'
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+typedef struct Pair {
+ uint64_t a;
+ uint64_t b;
+} Pair;
+
+typedef struct Big {
+ uint64_t a;
+ uint64_t b;
+ uint64_t c;
+ uint64_t d;
+ uint64_t e;
+ double f;
+} Big;
+
+static volatile double g_fp[16] = {
+ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0,
+ 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+};
+
+static volatile uint64_t g_int[16] = {
+ 11u, 23u, 37u, 41u, 53u, 67u, 71u, 83u,
+ 97u, 101u, 109u, 113u, 127u, 131u, 137u, 149u,
+};
+
+int run_fp_probe(void);
+int run_int_probe(void);
+
+__attribute__((noinline)) uint64_t opaque_u64(uint64_t x) {
+ return (x * 33u) ^ 0x123456789abcdef0ull;
+}
+
+__attribute__((noinline)) void touch_fp_pressure(double seed) {
+ double a0 = g_fp[0] + seed;
+ double a1 = g_fp[1] + seed;
+ double a2 = g_fp[2] + seed;
+ double a3 = g_fp[3] + seed;
+ double a4 = g_fp[4] + seed;
+ double a5 = g_fp[5] + seed;
+ double a6 = g_fp[6] + seed;
+ double a7 = g_fp[7] + seed;
+ double a8 = g_fp[8] + seed;
+ double a9 = g_fp[9] + seed;
+ double a10 = g_fp[10] + seed;
+ double a11 = g_fp[11] + seed;
+ uint64_t k = opaque_u64((uint64_t)seed);
+ double sum = a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9 + a10 + a11;
+ sum += (double)(k & 255u);
+ g_fp[15] = sum;
+}
+
+__attribute__((noinline)) void touch_int_pressure(uint64_t seed) {
+ uint64_t a0 = g_int[0] + seed;
+ uint64_t a1 = g_int[1] + seed;
+ uint64_t a2 = g_int[2] + seed;
+ uint64_t a3 = g_int[3] + seed;
+ uint64_t a4 = g_int[4] + seed;
+ uint64_t a5 = g_int[5] + seed;
+ uint64_t a6 = g_int[6] + seed;
+ uint64_t a7 = g_int[7] + seed;
+ uint64_t a8 = g_int[8] + seed;
+ uint64_t a9 = g_int[9] + seed;
+ uint64_t k = opaque_u64(seed + 7u);
+ g_int[15] = a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9 + k;
+}
+
+#if defined(__x86_64__)
+__asm__(
+ ".text\n"
+ ".globl run_fp_probe\n"
+ "run_fp_probe:\n"
+ "pushq %rbp\n"
+ "movq %rsp, %rbp\n"
+ "subq $0x30, %rsp\n"
+ "movaps %xmm6, -0x10(%rbp)\n"
+ "movq $0x405edd2f1a9fbe77, %rax\n"
+ "movq %rax, %xmm6\n"
+ "movq $0x401c000000000000, %rax\n"
+ "movq %rax, %xmm0\n"
+ "callq touch_fp_pressure\n"
+ "movq %xmm6, %rax\n"
+ "movq $0x405edd2f1a9fbe77, %rcx\n"
+ "cmpq %rcx, %rax\n"
+ "sete %al\n"
+ "movzbl %al, %eax\n"
+ "movaps -0x10(%rbp), %xmm6\n"
+ "leave\n"
+ "retq\n"
+ ".globl run_int_probe\n"
+ "run_int_probe:\n"
+ "pushq %rbp\n"
+ "movq %rsp, %rbp\n"
+ "subq $0x30, %rsp\n"
+ "movq %r13, -0x8(%rbp)\n"
+ "movq $0xfedcba9876543210, %r13\n"
+ "movq $5, %rcx\n"
+ "callq touch_int_pressure\n"
+ "movq %r13, %rax\n"
+ "movq $0xfedcba9876543210, %rcx\n"
+ "cmpq %rcx, %rax\n"
+ "sete %al\n"
+ "movzbl %al, %eax\n"
+ "movq -0x8(%rbp), %r13\n"
+ "leave\n"
+ "retq\n");
+#elif defined(__aarch64__)
+__asm__(
+ ".text\n"
+ ".globl run_fp_probe\n"
+ "run_fp_probe:\n"
+ "stp x29, x30, [sp, #-32]!\n"
+ "mov x29, sp\n"
+ "str d8, [sp, #16]\n"
+ "mov x9, #0xbe77\n"
+ "movk x9, #0x1a9f, lsl #16\n"
+ "movk x9, #0xdd2f, lsl #32\n"
+ "movk x9, #0x405e, lsl #48\n"
+ "fmov d8, x9\n"
+ "mov x9, #0\n"
+ "movk x9, #0x401c, lsl #48\n"
+ "fmov d0, x9\n"
+ "bl touch_fp_pressure\n"
+ "fmov x10, d8\n"
+ "mov x9, #0xbe77\n"
+ "movk x9, #0x1a9f, lsl #16\n"
+ "movk x9, #0xdd2f, lsl #32\n"
+ "movk x9, #0x405e, lsl #48\n"
+ "cmp x10, x9\n"
+ "cset w0, eq\n"
+ "ldr d8, [sp, #16]\n"
+ "ldp x29, x30, [sp], #32\n"
+ "ret\n"
+ ".globl run_int_probe\n"
+ "run_int_probe:\n"
+ "stp x29, x30, [sp, #-32]!\n"
+ "mov x29, sp\n"
+ "str x19, [sp, #16]\n"
+ "mov x19, #0x3210\n"
+ "movk x19, #0x7654, lsl #16\n"
+ "movk x19, #0xba98, lsl #32\n"
+ "movk x19, #0xfedc, lsl #48\n"
+ "mov x0, #5\n"
+ "bl touch_int_pressure\n"
+ "mov x10, x19\n"
+ "mov x9, #0x3210\n"
+ "movk x9, #0x7654, lsl #16\n"
+ "movk x9, #0xba98, lsl #32\n"
+ "movk x9, #0xfedc, lsl #48\n"
+ "cmp x10, x9\n"
+ "cset w0, eq\n"
+ "ldr x19, [sp, #16]\n"
+ "ldp x29, x30, [sp], #32\n"
+ "ret\n");
+#else
+#error unsupported arch
+#endif
+
+__attribute__((noinline)) uint64_t mixed_args(int a, uint64_t b, double c,
+ float d, int e, double f,
+ uint64_t g, int h, double i,
+ uint64_t j) {
+ return (uint64_t)a + b + (uint64_t)(c * 10.0) + (uint64_t)(d * 10.0f) +
+ (uint64_t)e + (uint64_t)(f * 10.0) + g + (uint64_t)h +
+ (uint64_t)(i * 10.0) + j;
+}
+
+__attribute__((noinline)) uint64_t stack_args(uint64_t a1, uint64_t a2,
+ uint64_t a3, uint64_t a4,
+ uint64_t a5, uint64_t a6,
+ uint64_t a7, uint64_t a8,
+ uint64_t a9, uint64_t a10,
+ uint64_t a11, uint64_t a12) {
+ return a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9 + a10 + a11 + a12;
+}
+
+__attribute__((noinline)) Pair make_pair(uint64_t base) {
+ Pair p;
+ p.a = base + 1u;
+ p.b = base + 2u;
+ return p;
+}
+
+__attribute__((noinline)) Big make_big(uint64_t base) {
+ Big b;
+ b.a = base + 1u;
+ b.b = base + 2u;
+ b.c = base + 3u;
+ b.d = base + 4u;
+ b.e = base + 5u;
+ b.f = (double)(base + 6u) + 0.5;
+ return b;
+}
+
+__attribute__((noinline)) uint64_t take_big(Big b, Pair p, uint64_t tail) {
+ return b.a + b.b * 2u + b.c * 3u + b.d * 4u + b.e * 5u +
+ (uint64_t)(b.f * 10.0) + p.a + p.b + tail;
+}
+
+__attribute__((noinline)) uint64_t sum_varargs(int tag, ...) {
+ va_list ap;
+ uint64_t a;
+ double b;
+ uint64_t c;
+ int d;
+ double e;
+ va_start(ap, tag);
+ a = va_arg(ap, uint64_t);
+ b = va_arg(ap, double);
+ c = va_arg(ap, uint64_t);
+ d = va_arg(ap, int);
+ e = va_arg(ap, double);
+ va_end(ap);
+ return (uint64_t)tag + a + (uint64_t)(b * 10.0) + c + (uint64_t)d +
+ (uint64_t)(e * 10.0);
+}
+
+/* Loop over a long, boundary-crossing variadic list mixing integer and
+ * floating-point arguments. Exercises three easy-to-regress shapes at once:
+ * the loop accumulator stays live across va_start (so no scratch register may
+ * clobber it), the >8 arguments overflow the register slots onto the stack (so
+ * the ARM64 GP home area must be contiguous with the incoming stack args), and
+ * the floating-point arguments route through integer slots on ARM64. */
+__attribute__((noinline)) uint64_t loop_varargs(int n, ...) {
+ va_list ap;
+ uint64_t sum = (uint64_t)n;
+ va_start(ap, n);
+ for (int i = 0; i < n; ++i) {
+ if (i & 1)
+ sum += (uint64_t)(va_arg(ap, double) * 2.0);
+ else
+ sum += va_arg(ap, uint64_t);
+ }
+ va_end(ap);
+ return sum;
+}
+
+typedef uint64_t (*MixCallback)(int, double, uint64_t, double, int, int,
+ uint64_t);
+
+__attribute__((noinline)) uint64_t callback_impl(int a, double b, uint64_t c,
+ double d, int e, int f,
+ uint64_t g) {
+ return (uint64_t)a + (uint64_t)(b * 10.0) + c * 2u +
+ (uint64_t)(d * 10.0) + (uint64_t)e * 3u + (uint64_t)f * 5u + g;
+}
+
+__attribute__((noinline)) uint64_t call_callback(MixCallback cb) {
+ return cb(3, 2.5, 7u, 1.5, 4, 5, 9u);
+}
+
+static int cmp_u32(const void* a, const void* b) {
+ uint32_t aa = *(const uint32_t*)a;
+ uint32_t bb = *(const uint32_t*)b;
+ return (aa > bb) - (aa < bb);
+}
+
+int main(void) {
+ uint32_t vals[6] = {9u, 1u, 7u, 3u, 5u, 2u};
+ Pair p;
+ Big b;
+ if (!run_fp_probe()) return 10;
+ if (!(g_fp[15] > 0.0)) return 11;
+ if (!run_int_probe()) return 20;
+ if (g_int[15] == 0u) return 21;
+ if (mixed_args(1, 2u, 1.5, 2.5f, 3, 3.5, 4u, 5, 4.5, 6u) != 141u)
+ return 30;
+ if (stack_args(1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u) != 78u)
+ return 31;
+ p = make_pair(100u);
+ if (p.a != 101u || p.b != 102u) return 40;
+ b = make_big(10u);
+ if (take_big(b, p, 5u) != 578u) return 41;
+ if (sum_varargs(3, (uint64_t)10u, 2.5, (uint64_t)20u, 7, 1.5) != 80u)
+ return 50;
+ if (loop_varargs(10, (uint64_t)1, 2.0, (uint64_t)3, 4.0, (uint64_t)5, 6.0,
+ (uint64_t)7, 8.0, (uint64_t)9, 10.0) != 95u)
+ return 51;
+ if (call_callback(callback_impl) != 103u) return 60;
+ qsort(vals, 6u, sizeof vals[0], cmp_u32);
+ if (vals[0] != 1u || vals[1] != 2u || vals[5] != 9u) return 70;
+ return 0;
+}
+SRC
+
+no_legacy_crt_imports() {
+ local name=$1 dump=$2
+ if grep -Eiq 'DLL Name: (msvcrt|ucrt)\.dll' "$dump"; then
+ grep -Ei 'DLL Name: (msvcrt|ucrt)\.dll' "$dump" > "$work/$name.diag"
+ not_ok "$name" "$work/$name.diag"
+ else
+ ok "$name"
+ fi
+}
+
+run_vm_if_available() {
+ local label=$1 arch=$2 exe=$3
+ case "$arch" in
+ x64)
+ if [ -n "${KIT_WINDOWS_VM_X64:-${KIT_WINDOWS_VM_AMD64:-}}" ]; then
+ if "$ROOT/scripts/windows_vm.sh" run x64 "$exe" \
+ > "$work/$label-vm.out" 2> "$work/$label-vm.err"; then
+ ok "$label-vm"
+ else
+ not_ok "$label-vm" "$work/$label-vm.err"
+ fi
+ else
+ skip_test "$label-vm" "KIT_WINDOWS_VM_X64 not set"
+ fi
+ ;;
+ aarch64)
+ if [ -n "${KIT_WINDOWS_VM_AARCH64:-${KIT_WINDOWS_VM_ARM64:-}}" ]; then
+ if "$ROOT/scripts/windows_vm.sh" run aarch64 "$exe" \
+ > "$work/$label-vm.out" 2> "$work/$label-vm.err"; then
+ ok "$label-vm"
+ else
+ not_ok "$label-vm" "$work/$label-vm.err"
+ fi
+ else
+ skip_test "$label-vm" "KIT_WINDOWS_VM_AARCH64 not set"
+ fi
+ ;;
+ esac
+}
+
+ran=0
+for arch in x86_64 aarch64; do
+ case "$arch" in
+ x86_64) target=x86_64-windows; label=x64 ;;
+ aarch64) target=aarch64-windows; label=aarch64 ;;
+ esac
+
+ if ! ARCH_SDK=$(sdk_for_arch "$arch"); then
+ skip_test "$LABEL_SUITE/$label-sysroot" "no $arch llvm-mingw UCRT sysroot"
+ continue
+ fi
+ if [ ! -r "$ARCH_SDK/include/windows.h" ] ||
+ [ ! -r "$ARCH_SDK/lib/libucrt.a" ]; then
+ skip_test "$LABEL_SUITE/$label-sysroot" "invalid UCRT llvm-mingw sysroot: $ARCH_SDK"
+ continue
+ fi
+
+ ran=1
+ exe=$work/o1-abi-$arch.exe
+ dump=$work/o1-abi-$arch.dump
+ run_ok "$label-o1-abi-build" "$KIT" cc -target "$target" --sysroot "$ARCH_SDK" \
+ -O1 "$ABI_C" -o "$exe"
+ if [ -f "$exe" ]; then
+ run_ok "$label-o1-abi-objdump" "$KIT" objdump -p "$exe"
+ if [ -s "$work/$label-o1-abi-objdump.out" ]; then
+ cp "$work/$label-o1-abi-objdump.out" "$dump"
+ no_legacy_crt_imports "$label-o1-abi-no-legacy-crt" "$dump"
+ contains "$label-o1-abi-qsort-import" "$dump" "Name: qsort"
+ fi
+ run_vm_if_available "$label-o1-abi" "$label" "$exe"
+ fi
+done
+
+if [ "$ran" -eq 0 ]; then
+ skip_test "$LABEL_SUITE" "set KIT_SYSROOT or install llvm-mingw UCRT under /tmp/llvm-mingw*"
+fi
+
+kit_summary "$LABEL_SUITE"
+kit_exit
diff --git a/test/parse/cases/asm_02_file_scope.c b/test/parse/cases/asm_02_file_scope.c
@@ -14,5 +14,14 @@ asm(".data\n"
".text\n");
extern int global_asm_before;
+extern int global_asm_tail;
-int test_main(void) { return global_asm_before + global_asm_after; }
+int test_main(void) {
+ return global_asm_before + global_asm_after + global_asm_tail;
+}
+
+asm(".data\n"
+ ".globl global_asm_tail\n"
+ "global_asm_tail:\n"
+ ".word 0\n"
+ ".text\n");
diff --git a/test/parse/cases/gnu_inline_control_flow.c b/test/parse/cases/gnu_inline_control_flow.c
@@ -0,0 +1,36 @@
+/* A C99 `extern inline` definition is parsed and semantically validated but
+ * emits no out-of-line code (codegen suppressed). Control-flow statements in
+ * such a body must do their break/continue/case bookkeeping without opening a
+ * CG function — regression for pcg_label_new and the while/switch suppressed
+ * paths, which mint dummy labels under suppression instead of calling the CG
+ * label ops. The suppressed body is intentionally not referenced (an extern
+ * inline has no external definition to link against); test_main stands alone. */
+extern inline int suppressed(int n) {
+ int s = 0;
+ while (n > 0) {
+ if (n == 3) {
+ n--;
+ continue;
+ }
+ switch (n) {
+ case 1:
+ s += 1;
+ break;
+ case 2:
+ s += 2;
+ break;
+ default:
+ s += n;
+ break;
+ }
+ n--;
+ }
+ for (int i = 0; i < 4; i++) {
+ if (i == 1) continue;
+ if (i == 3) break;
+ s += i;
+ }
+ return s;
+}
+
+int test_main(void) { return 7; }
diff --git a/test/parse/cases/gnu_inline_control_flow.expected b/test/parse/cases/gnu_inline_control_flow.expected
@@ -0,0 +1 @@
+7