kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 61a4a4c6f662dced2a9394cb09ee41c1bca529c2
parent 4c4f1db31be66b1fab039b0a1fa8c4f8ab2bdd6f
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu,  4 Jun 2026 21:51:14 -0700

Fix Windows x64/arm64 variadic ABI at O0 and O1

aarch64-windows variadics were broken in both the direct (O0) and
optimizer (O1) paths, and x64 Win64 va_start miscompiled at O1. The
no-VM smoke (build + objdump) hid all of it; these are runtime bugs.

- aarch64-windows: route variadic FP args through GP registers (the
  classifier already remapped named params; aa_param_abi now does the
  same for trailing `...` args) and home x0..x7 into a GP "home area"
  at the top of the frame, contiguous with the incoming stack args, so
  the plain-pointer va_list walks register- then stack-passed varargs
  as one block. Driven by ABIVaListInfo.gp_reg_count; forces the fat
  top-record frame for Windows variadic functions only (non-variadic
  and non-Windows codegen is untouched).
- x64 Win64: declare RAX clobbered by va_start via the machine-op
  clobber mechanism (new NATIVE_MOP_VA_START), so the allocator stops
  keeping a live value (e.g. a return-coalesced loop accumulator) in
  RAX across the op.

Verified on the Win11 ARM64 VM (aarch64 native + x64 emulated) at O0
and O1: all three COFF Windows smokes pass, plus new coverage for
loop/overflow varargs, named-FP varargs, va_copy, and RSI/RDI +
XMM6-15 preservation. No regressions across cg/opt/isa/parse/asm/pp/
debug/dwarf/smoke/libc, Apple-ARM64, or SysV varargs.

Also fold in the in-flight frontend cleanups owned by this change
(extern-inline suppressed-parse dummy labels; drop the bogus
local-const memory boundary on file-scope asm) and add their missing
test coverage.

Diffstat:
Mlang/c/parse/cg_adapter.c | 14+++++++++-----
Mlang/c/parse/parse_stmt.c | 54++++++++++++++++++++++++++++++++++++++++++++++++------
Mmk/test.mk | 6++++++
Msrc/abi/abi_aapcs64_windows.c | 6+++++-
Msrc/arch/aa64/native.c | 128++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
Msrc/arch/native_target.h | 36++++++++++++++++++++++++++++++++----
Msrc/arch/x64/native.c | 22+++++++++++++++++++---
Msrc/cg/asm.c | 1-
Msrc/cg/native_asm.c | 9++++-----
Msrc/cg/native_asm.h | 4+---
Msrc/cg/native_direct_target.c | 10++++++----
Msrc/opt/pass_machinize.c | 14++++++++++----
Msrc/opt/pass_native_emit.c | 4+++-
Atest/coff/windows-o1-abi-smoke.sh | 434+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest/parse/cases/asm_02_file_scope.c | 11++++++++++-
Atest/parse/cases/gnu_inline_control_flow.c | 36++++++++++++++++++++++++++++++++++++
Atest/parse/cases/gnu_inline_control_flow.expected | 1+
17 files changed, 732 insertions(+), 58 deletions(-)

diff --git a/lang/c/parse/cg_adapter.c b/lang/c/parse/cg_adapter.c @@ -607,11 +607,15 @@ void pcg_computed_goto(Parser* p, const CGLabel* targets, u32 ntargets) { /* ---- Control flow ---- * - * Labels are pure bookkeeping handles, so pcg_label_new always mints one (it - * has no codegen side effect to suppress). The placement / jump / branch ops - * gate on emit; the conditional branches also pop the tested value off the - * type stack to mirror the CG-side consume. */ -CGLabel pcg_label_new(Parser* p) { return kit_cg_label_new(p->cg); } + * Label placement / jump / branch ops gate on emit; the conditional branches + * also pop the tested value off the type stack to mirror the CG-side consume. + * Suppressed parses, such as C99 `extern inline` bodies, do not open a CG + * function, so they use a nonzero dummy label only for semantic bookkeeping + * around break/continue/case validation. */ +CGLabel pcg_label_new(Parser* p) { + if (!pcg_emit_enabled(p)) return (CGLabel)1; + return kit_cg_label_new(p->cg); +} void pcg_label_place(Parser* p, CGLabel l) { if (pcg_emit_enabled(p)) kit_cg_label_place(p->cg, l); diff --git a/lang/c/parse/parse_stmt.c b/lang/c/parse/parse_stmt.c @@ -81,11 +81,21 @@ static void parse_while_stmt(Parser* p) { * /parse_continue/etc. keep using their existing raw `pcg_jump` calls — * the C target recognizes the labels as the innermost scope's * boundaries and emits the structured keywords on its own. */ - KitCgScope scope = kit_cg_scope_begin(p->cg, KIT_CG_TYPE_NONE); - CGLabel L_top = kit_cg_scope_continue_label(p->cg, scope); - CGLabel L_end = kit_cg_scope_break_label(p->cg, scope); CGLabel saved_break = p->cur_break; CGLabel saved_continue = p->cur_continue; + KitCgScope scope; + CGLabel L_top; + CGLabel L_end; + int emit = pcg_emit_enabled(p); + if (emit) { + scope = kit_cg_scope_begin(p->cg, KIT_CG_TYPE_NONE); + L_top = kit_cg_scope_continue_label(p->cg, scope); + L_end = kit_cg_scope_break_label(p->cg, scope); + } else { + scope = 0; + L_top = (CGLabel)1; + L_end = (CGLabel)1; + } expect_punct(p, '(', "'('"); parse_expr(p); to_rvalue(p); @@ -93,6 +103,15 @@ static void parse_while_stmt(Parser* p) { perr(p, "while condition requires scalar type"); } expect_punct(p, ')', "')'"); + if (!emit) { + pcg_drop(p); + p->cur_break = L_end; + p->cur_continue = L_top; + parse_stmt(p); + p->cur_break = saved_break; + p->cur_continue = saved_continue; + return; + } pcg_branch_false(p, L_end); p->cur_break = L_end; p->cur_continue = L_top; @@ -378,18 +397,29 @@ static void parse_switch_stmt(Parser* p) { * chain (unchanged behaviour) and which the C target overrides to * emit a real `switch (sel) { case V: goto L_V; …; default: goto * L_def; }`. */ - KitCgScope scope = kit_cg_scope_begin(p->cg, KIT_CG_TYPE_NONE); - CGLabel L_dispatch = pcg_label_new(p); - CGLabel L_end = kit_cg_scope_break_label(p->cg, scope); CGLabel saved_break = p->cur_break; SwitchCtx ctx; SwitchCtx* saved_switch = p->cur_switch; + KitCgScope scope; + CGLabel L_dispatch; + CGLabel L_end; + int emit = pcg_emit_enabled(p); FrameSlotDesc fsd; const Type* vty; CaseEntry* it; CaseEntry* prev; CaseEntry* head; + if (emit) { + scope = kit_cg_scope_begin(p->cg, KIT_CG_TYPE_NONE); + L_dispatch = pcg_label_new(p); + L_end = kit_cg_scope_break_label(p->cg, scope); + } else { + scope = 0; + L_dispatch = (CGLabel)1; + L_end = (CGLabel)1; + } + expect_punct(p, '(', "'('"); parse_expr(p); to_rvalue(p); @@ -413,6 +443,18 @@ static void parse_switch_stmt(Parser* p) { } expect_punct(p, ')', "')' after switch expression"); + if (!emit) { + pcg_drop(p); + memset(&ctx, 0, sizeof ctx); + ctx.parent = saved_switch; + p->cur_switch = &ctx; + p->cur_break = L_end; + parse_stmt(p); + p->cur_break = saved_break; + p->cur_switch = saved_switch; + return; + } + memset(&ctx, 0, sizeof ctx); memset(&fsd, 0, sizeof fsd); fsd.type = vty; diff --git a/mk/test.mk b/mk/test.mk @@ -59,6 +59,7 @@ TEST_TARGETS = \ test-cg-api \ test-coff \ test-coff-mingw-import \ + test-coff-windows-o1-abi \ test-coff-windows-ucrt \ test-debug \ test-dbg \ @@ -685,6 +686,10 @@ test-coff-mingw-import: lib $(COFF_IMPORT_MINGW_BIN) test-coff-windows-ucrt: bin rt-x86_64-pc-windows rt-aarch64-windows windows-ucrt-sysroots KIT_SYSROOT=$(abspath build/llvm-mingw/20260602/ucrt) bash test/coff/windows-ucrt-hosted-smoke.sh KIT_SYSROOT=$(abspath build/llvm-mingw/20260602/ucrt) bash test/coff/windows-system-dlls-smoke.sh + KIT_SYSROOT=$(abspath build/llvm-mingw/20260602/ucrt) bash test/coff/windows-o1-abi-smoke.sh + +test-coff-windows-o1-abi: bin rt-x86_64-pc-windows rt-aarch64-windows windows-ucrt-sysroots + KIT_SYSROOT=$(abspath build/llvm-mingw/20260602/ucrt) bash test/coff/windows-o1-abi-smoke.sh # Opt-in: run the COFF/PE hosted smokes against a real Windows 11 ARM64 VM, so # their per-program run lanes execute for real instead of self-skipping. On @@ -707,6 +712,7 @@ test-coff-windows-vm: bin rt-x86_64-pc-windows rt-aarch64-windows windows-ucrt-s bash scripts/windows_vm.sh wait-ssh 600 $(_WIN_VM_ENV) bash test/coff/windows-ucrt-hosted-smoke.sh $(_WIN_VM_ENV) bash test/coff/windows-system-dlls-smoke.sh + $(_WIN_VM_ENV) bash test/coff/windows-o1-abi-smoke.sh # The parse/asm/macho harnesses select a cross-target via KIT_TEST_ARCH # (default aa64); the link rt dependency is resolved through the shared diff --git a/src/abi/abi_aapcs64_windows.c b/src/abi/abi_aapcs64_windows.c @@ -64,5 +64,9 @@ const ABIVtable aapcs64_windows_vtable = { .compute_func_info = aapcs64_windows_compute_func_info, .va_list_info = {8, 8, ABI_SC_PTR, 0, 0, 0}, .va_list_layout = {.type = {8, 8, ABI_SC_PTR, 0, 0, 0}, - .kind = ABI_VA_LIST_POINTER}, + .kind = ABI_VA_LIST_POINTER, + .gp_reg_count = 8, + .fp_reg_count = 0, + .gp_slot_size = 8, + .fp_slot_size = 0}, }; diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c @@ -130,14 +130,22 @@ typedef struct AAFrameLayout { u32 slot_bytes; /* sum of aa_frame_slot reservations (callee-saves + locals * + spills + sret/variadic) */ u32 out_stack; /* max outgoing-arg bytes across all calls in this function */ - u32 frame_size; /* align16(AA_FRAME_SAVE_SIZE + slot_bytes + out_stack) */ + u32 top_home; /* Windows-variadic GP register home area, reserved between + * the saved pair and the incoming stack args so the + * plain-pointer va_list walks register then stack varargs as + * one contiguous block (0 on every other ABI). */ + u32 frame_size; /* align16(AA_FRAME_SAVE_SIZE + top_home + slot_bytes + + * out_stack) */ } AAFrameLayout; -static inline AAFrameLayout aa_build_layout(u32 slot_bytes, u32 out_stack) { +static inline AAFrameLayout aa_build_layout(u32 slot_bytes, u32 out_stack, + u32 top_home) { AAFrameLayout L; L.slot_bytes = slot_bytes; L.out_stack = out_stack; - L.frame_size = align_up_u32(AA_FRAME_SAVE_SIZE + slot_bytes + out_stack, 16u); + L.top_home = top_home; + L.frame_size = + align_up_u32(AA_FRAME_SAVE_SIZE + top_home + slot_bytes + out_stack, 16u); return L; } @@ -152,7 +160,7 @@ static inline i32 aa_fp_off_saved_lr(void) { return 8; } /* SP-relative byte offsets. */ static inline i32 aa_sp_off_out_arg(u32 byte_off) { return (i32)byte_off; } static inline u32 aa_sp_off_saved_pair(const AAFrameLayout* L) { - return L->frame_size - AA_FRAME_SAVE_SIZE; + return L->frame_size - AA_FRAME_SAVE_SIZE - L->top_home; } /* Frame slots and callee-save records are owned by the shared NativeFrame @@ -195,6 +203,11 @@ typedef struct AANativeTarget { * and unread on the single-pass path, which never sets fp_at_bottom. */ u32 frame_size_final; u32 incoming_stack_size; + /* Windows-variadic GP register home area size (gp_reg_count * gp_slot_size, + * 64 today; 0 on every other ABI). When nonzero the function takes the fat + * top-record layout and homes x0..x7 into [fp + AA_FRAME_SAVE_SIZE ..] so the + * plain-pointer va_list can walk register then stack varargs contiguously. */ + u32 top_home_bytes; u32 next_param_int; u32 next_param_fp; u32 next_param_stack; @@ -254,7 +267,10 @@ static AANativeTarget* aa_of(NativeTarget* t) { return (AANativeTarget*)t; } * record since frame_size = align16(16+cum_off) >= 16+cum_off). * CFA = fp+frame_size. */ static inline i32 aa_fp_off_in_arg(const AANativeTarget* a, u32 byte_off) { - u32 base = a->fp_at_bottom ? a->frame_size_final : AA_FRAME_SAVE_SIZE; + /* top-record incoming args sit above the saved pair and the (usually empty) + * Windows-variadic GP home area; bottom-record never carries a home area. */ + u32 base = a->fp_at_bottom ? a->frame_size_final + : AA_FRAME_SAVE_SIZE + a->top_home_bytes; return (i32)(base + byte_off); } static inline i32 aa_fp_off_slot(const AANativeTarget* a, u32 slot_off) { @@ -272,7 +288,17 @@ static inline i32 aa_fp_off_tail_out_arg(const AANativeTarget* a, * fp+frame_size bottom-record). Named so the CFI emit site stays layout-blind. */ static inline i32 aa_cfa_off(const AANativeTarget* a) { - return a->fp_at_bottom ? (i32)a->frame_size_final : (i32)AA_FRAME_SAVE_SIZE; + return a->fp_at_bottom + ? (i32)a->frame_size_final + : (i32)(AA_FRAME_SAVE_SIZE + a->top_home_bytes); +} + +/* fp-relative offset of GP home slot `i` (Windows variadic only). The home area + * sits just above the saved pair and just below the incoming stack args, so + * slot gp_reg_count coincides with incoming-arg byte 0 (top-record only — a + * function with a home area never takes a slim/bottom layout). */ +static inline i32 aa_fp_off_home_slot(u32 i) { + return (i32)(AA_FRAME_SAVE_SIZE + i * 8u); } static void aa_panic(AANativeTarget* a, const char* msg) { @@ -1023,13 +1049,34 @@ static void aa_reserve_variadic_reg_saves(AANativeTarget* a) { a->va_vr_slot = a->base.frame_slot(&a->base, &sd); } -/* Emit the stores into the variadic register-save area. Slots must already be - * reserved (aa_reserve_variadic_reg_saves). */ +/* Emit the stores into the variadic register-save area. For AAPCS64 these land + * in the reserved gr/vr frame slots (aa_reserve_variadic_reg_saves); for the + * Windows GP home area they land in [fp + AA_FRAME_SAVE_SIZE ..], the + * top-of-frame block contiguous with the incoming stack args. */ static void aa_emit_variadic_reg_save_stores(AANativeTarget* a) { NativeAddr addr; MemAccess mem; KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64); ABIVaListInfo vai = abi_va_list_layout(a->base.c->abi); + if (vai.kind == ABI_VA_LIST_POINTER && a->top_home_bytes) { + /* Windows: home x0..x{gp_reg_count-1} so the plain-pointer va_list walks + * register then stack varargs as one block. The named leading registers are + * homed too (harmless): va_start skips past them. */ + memset(&mem, 0, sizeof mem); + mem.type = i64; + mem.size = 8; + mem.align = 8; + memset(&addr, 0, sizeof addr); + addr.base_kind = NATIVE_ADDR_BASE_REG; + addr.base.reg = AA_FP; + addr.base_type = i64; + for (u32 r = 0; r < vai.gp_reg_count && r < 8u; ++r) { + NativeLoc src = native_loc_reg(i64, NATIVE_REG_INT, r); + addr.offset = aa_fp_off_home_slot(r); + aa_emit_mem(a, 0, src, addr, mem); + } + return; + } if (vai.kind != ABI_VA_LIST_AAPCS64) return; memset(&mem, 0, sizeof mem); mem.type = i64; @@ -1080,6 +1127,18 @@ static void aa_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) { a->slim_small_frame = 0; a->fp_at_bottom = 0; a->frame_size_final = 0; + /* Windows variadic functions reserve a GP register home area at the top of + * the frame (just below the incoming stack args). The plain-pointer va_list + * then walks register-passed then stack-passed varargs as one block. Other + * ABIs leave gp_reg_count 0 here: Apple ARM64 routes all varargs to the + * stack, AAPCS64 uses a struct va_list with separate reg-save pointers. */ + { + const ABIFuncInfo* fi = abi_cg_func_info(t->c->abi, fd->fn_type); + ABIVaListInfo vai = abi_va_list_layout(t->c->abi); + a->top_home_bytes = (fi && fi->variadic && vai.kind == ABI_VA_LIST_POINTER) + ? vai.gp_reg_count * vai.gp_slot_size + : 0u; + } mc->set_section(mc, fd->text_section_id); mc->emit_align(mc, 4, 0); a->func_start = mc->pos(mc); @@ -1288,11 +1347,13 @@ static void aa_words_restore_frame(AANativeTarget* a, u32* words, u32 cap, return; } if (*n + 3u > cap) aa_panic(a, "instruction patch too small"); - /* AAPCS64: fp is the saved-pair address. Reload pair from [fp], then - * restore sp to fp + AA_FRAME_SAVE_SIZE (= caller's original sp = CFA). */ + /* AAPCS64: fp is the saved-pair address. Reload pair from [fp], then restore + * sp to fp + CFA-offset (= caller's original sp = CFA). The CFA offset is + * AA_FRAME_SAVE_SIZE normally, plus the Windows-variadic GP home area when + * present. */ words[(*n)++] = aa64_add_imm(1, AA_TMP0, AA_FP, 0, 0); words[(*n)++] = aa64_ldp64_soff(AA_FP, AA_LR, AA_TMP0, 0); - words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP0, AA_FRAME_SAVE_SIZE, 0); + words[(*n)++] = aa64_add_imm(1, AA_SP, AA_TMP0, (u32)aa_cfa_off(a), 0); } /* Emit callee-save store (save=1) or restore (save=0) words into `words`, @@ -1488,7 +1549,8 @@ static void aa_apply_patches(AANativeTarget* a, const AAFrameLayout* L) { static void aa_func_end(NativeTarget* t) { AANativeTarget* a = aa_of(t); MCEmitter* mc = t->mc; - AAFrameLayout L = aa_build_layout(a->frame.cum_off, a->frame.max_outgoing); + AAFrameLayout L = + aa_build_layout(a->frame.cum_off, a->frame.max_outgoing, a->top_home_bytes); /* known_frame (optimizer): prologue, allocas, and tail epilogues were emitted * final and slim eligibility was settled in aa_func_begin_known_frame — there * is nothing to patch. Single-pass (NDT): a worst-case prologue region was @@ -1646,7 +1708,8 @@ static void aa_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd, * known, so the prologue immediates and slim-form choice are settled here. * frame_size_final must be set before aa_build_prologue_words / entry saves, * since the bottom-record offset helpers read it. */ - L = aa_build_layout(a->frame.cum_off, a->frame.max_outgoing); + L = aa_build_layout(a->frame.cum_off, a->frame.max_outgoing, + a->top_home_bytes); a->frame_size_final = L.frame_size; /* Slim Tier A: no callee-saves, no alloca, no body slots, no outgoing stack * args — the whole frame is the 16-byte record. fp_at_bottom: a small frame @@ -1654,14 +1717,18 @@ static void aa_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd, * the bottom (fp = sp) so sp adjustment folds into the pre/post-indexed * stp/ldp (frame_size <= 504 keeps the post-index ldp imm in range). * Otherwise slim_small_frame keeps the top-record layout but skips the - * x17/x10 scratch (out_stack>0 small frames land here). (See aa_func_end for - * the single-pass path, which never takes any slim form.) */ + * x17/x10 scratch (out_stack>0 small frames land here). A Windows-variadic + * home area forces the fat top-record layout: it lives above the saved pair, + * which neither the slim forms (saved pair at the very top) nor the + * bottom-record (saved pair at the very bottom) leave room for. (See + * aa_func_end for the single-pass path, which never takes any slim form.) */ a->slim_prologue = a->frame.ncallee_saves == 0 && !a->frame.has_alloca && - L.slot_bytes == 0 && L.out_stack == 0; + L.slot_bytes == 0 && L.out_stack == 0 && !a->top_home_bytes; a->fp_at_bottom = !a->slim_prologue && !a->frame.has_alloca && - L.out_stack == 0 && L.frame_size <= 504u; + L.out_stack == 0 && L.frame_size <= 504u && + !a->top_home_bytes; a->slim_small_frame = !a->slim_prologue && !a->fp_at_bottom && - !a->frame.has_alloca && + !a->frame.has_alloca && !a->top_home_bytes && aa_sp_off_saved_pair(&L) <= 504u; n = aa_build_prologue_words(a, &L, words, AA_PROLOGUE_WORDS); for (u32 i = 0; i < n; ++i) aa_emit32(t->mc, words[i]); @@ -2516,8 +2583,15 @@ static const ABIArgInfo* aa_param_abi(NativeTarget* t, const ABIFuncInfo* abi, scratch->flags = ABI_AF_NONE; scratch->nparts = 1; scratch->parts = arena_zarray(t->c->tu, ABIArgPart, 1); + /* Windows ARM64 routes variadic floating-point arguments through the integer + * registers/stack (the classifier's remap_fp_parts_to_int does the same for + * the *named* params of a variadic function); the value's bit pattern moves + * via fmov x,d. Every other ABI keeps the `...` FP args in v registers. */ ((ABIArgPart*)scratch->parts)[0].cls = - cg_type_is_float(t->c, desc->args[i].type) ? ABI_CLASS_FP : ABI_CLASS_INT; + (cg_type_is_float(t->c, desc->args[i].type) && + t->c->target.os != KIT_OS_WINDOWS) + ? ABI_CLASS_FP + : ABI_CLASS_INT; ((ABIArgPart*)scratch->parts)[0].loc = ABI_LOC_REG; ((ABIArgPart*)scratch->parts)[0].size = type_size32(t, desc->args[i].type); ((ABIArgPart*)scratch->parts)[0].align = type_align32(t, desc->args[i].type); @@ -2872,7 +2946,8 @@ static void aa_emit_tail_site(NativeTarget* t, NativeLoc callee) { /* Frame is final: emit the tail epilogue (callee restores + frame restore + * branch) directly, exactly the words aa_apply_patches would patch in but * without the reserved NOP padding. */ - AAFrameLayout L = aa_build_layout(a->frame.cum_off, a->frame.max_outgoing); + AAFrameLayout L = + aa_build_layout(a->frame.cum_off, a->frame.max_outgoing, a->top_home_bytes); u32 words[AA_TAIL_WORDS]; u32 n = 0; aa_words_callee_restores(a, words, AA_TAIL_WORDS, &n); @@ -4074,6 +4149,19 @@ static void aa_va_start_core(AANativeTarget* a, NativeAddr ap) { NativeLoc ptr = native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, AA_TMP0); if (vai.kind == ABI_VA_LIST_POINTER) { + if (a->top_home_bytes) { + /* Windows: `va_list = &<first vararg>` inside the contiguous + * [GP home area | incoming stack args] block. Named args consume the + * leading slots; next_param_int (FP params remapped to GP included) plus + * next_param_stack locate the first unnamed slot. Home slot + * gp_reg_count coincides with incoming-arg byte 0, so a single formula + * spans both regions. */ + i32 off = + aa_fp_off_home_slot(a->next_param_int) + (i32)a->next_param_stack; + aa_emit_add_imm(a, AA_TMP0, AA_FP, off); + aa_emit_mem(a, 0, ptr, ap, aa_mem_for_type(t, ptr.type, 8)); + return; + } /* `va_list = &<first vararg>`. Variadic stack args follow the fixed * incoming params in the same caller window. Apple ARM64 compact fixed * stack args may leave this cursor at +4, while the first variadic slot diff --git a/src/arch/native_target.h b/src/arch/native_target.h @@ -272,6 +272,7 @@ typedef struct NativeFramePatchState { * the descriptor keeps the backend from depending on the optimizer IR. */ typedef enum NativeMachineOpKind { NATIVE_MOP_BINOP, + NATIVE_MOP_VA_START, NATIVE_MOP_VA_ARG, NATIVE_MOP_ATOMIC_CAS, NATIVE_MOP_ATOMIC_RMW, @@ -370,12 +371,13 @@ struct NativeTarget { * up front. */ void (*reserve_callee_saves)(NativeTarget*, const u32* used_by_class, u32 nclasses); - /* Optional live-ABI callee-saved register mask for a class. Static + /* Optional live-ABI caller/callee-saved register masks for a class. Static * NativeAllocClassInfo masks describe the target register file, but some * targets vary preservation rules by OS ABI (x64 SysV vs Win64 XMM regs). - * Direct emission uses this to decide which borrowed scratch/cache registers - * must be reported to reserve_callee_saves(). NULL falls back to - * NativeAllocClassInfo.callee_saved_mask. */ + * The optimizer and direct emission use these to keep allocation, call + * clobbers, and prologue save sets aligned with the selected ABI. NULL falls + * back to NativeAllocClassInfo.{caller,callee}_saved_mask. */ + u32 (*caller_saved_mask)(NativeTarget*, NativeAllocClass); u32 (*callee_saved_mask)(NativeTarget*, NativeAllocClass); /* Optional. When set, the optimizer emit path calls this once — after * func_begin, reserve_callee_saves, and frame-slot mapping, but before the @@ -519,6 +521,32 @@ struct NativeTarget { void (*destroy)(NativeTarget*); }; +static inline const NativeAllocClassInfo* +native_target_class_info(const NativeTarget* t, NativeAllocClass cls) { + if (!t || !t->regs) return NULL; + for (u32 i = 0; i < t->regs->nclasses; ++i) { + const NativeAllocClassInfo* ci = &t->regs->classes[i]; + if ((NativeAllocClass)ci->cls == cls) return ci; + } + return NULL; +} + +static inline u32 native_target_caller_saved_mask(NativeTarget* t, + NativeAllocClass cls) { + const NativeAllocClassInfo* ci; + if (t && t->caller_saved_mask) return t->caller_saved_mask(t, cls); + ci = native_target_class_info(t, cls); + return ci ? ci->caller_saved_mask : 0u; +} + +static inline u32 native_target_callee_saved_mask(NativeTarget* t, + NativeAllocClass cls) { + const NativeAllocClassInfo* ci; + if (t && t->callee_saved_mask) return t->callee_saved_mask(t, cls); + ci = native_target_class_info(t, cls); + return ci ? ci->callee_saved_mask : 0u; +} + static inline NativeLoc native_loc_none(void) { NativeLoc loc; memset(&loc, 0, sizeof loc); diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c @@ -1755,21 +1755,29 @@ static int x64_reg_is_callee_fp(const X64ABIRegs* abi, Reg r); static u32 x64_live_callee_saved_mask(NativeTarget* t, NativeAllocClass cls) { X64NativeTarget* a = x64_of(t); + const X64ABIRegs* abi = a->abi ? a->abi : x64_abi_for_os(t->c->target.os); u32 mask = 0; for (Reg r = 0; r < 16u; ++r) { - if (cls == NATIVE_REG_INT && x64_reg_is_callee_int(a->abi, r)) + if (cls == NATIVE_REG_INT && x64_reg_is_callee_int(abi, r)) mask |= 1u << r; - if (cls == NATIVE_REG_FP && x64_reg_is_callee_fp(a->abi, r)) + if (cls == NATIVE_REG_FP && x64_reg_is_callee_fp(abi, r)) mask |= 1u << r; } return mask; } +static u32 x64_live_caller_saved_mask(NativeTarget* t, + NativeAllocClass cls) { + const NativeAllocClassInfo* ci = native_target_class_info(t, cls); + if (!ci) return 0; + return ci->caller_saved_mask & ~x64_live_callee_saved_mask(t, cls); +} + static void x64_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers, u32 nclob, u32* int_mask, u32* fp_mask); /* abi_clobber_masks is shared as native_asm_abi_clobber_masks - * (cg/native_asm.h); it reads the masks from t->regs->classes. */ + * (cg/native_asm.h); it reads the target's live ABI masks. */ /* Build the callee-saved set the prologue must preserve: the allocator-assigned * callee-saved registers (frame->callee_saved_used) plus any an inline-asm @@ -4049,6 +4057,13 @@ static int x64_machine_op_clobbers(NativeTarget* t, const NativeMachineOp* op, mask[NATIVE_REG_INT] = (1u << X64_RAX) | (1u << X64_RCX) | (1u << X64_RDX); return 1; + case NATIVE_MOP_VA_START: + /* x64_va_start_core materializes the va_list field values through RAX + * (the ap pointer itself lands in the reserved r11 scratch). RAX is the + * return register, so the allocator may otherwise keep a live value there + * across the op. */ + mask[NATIVE_REG_INT] = (1u << X64_RAX); + return 1; case NATIVE_MOP_VA_ARG: if (!op->result_is_fp) return 0; mask[NATIVE_REG_INT] = (1u << X64_RAX); @@ -4092,6 +4107,7 @@ NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj, /* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved * set; x64_func_begin_known_frame derives the records from the masks. */ t->reserve_callee_saves = x64_reserve_callee_saves; + t->caller_saved_mask = x64_live_caller_saved_mask; t->callee_saved_mask = x64_live_callee_saved_mask; t->signature_stack_bytes = x64_signature_stack_bytes; t->call_stack_bytes = x64_call_stack_bytes; diff --git a/src/cg/asm.c b/src/cg/asm.c @@ -324,7 +324,6 @@ void kit_cg_file_scope_asm(KitCg* g, KitSlice asm_source) { if (!g || !asm_source.s) return; if (g->check_only) return; if (g->target && g->target->file_scope_asm) { - api_local_const_memory_boundary(g); g->target->file_scope_asm(g->target, asm_source.s, asm_source.len); return; } diff --git a/src/cg/native_asm.c b/src/cg/native_asm.c @@ -37,16 +37,15 @@ int native_asm_match_index(const char* s) { void native_asm_abi_clobber_masks(NativeTarget* t, u32 abi_sets, u32* int_mask, u32* fp_mask) { - const NativeAllocClassInfo* classes = t->regs->classes; *int_mask = 0; *fp_mask = 0; if (abi_sets & KIT_CG_ASM_CLOBBER_ABI_CALLER_SAVED) { - *int_mask |= classes[NATIVE_REG_INT].caller_saved_mask; - *fp_mask |= classes[NATIVE_REG_FP].caller_saved_mask; + *int_mask |= native_target_caller_saved_mask(t, NATIVE_REG_INT); + *fp_mask |= native_target_caller_saved_mask(t, NATIVE_REG_FP); } if (abi_sets & KIT_CG_ASM_CLOBBER_ABI_CALLEE_SAVED) { - *int_mask |= classes[NATIVE_REG_INT].callee_saved_mask; - *fp_mask |= classes[NATIVE_REG_FP].callee_saved_mask; + *int_mask |= native_target_callee_saved_mask(t, NATIVE_REG_INT); + *fp_mask |= native_target_callee_saved_mask(t, NATIVE_REG_FP); } } diff --git a/src/cg/native_asm.h b/src/cg/native_asm.h @@ -40,9 +40,7 @@ int native_asm_constraint_early(const char* s); int native_asm_match_index(const char* s); /* Expand the arch-neutral clobber-ABI sets (KitCgAsmClobberAbiSet bits) into - * this target's per-class caller/callee-saved register masks, read straight - * from the target's register file (t->regs->classes). Byte-identical across the - * backends apart from which register table they consulted, so it lives here. */ + * this target's live per-class caller/callee-saved register masks. */ void native_asm_abi_clobber_masks(NativeTarget* t, u32 abi_sets, u32* int_mask, u32* fp_mask); diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c @@ -109,9 +109,11 @@ static Reg nd_cache_reg_for(NativeDirectTarget* d, CGLocal local, static Reg nd_pick_cache_victim(NativeDirectTarget* d, NativeAllocClass cls); static u32 nd_callee_saved_mask(NativeDirectTarget* d, NativeAllocClass cls) { - if (d->native && d->native->callee_saved_mask) - return d->native->callee_saved_mask(d->native, cls); - return nd_class_info(d, cls)->callee_saved_mask; + return native_target_callee_saved_mask(d->native, cls); +} + +static u32 nd_caller_saved_mask(NativeDirectTarget* d, NativeAllocClass cls) { + return native_target_caller_saved_mask(d->native, cls); } static void nd_note_reg_used(NativeDirectTarget* d, NativeAllocClass cls, @@ -544,7 +546,7 @@ static Reg nd_pick_cache_victim(NativeDirectTarget* d, NativeAllocClass cls) { * path. */ static Reg nd_cache_alloc(NativeDirectTarget* d, NativeAllocClass cls) { const NativeAllocClassInfo* ci = nd_class_info(d, cls); - u32 caller = ci->caller_saved_mask; + u32 caller = nd_caller_saved_mask(d, cls); Reg victim; for (u32 i = 0; i < ci->nallocable; ++i) { Reg r = ci->allocable[i]; diff --git a/src/opt/pass_machinize.c b/src/opt/pass_machinize.c @@ -88,11 +88,14 @@ static void machinize_prepare_insts(Func* f, NativeTarget* target) { } } -static void collect_class(Func* f, const NativeAllocClassInfo* ci) { +static void collect_class(Func* f, NativeTarget* target, + const NativeAllocClassInfo* ci) { u32 cls = ci->cls; if (cls >= OPT_REG_CLASSES) return; - f->opt_caller_saved[cls] = ci->caller_saved_mask; - f->opt_callee_saved[cls] = ci->callee_saved_mask; + f->opt_caller_saved[cls] = + native_target_caller_saved_mask(target, (NativeAllocClass)cls); + f->opt_callee_saved[cls] = + native_target_callee_saved_mask(target, (NativeAllocClass)cls); f->opt_reserved_regs[cls] = ci->reserved_mask; f->opt_arg_regs[cls] = ci->arg_mask; f->opt_ret_regs[cls] = ci->ret_mask; @@ -116,7 +119,7 @@ static void collect_class(Func* f, const NativeAllocClassInfo* ci) { static void machinize_collect_regs(Func* f, NativeTarget* target) { if (!target || !target->regs) return; for (u32 i = 0; i < target->regs->nclasses; ++i) - collect_class(f, &target->regs->classes[i]); + collect_class(f, target, &target->regs->classes[i]); } static void machinize_check_overlap(Func* f) { @@ -157,6 +160,9 @@ static void machinize_inst_clobbers(Func* f, NativeTarget* target) { mop.second_is_reg = (u8)(in->nopnds > 2u && in->opnds[2].kind == OPK_REG); break; + case IR_VA_START: + mop.kind = NATIVE_MOP_VA_START; + break; case IR_VA_ARG: mop.kind = NATIVE_MOP_VA_ARG; mop.result_is_fp = (u8)(in->nopnds > 0u && diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c @@ -1360,7 +1360,9 @@ static u32 compute_callee_saved_used(NativeEmitCtx* e, u32* used, u32 cap) { nclasses = ri->nclasses < cap ? ri->nclasses : cap; for (u32 i = 0; i < ri->nclasses; ++i) { const NativeAllocClassInfo* ci = &ri->classes[i]; - if (ci->cls < cap) used[ci->cls] &= ci->callee_saved_mask; + if (ci->cls < cap) + used[ci->cls] &= + native_target_callee_saved_mask(t, (NativeAllocClass)ci->cls); } return nclasses; } diff --git a/test/coff/windows-o1-abi-smoke.sh b/test/coff/windows-o1-abi-smoke.sh @@ -0,0 +1,434 @@ +#!/usr/bin/env bash +# test/coff/windows-o1-abi-smoke.sh - focused -O1 Windows ABI coverage for PE. +# +# Builds one optimized C program for x86_64-windows and aarch64-windows. The +# program deliberately exercises ABI shapes that are easy to regress in the +# optimizer: nonvolatile GPR/FP preservation across calls, mixed int/FP arg +# assignment, stack arguments, varargs, aggregate return/by-value passing, +# indirect callbacks, and a UCRT qsort callback. +set -u + +ROOT=${KIT_TEST_ROOT:-$(cd "$(dirname "$0")/../.." && pwd)} +KIT=${KIT:-"$ROOT/build/kit"} +SDK=${KIT_SYSROOT:-} + +KIT_KIT_DIR="$ROOT/test/lib" +. "$ROOT/test/lib/kit_sh_kit.sh" +kit_report_init + +LABEL_SUITE=windows-o1-abi-smoke + +find_sdk() { + local arch=$1 + local d + for d in \ + "$ROOT"/build/llvm-mingw/*/ucrt/"$arch"-w64-mingw32 \ + /tmp/llvm-mingw*/llvm-mingw-*-ucrt-*/"$arch"-w64-mingw32 \ + /tmp/llvm-mingw*/"$arch"-w64-mingw32 \ + /private/tmp/llvm-mingw*/llvm-mingw-*-ucrt-*/"$arch"-w64-mingw32 \ + /private/tmp/llvm-mingw*/"$arch"-w64-mingw32; do + if [ -d "$d/lib" ] && [ -r "$d/include/windows.h" ]; then + printf '%s\n' "$d" + return 0 + fi + done + return 1 +} + +sdk_for_arch() { + local arch=$1 + local base + if [ -n "$SDK" ]; then + if [ "$(basename "$SDK")" = "$arch-w64-mingw32" ]; then + printf '%s\n' "$SDK" + return 0 + fi + base=$(dirname "$SDK") + if [ -d "$base/$arch-w64-mingw32/lib" ] && + [ -r "$base/$arch-w64-mingw32/include/windows.h" ]; then + printf '%s\n' "$base/$arch-w64-mingw32" + return 0 + fi + fi + find_sdk "$arch" +} + +if [ ! -x "$KIT" ]; then + kit_fail "$LABEL_SUITE/kit-present" "kit binary not found: $KIT" + kit_summary "$LABEL_SUITE" + kit_exit +fi + +TMP=${TMPDIR:-/tmp} +work=$(mktemp -d "$TMP/kit-windows-o1-abi-smoke.XXXXXX") +trap 'rm -rf "$work"' EXIT + +ABI_C=$work/o1-abi.c + +cat >"$ABI_C" <<'SRC' +#include <stdarg.h> +#include <stdint.h> +#include <stdlib.h> + +typedef struct Pair { + uint64_t a; + uint64_t b; +} Pair; + +typedef struct Big { + uint64_t a; + uint64_t b; + uint64_t c; + uint64_t d; + uint64_t e; + double f; +} Big; + +static volatile double g_fp[16] = { + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, +}; + +static volatile uint64_t g_int[16] = { + 11u, 23u, 37u, 41u, 53u, 67u, 71u, 83u, + 97u, 101u, 109u, 113u, 127u, 131u, 137u, 149u, +}; + +int run_fp_probe(void); +int run_int_probe(void); + +__attribute__((noinline)) uint64_t opaque_u64(uint64_t x) { + return (x * 33u) ^ 0x123456789abcdef0ull; +} + +__attribute__((noinline)) void touch_fp_pressure(double seed) { + double a0 = g_fp[0] + seed; + double a1 = g_fp[1] + seed; + double a2 = g_fp[2] + seed; + double a3 = g_fp[3] + seed; + double a4 = g_fp[4] + seed; + double a5 = g_fp[5] + seed; + double a6 = g_fp[6] + seed; + double a7 = g_fp[7] + seed; + double a8 = g_fp[8] + seed; + double a9 = g_fp[9] + seed; + double a10 = g_fp[10] + seed; + double a11 = g_fp[11] + seed; + uint64_t k = opaque_u64((uint64_t)seed); + double sum = a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9 + a10 + a11; + sum += (double)(k & 255u); + g_fp[15] = sum; +} + +__attribute__((noinline)) void touch_int_pressure(uint64_t seed) { + uint64_t a0 = g_int[0] + seed; + uint64_t a1 = g_int[1] + seed; + uint64_t a2 = g_int[2] + seed; + uint64_t a3 = g_int[3] + seed; + uint64_t a4 = g_int[4] + seed; + uint64_t a5 = g_int[5] + seed; + uint64_t a6 = g_int[6] + seed; + uint64_t a7 = g_int[7] + seed; + uint64_t a8 = g_int[8] + seed; + uint64_t a9 = g_int[9] + seed; + uint64_t k = opaque_u64(seed + 7u); + g_int[15] = a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9 + k; +} + +#if defined(__x86_64__) +__asm__( + ".text\n" + ".globl run_fp_probe\n" + "run_fp_probe:\n" + "pushq %rbp\n" + "movq %rsp, %rbp\n" + "subq $0x30, %rsp\n" + "movaps %xmm6, -0x10(%rbp)\n" + "movq $0x405edd2f1a9fbe77, %rax\n" + "movq %rax, %xmm6\n" + "movq $0x401c000000000000, %rax\n" + "movq %rax, %xmm0\n" + "callq touch_fp_pressure\n" + "movq %xmm6, %rax\n" + "movq $0x405edd2f1a9fbe77, %rcx\n" + "cmpq %rcx, %rax\n" + "sete %al\n" + "movzbl %al, %eax\n" + "movaps -0x10(%rbp), %xmm6\n" + "leave\n" + "retq\n" + ".globl run_int_probe\n" + "run_int_probe:\n" + "pushq %rbp\n" + "movq %rsp, %rbp\n" + "subq $0x30, %rsp\n" + "movq %r13, -0x8(%rbp)\n" + "movq $0xfedcba9876543210, %r13\n" + "movq $5, %rcx\n" + "callq touch_int_pressure\n" + "movq %r13, %rax\n" + "movq $0xfedcba9876543210, %rcx\n" + "cmpq %rcx, %rax\n" + "sete %al\n" + "movzbl %al, %eax\n" + "movq -0x8(%rbp), %r13\n" + "leave\n" + "retq\n"); +#elif defined(__aarch64__) +__asm__( + ".text\n" + ".globl run_fp_probe\n" + "run_fp_probe:\n" + "stp x29, x30, [sp, #-32]!\n" + "mov x29, sp\n" + "str d8, [sp, #16]\n" + "mov x9, #0xbe77\n" + "movk x9, #0x1a9f, lsl #16\n" + "movk x9, #0xdd2f, lsl #32\n" + "movk x9, #0x405e, lsl #48\n" + "fmov d8, x9\n" + "mov x9, #0\n" + "movk x9, #0x401c, lsl #48\n" + "fmov d0, x9\n" + "bl touch_fp_pressure\n" + "fmov x10, d8\n" + "mov x9, #0xbe77\n" + "movk x9, #0x1a9f, lsl #16\n" + "movk x9, #0xdd2f, lsl #32\n" + "movk x9, #0x405e, lsl #48\n" + "cmp x10, x9\n" + "cset w0, eq\n" + "ldr d8, [sp, #16]\n" + "ldp x29, x30, [sp], #32\n" + "ret\n" + ".globl run_int_probe\n" + "run_int_probe:\n" + "stp x29, x30, [sp, #-32]!\n" + "mov x29, sp\n" + "str x19, [sp, #16]\n" + "mov x19, #0x3210\n" + "movk x19, #0x7654, lsl #16\n" + "movk x19, #0xba98, lsl #32\n" + "movk x19, #0xfedc, lsl #48\n" + "mov x0, #5\n" + "bl touch_int_pressure\n" + "mov x10, x19\n" + "mov x9, #0x3210\n" + "movk x9, #0x7654, lsl #16\n" + "movk x9, #0xba98, lsl #32\n" + "movk x9, #0xfedc, lsl #48\n" + "cmp x10, x9\n" + "cset w0, eq\n" + "ldr x19, [sp, #16]\n" + "ldp x29, x30, [sp], #32\n" + "ret\n"); +#else +#error unsupported arch +#endif + +__attribute__((noinline)) uint64_t mixed_args(int a, uint64_t b, double c, + float d, int e, double f, + uint64_t g, int h, double i, + uint64_t j) { + return (uint64_t)a + b + (uint64_t)(c * 10.0) + (uint64_t)(d * 10.0f) + + (uint64_t)e + (uint64_t)(f * 10.0) + g + (uint64_t)h + + (uint64_t)(i * 10.0) + j; +} + +__attribute__((noinline)) uint64_t stack_args(uint64_t a1, uint64_t a2, + uint64_t a3, uint64_t a4, + uint64_t a5, uint64_t a6, + uint64_t a7, uint64_t a8, + uint64_t a9, uint64_t a10, + uint64_t a11, uint64_t a12) { + return a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9 + a10 + a11 + a12; +} + +__attribute__((noinline)) Pair make_pair(uint64_t base) { + Pair p; + p.a = base + 1u; + p.b = base + 2u; + return p; +} + +__attribute__((noinline)) Big make_big(uint64_t base) { + Big b; + b.a = base + 1u; + b.b = base + 2u; + b.c = base + 3u; + b.d = base + 4u; + b.e = base + 5u; + b.f = (double)(base + 6u) + 0.5; + return b; +} + +__attribute__((noinline)) uint64_t take_big(Big b, Pair p, uint64_t tail) { + return b.a + b.b * 2u + b.c * 3u + b.d * 4u + b.e * 5u + + (uint64_t)(b.f * 10.0) + p.a + p.b + tail; +} + +__attribute__((noinline)) uint64_t sum_varargs(int tag, ...) { + va_list ap; + uint64_t a; + double b; + uint64_t c; + int d; + double e; + va_start(ap, tag); + a = va_arg(ap, uint64_t); + b = va_arg(ap, double); + c = va_arg(ap, uint64_t); + d = va_arg(ap, int); + e = va_arg(ap, double); + va_end(ap); + return (uint64_t)tag + a + (uint64_t)(b * 10.0) + c + (uint64_t)d + + (uint64_t)(e * 10.0); +} + +/* Loop over a long, boundary-crossing variadic list mixing integer and + * floating-point arguments. Exercises three easy-to-regress shapes at once: + * the loop accumulator stays live across va_start (so no scratch register may + * clobber it), the >8 arguments overflow the register slots onto the stack (so + * the ARM64 GP home area must be contiguous with the incoming stack args), and + * the floating-point arguments route through integer slots on ARM64. */ +__attribute__((noinline)) uint64_t loop_varargs(int n, ...) { + va_list ap; + uint64_t sum = (uint64_t)n; + va_start(ap, n); + for (int i = 0; i < n; ++i) { + if (i & 1) + sum += (uint64_t)(va_arg(ap, double) * 2.0); + else + sum += va_arg(ap, uint64_t); + } + va_end(ap); + return sum; +} + +typedef uint64_t (*MixCallback)(int, double, uint64_t, double, int, int, + uint64_t); + +__attribute__((noinline)) uint64_t callback_impl(int a, double b, uint64_t c, + double d, int e, int f, + uint64_t g) { + return (uint64_t)a + (uint64_t)(b * 10.0) + c * 2u + + (uint64_t)(d * 10.0) + (uint64_t)e * 3u + (uint64_t)f * 5u + g; +} + +__attribute__((noinline)) uint64_t call_callback(MixCallback cb) { + return cb(3, 2.5, 7u, 1.5, 4, 5, 9u); +} + +static int cmp_u32(const void* a, const void* b) { + uint32_t aa = *(const uint32_t*)a; + uint32_t bb = *(const uint32_t*)b; + return (aa > bb) - (aa < bb); +} + +int main(void) { + uint32_t vals[6] = {9u, 1u, 7u, 3u, 5u, 2u}; + Pair p; + Big b; + if (!run_fp_probe()) return 10; + if (!(g_fp[15] > 0.0)) return 11; + if (!run_int_probe()) return 20; + if (g_int[15] == 0u) return 21; + if (mixed_args(1, 2u, 1.5, 2.5f, 3, 3.5, 4u, 5, 4.5, 6u) != 141u) + return 30; + if (stack_args(1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u) != 78u) + return 31; + p = make_pair(100u); + if (p.a != 101u || p.b != 102u) return 40; + b = make_big(10u); + if (take_big(b, p, 5u) != 578u) return 41; + if (sum_varargs(3, (uint64_t)10u, 2.5, (uint64_t)20u, 7, 1.5) != 80u) + return 50; + if (loop_varargs(10, (uint64_t)1, 2.0, (uint64_t)3, 4.0, (uint64_t)5, 6.0, + (uint64_t)7, 8.0, (uint64_t)9, 10.0) != 95u) + return 51; + if (call_callback(callback_impl) != 103u) return 60; + qsort(vals, 6u, sizeof vals[0], cmp_u32); + if (vals[0] != 1u || vals[1] != 2u || vals[5] != 9u) return 70; + return 0; +} +SRC + +no_legacy_crt_imports() { + local name=$1 dump=$2 + if grep -Eiq 'DLL Name: (msvcrt|ucrt)\.dll' "$dump"; then + grep -Ei 'DLL Name: (msvcrt|ucrt)\.dll' "$dump" > "$work/$name.diag" + not_ok "$name" "$work/$name.diag" + else + ok "$name" + fi +} + +run_vm_if_available() { + local label=$1 arch=$2 exe=$3 + case "$arch" in + x64) + if [ -n "${KIT_WINDOWS_VM_X64:-${KIT_WINDOWS_VM_AMD64:-}}" ]; then + if "$ROOT/scripts/windows_vm.sh" run x64 "$exe" \ + > "$work/$label-vm.out" 2> "$work/$label-vm.err"; then + ok "$label-vm" + else + not_ok "$label-vm" "$work/$label-vm.err" + fi + else + skip_test "$label-vm" "KIT_WINDOWS_VM_X64 not set" + fi + ;; + aarch64) + if [ -n "${KIT_WINDOWS_VM_AARCH64:-${KIT_WINDOWS_VM_ARM64:-}}" ]; then + if "$ROOT/scripts/windows_vm.sh" run aarch64 "$exe" \ + > "$work/$label-vm.out" 2> "$work/$label-vm.err"; then + ok "$label-vm" + else + not_ok "$label-vm" "$work/$label-vm.err" + fi + else + skip_test "$label-vm" "KIT_WINDOWS_VM_AARCH64 not set" + fi + ;; + esac +} + +ran=0 +for arch in x86_64 aarch64; do + case "$arch" in + x86_64) target=x86_64-windows; label=x64 ;; + aarch64) target=aarch64-windows; label=aarch64 ;; + esac + + if ! ARCH_SDK=$(sdk_for_arch "$arch"); then + skip_test "$LABEL_SUITE/$label-sysroot" "no $arch llvm-mingw UCRT sysroot" + continue + fi + if [ ! -r "$ARCH_SDK/include/windows.h" ] || + [ ! -r "$ARCH_SDK/lib/libucrt.a" ]; then + skip_test "$LABEL_SUITE/$label-sysroot" "invalid UCRT llvm-mingw sysroot: $ARCH_SDK" + continue + fi + + ran=1 + exe=$work/o1-abi-$arch.exe + dump=$work/o1-abi-$arch.dump + run_ok "$label-o1-abi-build" "$KIT" cc -target "$target" --sysroot "$ARCH_SDK" \ + -O1 "$ABI_C" -o "$exe" + if [ -f "$exe" ]; then + run_ok "$label-o1-abi-objdump" "$KIT" objdump -p "$exe" + if [ -s "$work/$label-o1-abi-objdump.out" ]; then + cp "$work/$label-o1-abi-objdump.out" "$dump" + no_legacy_crt_imports "$label-o1-abi-no-legacy-crt" "$dump" + contains "$label-o1-abi-qsort-import" "$dump" "Name: qsort" + fi + run_vm_if_available "$label-o1-abi" "$label" "$exe" + fi +done + +if [ "$ran" -eq 0 ]; then + skip_test "$LABEL_SUITE" "set KIT_SYSROOT or install llvm-mingw UCRT under /tmp/llvm-mingw*" +fi + +kit_summary "$LABEL_SUITE" +kit_exit diff --git a/test/parse/cases/asm_02_file_scope.c b/test/parse/cases/asm_02_file_scope.c @@ -14,5 +14,14 @@ asm(".data\n" ".text\n"); extern int global_asm_before; +extern int global_asm_tail; -int test_main(void) { return global_asm_before + global_asm_after; } +int test_main(void) { + return global_asm_before + global_asm_after + global_asm_tail; +} + +asm(".data\n" + ".globl global_asm_tail\n" + "global_asm_tail:\n" + ".word 0\n" + ".text\n"); diff --git a/test/parse/cases/gnu_inline_control_flow.c b/test/parse/cases/gnu_inline_control_flow.c @@ -0,0 +1,36 @@ +/* A C99 `extern inline` definition is parsed and semantically validated but + * emits no out-of-line code (codegen suppressed). Control-flow statements in + * such a body must do their break/continue/case bookkeeping without opening a + * CG function — regression for pcg_label_new and the while/switch suppressed + * paths, which mint dummy labels under suppression instead of calling the CG + * label ops. The suppressed body is intentionally not referenced (an extern + * inline has no external definition to link against); test_main stands alone. */ +extern inline int suppressed(int n) { + int s = 0; + while (n > 0) { + if (n == 3) { + n--; + continue; + } + switch (n) { + case 1: + s += 1; + break; + case 2: + s += 2; + break; + default: + s += n; + break; + } + n--; + } + for (int i = 0; i < 4; i++) { + if (i == 1) continue; + if (i == 3) break; + s += i; + } + return s; +} + +int test_main(void) { return 7; } diff --git a/test/parse/cases/gnu_inline_control_flow.expected b/test/parse/cases/gnu_inline_control_flow.expected @@ -0,0 +1 @@ +7