kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 6ea2c9a48aa1a8670eb9a9005cd63d8b7c002a28
parent 6ccc62a9ce1e2feed3f6d085b011571cccfc8447
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed,  3 Jun 2026 20:38:12 -0700

Fix Apple ARM64 variadic stack slots

Diffstat:
Msrc/abi/abi.h | 4++++
Msrc/abi/abi_apple_arm64.c | 11++++-------
Msrc/arch/aa64/native.c | 75++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
Mtest/api/abi_classify_test.c | 29+++++++++++++++++++++++++++++
Mtest/toy/cases/133_varargs_mixed_types.toy | 11+++++++++++
5 files changed, 106 insertions(+), 24 deletions(-)

diff --git a/src/abi/abi.h b/src/abi/abi.h @@ -128,6 +128,10 @@ typedef struct ABIFuncInfo { * Zero means the backend default. Apple ARM64 uses 4-byte compact slots for * stack arguments such as int32; AAPCS64 uses 8-byte slots. */ u8 stack_arg_min_align; + /* Minimum stack slot size/alignment for variadic arguments forced to the + * stack. Zero means use stack_arg_min_align/backend default. Apple ARM64 + * keeps fixed stack args compact but uses 8-byte variadic slots. */ + u8 vararg_stack_arg_min_align; u32 vararg_gp_offset; u32 vararg_fp_offset; u32 vararg_overflow_offset; diff --git a/src/abi/abi_apple_arm64.c b/src/abi/abi_apple_arm64.c @@ -15,14 +15,10 @@ * ABIFuncInfo.vararg_on_stack trait that the cg backend reads * when synthesizing variadic-arg ABIArgInfos. * - * 3. Stack-arg promotion — small integer arguments passed on the + * 3. Stack-arg promotion — fixed small integer arguments passed on the * stack are promoted to 4 bytes minimum (`char`/`short` occupy 4 - * stack bytes). Like (2), the divergence is in stack-slot - * assignment, not in classification, and lives in cg. The - * current kit cg path uses 8-byte stack stride for every arg, - * which is wider than either ABI requires but ABI-safe — the - * narrower Apple-specific layout becomes a concern only when - * cross-checking against clang-emitted callers/callees. */ + * stack bytes). Variadic arguments still occupy 8-byte slots so a + * plain `char*` va_list can advance uniformly. */ #include "abi/abi_internal.h" #include "core/core.h" @@ -40,6 +36,7 @@ static ABIFuncInfo* apple_arm64_compute_func_info(TargetABI* a, ABIFuncInfo* info = aapcs64_compute_func_info(a, fn); info->vararg_on_stack = 1; info->stack_arg_min_align = 4; + info->vararg_stack_arg_min_align = 8; return info; } diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c @@ -2525,28 +2525,53 @@ static const ABIArgInfo* aa_param_abi(NativeTarget* t, const ABIFuncInfo* abi, return scratch; } -/* Stack footprint of a single argument part. AAPCS64 uses 8-byte slots; - * Apple ARM64 uses compact 4-byte slots for stack-passed int32-sized values. */ +/* Stack footprint of a single argument part. AAPCS64 uses 8-byte slots. Apple + * ARM64 uses compact 4-byte slots for fixed stack-passed int32-sized values, + * but its forced stack variadics still use 8-byte slots. */ static u32 aa_stack_arg_min_align(const ABIFuncInfo* abi) { return (abi && abi->stack_arg_min_align) ? abi->stack_arg_min_align : 8u; } +static u32 aa_vararg_stack_arg_min_align(const ABIFuncInfo* abi) { + if (abi && abi->vararg_stack_arg_min_align) + return abi->vararg_stack_arg_min_align; + return aa_stack_arg_min_align(abi); +} + +static u32 aa_vararg_stack_start(const ABIFuncInfo* abi, u32 cursor) { + return align_up_u32(cursor, aa_vararg_stack_arg_min_align(abi)); +} + /* Natural stack alignment of a part, capped at 16 (binary128). */ -static u32 aa_part_stack_align(const ABIFuncInfo* abi, - const ABIArgPart* part) { - u32 min_align = aa_stack_arg_min_align(abi); +static u32 aa_part_stack_align_min(u32 min_align, const ABIArgPart* part) { u32 al = part->align ? part->align : 8u; if (al < min_align) al = min_align; if (al > 16u) al = 16u; return al; } +static u32 aa_part_stack_align(const ABIFuncInfo* abi, + const ABIArgPart* part) { + return aa_part_stack_align_min(aa_stack_arg_min_align(abi), part); +} + +static u32 aa_part_vararg_stack_align(const ABIFuncInfo* abi, + const ABIArgPart* part) { + return aa_part_stack_align_min(aa_vararg_stack_arg_min_align(abi), part); +} + static u32 aa_part_stack_size(const ABIFuncInfo* abi, const ABIArgPart* part) { return align_up_u32(part->size ? part->size : 8u, aa_part_stack_align(abi, part)); } +static u32 aa_part_vararg_stack_size(const ABIFuncInfo* abi, + const ABIArgPart* part) { + return align_up_u32(part->size ? part->size : 8u, + aa_part_vararg_stack_align(abi, part)); +} + /* The scalar type used to move one ABI part through a register. Aggregate * args/results are split into parts; each part must move at its own width, not * the (possibly >8-byte) aggregate width. */ @@ -2568,16 +2593,17 @@ static KitCgTypeId aa_part_scalar_type(const ABIArgPart* part) { } } -static u32 aa_class_stack_size(const ABIFuncInfo* abi, const ABIArgInfo* ai) { +static u32 aa_class_vararg_stack_size(const ABIFuncInfo* abi, + const ABIArgInfo* ai) { u32 total = 0; + u32 min_align = aa_vararg_stack_arg_min_align(abi); if (!ai || ai->kind == ABI_ARG_IGNORE) return 0; if (ai->kind == ABI_ARG_INDIRECT) return 8u; for (u32 p = 0; p < ai->nparts; ++p) { - total = align_up_u32(total, aa_part_stack_align(abi, &ai->parts[p])); - total += aa_part_stack_size(abi, &ai->parts[p]); + total = align_up_u32(total, aa_part_vararg_stack_align(abi, &ai->parts[p])); + total += aa_part_vararg_stack_size(abi, &ai->parts[p]); } - return align_up_u32(total ? total : aa_stack_arg_min_align(abi), - aa_stack_arg_min_align(abi)); + return align_up_u32(total ? total : min_align, min_align); } static u32 aa_call_stack_size(NativeTarget* t, const NativeCallDesc* desc) { @@ -2590,7 +2616,8 @@ static u32 aa_call_stack_size(NativeTarget* t, const NativeCallDesc* desc) { abi && abi->variadic && abi->vararg_on_stack && i >= abi->nparams; if (ai->kind == ABI_ARG_IGNORE) continue; if (force_stack) { - stack += aa_class_stack_size(abi, ai); + stack = aa_vararg_stack_start(abi, stack); + stack += aa_class_vararg_stack_size(abi, ai); continue; } if (ai->kind == ABI_ARG_INDIRECT) { @@ -2732,8 +2759,9 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc, if (force_stack) { NativeLoc tmpreg = native_loc_reg(desc->args[i].type, NATIVE_REG_INT, AA_TMP0); - u32 n = aa_class_stack_size(abi, ai); + u32 n = aa_class_vararg_stack_size(abi, ai); u32 off = 0; + stack = aa_vararg_stack_start(abi, stack); while (off < n) { u32 chunk = (n - off > 8u) ? 8u : (n - off); aa_load_part(t, tmpreg, desc->args[i], off, chunk); @@ -4040,15 +4068,18 @@ static u32 aa_va_base_reg(AANativeTarget* a, NativeAddr ap) { * va_list pointer opaquely. `ap` addresses the va_list object itself. */ static void aa_va_start_core(AANativeTarget* a, NativeAddr ap) { NativeTarget* t = &a->base; + const ABIFuncInfo* abi = + a->func ? abi_cg_func_info(t->c->abi, a->func->fn_type) : NULL; ABIVaListInfo vai = abi_va_list_layout(t->c->abi); NativeLoc ptr = native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, AA_TMP0); if (vai.kind == ABI_VA_LIST_POINTER) { /* `va_list = &<first vararg>`. Variadic stack args follow the fixed - * incoming params in the same caller window, so the offset is the - * current next_param_stack cursor. */ - aa_emit_add_imm(a, AA_TMP0, AA_FP, - aa_fp_off_in_arg(a, a->next_param_stack)); + * incoming params in the same caller window. Apple ARM64 compact fixed + * stack args may leave this cursor at +4, while the first variadic slot + * starts at the next 8-byte boundary. */ + u32 stack = aa_vararg_stack_start(abi, a->next_param_stack); + aa_emit_add_imm(a, AA_TMP0, AA_FP, aa_fp_off_in_arg(a, stack)); aa_emit_mem(a, 0, ptr, ap, aa_mem_for_type(t, ptr.type, 8)); return; } @@ -4115,7 +4146,17 @@ static void aa_va_arg_core(AANativeTarget* a, NativeLoc dst, NativeAddr ap, if (vai.kind == ABI_VA_LIST_POINTER) { aa_emit_mem(a, 1, cur, ap, ptr_mem); src = aa_reg_addr(type, AA_TMP0, 0); - aa_emit_add_imm(a, AA_TMP1, AA_TMP0, 8); + { + const ABIFuncInfo* abi = + a->func ? abi_cg_func_info(t->c->abi, a->func->fn_type) : NULL; + ABIArgPart part; + memset(&part, 0, sizeof part); + part.cls = cg_type_is_float(t->c, type) ? ABI_CLASS_FP : ABI_CLASS_INT; + part.size = type_size32(t, type); + part.align = type_align32(t, type); + aa_emit_add_imm(a, AA_TMP1, AA_TMP0, + (i32)aa_part_vararg_stack_size(abi, &part)); + } aa_emit_mem(a, 0, native_loc_reg(cur.type, NATIVE_REG_INT, AA_TMP1), ap, ptr_mem); aa_emit_mem(a, 1, val, src, val_mem); diff --git a/test/api/abi_classify_test.c b/test/api/abi_classify_test.c @@ -538,6 +538,34 @@ static void test_aarch64_windows_variadic(void) { kit_compiler_free(c); } +static void test_apple_arm64_stack_traits(void) { + KitCompiler* c = new_compiler(KIT_ARCH_ARM_64, KIT_OS_MACOS, KIT_OBJ_MACHO); + KitCgBuiltinTypes bi = kit_cg_builtin_types(c); + KitCgTypeId i32 = bi.id[KIT_CG_BUILTIN_I32]; + KitCgTypeId args[1] = {i32}; + const ABIFuncInfo* fi = + classify_fn_n(c, bi.id[KIT_CG_BUILTIN_VOID], args, 1, 1); + + EXPECT(fi->vararg_on_stack == 1, + "apple arm64 variadic: vararg_on_stack=%u want 1", + (unsigned)fi->vararg_on_stack); + EXPECT(fi->stack_arg_min_align == 4, + "apple arm64 fixed stack min=%u want 4", + (unsigned)fi->stack_arg_min_align); + EXPECT(fi->vararg_stack_arg_min_align == 8, + "apple arm64 vararg stack min=%u want 8", + (unsigned)fi->vararg_stack_arg_min_align); + { + ABITypeInfo vi = abi_va_list_info(((Compiler*)c)->abi); + EXPECT(vi.size == 8, "apple arm64 va_list size=%u want 8", + (unsigned)vi.size); + EXPECT(vi.scalar_kind == ABI_SC_PTR, + "apple arm64 va_list scalar_kind=%u want ABI_SC_PTR (%u)", + (unsigned)vi.scalar_kind, (unsigned)ABI_SC_PTR); + } + kit_compiler_free(c); +} + int main(void) { kit_unit_init(&g_u); check_target(KIT_ARCH_X86_64, KIT_OS_LINUX, KIT_OBJ_ELF); @@ -548,6 +576,7 @@ int main(void) { check_target(KIT_ARCH_ARM_64, KIT_OS_WINDOWS, KIT_OBJ_COFF); test_win64_specifics(); test_aarch64_windows_variadic(); + test_apple_arm64_stack_traits(); kit_unit_summary(&g_u, "abi_classify_test"); return kit_unit_status(&g_u); } diff --git a/test/toy/cases/133_varargs_mixed_types.toy b/test/toy/cases/133_varargs_mixed_types.toy @@ -35,6 +35,15 @@ fn no_varargs(n: i64, ...): i64 { return n * (2 as i64); } +fn adjacent_i32(n: i64, ...): i64 { + var ap: va_list; + @va_start(ap); + let a: i32 = @va_arg<i32>(ap); + let b: i32 = @va_arg<i32>(ap); + @va_end(ap); + return n + ((a as i64) * (10 as i64)) + (b as i64); +} + fn __user_main(): i64 { let s: i64 = sum3(10, 1, 2, 3 as i32); if s != (16 as i64) { return 1; } @@ -44,6 +53,8 @@ fn __user_main(): i64 { if c != (7 + 11 + 11) { return 3; } let z: i64 = no_varargs(21); if z != (42 as i64) { return 4; } + let p: i64 = adjacent_i32(0, 11 as i32, 22 as i32); + if p != (132 as i64) { return 5; } return 42; }