kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 532cc5f1ebd1b7bed82472244cd4cb8e0a378a90
parent 82ec9ebef078a59b33a21a88ab8ddbd476f7e038
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sun, 10 May 2026 19:41:07 -0700

arch/aa64: GOT addressing for extern data on Mach-O; build hosted shim with cfree

Teach the AArch64 backend to materialize undefined externs via the GOT
indirection sequence (ADR_GOT_PAGE + LD64_GOT_LO12_NC) on object formats
that bind dylib imports through __DATA,__got — i.e. Mach-O. With the
direct ADRP + ADD/LDR sequence we emit elsewhere, dyld has nowhere to
land the runtime fixup for libSystem-imported globals (__stdinp,
__stdoutp, __stderrp, etc.).

Policy stays out of the backend: obj_format_extern_via_got(Compiler*) in
src/obj/obj_secnames.c is the one place that names CFREE_OBJ_MACHO. The
backend reads it through use_got_for_sym, keyed on section_id ==
OBJ_SEC_NONE (the canonical "undefined external" marker per obj.h).
ELF and other formats keep the direct path.

In the same spirit, lift the lone other Apple-OS check out of the
backend: ABIFuncInfo gains a vararg_on_stack trait that
apple_arm64_compute_func_info sets, so emit_arg_value reads
fi->vararg_on_stack instead of branching on target.os. The AArch64
backend now contains zero CFREE_OS_* references.

Makefile: hosted-macos shim is now built by cfree-cc itself. The
previous clang dependency existed solely because we couldn't emit GOT
relocs; that constraint is gone.

Verified:
  - otool -rv on the cfree-built shim shows GOTLDP / GOTLDPOFF against
    ___stdinp / ___stdoutp / ___stderrp (was PAGE21 / PAGOF12).
  - test/libc/run.sh: 7/7 pass, 2 skip (unsupported surface).
  - test-cg + test-link: 119/119 pass.

Diffstat:
MMakefile | 17++++++++---------
Msrc/abi/abi.h | 5+++++
Msrc/abi/abi_apple_arm64.c | 15++++++++++-----
Msrc/arch/aarch64.c | 113++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
Msrc/obj/obj.h | 16++++++++++++++++
Msrc/obj/obj_secnames.c | 4++++
6 files changed, 141 insertions(+), 29 deletions(-)

diff --git a/Makefile b/Makefile @@ -57,22 +57,21 @@ rt: rt-aarch64-linux # rt/include/libc/ to whatever the platform libc actually exports. macOS # variant is the only one wired today. # -# Built with clang for now, not cfree-cc: the shim reads libSystem-imported -# global variables (__stdinp, __stdoutp, __stderrp), and cfree's AArch64 -# codegen always emits direct ADRP+LDR (R_AARCH64_ADR_PREL_PG_HI21 + -# LDST64_ABS_LO12_NC) for extern globals. Mach-O dylib imports require -# GOT_LOAD_PAGE21 / GOT_LO12_NC so the load can route through a chained -# fixup. clang emits GOT_LOAD unconditionally on this target; until cfree -# matches, the shim has to be built by clang. +# Built by cfree itself. The shim reads libSystem-imported global +# variables (__stdinp, __stdoutp, __stderrp), which require Mach-O +# GOT_LOAD_PAGE21 / GOT_LO12_NC relocations so dyld can route the load +# through a non-lazy pointer in __DATA,__got; the AArch64 backend picks +# that sequence automatically for undefined externs on Mach-O targets +# (see use_got_for_sym in src/arch/aarch64.c). HOSTED_MACOS_AR = build/libcfree_hosted_macos.a HOSTED_MACOS_SRC = rt/lib/cfree_hosted/macos.c HOSTED_MACOS_OBJ = build/cfree_hosted/macos.o hosted-macos: $(HOSTED_MACOS_AR) -$(HOSTED_MACOS_OBJ): $(HOSTED_MACOS_SRC) +$(HOSTED_MACOS_OBJ): $(HOSTED_MACOS_SRC) $(BIN) @mkdir -p $(dir $@) - $(CC) $(HOST_SYSROOT_CFLAGS) -arch arm64 -ffreestanding -c $< -o $@ + $(BIN) cc -target aarch64-darwin -c $< -o $@ $(HOSTED_MACOS_AR): $(HOSTED_MACOS_OBJ) $(BIN) @rm -f $@ diff --git a/src/abi/abi.h b/src/abi/abi.h @@ -97,6 +97,11 @@ typedef struct ABIFuncInfo { u16 nparams; u8 variadic; u8 has_sret; + /* True when the trailing `...` portion of a variadic call must be + * routed to the stack exclusively, bypassing the GPR/FPR arg pools. + * Apple ARM64 sets this; AAPCS64 / SysV-x64 leave it 0 (variadics + * use the same register routing as fixed args). */ + u8 vararg_on_stack; u32 vararg_gp_offset; u32 vararg_fp_offset; u32 vararg_overflow_offset; diff --git a/src/abi/abi_apple_arm64.c b/src/abi/abi_apple_arm64.c @@ -11,9 +11,9 @@ * on the stack (no v0-v7 / x0-x7 routing for the `...` portion of * the arglist). This is a *call-site* divergence — the fixed * params classify identically to AAPCS64, so compute_func_info - * remains an AAPCS64 alias. The stack routing is enforced inside - * the cg backend (src/arch/aarch64.c::emit_arg_value) by keying on - * target.os when synthesizing the variadic-arg ABIArgInfo. + * delegates to the AAPCS64 classifier and then sets the + * ABIFuncInfo.vararg_on_stack trait that the cg backend reads + * when synthesizing variadic-arg ABIArgInfos. * * 3. Stack-arg promotion — small integer arguments passed on the * stack are promoted to 4 bytes minimum (`char`/`short` occupy 4 @@ -35,8 +35,13 @@ static ABIFuncInfo* apple_arm64_compute_func_info(TargetABI* a, const Type* fn) { /* Phase 2: spell out the Darwin variadic / stack-arg-promotion * deltas. For now the AAPCS64 classifier produces ABI-correct - * output for the fixed-args-only programs in the v1 cg suite. */ - return aapcs64_compute_func_info(a, fn); + * output for the fixed-args-only programs in the v1 cg suite, + * and we layer on the vararg-on-stack trait so the cg backend + * routes the `...` portion to the stack without keying on the + * target OS itself. */ + ABIFuncInfo* info = aapcs64_compute_func_info(a, fn); + info->vararg_on_stack = 1; + return info; } static const Type* apple_arm64_va_list_type(TargetABI* a, Pool* p) { diff --git a/src/arch/aarch64.c b/src/arch/aarch64.c @@ -1264,10 +1264,54 @@ static RelocKind ldst_lo12_reloc_for(u32 nbytes) { } } -/* Materialize &sym+addend into `dst_reg` via ADRP + ADD (LO12_NC). */ +/* Forward decl: addend fixup after a GOT load lands here when the + * addend doesn't fit in a single imm12. Defined just below. */ +static void emit_addr_adjust(MCEmitter* mc, u32 Rd, u32 base, i32 off); + +/* True when the symbol must be reached via a GOT indirection slot at + * this site: an undefined external on a target format that binds extern + * data through __got / non-lazy pointers (Mach-O today). The policy + * lives behind obj_format_extern_via_got so the backend never names a + * specific OS/format. + * + * The "is undefined" test keys on section_id == OBJ_SEC_NONE — the + * canonical marker per obj.h. SK_UNDEF as a kind is reserved for + * symbols whose kind isn't known yet; the decl pass mints externs + * with their intended SK_OBJ / SK_FUNC kind plus OBJ_SEC_NONE. */ +static int use_got_for_sym(CGTarget* t, ObjSymId sym) { + const ObjSym* s; + if (!obj_format_extern_via_got(t->c)) return 0; + s = obj_symbol_get(t->obj, sym); + return s && s->section_id == OBJ_SEC_NONE; +} + +/* Emit `ADRP dst, sym@GOTPAGE ; LDR Xdst, [dst, #sym@GOTPAGEOFF]`, + * leaving the runtime address of `sym` in `dst_reg`. Addends are + * deliberately omitted from the GOT relocs — most loaders disallow + * nonzero addends on GOT-load fixups — so callers add any displacement + * with a follow-on ADD/LDUR/STUR. */ +static void emit_got_load_addr(CGTarget* t, u32 dst_reg, ObjSymId sym) { + MCEmitter* mc = t->mc; + u32 sec = mc->section_id; + u32 adrp_pos = mc->pos(mc); + emit32(mc, aa64_adrp_base(dst_reg)); + mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_GOT_PAGE, sym, 0, 0, 0); + u32 ldr_pos = mc->pos(mc); + emit32(mc, aa64_ldr_uimm(/*size=*/3, dst_reg, dst_reg, 0)); + mc->emit_reloc_at(mc, sec, ldr_pos, R_AARCH64_LD64_GOT_LO12_NC, sym, 0, 0, 0); +} + +/* Materialize &sym+addend into `dst_reg` via ADRP + ADD (LO12_NC), or + * ADRP + LDR-from-GOT + (optional) ADD when the symbol must route + * through an indirection slot. */ static void emit_global_addr(CGTarget* t, u32 dst_reg, ObjSymId sym, i64 addend) { MCEmitter* mc = t->mc; + if (use_got_for_sym(t, sym)) { + emit_got_load_addr(t, dst_reg, sym); + if (addend) emit_addr_adjust(mc, dst_reg, dst_reg, (i32)addend); + return; + } u32 sec = mc->section_id; u32 adrp_pos = mc->pos(mc); emit32(mc, aa64_adrp_base(dst_reg)); @@ -1358,12 +1402,26 @@ static void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma) { u32 sidx = size_idx_for_bytes(sz); /* OPK_GLOBAL: ADRP scratch, sym ; LDR Wd, [scratch, #:lo12:sym]. - * The LO12_NC reloc requires the scaled-offset LDR encoding, not LDUR. */ + * The LO12_NC reloc requires the scaled-offset LDR encoding, not LDUR. + * + * Extern-via-GOT path: ADRP scratch, sym@GOTPAGE ; + * LDR Xscratch, [scratch, #:gotoff:sym] ; LDUR Wd, [scratch, #addend] + * The GOT load returns the symbol's runtime address; we then read the + * value at +addend with a plain LDUR (no reloc, addend baked in). */ if (addr.kind == OPK_GLOBAL) { MCEmitter* mc = t->mc; u32 sec = mc->section_id; ObjSymId sym = addr.v.global.sym; i64 add = addr.v.global.addend; + if (use_got_for_sym(t, sym)) { + emit_got_load_addr(t, /*dst=*/9, sym); + if (dst.cls == RC_FP) { + emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), 9, (i32)add)); + } else { + emit32(mc, aa64_ldur(sidx, reg_num(dst), 9, (i32)add)); + } + return; + } u32 adrp_pos = mc->pos(mc); emit32(mc, aa64_adrp_base(/*Rd=*/9)); mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, add, @@ -1393,7 +1451,11 @@ static void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma) { /* OPK_GLOBAL: ADRP scratch, sym ; STR Wt, [scratch, #:lo12:sym]. * For OPK_IMM source, materialize the value first into x9, then use - * x10 for the global base so the two scratches don't collide. */ + * x10 for the global base so the two scratches don't collide. + * + * Extern-via-GOT path: load the symbol's runtime address into the + * base scratch via emit_got_load_addr, then STUR with addend baked + * into the imm9 (no reloc on the store). */ if (addr.kind == OPK_GLOBAL) { MCEmitter* mc = t->mc; u32 sec = mc->section_id; @@ -1413,6 +1475,15 @@ static void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma) { src_reg = reg_num(src); } u32 base = (src.kind == OPK_IMM) ? 10u : 9u; + if (use_got_for_sym(t, sym)) { + emit_got_load_addr(t, base, sym); + if (src_is_fp) { + emit32(mc, aa64_stur_fp(sidx, src_reg, base, (i32)add)); + } else { + emit32(mc, aa64_stur(sidx, src_reg, base, (i32)add)); + } + return; + } u32 adrp_pos = mc->pos(mc); emit32(mc, aa64_adrp_base(base)); mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, add, @@ -1474,17 +1545,27 @@ static void aa_addr_of(CGTarget* t, Operand dst, Operand lv) { } if (lv.kind == OPK_GLOBAL) { /* ADRP Xd, sym ; ADD Xd, Xd, #:lo12:sym (with addend baked into both - * relocations). Used to materialize a function or data pointer. */ + * relocations). Used to materialize a function or data pointer. + * + * Extern-via-GOT path: load the address from the GOT slot and then + * apply the addend with a plain ADD/SUB (GOT relocs disallow addends). */ u32 rd = reg_num(dst); + ObjSymId sym = lv.v.global.sym; + i64 addend = lv.v.global.addend; + if (use_got_for_sym(t, sym)) { + emit_got_load_addr(t, rd, sym); + if (addend) emit_addr_adjust(t->mc, rd, rd, (i32)addend); + return; + } u32 sec = t->mc->section_id; u32 adrp_pos = t->mc->pos(t->mc); emit32(t->mc, aa64_adrp_base(rd)); - t->mc->emit_reloc_at(t->mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, - lv.v.global.sym, lv.v.global.addend, 0, 0); + t->mc->emit_reloc_at(t->mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, + addend, 0, 0); u32 add_pos = t->mc->pos(t->mc); emit32(t->mc, aa64_add_imm(1, rd, rd, 0, 0)); - t->mc->emit_reloc_at(t->mc, sec, add_pos, R_AARCH64_ADD_ABS_LO12_NC, - lv.v.global.sym, lv.v.global.addend, 0, 0); + t->mc->emit_reloc_at(t->mc, sec, add_pos, R_AARCH64_ADD_ABS_LO12_NC, sym, + addend, 0, 0); return; } aa_panic(t, "addr_of"); @@ -1929,17 +2010,19 @@ static void aa_convert(CGTarget* t, ConvKind k, Operand dst, Operand src) { * For BYVAL/INDIRECT the caller's `storage` is the address of the source * data; we either load chunks into the next register pair (DIRECT * aggregate) or pass the address itself (INDIRECT). */ -static void emit_arg_value(CGTarget* t, const CGABIValue* av, u32* next_int, - u32* next_fp, u32* stack_off) { +static void emit_arg_value(CGTarget* t, const ABIFuncInfo* fi, + const CGABIValue* av, u32* next_int, u32* next_fp, + u32* stack_off) { AAImpl* a = impl_of(t); /* Synthesize a one-part DIRECT ABIArgInfo for var args (av->abi is NULL * past the fixed-param count). AAPCS64 routes var args through the same * register/stack rules as fixed scalars, so this matches what * abi_func_info would have produced. * - * Apple ARM64 (Darwin) diverges: variadic args go on the stack only. - * Detect the synthesized-vararg case and bump the next-int / next-fp - * cursors past the register pool so the part below routes to stack. */ + * Apple ARM64 diverges: variadic args go on the stack only. The + * ABIFuncInfo.vararg_on_stack trait carries that policy out of the + * backend — we bump the next-int / next-fp cursors past the register + * pool so the part below falls through to stack placement. */ ABIArgInfo va_ai; ABIArgPart va_pt; const ABIArgInfo* ai = av->abi; @@ -1955,7 +2038,7 @@ static void emit_arg_value(CGTarget* t, const CGABIValue* av, u32* next_int, va_pt.align = sz; va_pt.src_offset = 0; ai = &va_ai; - if (t->c->target.os == CFREE_OS_MACOS) { + if (fi && fi->vararg_on_stack) { *next_int = 8; *next_fp = 8; } @@ -2084,7 +2167,7 @@ static void aa_call(CGTarget* t, const CGCallDesc* d) { } for (u32 i = 0; i < d->nargs; ++i) { - emit_arg_value(t, &d->args[i], &next_int, &next_fp, &stack_off); + emit_arg_value(t, d->abi, &d->args[i], &next_int, &next_fp, &stack_off); } /* Track outgoing-arg high-water mark, 16-aligned. */ diff --git a/src/obj/obj.h b/src/obj/obj.h @@ -407,6 +407,22 @@ Sym obj_secname_preinit_array(Compiler*); Sym obj_secname_tdata(Compiler*); Sym obj_secname_tbss(Compiler*); +/* ---- format-aware codegen policy ---- + * + * Backends consult these predicates instead of branching on + * target.os / target.obj directly, so the OS/format knowledge stays + * concentrated in src/obj/ and a future format lands as one case here + * rather than fan-out in every CGTarget. */ + +/* True when references to undefined external symbols must be + * materialized via an indirection slot (GOT / non-lazy pointer) + * rather than direct page+offset addressing. Mach-O: yes — dyld + * binds dylib imports through __DATA,__got at runtime, and the + * direct PAGE21/PAGEOFF12 fixups can't carry that binding. ELF + * static link: no — the linker resolves SK_UNDEFs at link time and + * patches the direct ADRP/ADD bytes in place. */ +int obj_format_extern_via_got(const Compiler*); + /* ---- file format emitters ---- */ void emit_elf(Compiler*, ObjBuilder*, Writer*); void emit_coff(Compiler*, ObjBuilder*, Writer*); diff --git a/src/obj/obj_secnames.c b/src/obj/obj_secnames.c @@ -89,3 +89,7 @@ Sym obj_secname_tbss(Compiler* c) { return secname_panic_unimpl(c, ".tbss"); } } + +int obj_format_extern_via_got(const Compiler* c) { + return c->target.obj == CFREE_OBJ_MACHO; +}