kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit bba729146433c34115f295fd31bbab9a5e816e5c
parent 923c2ef63261ed6e24eee8c65c9ff7f5153345e5
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Mon, 11 May 2026 17:44:59 -0700

jit/macho: cfree-owned TLV thunk so _Thread_local works under cfree run

The Mach-O JIT lane previously segfaulted on the first TLV access:
the access sequence calls through descriptor[+0], which dyld would
normally rewrite to a per-key thunk, but JIT-mmap'd memory is not in
dyld's TLV-walk so descriptor[+0] held an unbound _tlv_bootstrap.

Install a cfree-owned thunk at link-jit time instead.  The thunk
(src/jit/tlv_thunk_aarch64.S) preserves all caller-saved GPR/SIMD
registers and calls a host get_block function stashed in the
descriptor's ctx pointer; the host vtable (CfreeJitTls) is provided
by the driver and the test harness via pthread_key + aligned_alloc.

Three smaller fixes ride along:
  - Apply Apple ld's TLVP LDR-to-ADD relaxation in link_jit so the
    descriptor address ends up directly in x0 (there is no
    __thread_ptrs slot in the JIT image to indirect through).
  - link_set_jit_mode lets link_resolve_undefs tolerate the non-weak
    __tlv_bootstrap undef that clang-produced inputs carry.
  - cfree_jit_from_image patches descriptor[+0/+8/+16] reloc-driven
    after applying normal relocs, freeing the per-image TLS ctx on
    cfree_jit_free.

Tests: make test-elf 37/37, make test-link 122/122,
make test-link CFREE_TEST_OBJ=macho 103/103 (was 102/102).

Diffstat:
Ddoc/MACHO.md | 74--------------------------------------------------------------------------
Mdriver/driver.h | 5+++--
Mdriver/env.c | 120+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Minclude/cfree.h | 34++++++++++++++++++++++++++++++++++
Msrc/api/pipeline.c | 1+
Asrc/jit/tlv_thunk.h | 50++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/jit/tlv_thunk_aarch64.S | 117+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/jit/tlv_thunk_stub.c | 24++++++++++++++++++++++++
Msrc/link/link.c | 5+++++
Msrc/link/link.h | 5+++++
Msrc/link/link_internal.h | 8++++++++
Msrc/link/link_jit.c | 155+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/link/link_resolve.c | 17+++++++++++++++++
Mtest/lib_deps.allowlist | 3+++
Dtest/link/cases/36_tls_basic/j_targets | 3---
Mtest/link/harness/jit_runner.c | 84+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
16 files changed, 626 insertions(+), 79 deletions(-)

diff --git a/doc/MACHO.md b/doc/MACHO.md @@ -1,74 +0,0 @@ -# MACHO — Mach-O open issues - -Running ledger of Mach-O-specific gaps still open against the -`aa64-macho` lane. Resolved items have been pruned from this doc; -`git log` is the historical record. - -State (2026-05-11): - - make test-elf # 37/37 - make test-link # 122/122 (ELF baseline) - make test-link CFREE_TEST_OBJ=macho # 102/102 (Path E + J) - -What still doesn't work: - -1. **`cfree run` on macOS with `_Thread_local`** — segfaults at the - first TLV access. §1 below. -2. **`36_tls_basic/J` on `aa64-macho`** — same lane, kept on the - `j_targets` excludelist. §1. -3. **`33_ifunc_in_init/E` on `aa64-macho`** — IFUNC has no Mach-O - representation; permanent `e_targets` exclusion, not a fix target. - -ELF lanes are the regression guardrail; every Mach-O change must keep -`make test-elf` and `make test-link` (ELF) green. - ---- - -## 1. TLV in the JIT lane - -`cfree cc -c` → `cfree_link_exe` (or Apple `ld`) → dyld is green for -`_Thread_local` end-to-end. `cfree run` is not: the JIT linker -(`src/link/link_jit.c`) handles TLSLE but not TLVP, and there is no -runtime that fills the role dyld plays for a normal Mach-O image -(allocate a pthread_key, install a per-descriptor thunk into -descriptor[0], record the key in descriptor[1]). - -Concretely, when codegen emits the Mach-O TLV access sequence - - adrp x0, sym@TLVPPAGE - ldr x0, [x0, sym@TLVPPAGEOFF] - ldr x1, [x0] ; descriptor[0] - blr x1 ; thunk(x0=desc) → x0 = TLV addr - -the JIT applies the TLVP relocs trivially (the descriptor exists in -the JIT image), but descriptor[0] is whatever `R_ABS64` against -`__tlv_bootstrap` resolved to. Two viable directions: - -- **dlsym `_tlv_bootstrap`.** `driver_dlsym_resolver` can already - find it on a macOS host. Open question: dyld's pthread_key / - descriptor-rewrite pass only walks `S_THREAD_LOCAL_VARIABLES` - sections it owns; JIT-mmap'd memory isn't in that walk, so - `_tlv_bootstrap` would run unparameterized. Likely insufficient on - its own. - -- **cfree-owned thunk.** Allocate one `pthread_key_t` per JIT image, - write our thunk's address into every descriptor[0], and have the - thunk consult descriptor[1]/[2] to find (or lazy-init) per-thread - storage. No libSystem dependency; cleaner for the freestanding - goal. This is the recommended fix. - -Either way the JIT linker needs a new pass parallel to its -`reloc_is_tlsle` branch that materializes per-descriptor runtime -state before the entry call. - ---- - -## 2. Cosmetic divergences worth flagging - -These do not block tests but are documented so a future "why doesn't -our output byte-match clang's?" doesn't re-debug them: - -- TLVP_LOAD_PAGEOFF12 LDR relaxation. Apple `ld` relaxes the LDR to - an ADD when the descriptor is in-image and drops the indirect - `__thread_ptrs` slot entirely. Our linker keeps the LDR + slot. - dyld accepts both shapes. diff --git a/driver/driver.h b/driver/driver.h @@ -73,8 +73,9 @@ typedef struct DriverEnv { CfreeDiagSink* diag; CfreeFileIO file_io; const CfreeExecMem* execmem; - const CfreeDbgOs* dbg_os; /* NULL unless `cfree dbg` paths run */ - int64_t now; /* unix seconds; -1 = unknown */ + const CfreeDbgOs* dbg_os; /* NULL unless `cfree dbg` paths run */ + const CfreeJitTls* jit_tls; /* NULL unless `cfree run` w/ TLV paths run */ + int64_t now; /* unix seconds; -1 = unknown */ } DriverEnv; void driver_env_init(DriverEnv*); diff --git a/driver/env.c b/driver/env.c @@ -801,6 +801,120 @@ static int dbg_guarded_copy(void* user, void* dst, const void* src, size_t n) { static CfreeDbgOs g_dbg_os_posix; +/* ---------------- jit_tls (pthread-key backed) ---------------- */ +/* Backs CfreeJitTls for `cfree run` on Mach-O targets: every JIT image + * with TLS gets one pthread_key, the per-thread block is allocated + * lazily on first access, and freed via the key's destructor when the + * thread exits. + * + * The ctx layout is fixed by the contract in src/jit/tlv_thunk.h: the + * first 8 bytes MUST be a function pointer the asm thunk calls with + * x0 = ctx and expects back an x0 = TLS block. We satisfy this by + * making `get_block` the first field. */ +typedef struct JitTlsCtx { + void* (*get_block)(void* ctx); /* first; matches tlv_thunk's expectation */ + pthread_key_t key; + size_t image_size; + size_t image_filesz; + size_t align; + void* init_bytes; /* heap-owned copy of init bytes, or NULL if all BSS */ +} JitTlsCtx; + +static void jit_tls_thread_dtor(void* block) { + /* POSIX pthread_key destructor: called when a thread that touched the + * TLV exits. `block` is the void* set by pthread_setspecific, never + * NULL (POSIX skips the destructor if it was NULL). */ + free(block); +} + +static void* jit_tls_alloc_block(JitTlsCtx* ctx) { + /* macOS aligned_alloc requires alignment >= sizeof(void*); bump + * smaller request alignments up. Size must be a multiple of + * alignment too. */ + size_t a = ctx->align ? ctx->align : sizeof(void*); + if (a < sizeof(void*)) a = sizeof(void*); + size_t sz = (ctx->image_size + a - 1u) & ~(a - 1u); + if (sz == 0) sz = a; /* zero-size TLS image still needs a non-NULL block */ + void* block = aligned_alloc(a, sz); + if (!block) return NULL; + if (ctx->image_filesz && ctx->init_bytes) + memcpy(block, ctx->init_bytes, ctx->image_filesz); + if (ctx->image_size > ctx->image_filesz) + memset((char*)block + ctx->image_filesz, 0, + ctx->image_size - ctx->image_filesz); + return block; +} + +/* The thunk-callable entry; the asm trampoline calls this with x0=ctx + * and expects x0 back = TLS block base. */ +static void* jit_tls_get_block(void* ctx_v) { + JitTlsCtx* ctx = (JitTlsCtx*)ctx_v; + void* block = pthread_getspecific(ctx->key); + if (block) return block; + block = jit_tls_alloc_block(ctx); + if (!block) { + /* OOM inside a TLV access has no clean recovery: the thunk's caller + * is mid-expression and can't observe failure. Abort, matching + * how dyld treats failures inside _tlv_bootstrap. */ + fprintf(stderr, + "cfree run: out of memory allocating per-thread TLS block\n"); + abort(); + } + if (pthread_setspecific(ctx->key, block) != 0) { + fprintf(stderr, "cfree run: pthread_setspecific failed in TLV thunk\n"); + abort(); + } + return block; +} + +static void* jit_tls_ctx_new(void* user, const void* init_bytes, + size_t image_filesz, size_t image_size, + size_t align) { + (void)user; + JitTlsCtx* ctx = (JitTlsCtx*)malloc(sizeof(*ctx)); + if (!ctx) return NULL; + ctx->get_block = jit_tls_get_block; + ctx->image_size = image_size; + ctx->image_filesz = image_filesz; + ctx->align = align ? align : sizeof(void*); + ctx->init_bytes = NULL; + if (image_filesz && init_bytes) { + ctx->init_bytes = malloc(image_filesz); + if (!ctx->init_bytes) { + free(ctx); + return NULL; + } + memcpy(ctx->init_bytes, init_bytes, image_filesz); + } + if (pthread_key_create(&ctx->key, jit_tls_thread_dtor) != 0) { + free(ctx->init_bytes); + free(ctx); + return NULL; + } + return ctx; +} + +static void jit_tls_ctx_destroy(void* user, void* ctx_v) { + JitTlsCtx* ctx = (JitTlsCtx*)ctx_v; + (void)user; + if (!ctx) return; + /* Free the calling thread's block (POSIX won't run our destructor for + * it; pthread_key_delete also doesn't fire destructors for live + * threads). Other threads' blocks are reaped when those threads + * exit, since their stored pointers remain reachable via the key + * value already snapshotted into TSD before delete. */ + void* my_block = pthread_getspecific(ctx->key); + if (my_block) { + pthread_setspecific(ctx->key, NULL); + free(my_block); + } + pthread_key_delete(ctx->key); + free(ctx->init_bytes); + free(ctx); +} + +static CfreeJitTls g_jit_tls_posix; + /* ---------------- writer (fd-backed) ---------------- */ typedef struct DriverFdWriter { @@ -999,6 +1113,11 @@ void driver_env_init(DriverEnv* e) { g_dbg_os_posix.user = NULL; e->dbg_os = &g_dbg_os_posix; + g_jit_tls_posix.ctx_new = jit_tls_ctx_new; + g_jit_tls_posix.ctx_destroy = jit_tls_ctx_destroy; + g_jit_tls_posix.user = NULL; + e->jit_tls = &g_jit_tls_posix; + /* Reproducible-build precedent: SOURCE_DATE_EPOCH wins over wall clock. * If neither is set or the env value doesn't parse, advertise -1 ("no * clock") and pp falls back to C11 placeholders. */ @@ -1027,6 +1146,7 @@ CfreeEnv driver_env_to_cfree(const DriverEnv* e) { ce.diag = e->diag; ce.execmem = e->execmem; ce.dbg_os = e->dbg_os; + ce.jit_tls = e->jit_tls; ce.now = e->now; return ce; } diff --git a/include/cfree.h b/include/cfree.h @@ -355,12 +355,46 @@ typedef struct CfreeDbgOs { void* user; } CfreeDbgOs; +/* Host vtable for the JIT TLV thunk on Mach-O targets. + * + * `cfree run` on macOS-aarch64 needs to service Mach-O thread-local + * descriptor calls — there's no dyld in the JIT image to allocate the + * pthread key and rewrite descriptor[0] to a per-image thunk. libcfree + * provides the asm thunk (caller-save-preserving) but cannot itself + * include <pthread.h>, so the per-thread plumbing is plumbed in via this + * vtable. NULL is fine on hosts that never JIT TLV code. + * + * ctx_new — called once per JIT image at link time. Receives the + * TLS image: `image_size` bytes total, `image_filesz` + * of which are initialized from `init_bytes`, aligned + * to `align`. Returns an opaque ctx pointer that the + * thunk reads from descriptor[+8]. + * + * The returned ctx MUST satisfy a binary contract: the + * first 8 bytes contain a function pointer of type + * `void* (*)(void* ctx)` that returns the per-thread + * TLS block (allocating + seeding on first call from + * each thread). This is what the thunk calls; placing + * it inside the ctx lets the thunk avoid loading + * process-global state. + * + * ctx_destroy — called from cfree_jit_free. Implementations should + * delete the pthread_key (POSIX runs per-thread + * destructors then) and release the ctx storage. */ +typedef struct CfreeJitTls { + void* (*ctx_new)(void* user, const void* init_bytes, size_t image_filesz, + size_t image_size, size_t align); + void (*ctx_destroy)(void* user, void* ctx); + void* user; +} CfreeJitTls; + typedef struct CfreeEnv { CfreeHeap* heap; const CfreeFileIO* file_io; /* may be NULL for purely in-memory pipelines */ CfreeDiagSink* diag; const CfreeExecMem* execmem; /* NULL ok unless JIT/emu paths run */ const CfreeDbgOs* dbg_os; /* NULL ok unless `cfree dbg` paths run */ + const CfreeJitTls* jit_tls; /* NULL ok unless JIT TLV paths run */ /* Unix seconds since 1970-01-01 UTC, or negative for "no clock". Used * by the preprocessor for __DATE__ / __TIME__ (negative → C11 §6.10.8.1 * placeholders). The host decides the policy (SOURCE_DATE_EPOCH, diff --git a/src/api/pipeline.c b/src/api/pipeline.c @@ -465,6 +465,7 @@ int cfree_link_jit(CfreeCompiler* c, const CfreeLinkOptions* opts, } linker = build_linker(c, &opts->inputs); link_set_gc_sections(linker, opts->gc_sections); + link_set_jit_mode(linker, 1); image = link_resolve(linker); /* deferred-cleanup-registered */ *out_jit = cfree_jit_from_image(image); /* undefers + transfers ownership */ link_free(linker); diff --git a/src/jit/tlv_thunk.h b/src/jit/tlv_thunk.h @@ -0,0 +1,50 @@ +#ifndef CFREE_JIT_TLV_THUNK_H +#define CFREE_JIT_TLV_THUNK_H + +/* The JIT-time TLV thunk for Mach-O thread-local access. + * + * cfree's codegen emits the Apple TLV access sequence whenever a Mach-O + * target dereferences a `_Thread_local`: + * + * adrp x0, sym@TLVPPAGE + * ldr x0, [x0, sym@TLVPPAGEOFF] ; x0 = descriptor + * ldr x1, [x0] ; x1 = descriptor[+0] (thunk*) + * blr x1 ; thunk(x0=desc) -> x0=TLV addr + * + * The thunk's ABI is custom: x0 in/out as the descriptor / per-thread + * TLV address, every other GPR and SIMD register preserved. In an AOT + * Mach-O image dyld rewrites descriptor[+0] to a libdyld-supplied thunk + * after allocating a pthread key per descriptor; the JIT image is never + * walked by dyld, so we install our own thunk and patch every + * descriptor's slot[0]/[1]/[2] from `cfree_jit_from_image`. + * + * Per-image descriptor convention (post-patch): + * [+0] : &cfree_jit_tlv_thunk (entry below) + * [+8] : opaque CfreeJitTls ctx pointer (per JIT image) + * [+16] : byte offset within the per-thread TLS block (image-relative) + * + * Contract on the ctx pointer (set by CfreeJitTls.ctx_new): its first 8 + * bytes are a function pointer `void* (*get_block)(void* ctx)` that + * returns the calling thread's TLS block (lazy-allocating + seeding + * from the image's init bytes on first per-thread call). + * + * The thunk does roughly: + * + * void* thunk(void* desc) { + * void* ctx = *(void**)((u8*)desc + 8); + * void* (*get_block)(void*) = *(void**)ctx; + * void* base = get_block(ctx); + * return (u8*)base + *(u64*)((u8*)desc + 16); + * } + * + * Calling `get_block` from a context that must preserve x1..x18 / q0..q7 + * is the reason the thunk is implemented in asm — a normal C function + * call would clobber caller-saved regs the JITed access sequence has no + * idea about. */ + +/* Declared as a function for &-of, but its calling convention is the + * custom one described above: callers must come through the access + * sequence, not a plain C call. */ +void cfree_jit_tlv_thunk(void); + +#endif diff --git a/src/jit/tlv_thunk_aarch64.S b/src/jit/tlv_thunk_aarch64.S @@ -0,0 +1,117 @@ +/* The Mach-O TLV thunk for the JIT path. + * + * Called via: + * ldr x1, [x0] ; x0 = descriptor, x1 = thunk addr + * blr x1 + * with the contract "x0 in/out as descriptor -> TLV addr, every other + * GPR/SIMD register preserved". No C frame at entry: the access + * sequence is mid-expression in JITed code, so we must save and restore + * everything caller-saved before/after calling out to the host + * `get_block` helper. + * + * See src/jit/tlv_thunk.h for the descriptor layout and ctx contract. */ + +#if defined(__aarch64__) + +#if defined(__APPLE__) +#define CFREE_TLV_THUNK_SYM _cfree_jit_tlv_thunk +#else +#define CFREE_TLV_THUNK_SYM cfree_jit_tlv_thunk +#endif + + .text + .p2align 2 + .globl CFREE_TLV_THUNK_SYM +CFREE_TLV_THUNK_SYM: + /* Frame layout (544 bytes, 16-byte aligned): + * sp+ 0 .. sp+127 : x1-x16 (eight stp pairs) + * sp+128 : x17 + * sp+136 : scratch slot for descriptor pointer + * sp+144 : x29 (FP) + * sp+152 : x30 (LR) + * sp+160 .. sp+543 : v0-v7, v16-v31 (24 q regs) + * + * x18 is platform-reserved on Apple aarch64 (don't touch). v8-v15 + * are callee-saved by ABI so the host's get_block won't perturb + * them; we skip them. */ + sub sp, sp, #544 + + stp x1, x2, [sp, # 0] + stp x3, x4, [sp, # 16] + stp x5, x6, [sp, # 32] + stp x7, x8, [sp, # 48] + stp x9, x10, [sp, # 64] + stp x11, x12, [sp, # 80] + stp x13, x14, [sp, # 96] + stp x15, x16, [sp, #112] + str x17, [sp, #128] + + stp x29, x30, [sp, #144] + add x29, sp, #144 + + stp q0, q1, [sp, #160] + stp q2, q3, [sp, #192] + stp q4, q5, [sp, #224] + stp q6, q7, [sp, #256] + stp q16, q17, [sp, #288] + stp q18, q19, [sp, #320] + stp q20, q21, [sp, #352] + stp q22, q23, [sp, #384] + stp q24, q25, [sp, #416] + stp q26, q27, [sp, #448] + stp q28, q29, [sp, #480] + stp q30, q31, [sp, #512] + + /* Stash desc; we'll need [desc + 16] (the byte offset) after the + * call, but the call clobbers x0. */ + str x0, [sp, #136] + + /* ctx = *(desc + 8); get_block = *ctx. */ + ldr x1, [x0, #8] + ldr x16, [x1] + + /* x0 = ctx, call get_block(ctx) -> x0 = block base. */ + mov x0, x1 + blr x16 + + /* x0 = base; load offset and combine. */ + ldr x1, [sp, #136] + ldr x1, [x1, #16] + add x0, x0, x1 + + /* Restore SIMD. */ + ldp q0, q1, [sp, #160] + ldp q2, q3, [sp, #192] + ldp q4, q5, [sp, #224] + ldp q6, q7, [sp, #256] + ldp q16, q17, [sp, #288] + ldp q18, q19, [sp, #320] + ldp q20, q21, [sp, #352] + ldp q22, q23, [sp, #384] + ldp q24, q25, [sp, #416] + ldp q26, q27, [sp, #448] + ldp q28, q29, [sp, #480] + ldp q30, q31, [sp, #512] + + /* Restore FP/LR and GPRs (last so the temps we used above don't + * leak out). */ + ldp x29, x30, [sp, #144] + + ldp x1, x2, [sp, # 0] + ldp x3, x4, [sp, # 16] + ldp x5, x6, [sp, # 32] + ldp x7, x8, [sp, # 48] + ldp x9, x10, [sp, # 64] + ldp x11, x12, [sp, # 80] + ldp x13, x14, [sp, # 96] + ldp x15, x16, [sp, #112] + ldr x17, [sp, #128] + + add sp, sp, #544 + ret + +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif + +#endif /* __aarch64__ */ diff --git a/src/jit/tlv_thunk_stub.c b/src/jit/tlv_thunk_stub.c @@ -0,0 +1,24 @@ +/* Stub `cfree_jit_tlv_thunk` for non-aarch64 hosts. The asm version + * lives in tlv_thunk_aarch64.S; on hosts that can't run the JIT image's + * Mach-O code anyway (the JIT runs in-process, so target arch must + * equal host arch), the symbol exists only so taking its address in + * src/link/link_jit.c links cleanly. */ + +#include "jit/tlv_thunk.h" + +#if !defined(__aarch64__) + +void cfree_jit_tlv_thunk(void) { + /* Reachable only if a non-aarch64 host somehow attempted to JIT an + * aarch64 Mach-O image with TLVs — should have been rejected long + * before any access reaches the thunk. */ + __builtin_trap(); +} + +#else + +/* Real implementation lives in tlv_thunk_aarch64.S; keep the TU + * non-empty for -Wempty-translation-unit. */ +extern void cfree_jit_tlv_thunk(void); + +#endif diff --git a/src/link/link.c b/src/link/link.c @@ -336,6 +336,11 @@ void link_set_emit_static_exe(Linker* l, int enable) { l->emit_static_exe = enable ? 1 : 0; } +void link_set_jit_mode(Linker* l, int enable) { + if (!l) return; + l->jit_mode = enable ? 1 : 0; +} + void link_set_pie(Linker* l, int enable) { if (!l) return; l->emit_pie = enable ? 1 : 0; diff --git a/src/link/link.h b/src/link/link.h @@ -167,6 +167,11 @@ void link_set_gc_sections(Linker*, int enable); * .igot.plt slots get filled before user code runs. The JIT pre- * resolves slots in-process and doesn't need the ctor. */ void link_set_emit_static_exe(Linker*, int enable); +/* Mark this link as the in-process JIT lane (set by cfree_link_jit). + * Lets link_resolve tolerate platform undefs the JIT image patches + * post-link (currently: Mach-O `__tlv_bootstrap`). Leaves AOT lanes + * untouched. */ +void link_set_jit_mode(Linker*, int enable); /* Mark this link as producing a position-independent ET_DYN exe (-pie). * Triggers Phase 4 layout_dyn pass (synthetic .interp/.dynsym/.dynstr/ diff --git a/src/link/link_internal.h b/src/link/link_internal.h @@ -97,6 +97,14 @@ struct Linker { * JIT path leaves this 0 — slots are pre-resolved in-process by * link_jit.c, no ctor needed. */ int emit_static_exe; + /* In-process JIT lane (set by cfree_link_jit). Currently used to + * tolerate undef `__tlv_bootstrap` on Mach-O inputs — the JIT image + * has no dyld, descriptor[+0] is rewritten to cfree's thunk during + * cfree_jit_from_image, so the symbol's resolved value doesn't + * matter. Without this, clang-produced .o files (which emit + * `__tlv_bootstrap` as a plain non-weak undef) would panic at + * link_resolve_undefs. */ + int jit_mode; /* PIE / ET_DYN output. Set by cfree_link_exe when opts->pie or any * DSO input is present. Triggers layout_dyn (Phase 4) and the * dynamic ELF emit path (Phase 6). */ diff --git a/src/link/link_jit.c b/src/link/link_jit.c @@ -16,6 +16,7 @@ #include "core/heap.h" #include "core/pool.h" #include "core/util.h" +#include "jit/tlv_thunk.h" #include "link/link.h" #include "link/link_internal.h" #include "obj/obj.h" @@ -70,6 +71,13 @@ struct CfreeJit { * means "not yet built"; view_built distinguishes "tried and gave up" * (multi-input v1, etc.) from "untried". */ CfreeObjFile* view; + /* Mach-O TLV runtime state. Lazily allocated by jit_patch_tlv_descriptors + * when the image contains any in-image TLV descriptor. `tls_vtable` is + * borrowed from CfreeEnv (lives across the env's lifetime); `tls_ctx` + * is owned by us and freed via tls_vtable->ctx_destroy in + * cfree_jit_free. */ + const CfreeJitTls* tls_vtable; + void* tls_ctx; u8 view_built; u8 pad[7]; }; @@ -138,6 +146,115 @@ static uintptr_t vaddr_to_write(const LinkImage* img, return 0; } +/* Walk every TLV descriptor and overwrite its three slots with + * (thunk_addr, ctx, per-thread offset). See src/jit/tlv_thunk.h for + * the descriptor contract. Iteration is reloc-driven: every descriptor + * carries one R_ABS64 against the cached `__tlv_bootstrap` undef extern + * at its base, and one R_ABS64 against the storage symbol at base+16. + * The +16 reloc gives us the storage symbol's image vaddr; subtracting + * img->tls_vaddr yields the per-thread byte offset our thunk adds to the + * caller's TLS block on every access. */ +static void jit_patch_tlv_descriptors(CfreeJit* jit) { + LinkImage* img = jit->image; + Compiler* c = jit->c; + if (c->target.obj != CFREE_OBJ_MACHO) return; + if (img->tls_memsz == 0) return; + + /* Find every LinkSymId whose interned name is __tlv_bootstrap. The + * symbol is emitted as a weak-undef per TU (one ObjBuilder appends + * one undef sym); link_resolve_symbols does NOT fan undef externs + * into img->globals (only definitions go there), so we iterate + * img->syms directly. One reloc with target = any of these IDs + * marks a descriptor's +0 slot. + * + * Bitmap over LinkSymId so the inner reloc test is O(1). */ + Sym tlv_name = pool_intern_cstr(c->global, "__tlv_bootstrap"); + u32 nsyms = LinkSyms_count(&img->syms); + Heap* h = (Heap*)c->env->heap; + u8* is_tlv_bootstrap = (u8*)h->alloc(h, nsyms + 1u, 1u); + if (!is_tlv_bootstrap) + compiler_panic(c, no_loc(), "cfree_jit: oom on tlv-bootstrap bitmap"); + memset(is_tlv_bootstrap, 0, nsyms + 1u); + int any_tlv = 0; + for (u32 si = 0; si < nsyms; ++si) { + LinkSymbol* s = LinkSyms_at(&img->syms, si); + if (s && s->name == tlv_name) { + is_tlv_bootstrap[si + 1u] = 1u; + any_tlv = 1; + } + } + if (!any_tlv) { + h->free(h, is_tlv_bootstrap, nsyms + 1u); + return; + } + + const CfreeJitTls* tls = c->env ? c->env->jit_tls : NULL; + if (!tls || !tls->ctx_new || !tls->ctx_destroy) + compiler_panic(c, no_loc(), + "cfree_jit: image needs TLV thunk but env->jit_tls is NULL " + "or incomplete"); + + /* Snapshot the TLS image's init bytes from the write alias. The + * write alias of any SF_TLS section stays readable for the lifetime + * of CfreeJit; ctx_new is expected to copy what it needs. */ + const u8* init_bytes = NULL; + if (img->tls_filesz) { + init_bytes = (const u8*)vaddr_to_write(img, jit->segs, img->tls_vaddr); + if (!init_bytes) + compiler_panic(c, no_loc(), + "cfree_jit: tls_vaddr does not map to any segment"); + } + size_t align = img->tls_align ? (size_t)img->tls_align : 1u; + void* ctx = tls->ctx_new(tls->user, init_bytes, (size_t)img->tls_filesz, + (size_t)img->tls_memsz, align); + if (!ctx) + compiler_panic(c, no_loc(), "cfree_jit: jit_tls->ctx_new returned NULL"); + + jit->tls_vtable = tls; + jit->tls_ctx = ctx; + + uintptr_t thunk_addr = (uintptr_t)&cfree_jit_tlv_thunk; + u32 nrel = LinkRelocs_count(&img->relocs); + for (u32 i = 0; i < nrel; ++i) { + const LinkRelocApply* r = LinkRelocs_at(&img->relocs, i); + if (r->target == LINK_SYM_NONE || !is_tlv_bootstrap[r->target]) continue; + if (r->kind != R_ABS64) continue; + + u64 desc_vaddr = r->write_vaddr; + + /* Locate the paired +16 reloc. Quadratic, but reloc counts are + * small (one reloc pair per TLV var) so the inner scan amortizes. */ + const LinkRelocApply* r16 = NULL; + for (u32 j = 0; j < nrel; ++j) { + const LinkRelocApply* q = LinkRelocs_at(&img->relocs, j); + if (q->kind == R_ABS64 && q->write_vaddr == desc_vaddr + 16u) { + r16 = q; + break; + } + } + if (!r16 || r16->target == LINK_SYM_NONE) + compiler_panic(c, no_loc(), + "cfree_jit: TLV descriptor missing data-symbol reloc"); + + const LinkSymbol* data_sym = LinkSyms_at(&img->syms, r16->target - 1); + if (!data_sym || !data_sym->defined) + compiler_panic(c, no_loc(), + "cfree_jit: TLV descriptor data symbol is undefined"); + u64 offset_in_image = + (data_sym->vaddr + (u64)r16->addend) - img->tls_vaddr; + + u8* write = (u8*)vaddr_to_write(img, jit->segs, desc_vaddr); + if (!write) + compiler_panic(c, no_loc(), + "cfree_jit: TLV descriptor vaddr does not map"); + wr_u64_le(write + 0u, (u64)thunk_addr); + wr_u64_le(write + 8u, (u64)(uintptr_t)ctx); + wr_u64_le(write + 16u, offset_in_image); + } + + h->free(h, is_tlv_bootstrap, nsyms + 1u); +} + CfreeJit* cfree_jit_from_image(LinkImage* img) { Compiler* c; Heap* heap; @@ -265,6 +382,26 @@ CfreeJit* cfree_jit_from_image(LinkImage* img) { continue; } } + /* Apple ld's "LDR -> ADD" TLVP relaxation, mandatory in the JIT. + * + * The Mach-O TLV access sequence loads `x0 = *thread_ptrs_slot`, + * where the slot's content is the descriptor's vaddr. The AOT + * Mach-O writer synthesizes a __thread_ptrs section to hold those + * slots; the JIT path does not (and doesn't need to: every TLV + * descriptor is in-image, so the load is one indirection too many). + * + * Rewrite `ldr xd, [xn, #imm12]` -> `add xd, xn, #imm12` so the + * descriptor address ends up directly in x0. Encoding shifts: + * LDR (uimm12, 64-bit): bits [31:22] = 1111100101 (0x3E5) + * ADD (immediate, 64-bit, sh=0): bits [31:22] = 1001000100 (0x244) + * Rn[9:5] / Rd|Rt[4:0] / imm12[21:10] stay in the same positions. */ + if (r->kind == R_AARCH64_TLVP_LOAD_PAGEOFF12) { + u64 v = ((u64)S + (u64)r->addend) & 0xfffu; + u32 instr = rd_u32_le(P_bytes); + instr = 0x91000000u | (instr & 0x3ffu) | ((u32)v << 10); + wr_u32_le(P_bytes, instr); + continue; + } link_reloc_apply(c, r->kind, P_bytes, S, r->addend, P); } @@ -330,6 +467,8 @@ CfreeJit* cfree_jit_from_image(LinkImage* img) { jit->nsegs = img->nsegments; jit->view = NULL; jit->view_built = 0u; + jit->tls_vtable = NULL; + jit->tls_ctx = NULL; /* Take ownership of the image: undefer it from the compiler so a * future panic doesn't reap something we still hold. */ @@ -338,6 +477,12 @@ CfreeJit* cfree_jit_from_image(LinkImage* img) { img->deferred = NULL; } + /* Mach-O TLV descriptor pass: install our thunk into descriptor[+0], + * stash the per-image TLS ctx in [+8], and overwrite [+16] with the + * per-thread byte offset. No-op on ELF (no TLV descriptors) and on + * Mach-O images that have neither TLS nor TLV access sites. */ + jit_patch_tlv_descriptors(jit); + /* Run .init_array constructors in forward order. */ { typedef void (*VoidFn)(void); @@ -367,6 +512,16 @@ void cfree_jit_free(CfreeJit* jit) { cfree_obj_close(jit->view); jit->view = NULL; } + /* TLV ctx: pthread_key_delete inside ctx_destroy triggers POSIX + * per-thread destructors for blocks any live thread might still hold, + * so this needs to run before we release segments referenced by those + * destructors. (Currently the destructor only frees the block, but + * we keep the ordering invariant either way.) */ + if (jit->tls_vtable && jit->tls_ctx) { + jit->tls_vtable->ctx_destroy(jit->tls_vtable->user, jit->tls_ctx); + jit->tls_ctx = NULL; + jit->tls_vtable = NULL; + } /* segs[] are views into master — release master only. */ if (mem && mem->release && jit->master.size) { mem->release(mem->user, &jit->master); diff --git a/src/link/link_resolve.c b/src/link/link_resolve.c @@ -229,6 +229,23 @@ void link_resolve_undefs(Linker* l, LinkImage* img) { s->defined = 1; continue; } + /* JIT lane: Mach-O inputs (including clang-produced .o files) + * carry a non-weak undef `__tlv_bootstrap` on every TLV var. + * cfree_jit_from_image rewrites every descriptor's slot[0] to our + * thunk, so the resolved value never gets read — but we still need + * resolve_undefs to not panic. Treat the symbol as weak-undef + * (vaddr = 0, SK_ABS) in JIT mode only; AOT lanes keep the strict + * "undefined external" semantics. */ + if (l->jit_mode && s->name != 0) { + size_t nlen; + const char* nm = pool_str(l->c->global, s->name, &nlen); + if (nm && nlen == 15u && memcmp(nm, "__tlv_bootstrap", 15u) == 0) { + s->kind = SK_ABS; + s->vaddr = 0; + s->defined = 1; + continue; + } + } { size_t namelen; const char* nm = s->name ? pool_str(l->c->global, s->name, &namelen) diff --git a/test/lib_deps.allowlist b/test/lib_deps.allowlist @@ -2,9 +2,12 @@ ___memcpy_chk ___memset_chk ___stack_chk_fail ___stack_chk_guard +_bzero _longjmp _memcmp _memcpy _memset _setjmp +_strcmp _strlen +_strstr diff --git a/test/link/cases/36_tls_basic/j_targets b/test/link/cases/36_tls_basic/j_targets @@ -1,3 +0,0 @@ -aa64-elf -rv64-elf -x64-elf diff --git a/test/link/harness/jit_runner.c b/test/link/harness/jit_runner.c @@ -249,6 +249,89 @@ static void* extern_resolver(void* user, const char* name) { return &g_extern_default_value; } +/* ---- jit_tls (pthread-key backed) ---- + * Mirrors driver/env.c's CfreeJitTls implementation so 36_tls_basic/J on + * aa64-macho can resolve TLV accesses without depending on the driver + * binary. The ctx layout is fixed by src/jit/tlv_thunk.h: first 8 bytes + * MUST be a function pointer `void* (*)(void* ctx)` that returns the + * per-thread block. */ +#include <pthread.h> +typedef struct JitTlsCtx { + void* (*get_block)(void* ctx); + pthread_key_t key; + size_t image_size; + size_t image_filesz; + size_t align; + void* init_bytes; +} JitTlsCtx; +static void jit_tls_thread_dtor(void* block) { free(block); } +static void* jit_tls_alloc_block(JitTlsCtx* ctx) { + /* macOS aligned_alloc requires alignment >= sizeof(void*) (8); bump + * smaller request alignments up. Round size to a multiple of align + * (aligned_alloc requires this). */ + size_t a = ctx->align ? ctx->align : sizeof(void*); + if (a < sizeof(void*)) a = sizeof(void*); + size_t sz = (ctx->image_size + a - 1u) & ~(a - 1u); + if (sz == 0) sz = a; + void* block = aligned_alloc(a, sz); + if (!block) return NULL; + if (ctx->image_filesz && ctx->init_bytes) + memcpy(block, ctx->init_bytes, ctx->image_filesz); + if (ctx->image_size > ctx->image_filesz) + memset((char*)block + ctx->image_filesz, 0, + ctx->image_size - ctx->image_filesz); + return block; +} +static void* jit_tls_get_block(void* ctx_v) { + JitTlsCtx* ctx = (JitTlsCtx*)ctx_v; + void* block = pthread_getspecific(ctx->key); + if (block) return block; + block = jit_tls_alloc_block(ctx); + if (!block) abort(); + if (pthread_setspecific(ctx->key, block) != 0) abort(); + return block; +} +static void* jit_tls_ctx_new(void* user, const void* init_bytes, + size_t image_filesz, size_t image_size, + size_t align) { + (void)user; + JitTlsCtx* ctx = (JitTlsCtx*)malloc(sizeof(*ctx)); + if (!ctx) return NULL; + ctx->get_block = jit_tls_get_block; + ctx->image_size = image_size; + ctx->image_filesz = image_filesz; + ctx->align = align ? align : sizeof(void*); + ctx->init_bytes = NULL; + if (image_filesz && init_bytes) { + ctx->init_bytes = malloc(image_filesz); + if (!ctx->init_bytes) { + free(ctx); + return NULL; + } + memcpy(ctx->init_bytes, init_bytes, image_filesz); + } + if (pthread_key_create(&ctx->key, jit_tls_thread_dtor) != 0) { + free(ctx->init_bytes); + free(ctx); + return NULL; + } + return ctx; +} +static void jit_tls_ctx_destroy(void* user, void* ctx_v) { + JitTlsCtx* ctx = (JitTlsCtx*)ctx_v; + (void)user; + if (!ctx) return; + void* my = pthread_getspecific(ctx->key); + if (my) { + pthread_setspecific(ctx->key, NULL); + free(my); + } + pthread_key_delete(ctx->key); + free(ctx->init_bytes); + free(ctx); +} +static CfreeJitTls g_jit_tls = {jit_tls_ctx_new, jit_tls_ctx_destroy, NULL}; + int main(int argc, char** argv) { { long ps = sysconf(_SC_PAGESIZE); @@ -324,6 +407,7 @@ int main(int argc, char** argv) { env.heap = &g_heap; env.diag = &g_diag; env.execmem = &g_execmem; + env.jit_tls = &g_jit_tls; env.now = -1; CfreeCompiler* c = cfree_compiler_new(target, &env);