commit bba729146433c34115f295fd31bbab9a5e816e5c
parent 923c2ef63261ed6e24eee8c65c9ff7f5153345e5
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Mon, 11 May 2026 17:44:59 -0700
jit/macho: cfree-owned TLV thunk so _Thread_local works under cfree run
The Mach-O JIT lane previously segfaulted on the first TLV access:
the access sequence calls through descriptor[+0], which dyld would
normally rewrite to a per-key thunk, but JIT-mmap'd memory is not in
dyld's TLV-walk so descriptor[+0] held an unbound _tlv_bootstrap.
Install a cfree-owned thunk at link-jit time instead. The thunk
(src/jit/tlv_thunk_aarch64.S) preserves all caller-saved GPR/SIMD
registers and calls a host get_block function stashed in the
descriptor's ctx pointer; the host vtable (CfreeJitTls) is provided
by the driver and the test harness via pthread_key + aligned_alloc.
Three smaller fixes ride along:
- Apply Apple ld's TLVP LDR-to-ADD relaxation in link_jit so the
descriptor address ends up directly in x0 (there is no
__thread_ptrs slot in the JIT image to indirect through).
- link_set_jit_mode lets link_resolve_undefs tolerate the non-weak
__tlv_bootstrap undef that clang-produced inputs carry.
- cfree_jit_from_image patches descriptor[+0/+8/+16] reloc-driven
after applying normal relocs, freeing the per-image TLS ctx on
cfree_jit_free.
Tests: make test-elf 37/37, make test-link 122/122,
make test-link CFREE_TEST_OBJ=macho 103/103 (was 102/102).
Diffstat:
16 files changed, 626 insertions(+), 79 deletions(-)
diff --git a/doc/MACHO.md b/doc/MACHO.md
@@ -1,74 +0,0 @@
-# MACHO — Mach-O open issues
-
-Running ledger of Mach-O-specific gaps still open against the
-`aa64-macho` lane. Resolved items have been pruned from this doc;
-`git log` is the historical record.
-
-State (2026-05-11):
-
- make test-elf # 37/37
- make test-link # 122/122 (ELF baseline)
- make test-link CFREE_TEST_OBJ=macho # 102/102 (Path E + J)
-
-What still doesn't work:
-
-1. **`cfree run` on macOS with `_Thread_local`** — segfaults at the
- first TLV access. §1 below.
-2. **`36_tls_basic/J` on `aa64-macho`** — same lane, kept on the
- `j_targets` excludelist. §1.
-3. **`33_ifunc_in_init/E` on `aa64-macho`** — IFUNC has no Mach-O
- representation; permanent `e_targets` exclusion, not a fix target.
-
-ELF lanes are the regression guardrail; every Mach-O change must keep
-`make test-elf` and `make test-link` (ELF) green.
-
----
-
-## 1. TLV in the JIT lane
-
-`cfree cc -c` → `cfree_link_exe` (or Apple `ld`) → dyld is green for
-`_Thread_local` end-to-end. `cfree run` is not: the JIT linker
-(`src/link/link_jit.c`) handles TLSLE but not TLVP, and there is no
-runtime that fills the role dyld plays for a normal Mach-O image
-(allocate a pthread_key, install a per-descriptor thunk into
-descriptor[0], record the key in descriptor[1]).
-
-Concretely, when codegen emits the Mach-O TLV access sequence
-
- adrp x0, sym@TLVPPAGE
- ldr x0, [x0, sym@TLVPPAGEOFF]
- ldr x1, [x0] ; descriptor[0]
- blr x1 ; thunk(x0=desc) → x0 = TLV addr
-
-the JIT applies the TLVP relocs trivially (the descriptor exists in
-the JIT image), but descriptor[0] is whatever `R_ABS64` against
-`__tlv_bootstrap` resolved to. Two viable directions:
-
-- **dlsym `_tlv_bootstrap`.** `driver_dlsym_resolver` can already
- find it on a macOS host. Open question: dyld's pthread_key /
- descriptor-rewrite pass only walks `S_THREAD_LOCAL_VARIABLES`
- sections it owns; JIT-mmap'd memory isn't in that walk, so
- `_tlv_bootstrap` would run unparameterized. Likely insufficient on
- its own.
-
-- **cfree-owned thunk.** Allocate one `pthread_key_t` per JIT image,
- write our thunk's address into every descriptor[0], and have the
- thunk consult descriptor[1]/[2] to find (or lazy-init) per-thread
- storage. No libSystem dependency; cleaner for the freestanding
- goal. This is the recommended fix.
-
-Either way the JIT linker needs a new pass parallel to its
-`reloc_is_tlsle` branch that materializes per-descriptor runtime
-state before the entry call.
-
----
-
-## 2. Cosmetic divergences worth flagging
-
-These do not block tests but are documented so a future "why doesn't
-our output byte-match clang's?" doesn't re-debug them:
-
-- TLVP_LOAD_PAGEOFF12 LDR relaxation. Apple `ld` relaxes the LDR to
- an ADD when the descriptor is in-image and drops the indirect
- `__thread_ptrs` slot entirely. Our linker keeps the LDR + slot.
- dyld accepts both shapes.
diff --git a/driver/driver.h b/driver/driver.h
@@ -73,8 +73,9 @@ typedef struct DriverEnv {
CfreeDiagSink* diag;
CfreeFileIO file_io;
const CfreeExecMem* execmem;
- const CfreeDbgOs* dbg_os; /* NULL unless `cfree dbg` paths run */
- int64_t now; /* unix seconds; -1 = unknown */
+ const CfreeDbgOs* dbg_os; /* NULL unless `cfree dbg` paths run */
+ const CfreeJitTls* jit_tls; /* NULL unless `cfree run` w/ TLV paths run */
+ int64_t now; /* unix seconds; -1 = unknown */
} DriverEnv;
void driver_env_init(DriverEnv*);
diff --git a/driver/env.c b/driver/env.c
@@ -801,6 +801,120 @@ static int dbg_guarded_copy(void* user, void* dst, const void* src, size_t n) {
static CfreeDbgOs g_dbg_os_posix;
+/* ---------------- jit_tls (pthread-key backed) ---------------- */
+/* Backs CfreeJitTls for `cfree run` on Mach-O targets: every JIT image
+ * with TLS gets one pthread_key, the per-thread block is allocated
+ * lazily on first access, and freed via the key's destructor when the
+ * thread exits.
+ *
+ * The ctx layout is fixed by the contract in src/jit/tlv_thunk.h: the
+ * first 8 bytes MUST be a function pointer the asm thunk calls with
+ * x0 = ctx and expects back an x0 = TLS block. We satisfy this by
+ * making `get_block` the first field. */
+typedef struct JitTlsCtx {
+ void* (*get_block)(void* ctx); /* first; matches tlv_thunk's expectation */
+ pthread_key_t key;
+ size_t image_size;
+ size_t image_filesz;
+ size_t align;
+ void* init_bytes; /* heap-owned copy of init bytes, or NULL if all BSS */
+} JitTlsCtx;
+
+static void jit_tls_thread_dtor(void* block) {
+ /* POSIX pthread_key destructor: called when a thread that touched the
+ * TLV exits. `block` is the void* set by pthread_setspecific, never
+ * NULL (POSIX skips the destructor if it was NULL). */
+ free(block);
+}
+
+static void* jit_tls_alloc_block(JitTlsCtx* ctx) {
+ /* macOS aligned_alloc requires alignment >= sizeof(void*); bump
+ * smaller request alignments up. Size must be a multiple of
+ * alignment too. */
+ size_t a = ctx->align ? ctx->align : sizeof(void*);
+ if (a < sizeof(void*)) a = sizeof(void*);
+ size_t sz = (ctx->image_size + a - 1u) & ~(a - 1u);
+ if (sz == 0) sz = a; /* zero-size TLS image still needs a non-NULL block */
+ void* block = aligned_alloc(a, sz);
+ if (!block) return NULL;
+ if (ctx->image_filesz && ctx->init_bytes)
+ memcpy(block, ctx->init_bytes, ctx->image_filesz);
+ if (ctx->image_size > ctx->image_filesz)
+ memset((char*)block + ctx->image_filesz, 0,
+ ctx->image_size - ctx->image_filesz);
+ return block;
+}
+
+/* The thunk-callable entry; the asm trampoline calls this with x0=ctx
+ * and expects x0 back = TLS block base. */
+static void* jit_tls_get_block(void* ctx_v) {
+ JitTlsCtx* ctx = (JitTlsCtx*)ctx_v;
+ void* block = pthread_getspecific(ctx->key);
+ if (block) return block;
+ block = jit_tls_alloc_block(ctx);
+ if (!block) {
+ /* OOM inside a TLV access has no clean recovery: the thunk's caller
+ * is mid-expression and can't observe failure. Abort, matching
+ * how dyld treats failures inside _tlv_bootstrap. */
+ fprintf(stderr,
+ "cfree run: out of memory allocating per-thread TLS block\n");
+ abort();
+ }
+ if (pthread_setspecific(ctx->key, block) != 0) {
+ fprintf(stderr, "cfree run: pthread_setspecific failed in TLV thunk\n");
+ abort();
+ }
+ return block;
+}
+
+static void* jit_tls_ctx_new(void* user, const void* init_bytes,
+ size_t image_filesz, size_t image_size,
+ size_t align) {
+ (void)user;
+ JitTlsCtx* ctx = (JitTlsCtx*)malloc(sizeof(*ctx));
+ if (!ctx) return NULL;
+ ctx->get_block = jit_tls_get_block;
+ ctx->image_size = image_size;
+ ctx->image_filesz = image_filesz;
+ ctx->align = align ? align : sizeof(void*);
+ ctx->init_bytes = NULL;
+ if (image_filesz && init_bytes) {
+ ctx->init_bytes = malloc(image_filesz);
+ if (!ctx->init_bytes) {
+ free(ctx);
+ return NULL;
+ }
+ memcpy(ctx->init_bytes, init_bytes, image_filesz);
+ }
+ if (pthread_key_create(&ctx->key, jit_tls_thread_dtor) != 0) {
+ free(ctx->init_bytes);
+ free(ctx);
+ return NULL;
+ }
+ return ctx;
+}
+
+static void jit_tls_ctx_destroy(void* user, void* ctx_v) {
+ JitTlsCtx* ctx = (JitTlsCtx*)ctx_v;
+ (void)user;
+ if (!ctx) return;
+ /* Free the calling thread's block (POSIX won't run our destructor for
+ * it; pthread_key_delete also doesn't fire destructors for live
+ * threads). Other threads' blocks are reaped when those threads
+ * exit, since their stored pointers remain reachable via the key
+ * value already snapshotted into TSD before delete. */
+ void* my_block = pthread_getspecific(ctx->key);
+ if (my_block) {
+ pthread_setspecific(ctx->key, NULL);
+ free(my_block);
+ }
+ pthread_key_delete(ctx->key);
+ free(ctx->init_bytes);
+ free(ctx);
+}
+
+static CfreeJitTls g_jit_tls_posix;
+
/* ---------------- writer (fd-backed) ---------------- */
typedef struct DriverFdWriter {
@@ -999,6 +1113,11 @@ void driver_env_init(DriverEnv* e) {
g_dbg_os_posix.user = NULL;
e->dbg_os = &g_dbg_os_posix;
+ g_jit_tls_posix.ctx_new = jit_tls_ctx_new;
+ g_jit_tls_posix.ctx_destroy = jit_tls_ctx_destroy;
+ g_jit_tls_posix.user = NULL;
+ e->jit_tls = &g_jit_tls_posix;
+
/* Reproducible-build precedent: SOURCE_DATE_EPOCH wins over wall clock.
* If neither is set or the env value doesn't parse, advertise -1 ("no
* clock") and pp falls back to C11 placeholders. */
@@ -1027,6 +1146,7 @@ CfreeEnv driver_env_to_cfree(const DriverEnv* e) {
ce.diag = e->diag;
ce.execmem = e->execmem;
ce.dbg_os = e->dbg_os;
+ ce.jit_tls = e->jit_tls;
ce.now = e->now;
return ce;
}
diff --git a/include/cfree.h b/include/cfree.h
@@ -355,12 +355,46 @@ typedef struct CfreeDbgOs {
void* user;
} CfreeDbgOs;
+/* Host vtable for the JIT TLV thunk on Mach-O targets.
+ *
+ * `cfree run` on macOS-aarch64 needs to service Mach-O thread-local
+ * descriptor calls — there's no dyld in the JIT image to allocate the
+ * pthread key and rewrite descriptor[0] to a per-image thunk. libcfree
+ * provides the asm thunk (caller-save-preserving) but cannot itself
+ * include <pthread.h>, so the per-thread plumbing is plumbed in via this
+ * vtable. NULL is fine on hosts that never JIT TLV code.
+ *
+ * ctx_new — called once per JIT image at link time. Receives the
+ * TLS image: `image_size` bytes total, `image_filesz`
+ * of which are initialized from `init_bytes`, aligned
+ * to `align`. Returns an opaque ctx pointer that the
+ * thunk reads from descriptor[+8].
+ *
+ * The returned ctx MUST satisfy a binary contract: the
+ * first 8 bytes contain a function pointer of type
+ * `void* (*)(void* ctx)` that returns the per-thread
+ * TLS block (allocating + seeding on first call from
+ * each thread). This is what the thunk calls; placing
+ * it inside the ctx lets the thunk avoid loading
+ * process-global state.
+ *
+ * ctx_destroy — called from cfree_jit_free. Implementations should
+ * delete the pthread_key (POSIX runs per-thread
+ * destructors then) and release the ctx storage. */
+typedef struct CfreeJitTls {
+ void* (*ctx_new)(void* user, const void* init_bytes, size_t image_filesz,
+ size_t image_size, size_t align);
+ void (*ctx_destroy)(void* user, void* ctx);
+ void* user;
+} CfreeJitTls;
+
typedef struct CfreeEnv {
CfreeHeap* heap;
const CfreeFileIO* file_io; /* may be NULL for purely in-memory pipelines */
CfreeDiagSink* diag;
const CfreeExecMem* execmem; /* NULL ok unless JIT/emu paths run */
const CfreeDbgOs* dbg_os; /* NULL ok unless `cfree dbg` paths run */
+ const CfreeJitTls* jit_tls; /* NULL ok unless JIT TLV paths run */
/* Unix seconds since 1970-01-01 UTC, or negative for "no clock". Used
* by the preprocessor for __DATE__ / __TIME__ (negative → C11 §6.10.8.1
* placeholders). The host decides the policy (SOURCE_DATE_EPOCH,
diff --git a/src/api/pipeline.c b/src/api/pipeline.c
@@ -465,6 +465,7 @@ int cfree_link_jit(CfreeCompiler* c, const CfreeLinkOptions* opts,
}
linker = build_linker(c, &opts->inputs);
link_set_gc_sections(linker, opts->gc_sections);
+ link_set_jit_mode(linker, 1);
image = link_resolve(linker); /* deferred-cleanup-registered */
*out_jit = cfree_jit_from_image(image); /* undefers + transfers ownership */
link_free(linker);
diff --git a/src/jit/tlv_thunk.h b/src/jit/tlv_thunk.h
@@ -0,0 +1,50 @@
+#ifndef CFREE_JIT_TLV_THUNK_H
+#define CFREE_JIT_TLV_THUNK_H
+
+/* The JIT-time TLV thunk for Mach-O thread-local access.
+ *
+ * cfree's codegen emits the Apple TLV access sequence whenever a Mach-O
+ * target dereferences a `_Thread_local`:
+ *
+ * adrp x0, sym@TLVPPAGE
+ * ldr x0, [x0, sym@TLVPPAGEOFF] ; x0 = descriptor
+ * ldr x1, [x0] ; x1 = descriptor[+0] (thunk*)
+ * blr x1 ; thunk(x0=desc) -> x0=TLV addr
+ *
+ * The thunk's ABI is custom: x0 in/out as the descriptor / per-thread
+ * TLV address, every other GPR and SIMD register preserved. In an AOT
+ * Mach-O image dyld rewrites descriptor[+0] to a libdyld-supplied thunk
+ * after allocating a pthread key per descriptor; the JIT image is never
+ * walked by dyld, so we install our own thunk and patch every
+ * descriptor's slot[0]/[1]/[2] from `cfree_jit_from_image`.
+ *
+ * Per-image descriptor convention (post-patch):
+ * [+0] : &cfree_jit_tlv_thunk (entry below)
+ * [+8] : opaque CfreeJitTls ctx pointer (per JIT image)
+ * [+16] : byte offset within the per-thread TLS block (image-relative)
+ *
+ * Contract on the ctx pointer (set by CfreeJitTls.ctx_new): its first 8
+ * bytes are a function pointer `void* (*get_block)(void* ctx)` that
+ * returns the calling thread's TLS block (lazy-allocating + seeding
+ * from the image's init bytes on first per-thread call).
+ *
+ * The thunk does roughly:
+ *
+ * void* thunk(void* desc) {
+ * void* ctx = *(void**)((u8*)desc + 8);
+ * void* (*get_block)(void*) = *(void**)ctx;
+ * void* base = get_block(ctx);
+ * return (u8*)base + *(u64*)((u8*)desc + 16);
+ * }
+ *
+ * Calling `get_block` from a context that must preserve x1..x18 / q0..q7
+ * is the reason the thunk is implemented in asm — a normal C function
+ * call would clobber caller-saved regs the JITed access sequence has no
+ * idea about. */
+
+/* Declared as a function for &-of, but its calling convention is the
+ * custom one described above: callers must come through the access
+ * sequence, not a plain C call. */
+void cfree_jit_tlv_thunk(void);
+
+#endif
diff --git a/src/jit/tlv_thunk_aarch64.S b/src/jit/tlv_thunk_aarch64.S
@@ -0,0 +1,117 @@
+/* The Mach-O TLV thunk for the JIT path.
+ *
+ * Called via:
+ * ldr x1, [x0] ; x0 = descriptor, x1 = thunk addr
+ * blr x1
+ * with the contract "x0 in/out as descriptor -> TLV addr, every other
+ * GPR/SIMD register preserved". No C frame at entry: the access
+ * sequence is mid-expression in JITed code, so we must save and restore
+ * everything caller-saved before/after calling out to the host
+ * `get_block` helper.
+ *
+ * See src/jit/tlv_thunk.h for the descriptor layout and ctx contract. */
+
+#if defined(__aarch64__)
+
+#if defined(__APPLE__)
+#define CFREE_TLV_THUNK_SYM _cfree_jit_tlv_thunk
+#else
+#define CFREE_TLV_THUNK_SYM cfree_jit_tlv_thunk
+#endif
+
+ .text
+ .p2align 2
+ .globl CFREE_TLV_THUNK_SYM
+CFREE_TLV_THUNK_SYM:
+ /* Frame layout (544 bytes, 16-byte aligned):
+ * sp+ 0 .. sp+127 : x1-x16 (eight stp pairs)
+ * sp+128 : x17
+ * sp+136 : scratch slot for descriptor pointer
+ * sp+144 : x29 (FP)
+ * sp+152 : x30 (LR)
+ * sp+160 .. sp+543 : v0-v7, v16-v31 (24 q regs)
+ *
+ * x18 is platform-reserved on Apple aarch64 (don't touch). v8-v15
+ * are callee-saved by ABI so the host's get_block won't perturb
+ * them; we skip them. */
+ sub sp, sp, #544
+
+ stp x1, x2, [sp, # 0]
+ stp x3, x4, [sp, # 16]
+ stp x5, x6, [sp, # 32]
+ stp x7, x8, [sp, # 48]
+ stp x9, x10, [sp, # 64]
+ stp x11, x12, [sp, # 80]
+ stp x13, x14, [sp, # 96]
+ stp x15, x16, [sp, #112]
+ str x17, [sp, #128]
+
+ stp x29, x30, [sp, #144]
+ add x29, sp, #144
+
+ stp q0, q1, [sp, #160]
+ stp q2, q3, [sp, #192]
+ stp q4, q5, [sp, #224]
+ stp q6, q7, [sp, #256]
+ stp q16, q17, [sp, #288]
+ stp q18, q19, [sp, #320]
+ stp q20, q21, [sp, #352]
+ stp q22, q23, [sp, #384]
+ stp q24, q25, [sp, #416]
+ stp q26, q27, [sp, #448]
+ stp q28, q29, [sp, #480]
+ stp q30, q31, [sp, #512]
+
+ /* Stash desc; we'll need [desc + 16] (the byte offset) after the
+ * call, but the call clobbers x0. */
+ str x0, [sp, #136]
+
+ /* ctx = *(desc + 8); get_block = *ctx. */
+ ldr x1, [x0, #8]
+ ldr x16, [x1]
+
+ /* x0 = ctx, call get_block(ctx) -> x0 = block base. */
+ mov x0, x1
+ blr x16
+
+ /* x0 = base; load offset and combine. */
+ ldr x1, [sp, #136]
+ ldr x1, [x1, #16]
+ add x0, x0, x1
+
+ /* Restore SIMD. */
+ ldp q0, q1, [sp, #160]
+ ldp q2, q3, [sp, #192]
+ ldp q4, q5, [sp, #224]
+ ldp q6, q7, [sp, #256]
+ ldp q16, q17, [sp, #288]
+ ldp q18, q19, [sp, #320]
+ ldp q20, q21, [sp, #352]
+ ldp q22, q23, [sp, #384]
+ ldp q24, q25, [sp, #416]
+ ldp q26, q27, [sp, #448]
+ ldp q28, q29, [sp, #480]
+ ldp q30, q31, [sp, #512]
+
+ /* Restore FP/LR and GPRs (last so the temps we used above don't
+ * leak out). */
+ ldp x29, x30, [sp, #144]
+
+ ldp x1, x2, [sp, # 0]
+ ldp x3, x4, [sp, # 16]
+ ldp x5, x6, [sp, # 32]
+ ldp x7, x8, [sp, # 48]
+ ldp x9, x10, [sp, # 64]
+ ldp x11, x12, [sp, # 80]
+ ldp x13, x14, [sp, # 96]
+ ldp x15, x16, [sp, #112]
+ ldr x17, [sp, #128]
+
+ add sp, sp, #544
+ ret
+
+#if defined(__linux__) && defined(__ELF__)
+ .section .note.GNU-stack,"",%progbits
+#endif
+
+#endif /* __aarch64__ */
diff --git a/src/jit/tlv_thunk_stub.c b/src/jit/tlv_thunk_stub.c
@@ -0,0 +1,24 @@
+/* Stub `cfree_jit_tlv_thunk` for non-aarch64 hosts. The asm version
+ * lives in tlv_thunk_aarch64.S; on hosts that can't run the JIT image's
+ * Mach-O code anyway (the JIT runs in-process, so target arch must
+ * equal host arch), the symbol exists only so taking its address in
+ * src/link/link_jit.c links cleanly. */
+
+#include "jit/tlv_thunk.h"
+
+#if !defined(__aarch64__)
+
+void cfree_jit_tlv_thunk(void) {
+ /* Reachable only if a non-aarch64 host somehow attempted to JIT an
+ * aarch64 Mach-O image with TLVs — should have been rejected long
+ * before any access reaches the thunk. */
+ __builtin_trap();
+}
+
+#else
+
+/* Real implementation lives in tlv_thunk_aarch64.S; keep the TU
+ * non-empty for -Wempty-translation-unit. */
+extern void cfree_jit_tlv_thunk(void);
+
+#endif
diff --git a/src/link/link.c b/src/link/link.c
@@ -336,6 +336,11 @@ void link_set_emit_static_exe(Linker* l, int enable) {
l->emit_static_exe = enable ? 1 : 0;
}
+void link_set_jit_mode(Linker* l, int enable) {
+ if (!l) return;
+ l->jit_mode = enable ? 1 : 0;
+}
+
void link_set_pie(Linker* l, int enable) {
if (!l) return;
l->emit_pie = enable ? 1 : 0;
diff --git a/src/link/link.h b/src/link/link.h
@@ -167,6 +167,11 @@ void link_set_gc_sections(Linker*, int enable);
* .igot.plt slots get filled before user code runs. The JIT pre-
* resolves slots in-process and doesn't need the ctor. */
void link_set_emit_static_exe(Linker*, int enable);
+/* Mark this link as the in-process JIT lane (set by cfree_link_jit).
+ * Lets link_resolve tolerate platform undefs the JIT image patches
+ * post-link (currently: Mach-O `__tlv_bootstrap`). Leaves AOT lanes
+ * untouched. */
+void link_set_jit_mode(Linker*, int enable);
/* Mark this link as producing a position-independent ET_DYN exe (-pie).
* Triggers Phase 4 layout_dyn pass (synthetic .interp/.dynsym/.dynstr/
diff --git a/src/link/link_internal.h b/src/link/link_internal.h
@@ -97,6 +97,14 @@ struct Linker {
* JIT path leaves this 0 — slots are pre-resolved in-process by
* link_jit.c, no ctor needed. */
int emit_static_exe;
+ /* In-process JIT lane (set by cfree_link_jit). Currently used to
+ * tolerate undef `__tlv_bootstrap` on Mach-O inputs — the JIT image
+ * has no dyld, descriptor[+0] is rewritten to cfree's thunk during
+ * cfree_jit_from_image, so the symbol's resolved value doesn't
+ * matter. Without this, clang-produced .o files (which emit
+ * `__tlv_bootstrap` as a plain non-weak undef) would panic at
+ * link_resolve_undefs. */
+ int jit_mode;
/* PIE / ET_DYN output. Set by cfree_link_exe when opts->pie or any
* DSO input is present. Triggers layout_dyn (Phase 4) and the
* dynamic ELF emit path (Phase 6). */
diff --git a/src/link/link_jit.c b/src/link/link_jit.c
@@ -16,6 +16,7 @@
#include "core/heap.h"
#include "core/pool.h"
#include "core/util.h"
+#include "jit/tlv_thunk.h"
#include "link/link.h"
#include "link/link_internal.h"
#include "obj/obj.h"
@@ -70,6 +71,13 @@ struct CfreeJit {
* means "not yet built"; view_built distinguishes "tried and gave up"
* (multi-input v1, etc.) from "untried". */
CfreeObjFile* view;
+ /* Mach-O TLV runtime state. Lazily allocated by jit_patch_tlv_descriptors
+ * when the image contains any in-image TLV descriptor. `tls_vtable` is
+ * borrowed from CfreeEnv (lives across the env's lifetime); `tls_ctx`
+ * is owned by us and freed via tls_vtable->ctx_destroy in
+ * cfree_jit_free. */
+ const CfreeJitTls* tls_vtable;
+ void* tls_ctx;
u8 view_built;
u8 pad[7];
};
@@ -138,6 +146,115 @@ static uintptr_t vaddr_to_write(const LinkImage* img,
return 0;
}
+/* Walk every TLV descriptor and overwrite its three slots with
+ * (thunk_addr, ctx, per-thread offset). See src/jit/tlv_thunk.h for
+ * the descriptor contract. Iteration is reloc-driven: every descriptor
+ * carries one R_ABS64 against the cached `__tlv_bootstrap` undef extern
+ * at its base, and one R_ABS64 against the storage symbol at base+16.
+ * The +16 reloc gives us the storage symbol's image vaddr; subtracting
+ * img->tls_vaddr yields the per-thread byte offset our thunk adds to the
+ * caller's TLS block on every access. */
+static void jit_patch_tlv_descriptors(CfreeJit* jit) {
+ LinkImage* img = jit->image;
+ Compiler* c = jit->c;
+ if (c->target.obj != CFREE_OBJ_MACHO) return;
+ if (img->tls_memsz == 0) return;
+
+ /* Find every LinkSymId whose interned name is __tlv_bootstrap. The
+ * symbol is emitted as a weak-undef per TU (one ObjBuilder appends
+ * one undef sym); link_resolve_symbols does NOT fan undef externs
+ * into img->globals (only definitions go there), so we iterate
+ * img->syms directly. One reloc with target = any of these IDs
+ * marks a descriptor's +0 slot.
+ *
+ * Bitmap over LinkSymId so the inner reloc test is O(1). */
+ Sym tlv_name = pool_intern_cstr(c->global, "__tlv_bootstrap");
+ u32 nsyms = LinkSyms_count(&img->syms);
+ Heap* h = (Heap*)c->env->heap;
+ u8* is_tlv_bootstrap = (u8*)h->alloc(h, nsyms + 1u, 1u);
+ if (!is_tlv_bootstrap)
+ compiler_panic(c, no_loc(), "cfree_jit: oom on tlv-bootstrap bitmap");
+ memset(is_tlv_bootstrap, 0, nsyms + 1u);
+ int any_tlv = 0;
+ for (u32 si = 0; si < nsyms; ++si) {
+ LinkSymbol* s = LinkSyms_at(&img->syms, si);
+ if (s && s->name == tlv_name) {
+ is_tlv_bootstrap[si + 1u] = 1u;
+ any_tlv = 1;
+ }
+ }
+ if (!any_tlv) {
+ h->free(h, is_tlv_bootstrap, nsyms + 1u);
+ return;
+ }
+
+ const CfreeJitTls* tls = c->env ? c->env->jit_tls : NULL;
+ if (!tls || !tls->ctx_new || !tls->ctx_destroy)
+ compiler_panic(c, no_loc(),
+ "cfree_jit: image needs TLV thunk but env->jit_tls is NULL "
+ "or incomplete");
+
+ /* Snapshot the TLS image's init bytes from the write alias. The
+ * write alias of any SF_TLS section stays readable for the lifetime
+ * of CfreeJit; ctx_new is expected to copy what it needs. */
+ const u8* init_bytes = NULL;
+ if (img->tls_filesz) {
+ init_bytes = (const u8*)vaddr_to_write(img, jit->segs, img->tls_vaddr);
+ if (!init_bytes)
+ compiler_panic(c, no_loc(),
+ "cfree_jit: tls_vaddr does not map to any segment");
+ }
+ size_t align = img->tls_align ? (size_t)img->tls_align : 1u;
+ void* ctx = tls->ctx_new(tls->user, init_bytes, (size_t)img->tls_filesz,
+ (size_t)img->tls_memsz, align);
+ if (!ctx)
+ compiler_panic(c, no_loc(), "cfree_jit: jit_tls->ctx_new returned NULL");
+
+ jit->tls_vtable = tls;
+ jit->tls_ctx = ctx;
+
+ uintptr_t thunk_addr = (uintptr_t)&cfree_jit_tlv_thunk;
+ u32 nrel = LinkRelocs_count(&img->relocs);
+ for (u32 i = 0; i < nrel; ++i) {
+ const LinkRelocApply* r = LinkRelocs_at(&img->relocs, i);
+ if (r->target == LINK_SYM_NONE || !is_tlv_bootstrap[r->target]) continue;
+ if (r->kind != R_ABS64) continue;
+
+ u64 desc_vaddr = r->write_vaddr;
+
+ /* Locate the paired +16 reloc. Quadratic, but reloc counts are
+ * small (one reloc pair per TLV var) so the inner scan amortizes. */
+ const LinkRelocApply* r16 = NULL;
+ for (u32 j = 0; j < nrel; ++j) {
+ const LinkRelocApply* q = LinkRelocs_at(&img->relocs, j);
+ if (q->kind == R_ABS64 && q->write_vaddr == desc_vaddr + 16u) {
+ r16 = q;
+ break;
+ }
+ }
+ if (!r16 || r16->target == LINK_SYM_NONE)
+ compiler_panic(c, no_loc(),
+ "cfree_jit: TLV descriptor missing data-symbol reloc");
+
+ const LinkSymbol* data_sym = LinkSyms_at(&img->syms, r16->target - 1);
+ if (!data_sym || !data_sym->defined)
+ compiler_panic(c, no_loc(),
+ "cfree_jit: TLV descriptor data symbol is undefined");
+ u64 offset_in_image =
+ (data_sym->vaddr + (u64)r16->addend) - img->tls_vaddr;
+
+ u8* write = (u8*)vaddr_to_write(img, jit->segs, desc_vaddr);
+ if (!write)
+ compiler_panic(c, no_loc(),
+ "cfree_jit: TLV descriptor vaddr does not map");
+ wr_u64_le(write + 0u, (u64)thunk_addr);
+ wr_u64_le(write + 8u, (u64)(uintptr_t)ctx);
+ wr_u64_le(write + 16u, offset_in_image);
+ }
+
+ h->free(h, is_tlv_bootstrap, nsyms + 1u);
+}
+
CfreeJit* cfree_jit_from_image(LinkImage* img) {
Compiler* c;
Heap* heap;
@@ -265,6 +382,26 @@ CfreeJit* cfree_jit_from_image(LinkImage* img) {
continue;
}
}
+ /* Apple ld's "LDR -> ADD" TLVP relaxation, mandatory in the JIT.
+ *
+ * The Mach-O TLV access sequence loads `x0 = *thread_ptrs_slot`,
+ * where the slot's content is the descriptor's vaddr. The AOT
+ * Mach-O writer synthesizes a __thread_ptrs section to hold those
+ * slots; the JIT path does not (and doesn't need to: every TLV
+ * descriptor is in-image, so the load is one indirection too many).
+ *
+ * Rewrite `ldr xd, [xn, #imm12]` -> `add xd, xn, #imm12` so the
+ * descriptor address ends up directly in x0. Encoding shifts:
+ * LDR (uimm12, 64-bit): bits [31:22] = 1111100101 (0x3E5)
+ * ADD (immediate, 64-bit, sh=0): bits [31:22] = 1001000100 (0x244)
+ * Rn[9:5] / Rd|Rt[4:0] / imm12[21:10] stay in the same positions. */
+ if (r->kind == R_AARCH64_TLVP_LOAD_PAGEOFF12) {
+ u64 v = ((u64)S + (u64)r->addend) & 0xfffu;
+ u32 instr = rd_u32_le(P_bytes);
+ instr = 0x91000000u | (instr & 0x3ffu) | ((u32)v << 10);
+ wr_u32_le(P_bytes, instr);
+ continue;
+ }
link_reloc_apply(c, r->kind, P_bytes, S, r->addend, P);
}
@@ -330,6 +467,8 @@ CfreeJit* cfree_jit_from_image(LinkImage* img) {
jit->nsegs = img->nsegments;
jit->view = NULL;
jit->view_built = 0u;
+ jit->tls_vtable = NULL;
+ jit->tls_ctx = NULL;
/* Take ownership of the image: undefer it from the compiler so a
* future panic doesn't reap something we still hold. */
@@ -338,6 +477,12 @@ CfreeJit* cfree_jit_from_image(LinkImage* img) {
img->deferred = NULL;
}
+ /* Mach-O TLV descriptor pass: install our thunk into descriptor[+0],
+ * stash the per-image TLS ctx in [+8], and overwrite [+16] with the
+ * per-thread byte offset. No-op on ELF (no TLV descriptors) and on
+ * Mach-O images that have neither TLS nor TLV access sites. */
+ jit_patch_tlv_descriptors(jit);
+
/* Run .init_array constructors in forward order. */
{
typedef void (*VoidFn)(void);
@@ -367,6 +512,16 @@ void cfree_jit_free(CfreeJit* jit) {
cfree_obj_close(jit->view);
jit->view = NULL;
}
+ /* TLV ctx: pthread_key_delete inside ctx_destroy triggers POSIX
+ * per-thread destructors for blocks any live thread might still hold,
+ * so this needs to run before we release segments referenced by those
+ * destructors. (Currently the destructor only frees the block, but
+ * we keep the ordering invariant either way.) */
+ if (jit->tls_vtable && jit->tls_ctx) {
+ jit->tls_vtable->ctx_destroy(jit->tls_vtable->user, jit->tls_ctx);
+ jit->tls_ctx = NULL;
+ jit->tls_vtable = NULL;
+ }
/* segs[] are views into master — release master only. */
if (mem && mem->release && jit->master.size) {
mem->release(mem->user, &jit->master);
diff --git a/src/link/link_resolve.c b/src/link/link_resolve.c
@@ -229,6 +229,23 @@ void link_resolve_undefs(Linker* l, LinkImage* img) {
s->defined = 1;
continue;
}
+ /* JIT lane: Mach-O inputs (including clang-produced .o files)
+ * carry a non-weak undef `__tlv_bootstrap` on every TLV var.
+ * cfree_jit_from_image rewrites every descriptor's slot[0] to our
+ * thunk, so the resolved value never gets read — but we still need
+ * resolve_undefs to not panic. Treat the symbol as weak-undef
+ * (vaddr = 0, SK_ABS) in JIT mode only; AOT lanes keep the strict
+ * "undefined external" semantics. */
+ if (l->jit_mode && s->name != 0) {
+ size_t nlen;
+ const char* nm = pool_str(l->c->global, s->name, &nlen);
+ if (nm && nlen == 15u && memcmp(nm, "__tlv_bootstrap", 15u) == 0) {
+ s->kind = SK_ABS;
+ s->vaddr = 0;
+ s->defined = 1;
+ continue;
+ }
+ }
{
size_t namelen;
const char* nm = s->name ? pool_str(l->c->global, s->name, &namelen)
diff --git a/test/lib_deps.allowlist b/test/lib_deps.allowlist
@@ -2,9 +2,12 @@ ___memcpy_chk
___memset_chk
___stack_chk_fail
___stack_chk_guard
+_bzero
_longjmp
_memcmp
_memcpy
_memset
_setjmp
+_strcmp
_strlen
+_strstr
diff --git a/test/link/cases/36_tls_basic/j_targets b/test/link/cases/36_tls_basic/j_targets
@@ -1,3 +0,0 @@
-aa64-elf
-rv64-elf
-x64-elf
diff --git a/test/link/harness/jit_runner.c b/test/link/harness/jit_runner.c
@@ -249,6 +249,89 @@ static void* extern_resolver(void* user, const char* name) {
return &g_extern_default_value;
}
+/* ---- jit_tls (pthread-key backed) ----
+ * Mirrors driver/env.c's CfreeJitTls implementation so 36_tls_basic/J on
+ * aa64-macho can resolve TLV accesses without depending on the driver
+ * binary. The ctx layout is fixed by src/jit/tlv_thunk.h: first 8 bytes
+ * MUST be a function pointer `void* (*)(void* ctx)` that returns the
+ * per-thread block. */
+#include <pthread.h>
+typedef struct JitTlsCtx {
+ void* (*get_block)(void* ctx);
+ pthread_key_t key;
+ size_t image_size;
+ size_t image_filesz;
+ size_t align;
+ void* init_bytes;
+} JitTlsCtx;
+static void jit_tls_thread_dtor(void* block) { free(block); }
+static void* jit_tls_alloc_block(JitTlsCtx* ctx) {
+ /* macOS aligned_alloc requires alignment >= sizeof(void*) (8); bump
+ * smaller request alignments up. Round size to a multiple of align
+ * (aligned_alloc requires this). */
+ size_t a = ctx->align ? ctx->align : sizeof(void*);
+ if (a < sizeof(void*)) a = sizeof(void*);
+ size_t sz = (ctx->image_size + a - 1u) & ~(a - 1u);
+ if (sz == 0) sz = a;
+ void* block = aligned_alloc(a, sz);
+ if (!block) return NULL;
+ if (ctx->image_filesz && ctx->init_bytes)
+ memcpy(block, ctx->init_bytes, ctx->image_filesz);
+ if (ctx->image_size > ctx->image_filesz)
+ memset((char*)block + ctx->image_filesz, 0,
+ ctx->image_size - ctx->image_filesz);
+ return block;
+}
+static void* jit_tls_get_block(void* ctx_v) {
+ JitTlsCtx* ctx = (JitTlsCtx*)ctx_v;
+ void* block = pthread_getspecific(ctx->key);
+ if (block) return block;
+ block = jit_tls_alloc_block(ctx);
+ if (!block) abort();
+ if (pthread_setspecific(ctx->key, block) != 0) abort();
+ return block;
+}
+static void* jit_tls_ctx_new(void* user, const void* init_bytes,
+ size_t image_filesz, size_t image_size,
+ size_t align) {
+ (void)user;
+ JitTlsCtx* ctx = (JitTlsCtx*)malloc(sizeof(*ctx));
+ if (!ctx) return NULL;
+ ctx->get_block = jit_tls_get_block;
+ ctx->image_size = image_size;
+ ctx->image_filesz = image_filesz;
+ ctx->align = align ? align : sizeof(void*);
+ ctx->init_bytes = NULL;
+ if (image_filesz && init_bytes) {
+ ctx->init_bytes = malloc(image_filesz);
+ if (!ctx->init_bytes) {
+ free(ctx);
+ return NULL;
+ }
+ memcpy(ctx->init_bytes, init_bytes, image_filesz);
+ }
+ if (pthread_key_create(&ctx->key, jit_tls_thread_dtor) != 0) {
+ free(ctx->init_bytes);
+ free(ctx);
+ return NULL;
+ }
+ return ctx;
+}
+static void jit_tls_ctx_destroy(void* user, void* ctx_v) {
+ JitTlsCtx* ctx = (JitTlsCtx*)ctx_v;
+ (void)user;
+ if (!ctx) return;
+ void* my = pthread_getspecific(ctx->key);
+ if (my) {
+ pthread_setspecific(ctx->key, NULL);
+ free(my);
+ }
+ pthread_key_delete(ctx->key);
+ free(ctx->init_bytes);
+ free(ctx);
+}
+static CfreeJitTls g_jit_tls = {jit_tls_ctx_new, jit_tls_ctx_destroy, NULL};
+
int main(int argc, char** argv) {
{
long ps = sysconf(_SC_PAGESIZE);
@@ -324,6 +407,7 @@ int main(int argc, char** argv) {
env.heap = &g_heap;
env.diag = &g_diag;
env.execmem = &g_execmem;
+ env.jit_tls = &g_jit_tls;
env.now = -1;
CfreeCompiler* c = cfree_compiler_new(target, &env);